### Notebook to scrub ebola dataset for Project 2

In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# File to Load (Remember to Change These)
ebola_data_to_load = "Data/ebola_2014_2016_clean.csv"
centroid_data_to_load = "Data/country_centroids_az8.csv"
pop_data_to_load = "Data/pop_data_2009-2019.csv"

# Read Population Data from "Estimates" sheet
ebola_data = pd.read_csv(ebola_data_to_load, encoding="ISO-8859-1")
centroid_data = pd.read_csv(centroid_data_to_load)
pop_data = pd.read_csv(pop_data_to_load)

In [2]:
# remove spaces and '.' in column names
ebola_data.columns = [c.replace(' ', '_') for c in ebola_data.columns]
ebola_data.columns = [c.replace('.', '') for c in ebola_data.columns]

# Add year column to dataframe
ebola_data['year'] = pd.DatetimeIndex(ebola_data['Date']).year

# Find earliest and latest dates
print(ebola_data.Date.min())
print(ebola_data.Date.max())
ebola_data

2014-08-29
2016-03-23


Unnamed: 0,Country,Date,No_of_suspected_cases,No_of_probable_cases,No_of_confirmed_cases,"No_of_confirmed,_probable_and_suspected_cases",No_of_suspected_deaths,No_of_probable_deaths,No_of_confirmed_deaths,"No_of_confirmed,_probable_and_suspected_deaths",year
0,Guinea,2014-08-29,25.0,141.0,482.0,648.0,2.0,141.0,287.0,430.0,2014
1,Nigeria,2014-08-29,3.0,1.0,15.0,19.0,0.0,1.0,6.0,7.0,2014
2,Sierra Leone,2014-08-29,54.0,37.0,935.0,1026.0,8.0,34.0,380.0,422.0,2014
3,Liberia,2014-08-29,382.0,674.0,322.0,1378.0,168.0,301.0,225.0,694.0,2014
4,Sierra Leone,2014-09-05,78.0,37.0,1146.0,1261.0,11.0,37.0,443.0,491.0,2014
...,...,...,...,...,...,...,...,...,...,...,...
2480,Liberia,2016-03-23,5636.0,1879.0,3151.0,10666.0,,,,4806.0,2016
2481,Italy,2016-03-23,0.0,0.0,1.0,1.0,,,,0.0,2016
2482,Liberia,2016-03-23,0.0,3.0,2.0,5.0,,3.0,1.0,4.0,2016
2483,Nigeria,2016-03-23,0.0,1.0,19.0,20.0,0.0,1.0,7.0,8.0,2016


In [3]:
# How many occurrences of each country in dataset?
ebola_data['Country'].value_counts()

Liberia                     365
Sierra Leone                259
Guinea                      259
Nigeria                     255
Senegal                     254
United States of America    245
Spain                       243
Mali                        243
United Kingdom              221
Italy                       141
Name: Country, dtype: int64

In [4]:
# Group by country and year
ebola_1 = ebola_data.groupby(["Country", "year"], as_index = False).agg(
    {
        'No_of_confirmed_cases':'max',    
        'No_of_confirmed_deaths': 'max'
    }
)

ebola_1

Unnamed: 0,Country,year,No_of_confirmed_cases,No_of_confirmed_deaths
0,Guinea,2014,2397.0,1433.0
1,Guinea,2015,3351.0,2083.0
2,Guinea,2016,3351.0,2083.0
3,Italy,2015,1.0,0.0
4,Italy,2016,1.0,
5,Liberia,2014,3110.0,1241.0
6,Liberia,2015,3153.0,3858.0
7,Liberia,2016,3151.0,3.0
8,Mali,2014,7.0,5.0
9,Mali,2015,7.0,


In [12]:
# Merge longitude and latitude data by country
# Get name, Longitude, and Latitude from centroid_data
centroid_data_1 = centroid_data[['name', 'Longitude', 'Latitude']]
centroid_data_1['name'].replace(['United States'], 'United States of America', inplace=True)
centroid_data_1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


Unnamed: 0,name,Longitude,Latitude
0,Aruba,-69.982677,12.52088
1,Afghanistan,66.004734,33.835231
2,Angola,17.537368,-12.293361
3,Anguilla,-63.064989,18.223959
4,Albania,20.049834,41.14245
5,Aland,19.953288,60.214887
6,Andorra,1.560544,42.542291
7,United Arab Emirates,54.300167,23.905282
8,Argentina,-65.179807,-35.381349
9,Armenia,44.929933,40.289526


In [13]:
# Merge centroid lon, lat 
ebola_2 = pd.merge(ebola_1, centroid_data_1, left_on='Country', right_on='name', how = 'left')
ebola_2

Unnamed: 0,Country,year,No_of_confirmed_cases,No_of_confirmed_deaths,name,Longitude,Latitude
0,Guinea,2014,2397.0,1433.0,Guinea,-10.940666,10.436216
1,Guinea,2015,3351.0,2083.0,Guinea,-10.940666,10.436216
2,Guinea,2016,3351.0,2083.0,Guinea,-10.940666,10.436216
3,Italy,2015,1.0,0.0,Italy,12.070013,42.796626
4,Italy,2016,1.0,,Italy,12.070013,42.796626
5,Liberia,2014,3110.0,1241.0,Liberia,-9.322076,6.452785
6,Liberia,2015,3153.0,3858.0,Liberia,-9.322076,6.452785
7,Liberia,2016,3151.0,3.0,Liberia,-9.322076,6.452785
8,Mali,2014,7.0,5.0,Mali,-3.542691,17.345816
9,Mali,2015,7.0,,Mali,-3.542691,17.345816


In [14]:
# Get subset of pop_data, Country, 2014, 2015, 2016
pop_data_1 = pop_data[['Country', '2014', '2015', '2016']]
pop_data_1

Unnamed: 0,Country,2014,2015,2016
0,WORLD,7295290.759,7379796.967,7464021.934
1,Burundi,9844.301,10160.034,10488.002
2,Comoros,759.39,777.435,795.597
3,Djibouti,898.707,913.998,929.117
4,Eritrea,3311.444,3342.818,3376.558
5,Ethiopia,98094.264,100835.453,103603.461
6,Kenya,46700.063,47878.339,49051.531
7,Madagascar,23589.897,24234.08,24894.37
8,Malawi,16289.55,16745.305,17205.253
9,Mauritius,1257.351,1259.457,1261.87


In [15]:
# Merge Population Data
# Merge centroid lon, lat 
ebola_3 = pd.merge(ebola_2, pop_data_1, on='Country', how = 'left')
ebola_3

Unnamed: 0,Country,year,No_of_confirmed_cases,No_of_confirmed_deaths,name,Longitude,Latitude,2014,2015,2016
0,Guinea,2014,2397.0,1433.0,Guinea,-10.940666,10.436216,11150.97,11432.096,11738.434
1,Guinea,2015,3351.0,2083.0,Guinea,-10.940666,10.436216,11150.97,11432.096,11738.434
2,Guinea,2016,3351.0,2083.0,Guinea,-10.940666,10.436216,11150.97,11432.096,11738.434
3,Italy,2015,1.0,0.0,Italy,12.070013,42.796626,60409.622,60578.489,60663.068
4,Italy,2016,1.0,,Italy,12.070013,42.796626,60409.622,60578.489,60663.068
5,Liberia,2014,3110.0,1241.0,Liberia,-9.322076,6.452785,4359.508,4472.229,4586.788
6,Liberia,2015,3153.0,3858.0,Liberia,-9.322076,6.452785,4359.508,4472.229,4586.788
7,Liberia,2016,3151.0,3.0,Liberia,-9.322076,6.452785,4359.508,4472.229,4586.788
8,Mali,2014,7.0,5.0,Mali,-3.542691,17.345816,16934.213,17438.772,17965.448
9,Mali,2015,7.0,,Mali,-3.542691,17.345816,16934.213,17438.772,17965.448


In [21]:
# Add population column
ebola_3['population'] = ebola_3['2014'] * 1000
ebola_3['population'] = np.where(ebola_3['year'] == 2015, ebola_3['2015'], ebola_3['population'] * 1000)
ebola_3['population'] = np.where(ebola_3['year'] == 2016, ebola_3['2016'], ebola_3['population'] * 1000)
ebola_3['Pandemic'] = "Ebola"
# Replace Nan values in No_of_confirmed_deaths column to 0
ebola_3['No_of_confirmed_deaths'].fillna(0, inplace=True)
ebola_3

Unnamed: 0,Country,year,No_of_confirmed_cases,No_of_confirmed_deaths,name,Longitude,Latitude,2014,2015,2016,population,Pandemic
0,Guinea,2014,2397.0,1433.0,Guinea,-10.940666,10.436216,11150.97,11432.096,11738.434,11150970000000.0,Ebola
1,Guinea,2015,3351.0,2083.0,Guinea,-10.940666,10.436216,11150.97,11432.096,11738.434,11432100.0,Ebola
2,Guinea,2016,3351.0,2083.0,Guinea,-10.940666,10.436216,11150.97,11432.096,11738.434,11738.43,Ebola
3,Italy,2015,1.0,0.0,Italy,12.070013,42.796626,60409.622,60578.489,60663.068,60578490.0,Ebola
4,Italy,2016,1.0,0.0,Italy,12.070013,42.796626,60409.622,60578.489,60663.068,60663.07,Ebola
5,Liberia,2014,3110.0,1241.0,Liberia,-9.322076,6.452785,4359.508,4472.229,4586.788,4359508000000.0,Ebola
6,Liberia,2015,3153.0,3858.0,Liberia,-9.322076,6.452785,4359.508,4472.229,4586.788,4472229.0,Ebola
7,Liberia,2016,3151.0,3.0,Liberia,-9.322076,6.452785,4359.508,4472.229,4586.788,4586.788,Ebola
8,Mali,2014,7.0,5.0,Mali,-3.542691,17.345816,16934.213,17438.772,17965.448,16934210000000.0,Ebola
9,Mali,2015,7.0,0.0,Mali,-3.542691,17.345816,16934.213,17438.772,17965.448,17438770.0,Ebola


In [22]:
# Get columns in following order: Pandemic, Country, Year, Cases, Deaths, lon,lat, population
ebola_4 = ebola_3[['Pandemic', 'Country', 'year', 'No_of_confirmed_cases', 'No_of_confirmed_deaths', 'Longitude', 'Latitude', 'population']]
ebola_5 = ebola_4.rename(columns={"year": "Year", "Longitude": "Lon", "Latitude": "Lat", "No_of_confirmed_cases": "Cases", "No_of_confirmed_deaths": "Deaths"  })
ebola_5

Unnamed: 0,Pandemic,Country,Year,Cases,Deaths,Lon,Lat,population
0,Ebola,Guinea,2014,2397.0,1433.0,-10.940666,10.436216,11150970000000.0
1,Ebola,Guinea,2015,3351.0,2083.0,-10.940666,10.436216,11432100.0
2,Ebola,Guinea,2016,3351.0,2083.0,-10.940666,10.436216,11738.43
3,Ebola,Italy,2015,1.0,0.0,12.070013,42.796626,60578490.0
4,Ebola,Italy,2016,1.0,0.0,12.070013,42.796626,60663.07
5,Ebola,Liberia,2014,3110.0,1241.0,-9.322076,6.452785,4359508000000.0
6,Ebola,Liberia,2015,3153.0,3858.0,-9.322076,6.452785,4472229.0
7,Ebola,Liberia,2016,3151.0,3.0,-9.322076,6.452785,4586.788
8,Ebola,Mali,2014,7.0,5.0,-3.542691,17.345816,16934210000000.0
9,Ebola,Mali,2015,7.0,0.0,-3.542691,17.345816,17438770.0


In [23]:
# Save scrubbed file to .csv
ebola_5.to_csv(r'Data/ebola_data.csv', index = False, header=True)