### Notebook to scrub ebola dataset for Project 2

In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
import 
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# File to Load (Remember to Change These)
ebola_data_to_load = "Data/ebola_2014_2016_clean.csv"
centroid_data_to_load = "Data/clean_country_centroids_az8.csv"
pop_data_to_load = "Data/pop_data_2009-2019.csv"

# Read Population Data from "Estimates" sheet
ebola_data = pd.read_csv(ebola_data_to_load, encoding="ISO-8859-1")
centroid_data = pd.read_csv(centroid_data_to_load)
pop_data = pd.read_csv(pop_data_to_load)

In [6]:
# remove spaces and '.' in column names
ebola_data.columns = [c.replace(' ', '_') for c in ebola_data.columns]
ebola_data.columns = [c.replace('.', '') for c in ebola_data.columns]

# Add year column to dataframe
ebola_data['year'] = pd.DatetimeIndex(ebola_data['Date']).year

ebola_data['Date'] = pd.to_datetime(ebola_data['Date'], infer_datetime_format=True)

# Find earliest and latest dates
print(ebola_data.Date.min())
print(ebola_data.Date.max())
ebola_data

2014-08-29 00:00:00
2016-03-23 00:00:00


Unnamed: 0,Country,Date,No_of_suspected_cases,No_of_probable_cases,No_of_confirmed_cases,"No_of_confirmed,_probable_and_suspected_cases",No_of_suspected_deaths,No_of_probable_deaths,No_of_confirmed_deaths,"No_of_confirmed,_probable_and_suspected_deaths",year
0,Guinea,2014-08-29,25.0,141.0,482.0,648.0,2.0,141.0,287.0,430.0,2014
1,Nigeria,2014-08-29,3.0,1.0,15.0,19.0,0.0,1.0,6.0,7.0,2014
2,Sierra Leone,2014-08-29,54.0,37.0,935.0,1026.0,8.0,34.0,380.0,422.0,2014
3,Liberia,2014-08-29,382.0,674.0,322.0,1378.0,168.0,301.0,225.0,694.0,2014
4,Sierra Leone,2014-09-05,78.0,37.0,1146.0,1261.0,11.0,37.0,443.0,491.0,2014
...,...,...,...,...,...,...,...,...,...,...,...
2480,Liberia,2016-03-23,5636.0,1879.0,3151.0,10666.0,,,,4806.0,2016
2481,Italy,2016-03-23,0.0,0.0,1.0,1.0,,,,0.0,2016
2482,Liberia,2016-03-23,0.0,3.0,2.0,5.0,,3.0,1.0,4.0,2016
2483,Nigeria,2016-03-23,0.0,1.0,19.0,20.0,0.0,1.0,7.0,8.0,2016


In [7]:
# How many occurrences of each country in dataset?
ebola_data['Country'].value_counts()

Liberia                     365
Guinea                      259
Sierra Leone                259
Nigeria                     255
Senegal                     254
United States of America    245
Mali                        243
Spain                       243
United Kingdom              221
Italy                       141
Name: Country, dtype: int64

In [8]:
# Group by country and year
ebola_1 = ebola_data.groupby(["Country", "year"], as_index = False).agg(
    {
        'No_of_confirmed_cases':'max',    
        'No_of_confirmed_deaths': 'max'
    }
)

ebola_1

Unnamed: 0,Country,year,No_of_confirmed_cases,No_of_confirmed_deaths
0,Guinea,2014,2397.0,1433.0
1,Guinea,2015,3351.0,2083.0
2,Guinea,2016,3351.0,2083.0
3,Italy,2015,1.0,0.0
4,Italy,2016,1.0,
5,Liberia,2014,3110.0,1241.0
6,Liberia,2015,3153.0,3858.0
7,Liberia,2016,3151.0,3.0
8,Mali,2014,7.0,5.0
9,Mali,2015,7.0,


In [None]:
# Merge longitude and latitude data by country
# Get name, Longitude, and Latitude from centroid_data
centroid_data_1 = centroid_data[['name', 'Longitude', 'Latitude']]
centroid_data_1['name'].replace(['United States'], 'United States of America', inplace=True)
centroid_data_1

In [None]:
# Merge centroid lon, lat 
ebola_2 = pd.merge(ebola_1, centroid_data_1, left_on='Country', right_on='name', how = 'left')
ebola_2

In [None]:
# Get subset of pop_data, Country, 2014, 2015, 2016
pop_data_1 = pop_data[['Country', '2014', '2015', '2016']]
pop_data_1

In [None]:
# Merge Population Data
# Merge centroid lon, lat 
ebola_3 = pd.merge(ebola_2, pop_data_1, on='Country', how = 'left')
ebola_3

In [None]:
# Add population column
ebola_3['population'] = ebola_3['2014'] * 1000
ebola_3['population'] = np.where(ebola_3['year'] == 2015, ebola_3['2015'], ebola_3['population'] * 1000)
ebola_3['population'] = np.where(ebola_3['year'] == 2016, ebola_3['2016'], ebola_3['population'] * 1000)
ebola_3['Pandemic'] = "Ebola"
# Replace Nan values in No_of_confirmed_deaths column to 0
ebola_3['No_of_confirmed_deaths'].fillna(0, inplace=True)
ebola_3

In [None]:
# Get columns in following order: Pandemic, Country, Year, Cases, Deaths, lon,lat, population
ebola_4 = ebola_3[['Pandemic', 'Country', 'year', 'No_of_confirmed_cases', 'No_of_confirmed_deaths', 'Longitude', 'Latitude', 'population']]
ebola_5 = ebola_4.rename(columns={"year": "Year", "Longitude": "Lon", "Latitude": "Lat", "No_of_confirmed_cases": "Cases", "No_of_confirmed_deaths": "Deaths"  })
ebola_5

In [None]:
# Save scrubbed file to .csv
ebola_5.to_csv(r'Data/ebola_data.csv', index = False, header=True)