In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
pd.options.mode.chained_assignment = None  # default='warn', Mutes warnings when copying a slice from a DataFrame.

# Compiling ebola data

**TASK**: The `DATA_FOLDER/ebola` folder contains summarized reports of Ebola cases from three countries (Guinea, Liberia and Sierra Leone) during the recent outbreak of the disease in West Africa. For each country, there are daily reports that contain various information about the outbreak in several cities in each country. Use pandas to import these data files into a single `Dataframe`. Using this `DataFrame`, calculate for *each country*, the *daily average per month* of *new cases* and *deaths*. Make sure you handle all the different expressions for *new cases* and *deaths* that are used in the reports

First we load the data for each country and we collect it all in one single `DataFrame` called `ebola_alldata`.

In [2]:
def import_data(country):
    all_files = glob.glob('./Data/ebola/{}_data/*.csv'.format(country))
    return pd.concat([pd.read_csv(date_file) for date_file in all_files], sort=True)

In [3]:
guinea_data = import_data('guinea')
print(type(guinea_data))
print(len(guinea_data))
guinea_data['Country'] = ['guinea' for i in range(len(guinea_data))]
guinea_data

<class 'pandas.core.frame.DataFrame'>
714


Unnamed: 0,Beyla,Boffa,Conakry,Coyah,Dabola,Dalaba,Date,Description,Dinguiraye,Dubreka,...,Lola,Macenta,Mzerekore,Nzerekore,Pita,Siguiri,Telimele,Totals,Yomou,Country
0,,0,5,,0,,2014-08-04,New cases of suspects,0,0,...,,0,,0,0,0,0,5,,guinea
1,,0,0,,0,,2014-08-04,New cases of probables,0,0,...,,0,,0,0,0,0,0,,guinea
2,,0,1,,0,,2014-08-04,New cases of confirmed,0,0,...,,0,,0,0,0,0,4,,guinea
3,,0,6,,0,,2014-08-04,Total new cases registered so far,0,0,...,,0,,0,0,0,0,9,,guinea
4,,0,9,,0,,2014-08-04,Total cases of suspects,0,0,...,,0,,0,0,0,0,11,,guinea
5,,5,8,,3,,2014-08-04,Total cases of probables,1,0,...,,11,,0,1,0,3,133,,guinea
6,,18,78,,1,,2014-08-04,Total cases of confirmed,0,0,...,,28,,4,1,6,23,351,,guinea
7,,23,95,,4,,2014-08-04,Cumulative (confirmed + probable + suspects),1,0,...,,39,,4,2,6,26,495,,guinea
8,,0,0,,0,,2014-08-04,New deaths registered today,0,0,...,,0,,0,0,0,0,2,,guinea
9,,0,0,,0,,2014-08-04,New deaths registered today (confirmed),0,0,...,,0,,0,0,0,0,2,,guinea


In [4]:
liberia_data = import_data('liberia')
print(len(liberia_data))
liberia_data['Country'] = ['liberia' for i in range(len(liberia_data))]
liberia_data.head(1)

3152


Unnamed: 0,Bomi County,Bong County,Date,Gbarpolu County,Grand Bassa,Grand Cape Mount,Grand Gedeh,Grand Kru,Lofa County,Margibi County,Maryland County,Montserrado County,National,Nimba County,River Gee County,RiverCess County,Sinoe County,Unnamed: 18,Variable,Country
0,,,6/16/2014,,,,,,1.0,,,0.0,1.0,,,,,,Specimens collected,liberia


In [5]:
sl_data = import_data('sl')
print(len(sl_data))
sl_data['Country'] = ['sl' for i in range(len(sl_data))]
sl_data.head(1)

3262


Unnamed: 0,34 Military Hospital,Bo,Bo EMC,Bombali,Bonthe,Hastings-F/Town,Kailahun,Kambia,Kenema,Kenema (IFRC),...,Pujehun,Tonkolili,Unnamed: 18,Western area,Western area combined,Western area rural,Western area urban,date,variable,Country
0,,654142,,494139,168729,,465048,341690,653013,,...,335574,434937,,,,263619,1040888,2014-08-12,population,sl


We fix the columns which do not have the same name but refer to the same measure, before merging together the data from the three countries.

In [6]:
guinea_data = guinea_data.rename(index=str, columns={"Totals": "National"})
liberia_data = liberia_data.rename(index=str, columns={"Variable": "Description"})
sl_data = sl_data.rename(index=str, columns={"date": "Date", "variable": "Description"})

# Merge the dat afrom the three countries in one single DataFrame
ebola_alldata = guinea_data.append([liberia_data, sl_data], sort=True)

We define a function `label_description()` mapping the categories of interest to their corresponding nomenclature in the .csv files. This investigation must be doen country by country to find all possbile matching.

In [7]:
CASES_SUSPECTS = "New Cases: Suspects"
CASES_PROBABLES = "New Cases: Probables"
CASES_CONFIRMED = "New Cases: Confirmed"
DEATHS_TOTAL = "Total New Deaths"

In [8]:
def label_description(description):
    try:
        return {"New cases of suspects": CASES_SUSPECTS,
                "New cases of probables": CASES_PROBABLES,
                "New cases of confirmed": CASES_CONFIRMED,
                "New deaths registered": DEATHS_TOTAL,
                "New deaths registered today": DEATHS_TOTAL,
                "Newly reported deaths": DEATHS_TOTAL,
                "New Case/s (Suspected)": CASES_SUSPECTS,
                "New Case/s (Probable)": CASES_PROBABLES,
                "New case/s (confirmed)": CASES_CONFIRMED,
                "new_suspected": CASES_SUSPECTS,
                "new_probable": CASES_PROBABLES,
                "new_confirmed": CASES_CONFIRMED,
                "etc_new_deaths": DEATHS_TOTAL,
               }[description]
    except:
        return None

With `filter_data()` we create a `DataFrame` containing only the data of interest, then with `compute_daily_av()` we compute the daily average per month of *new cases* and *deaths* for each country.

In [9]:
def filter_data(data_in, label_function):
    data_out = pd.DataFrame()
    
    data_temp = data_in['Description'].apply(label_function) # DataFrame.apply(func_name) applies a fuction 
    # along an axis of the DataFrame.
    data_out['Description'] = data_temp[data_temp.notnull()]
    
    data_out['Country'] = data_in['Country'][data_temp.notnull()]
    
    data_out['National'] = data_in['National'][data_temp.notnull()]
    
    data_out['Day'] = pd.DatetimeIndex(data_in['Date'][data_temp.notnull()]).day
    data_out['Month'] = pd.DatetimeIndex(data_in['Date'][data_temp.notnull()]).month
    data_out['Year'] = pd.DatetimeIndex(data_in['Date'][data_temp.notnull()]).year
    
    data_out = data_out.set_index(['Description','Country', 'Year', 'Month', 'Day'])['National']
    data_out = data_out.unstack('Description')
    
    return data_out

In [10]:
#filter_data(ebola_alldata, label_description)

In [11]:
def compute_daily_av(data_in, label_function):
    data_out = filter_data(data_in, label_function)
    
    # Replacing NaNs with zeros and converting them into int-type
    data_out = data_out.fillna(0).astype(int)
    
    return data_out.mean(level=[0,1,2])

In [12]:
compute_daily_av(ebola_alldata, label_description)

Unnamed: 0_level_0,Unnamed: 1_level_0,Description,New Cases: Confirmed,New Cases: Probables,New Cases: Suspects,Total New Deaths
Country,Year,Month,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
guinea,2014,8,12.4,1.6,11.8,3.4
guinea,2014,9,13.0,1.1875,5.4375,3.5625
guinea,2014,10,6.0,0.0,28.0,15.0
liberia,2014,6,2.142857,1.142857,2.428571,2.0
liberia,2014,7,1.818182,3.727273,3.0,4.272727
liberia,2014,8,5.444444,19.777778,12.0,23.222222
liberia,2014,9,6.166667,29.333333,28.333333,36.041667
liberia,2014,10,1.36,17.76,26.44,28.04
liberia,2014,11,2.6,7.0,16.866667,13.466667
liberia,2014,12,1928.333333,1208.0,2042.222222,0.0
