# Third Party Data
Let's merge and impute the hospital beds and doctors data using the same process that we used for the food data

In [63]:
import pickle
import pandas as pd
import numpy as np

In [2]:
# Load hospital beds and doctors data
out = open('data/clean/beds.p', 'r')
beds = pickle.load(out)
out.close()
out = open('data/clean/doctors.p', 'r')
doctors = pickle.load(out)
out.close()

In [14]:
# Load cleaned food data to see what countries should be included
out = open('data/final/food_2000_2012_cleaned.p', 'r')
food = pickle.load(out)
out.close()

countries = food.index

In [48]:
# Helper to print percentage of cells missing in a dataframe
def print_NaN_percent(df):
    if isinstance(df, pd.DataFrame):
        print "Percentage NaN cells:", df.isnull().sum().sum() / float(df.shape[0] * df.shape[1])
    if isinstance(df, pd.Series):
        print "Percentage NaN cells:", df.isnull().sum() / float(df.shape[0])

In [10]:
print_NaN_percent(doctors)
doctors.head()

Percentage NaN cells: 0.658812729498


Unnamed: 0_level_0,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
Country_corrected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0.034844,,,,,0.063428,,,,,...,0.146,0.145,0.175,0.194,0.234,0.225,0.266,,,
Angola,0.067068,,,,,0.076062,,,,,...,,,0.166,,,,,,,
Albania,0.276291,,,,,0.481283,,,,,...,1.146,,1.144,1.132,1.113,1.145,1.145,,,
United Arab Emirates,,,,,,,,,,,...,1.93,2.737,2.428,2.533,,,,,,
Argentina,1.350698,,,,,1.666652,,,,,...,,,,3.21,,,3.859,,,


In [12]:
print_NaN_percent(beds)
beds.head()

Percentage NaN cells: 0.762137902897


Unnamed: 0_level_0,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
Country_corrected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0.170627,,,,,,,,,,...,0.42,0.42,0.4,0.4,,0.5,,,,
Angola,2.061462,,,,,,,,,,...,,,,,,,,,,
Albania,5.102676,,,,,,,,,,...,2.9,,2.8,,2.43,2.6,,,,
United Arab Emirates,,,,,,,,,,,...,1.86,1.9,,,,1.1,,,,
Argentina,6.352251,,,,,,,,,,...,,,,4.5,4.5,4.7,,,,


Both of these datasets are very sparse.

## Create merged dataframes for hospital beds and doctors data

In [58]:
def merge_df(df, start, end, countries):
    '''
    Helper to average a dataframe of country-year data over a given time period
    
    Args:
        df: dataframe to average
        start: start year
        end: ending year
        countries: countries to used as index of resulting dataframe
    Returns:
        Series of average values for each country over a given time period
    '''
    time_period = range(start, end)

    # Calculate the mean for each crop/meat over the period 2000-2012
    merged = df[time_period].mean(axis=1)
    # Filter out only for countries listed in index
    merged = pd.DataFrame(merged.loc[countries])

    return merged

In [59]:
# Get average data for hospital beds
beds_1970_2000 = merge_df(beds, 1970, 2000, countries)
print_NaN_percent(beds_1970_2000)
beds_2000_2012 = merge_df(beds, 2000, 2012, countries)
print_NaN_percent(beds_2000_2012)

Percentage NaN cells: 0.142857142857
Percentage NaN cells: 0.123376623377


In [60]:
# Get average data for doctors
doctors_1970_2000 = merge_df(doctors, 1970, 2000, countries)
print_NaN_percent(doctors_1970_2000)
doctors_2000_2012 = merge_df(doctors, 2000, 2012, countries)
print_NaN_percent(doctors_2000_2012)

Percentage NaN cells: 0.12987012987
Percentage NaN cells: 0.12987012987


In [64]:
# Let's use mean imputation as we did before to fill in the rest of the missing data
beds_1970_2000_cleaned = beds_1970_2000.fillna(beds_1970_2000.mean())
beds_2000_2012_cleaned = beds_2000_2012.fillna(beds_2000_2012.mean())
doctors_1970_2000_cleaned = doctors_1970_2000.fillna(doctors_1970_2000.mean())
doctors_2000_2012_cleaned = doctors_2000_2012.fillna(doctors_2000_2012.mean())

In [66]:
# Sanity check that we have no more NaN's
for s in [beds_1970_2000_cleaned, beds_2000_2012_cleaned, doctors_1970_2000_cleaned, doctors_2000_2012_cleaned]:
    print_NaN_percent(s)

Percentage NaN cells: 0.0
Percentage NaN cells: 0.0
Percentage NaN cells: 0.0
Percentage NaN cells: 0.0


In [None]:
# Save for use later
pickle.dump(beds_1970_2000_cleaned, open('data/final/beds_1970_2000_cleaned.p', 'wb'))
pickle.dump(beds_2000_2012_cleaned, open('data/final/beds_2000_2012_cleaned.p', 'wb'))
pickle.dump(doctors_1970_2000_cleaned, open('data/final/doctors_1970_2000_cleaned.p', 'wb'))
pickle.dump(doctors_2000_2012_cleaned, open('data/final/doctors_2000_2012_cleaned.p', 'wb'))