# Notebook 3: Missing Data
This notebook contains the code used to impute missing data for the dataset. Only code that implemented the final imputation method is included here. For other methods of imputation that were rejected, please see the Preliminary Missing Data iPython Notebook.

In [6]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from statsmodels.compat import lzip
import statsmodels.formula.api as sm
import statsmodels.stats.api as sms
import pickle
from sklearn.preprocessing import Imputer
%matplotlib inline

## Load in cleaned data
Let's load in the data that we've cleaned and preprocessed.

In [7]:
# Load cleaned NCD data from pickled files
out = open('data/clean/deaths_100k.p', 'r')
deaths_100k = pickle.load(out)
out.close()
out = open('data/clean/risk.p', 'r')
risk_of_death = pickle.load(out)
out.close()
out = open('data/clean/crops.p', 'r')
crops = pickle.load(out)
out.close()
out = open('data/clean/meat.p', 'r')
meat = pickle.load(out)
out.close()

### Get Response Variables
First let's isolate some possible response variables we might want to look at:
* Deaths for 100k people in 2000 from all causes
* Risk of death for 30 year old in 2000

Note that the following analysis could naturally be extended to 2012 as well, but for this milestone we'll focus on the year 2000.

In [8]:
# Get response variables
deaths_100k_all_2000 = deaths_100k['all'][2000]
deaths_100k_cancer_2000 = deaths_100k['cancer'][2000]
deaths_100k_cardio_2000 = deaths_100k['cardio'][2000]
deaths_100k_diabetes_2000 = deaths_100k['diabetes'][2000]
deaths_100k_resp_2000 = deaths_100k['resp'][2000]

risk_of_death_2000 = risk_of_death[2000]

### Get Food Predictor Variables
Next let's get the food predictors (crop and meat data). Rather than looking at single years, let's calculate the mean for each crop/meat in the 30 years preceding the year of our response variable.

In [9]:
# TODO: Turn this process into a function later?
time_period = range(1970, 2000)

# Calculate the mean for each crop/meat over the period 1970-2000
food_1970_2000 = pd.DataFrame(index=risk_of_death.index)

for crop in crops.iterkeys():
    food_1970_2000[crop] = crops[crop][time_period].mean(axis=1)
    
for m in meat.iterkeys():
    food_1970_2000[m] = meat[m][time_period].mean(axis=1)

food_1970_2000.head()

Unnamed: 0_level_0,Ricebran Oil,Oilcrops,Plantains,"Sugar, Raw Equivalent","Beverages, Alcoholic",Roots & Tuber Dry Equiv,Vegetable Oils,Olives (including preserved),Cloves,Millet and products,...,Offals,Bovine Meat,"Molluscs, Other","Fish, Body Oil","Aquatic Animals, Others",Animal fats,Honey,"Offals, Edible",Demersal Fish,Cream
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,,2.629,,14.402,0.002333,8.539333,7.995,0.065333,,5.360333,...,9.166,16.762667,,,,6.513,0.687667,9.166,,0.0
Albania,,7.044333,,50.132333,3.201,10.253333,20.501,7.017667,,,...,6.729333,18.681,0.416,0.0,0.0,8.995,0.488667,6.734667,0.604,0.005333
Algeria,,1.646,0.0,69.079333,0.108667,15.592333,33.352667,1.432667,0.004667,,...,3.079333,9.742333,0.001,0.0,0.0,5.180333,0.247333,3.079333,0.950333,
Angola,,3.671667,,26.933333,1.855,157.302,21.056,,0.0,16.849333,...,3.521333,21.081,0.000333,0.0,0.0,1.588,5.557,3.521333,3.859667,
Argentina,,1.813,,113.051667,10.766,41.883333,35.942667,1.235333,0.000333,,...,19.928,192.834333,0.675,0.0,0.0,10.627667,0.505,19.933333,13.161333,0.157


### Dropping sparse columns and rows

Unfortunately it looks like even after averaging over thirty years, we still have quite a bit of NaN's. Let's try dropping the countries that are almost half NaN's:

In [10]:
# list of countries to drop due to being less than 50% full
countries_to_drop = []

for index, row in food_1970_2000.iterrows():
    if row.isnull().sum() > len(row) / 2:
        countries_to_drop.append(index)

print countries_to_drop
print "Number of countries to drop:", len(countries_to_drop)

['Bahrain', 'Belgium', 'Bhutan', 'Burundi', 'Comoros', 'Democratic Republic of the Congo', 'Equatorial Guinea', 'Eritrea', 'Libya', 'Montenegro', 'Papua New Guinea', 'Qatar', 'Serbia', 'Singapore', 'Somalia', 'South Sudan', 'Sudan', 'Syrian Arab Republic']
Number of countries to drop: 18


In [11]:
# Drop the identified countries with very sparse data
food_1970_2000_cleaned = food_1970_2000.drop(countries_to_drop)

Let's also see how many columns are very sparse (< 50% full) and consider dropping them as well if there are not too many.

In [12]:
cols_to_drop = []

# Identify sparse columns to drop
for col in food_1970_2000_cleaned.columns:
    if food_1970_2000_cleaned[col].isnull().sum() > len(food_1970_2000_cleaned[col]) / 2:
        cols_to_drop.append(col)
        
print cols_to_drop
print "Number of columns to drop:", len(cols_to_drop)

['Ricebran Oil', 'Millet and products', 'Sugar non-centrifugal', 'Molasses', 'Sugar beet', 'Sorghum and products', 'Sunflower seed', 'Sugar Crops', 'Sugar cane', 'Yams', 'Meat, Aquatic Mammals', 'Meat Meal', 'Whey']
Number of columns to drop: 13


Overall, there aren't too many columns to drop. However, unfortunately data on sugar-related products seems to be disproportionately sparse (which makes sense since very few countries have the correct conditions to grow sugar domestically). We'll drop these columns for now. 

The good news is though that we still have many columns that still contain sugar-related products even after dropping, so we should still be able to use sugar as a predictor in some form.

In [13]:
# Drop identified crops with very sparse data
food_1970_2000_cleaned = food_1970_2000_cleaned.drop(cols_to_drop, axis=1)

In [14]:
# Sanity check to see what percentage of cells are missing
print "Percentage NaN cells after dropping:", food_1970_2000_cleaned.isnull().sum().sum() / float(food_1970_2000_cleaned.shape[0] * food_1970_2000_cleaned.shape[1])

Percentage NaN cells after dropping: 0.0638031693078


### Imputation of Missing Values
Since the number of NaN cells has drastically been reduced, now let's use simple mean imputation (global average per crop) to fill them in. It may be worth investigating other methods for imputation in the future, but given the small number of NaN's now, this method should not have a huge impact on our results. Imputing via the global average introduces the least bias, as the overall results will only gravitate toward the mean with this imputation.

In [18]:
# Impute by mean for each column (i.e. global average per crop)
imp = Imputer(axis=1)
food_1970_2000_cleaned = pd.DataFrame(imp.fit_transform(food_1970_2000_cleaned), index=food_1970_2000_cleaned.index, columns=food_1970_2000_cleaned.columns)

In [19]:
food_1970_2000_cleaned.head()

Unnamed: 0_level_0,Oilcrops,Plantains,"Sugar, Raw Equivalent","Beverages, Alcoholic",Roots & Tuber Dry Equiv,Vegetable Oils,Olives (including preserved),Cloves,Coconuts - Incl Copra,Treenuts,...,Offals,Bovine Meat,"Molluscs, Other","Fish, Body Oil","Aquatic Animals, Others",Animal fats,Honey,"Offals, Edible",Demersal Fish,Cream
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,2.629,29.068781,14.402,0.002333,8.539333,7.995,0.065333,29.068781,0.015,2.327333,...,9.166,16.762667,29.068781,29.068781,29.068781,6.513,0.687667,9.166,29.068781,0.0
Albania,7.044333,32.502693,50.132333,3.201,10.253333,20.501,7.017667,32.502693,0.006333,4.35,...,6.729333,18.681,0.416,0.0,0.0,8.995,0.488667,6.734667,0.604,0.005333
Algeria,1.646,0.0,69.079333,0.108667,15.592333,33.352667,1.432667,0.004667,0.012,1.453667,...,3.079333,9.742333,0.001,0.0,0.0,5.180333,0.247333,3.079333,0.950333,25.450712
Angola,3.671667,26.325172,26.933333,1.855,157.302,21.056,26.325172,0.0,0.0,0.401,...,3.521333,21.081,0.000333,0.0,0.0,1.588,5.557,3.521333,3.859667,26.325172
Argentina,1.813,44.950187,113.051667,10.766,41.883333,35.942667,1.235333,0.000333,0.584,1.133,...,19.928,192.834333,0.675,0.0,0.0,10.627667,0.505,19.933333,13.161333,0.157


In [22]:
# Write the cleaned food dataset to pickle for use later
pickle.dump(food_1970_2000_cleaned, open('data/imputed/food_1970_2000_cleaned.p', 'wb'))