# Notebook 9: Advanced Models
This notebook contains our exploration more sophisticated models. 

In [4]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
import pickle
from collections import Counter
import sklearn as sk
from sklearn.preprocessing import Imputer
from sklearn import linear_model
%matplotlib inline

In [5]:
# Load cleaned NCD data from pickled files
out = open('data/clean/deaths_100k.p', 'r')
deaths_100k = pickle.load(out)
out.close()
out = open('data/clean/risk.p', 'r')
risk_of_death = pickle.load(out)
out.close()
out = open('data/clean/crops.p', 'r')
crops = pickle.load(out)
out.close()
out = open('data/clean/meat.p', 'r')
meat = pickle.load(out)
out.close()
out = open('data/clean/var_desc_livestock.p', 'r')
livestock_desc = pickle.load(out)
out.close()
out = open('data/clean/var_desc_crops.p', 'r')
crops_desc = pickle.load(out)
out.close()

# load cleaned food data with redundant variables removed
out = open('data/final/food_1970_2000_cleaned.p', 'r')
food_1970_2000_cleaned = pickle.load(out)

# Get response variables
deaths_100k_all_2000 = deaths_100k['all'][2000]
deaths_100k_cancer_2000 = deaths_100k['cancer'][2000]
deaths_100k_cardio_2000 = deaths_100k['cardio'][2000]
deaths_100k_diabetes_2000 = deaths_100k['diabetes'][2000]
deaths_100k_resp_2000 = deaths_100k['resp'][2000]

risk_of_death_2000 = risk_of_death[2000]

time_period = range(1970, 2000)

# Calculate the mean for each crop/meat over the period 1970-2000
food_1970_2000 = pd.DataFrame(index=risk_of_death.index)

for crop in crops.iterkeys():
    food_1970_2000[crop] = crops[crop][time_period].mean(axis=1)
    
for m in meat.iterkeys():
    food_1970_2000[m] = meat[m][time_period].mean(axis=1)

# list of countries to drop due to being less than 50% full
countries_to_drop = []

for index, row in food_1970_2000.iterrows():
    if row.isnull().sum() > len(row) / 2:
        countries_to_drop.append(index)

# I. Lasso Regression
To do variable selection efficiently, as explained in Baseline Models and Revised Baseline Models.

## Risk of Death in 2000 as Response Variable

We use `sklearn`'s `GridSearchCV` function to find the best hyperparameter ($\alpha$) value.

In [6]:
lasso = linear_model.Lasso(alpha = 1.0)
X = food_1970_2000_cleaned.values
y_death_lasso = risk_of_death_2000.drop(countries_to_drop).values
lasso.fit(X, y_death_lasso)
lasso.score(X, y_death_lasso)
lasso.coef_
params = {
    'alpha': [0,0.3,0.6,1.0,1.3,1.6,2.0,2.3,2.6,3.0,4.0,5.0, 6.0, 7.0, 8.0] + 
             [4.3, 4.6, 5.3, 5.6, 6.0, 6.3, 6.6, 7.0],
    'fit_intercept': [True, False],
    'normalize': [True, False],
}

CV_model = sk.grid_search.GridSearchCV(lasso, param_grid=params, cv=5)
CV_model.fit(X, y_death_lasso)
CV_model.best_params_

AttributeError: 'module' object has no attribute 'grid_search'

We then fit the model using the optimum alpha provided to fit the model using `statsmodels`.

In [None]:
# Fit linear model with risk of death in 2000 as response variable
risk_2000_model_lasso = sm.OLS(risk_of_death_2000.drop(countries_to_drop), food_1970_2000_cleaned)
risk_2000_results_lasso = risk_2000_model_lasso.fit_regularized(alpha = 5.6)
print(risk_2000_results_lasso.summary())

Checking the same diagnostics as before, while the Condition Number is much smaller (indicating a reduced likelihood of multicollinearity affecting the model), for the model considering overall percentage risk of death in 2000 the residuals do not appear to be normally distributed (according to the Omnibus and Jarque-Bera Tests), so we should address that issue in follow-up models.

In [None]:
plot_sig_coeffs(risk_2000_results_lasso, 0.1)

## Deaths per 100k from All Causes as Response Variable

In [None]:
lasso = linear_model.Lasso(alpha = 1.0)
X = food_1970_2000_cleaned
y_death_100k_cancer_lasso = deaths_100k_all_2000.drop(countries_to_drop).sort_index().loc[food_1970_2000_cleaned.index].values
lasso.fit(X, y_death_100k_cancer_lasso)
lasso.score(X, y_death_100k_cancer_lasso)
lasso.coef_
params = {
    'alpha': [0,0.3,0.6,1.0,1.3,1.6,2.0,2.3,2.6,3.0,4.0,5.0],
    'fit_intercept': [True, False],
    'normalize': [True, False],
}

CV_model = sk.grid_search.GridSearchCV(lasso, param_grid=params, cv=5)
CV_model.fit(X, y_death_100k_cancer_lasso)
CV_model.best_params_