## Demonstration of machine learning models

This notebook tests the effectiveness of a variety of machine learning models on our data.

In [1]:
%matplotlib inline

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import brier_score_loss, log_loss, f1_score

from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold

import warnings

In [3]:
cohort_79 = pd.read_csv('data/cohort79_Jun8.csv')

In [4]:
cohort_97 = pd.read_csv('data/cohort97_Jun8.csv')

In [5]:
merged_data = pd.concat([cohort_79, cohort_97], sort=False)
merged_data.drop(merged_data[merged_data["adjusted_income"] <= 1000].index, inplace=True)
merged_data.fillna(0, inplace=True)

predictors = list(merged_data.columns)
vars_to_drop = ["case_id","urban_or_rural","family_size", "sample_id", "year", "shock", "region", "highest_grade", "industry", "occupation", "Unnamed: 0",'marital_status', 'race', "region_1", "region_2", "region_3", "region_4", "work_kind_limited", "work_amount_limited"]
for var in vars_to_drop:
    predictors.remove(var)
    
X = merged_data[predictors]
y = np.ravel(merged_data["shock"])

In [6]:
def score_model(X, y, estimator, n_folds, scale=False, **kwargs):
    """
    Test various estimators.
    """ 
    
    kf = KFold(n_splits = n_folds)
    f1_scores = []
    brier_scores = []
    log_loss_scores = []
    
    for train_index, test_index in kf.split(X):
    
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        if scale:
            # The standard scaler will raise a warning about variable types. To suppress those for the moment...    
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                scaler = StandardScaler().fit(X_train)
                X_train = scaler.transform(X_train)
                X_test = scaler.transform(X_test)

        estimator.fit(X_train, y_train, **kwargs)  

        expected  = y_test
        predicted = estimator.predict(X_test)
        predicted_proba = estimator.predict_proba(X_test)[:,1]
        
        f1_scores.append(f1_score(expected, predicted, average="weighted"))
        brier_scores.append(brier_score_loss(expected, predicted_proba))
        log_loss_scores.append(log_loss(expected, predicted_proba))
    
    print("Model: {}".format(estimator.__class__.__name__))
    print("F1 score (higher is better): {:.03f}".format(np.mean(f1_scores)))
    print("Brier score loss (lower is better): {:.03f}".format(np.mean(brier_scores)))
    print("Log loss (lower is better): {:.03f}\n".format(np.mean(log_loss_scores)))


In [7]:
dummy_predictions = np.full((len(X), 1), merged_data["shock"].median())
dummy_probs = np.full((len(X), 1), merged_data["shock"].mean())
brier_skill_ref = brier_score_loss(y, dummy_probs)

print("Model: DummyPredictor")
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print("F1 score (higher is better): {:.03f}".format(f1_score(y, dummy_predictions, average="weighted")))
print("Brier score loss (lower is better): {:.03f}".format(brier_skill_ref))
print("Log loss (lower is better): {:.03f}\n".format(log_loss(y, dummy_probs)))

models = [
    SGDClassifier(loss="log", max_iter=1000, tol=.001),
    LogisticRegression(solver="lbfgs"),
    GradientBoostingClassifier(),
    RandomForestClassifier(n_estimators=100),
    BaggingClassifier(),
    GaussianNB(),
    MLPClassifier(hidden_layer_sizes=(71,), activation='tanh', alpha=.001),
    GradientBoostingClassifier(n_estimators=100, learning_rate=.3, max_features=30, max_depth=4, min_samples_leaf=75),
]

for model in models:
    score_model(X, y, model, 5, True)

Model: DummyPredictor
F1 score (higher is better): 0.662
Brier score loss (lower is better): 0.180
Log loss (lower is better): 0.546

Model: SGDClassifier
F1 score (higher is better): 0.694
Brier score loss (lower is better): 0.175
Log loss (lower is better): 0.559

Model: LogisticRegression
F1 score (higher is better): 0.693
Brier score loss (lower is better): 0.171
Log loss (lower is better): 0.521

Model: GradientBoostingClassifier
F1 score (higher is better): 0.679
Brier score loss (lower is better): 0.169
Log loss (lower is better): 0.517

Model: RandomForestClassifier
F1 score (higher is better): 0.691
Brier score loss (lower is better): 0.171
Log loss (lower is better): 0.520

Model: BaggingClassifier
F1 score (higher is better): 0.696
Brier score loss (lower is better): 0.192
Log loss (lower is better): 1.090

Model: GaussianNB
F1 score (higher is better): 0.603
Brier score loss (lower is better): 0.359
Log loss (lower is better): 3.394

Model: MLPClassifier
F1 score (higher is