## Model selection

Build ML pipelines and through cross-validation and assessment choose a model that performs the best

### Import libraries

In [6]:
# print_function for compatibility with Python 3
from __future__ import print_function
# NumPy for numerical computing
import numpy as np
# Pandas for DataFrames
import pandas as pd
# Matplotlib for visualization
from matplotlib import pyplot as plt
# display plots in the notebook
%matplotlib inline
# Seaborn for easier visualization
import seaborn as sns
# Scikit-Learn for Modeling
import sklearn
# Pickle for saving model files
import pickle

In [7]:
# Import Logistic Regression
from sklearn.linear_model import LogisticRegression
# Import RandomForestClassifier and GradientBoostingClassifer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [8]:
# Function for splitting training and test set
from sklearn.model_selection import train_test_split
# Function for creating model pipelines
from sklearn.pipeline import make_pipeline
# For standardization
from sklearn.preprocessing import StandardScaler
# Helper for cross-validation
from sklearn.model_selection import GridSearchCV

In [24]:
# Classification metrics (added later)
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

## Load ABT

In [64]:
abt = pd.read_csv('abt.csv', index_col=0)

## Split dataset

Into training and testing data. Don't touch test data until late in the process.

In [65]:
abt.head()

Unnamed: 0,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,days_between,Neighbourhood_ANDORINHAS,Neighbourhood_ANTÔNIO HONÓRIO,...,Neighbourhood_SÃO PEDRO,Neighbourhood_TABUAZEIRO,Neighbourhood_UNIVERSITÁRIO,Neighbourhood_VILA RUBIM,IsReschedule,PrevAppointments,PrevNoShows,Gender_F,Gender_M,age_group
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,45-65
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,45-65
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,45-65
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,6-12
4,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,45-65


In [66]:
# create dummy variables for the age-group but keep in a separate variable for stratification
#age_group = abt['age_group']
abt = pd.get_dummies(abt, 'age_group')

target = abt['No-show']
input_features = abt.drop('No-show', axis=1)

input_train, input_test, target_train, target_test = train_test_split(
    input_features, target, test_size=0.2, random_state=1234)

In [68]:
# Make sure the target variable is not present
input_train.columns

Index(['Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap',
       'SMS_received', 'days_between', 'Neighbourhood_ANDORINHAS',
       'Neighbourhood_ANTÔNIO HONÓRIO', 'Neighbourhood_ARIOVALDO FAVALESSA',
       'Neighbourhood_BARRO VERMELHO', 'Neighbourhood_BELA VISTA',
       'Neighbourhood_BENTO FERREIRA', 'Neighbourhood_BOA VISTA',
       'Neighbourhood_BONFIM', 'Neighbourhood_CARATOÍRA',
       'Neighbourhood_CENTRO', 'Neighbourhood_COMDUSA',
       'Neighbourhood_CONQUISTA', 'Neighbourhood_CONSOLAÇÃO',
       'Neighbourhood_CRUZAMENTO', 'Neighbourhood_DA PENHA',
       'Neighbourhood_DE LOURDES', 'Neighbourhood_DO CABRAL',
       'Neighbourhood_DO MOSCOSO', 'Neighbourhood_DO QUADRO',
       'Neighbourhood_ENSEADA DO SUÁ', 'Neighbourhood_ESTRELINHA',
       'Neighbourhood_FONTE GRANDE', 'Neighbourhood_FORTE SÃO JOÃO',
       'Neighbourhood_FRADINHOS', 'Neighbourhood_GOIABEIRAS',
       'Neighbourhood_GRANDE VITÓRIA', 'Neighbourhood_GURIGICA',
       'Neighbourhood_HO

In [71]:
print(input_train.shape, input_test.shape)

(88421, 94) (22106, 94)


In [72]:
print(target_train.shape, target_test.shape)

(88421,) (22106,)


## Create pipelines and declare hyperparameters

In [74]:
# Pipeline dictionary
pipelines = {
    'l1': make_pipeline(StandardScaler(), LogisticRegression(random_state=123, penalty='l1')),
    'l2': make_pipeline(StandardScaler(), LogisticRegression(random_state=123, penalty='l2')),
    'rf': make_pipeline(StandardScaler(), RandomForestClassifier(random_state=123)),
    'gb': make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=123))
}

# Hyperparameters to tune
hyperparameters = {
    'l1': {
        'logisticregression__C': np.linspace(1e-3, 1e3, 10)
    },
    'l2': {
        'logisticregression__C': np.linspace(1e-3, 1e3, 10)
    },
    'rf': {
        'randomforestclassifier__n_estimators': [100, 200],
        'randomforestclassifier__max_features': ['auto', 'sqrt', 0.33]
    },
    'gb': {
        'gradientboostingclassifier__n_estimators': [100, 200],
        'gradientboostingclassifier__learning_rate': [0.05, 0.1, 0.2],
        'gradientboostingclassifier__max_depth': [1,3,5]
    }
}

## Run cross-validation loops and train models

In [75]:
def fit_models(X_train, y_train):
    # Create empty dictionary called fitted_models
    fitted_models = {}

    # Loop through model pipelines, tuning each one and saving it to fitted_models
    for pipeline in pipelines.keys():

        # Create cross-validation object from pipeline and hyperparameters
        model = GridSearchCV(pipelines[pipeline], hyperparameters[pipeline], cv=10, n_jobs=-1)

        # Fit model on X_train, y_train
        model.fit(X_train, y_train)

        # Store model in fitted_models[name] 
        fitted_models[pipeline] = model

        # Print '{name} has been fitted'
        print('{} has been fitted'.format(pipeline))
        
    return fitted_models

In [76]:
fitted_models = fit_models(input_train, target_train)

l1 has been fitted
l2 has been fitted
rf has been fitted
gb has been fitted


## Evaluate performance and choose the winner

Higher R^2 score and lower MAE is better.

In [77]:
## Display best R^2 holdout score (red)
for name, model in fitted_models.items():
    print(name, model.best_score_)

l1 0.797638570023
l2 0.797638570023
rf 0.774103436966
gb 0.803553454496


In [78]:
def evaluate_perf(fitted_models, X_test, y_test):
    # Predict classes using L1-regularized logistic regression 
    for model in fitted_models.keys():
        y_pred = fitted_models[model].predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        print(model, accuracy_score(y_test, y_pred))
        print(cm)
        print(classification_report(y_test, y_pred))

In [80]:
## Evaluate performance of score prediction for reds
evaluate_perf(fitted_models, input_test, target_test)

l1 0.796706776441
[[17414   222]
 [ 4272   198]]
             precision    recall  f1-score   support

          0       0.80      0.99      0.89     17636
          1       0.47      0.04      0.08      4470

avg / total       0.74      0.80      0.72     22106

l2 0.796797249615
[[17424   212]
 [ 4280   190]]
             precision    recall  f1-score   support

          0       0.80      0.99      0.89     17636
          1       0.47      0.04      0.08      4470

avg / total       0.74      0.80      0.72     22106

rf 0.776033656021
[[16178  1458]
 [ 3493   977]]
             precision    recall  f1-score   support

          0       0.82      0.92      0.87     17636
          1       0.40      0.22      0.28      4470

avg / total       0.74      0.78      0.75     22106

gb 0.802632769384
[[17458   178]
 [ 4185   285]]
             precision    recall  f1-score   support

          0       0.81      0.99      0.89     17636
          1       0.62      0.06      0.12      4470