## Model selection

Build ML pipelines and through cross-validation and assessment choose a model that performs the best

### Import libraries

In [1]:
# print_function for compatibility with Python 3
from __future__ import print_function
# NumPy for numerical computing
import numpy as np
# Pandas for DataFrames
import pandas as pd
# Matplotlib for visualization
from matplotlib import pyplot as plt
# display plots in the notebook
%matplotlib inline
# Seaborn for easier visualization
import seaborn as sns
# Scikit-Learn for Modeling
import sklearn
# Pickle for saving model files
import pickle

In [2]:
# Import Logistic Regression
from sklearn.linear_model import LogisticRegression
# Import RandomForestClassifier and GradientBoostingClassifer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [3]:
# Function for splitting training and test set
from sklearn.model_selection import train_test_split
# Function for creating model pipelines
from sklearn.pipeline import make_pipeline
# For standardization
from sklearn.preprocessing import StandardScaler
# Helper for cross-validation
from sklearn.model_selection import GridSearchCV

In [23]:
# Classification metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc

### Load dataset

In [5]:
abt = pd.read_csv('abt.csv')

### Split dataset

In [7]:
y = abt.color
X = abt.drop('color', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234, stratify=abt.color)

In [9]:
## Check the size of train/test splits for red wine
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(5197, 11) (1300, 11)
(5197,) (1300,)


### Create pipelines and declare hyperparameters

In [10]:
# Pipeline dictionary
pipelines = {
    'l1': make_pipeline(StandardScaler(), LogisticRegression(random_state=123, penalty='l1')),
    'l2': make_pipeline(StandardScaler(), LogisticRegression(random_state=123, penalty='l2')),
    'rf': make_pipeline(StandardScaler(), RandomForestClassifier(random_state=123)),
    'gb': make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=123))
}

# Hyperparameters to tune
hyperparameters = {
    'l1': {
        'logisticregression__C': np.linspace(1e-3, 1e3, 10)
    },
    'l2': {
        'logisticregression__C': np.linspace(1e-3, 1e3, 10)
    },
    'rf': {
        'randomforestclassifier__n_estimators': [100, 200],
        'randomforestclassifier__max_features': ['auto', 'sqrt', 0.33]
    },
    'gb': {
        'gradientboostingclassifier__n_estimators': [100, 200],
        'gradientboostingclassifier__learning_rate': [0.05, 0.1, 0.2],
        'gradientboostingclassifier__max_depth': [1,3,5]
    }
}

### Run cross-validation loops and train models

In [12]:
def fit_models(X_train, y_train):
    # Create empty dictionary called fitted_models
    fitted_models = {}

    # Loop through model pipelines, tuning each one and saving it to fitted_models
    for pipeline in pipelines.keys():

        # Create cross-validation object from pipeline and hyperparameters
        model = GridSearchCV(pipelines[pipeline], hyperparameters[pipeline], cv=10, n_jobs=-1)

        # Fit model on X_train, y_train
        model.fit(X_train, y_train)

        # Store model in fitted_models[name] 
        fitted_models[pipeline] = model

        # Print '{name} has been fitted'
        print('{} has been fitted'.format(pipeline))
        
    return fitted_models

In [14]:
fitted_models = fit_models(X_train, y_train)

l1 has been fitted
l2 has been fitted
rf has been fitted
gb has been fitted


### Evaluate performance and choose the winner

Higher R^2 score and lower MAE is better.

In [15]:
## Display best R^2 holdout score (red)
for name, model in fitted_models.items():
    print(name, model.best_score_)

l1 0.993842601501
l2 0.993842601501
rf 0.995381951126
gb 0.995574369829


In [19]:
def evaluate_perf(fitted_models, X_test, y_test):
    # Predict classes using L1-regularized logistic regression 
    for model in fitted_models.keys():
        y_pred = fitted_models[model].predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        print(model, accuracy_score(y_test, y_pred))
        print(cm)
        print(classification_report(y_test, y_pred))

In [20]:
## Evaluate performance of score prediction for reds
evaluate_perf(fitted_models, X_test, y_test)

l1 0.990769230769
[[975   5]
 [  7 313]]
             precision    recall  f1-score   support

          0       0.99      0.99      0.99       980
          1       0.98      0.98      0.98       320

avg / total       0.99      0.99      0.99      1300

l2 0.990769230769
[[975   5]
 [  7 313]]
             precision    recall  f1-score   support

          0       0.99      0.99      0.99       980
          1       0.98      0.98      0.98       320

avg / total       0.99      0.99      0.99      1300

rf 0.993846153846
[[978   2]
 [  6 314]]
             precision    recall  f1-score   support

          0       0.99      1.00      1.00       980
          1       0.99      0.98      0.99       320

avg / total       0.99      0.99      0.99      1300

gb 0.994615384615
[[978   2]
 [  5 315]]
             precision    recall  f1-score   support

          0       0.99      1.00      1.00       980
          1       0.99      0.98      0.99       320

avg / total       0.99      0.

## Compute AUROC

In [25]:
for model in fitted_models.keys():
    pred = fitted_models[model].predict_proba(X_test)
    pred = [p[1] for p in pred]
    fpr, tpr, threshold = roc_curve(y_test, pred)
    print(model, auc(fpr, tpr), fitted_models[model].best_score_)

l1 0.987085459184 0.993842601501
l2 0.987091836735 0.993842601501
rf 0.99912627551 0.995381951126
gb 0.99856505102 0.995574369829


## Conclusion

A model based on random forest achieves 99% AUROC score. We can confidently predict the color of wine based on its physical characteristics.