In [1]:
%load_ext autoreload
%autoreload 2

## Part II: Model Building

Here you try your hand at model building to predict appointment no shows.

### Preprocessing

Package 'noshow_lib' now includes code to carry out preprocessing steps from part I. Here's how to use it:

In [4]:
import noshow_lib.util as utils

First, it includes a dictionary used for configuring path and file names
used through the project

In [5]:
utils.file_config

{'raw_data_path': 'data',
 'raw_data_csv': 'KaggleV2-May-2016.csv',
 'processed_data_path': 'processed_data',
 'train_csv': 'train_set.csv',
 'test_csv': 'test_set.csv',
 'objstore_path': 'objects',
 'feature_pipeline_file': 'feature_pipeline.pkl',
 'labels_pipeline_file': 'labels_pipeline.pkl'}

`feature_pipeline_file`: file storing the preprocessing pipeline used for preparing the feature matrix

`labels_pipeline_file`: file storing the preprocessing pipeline used for
preparing labels

`objstore_path`: directory to store python objects to disk

`processed_data_path`: directory containing processed data

`raw_data_csv`: name of the csv download from Kaggle

`raw_data_path`: directory containing raw data

`test_csv`: name of csv file containing test set

`train_csv`: name of csv file containing train set

You can change these paths and names to suit your project directory structure if you need so. E.g.,

In [16]:
file_config = utils.file_config
#config['raw_data_path'] = "some_other_directory"

First step is to create train test sets. Code is in file `noshow_lib/util.py` function `make_train_test_sets`. You
can edit that function as needed to include your own part I code if you so desire. The result will be to 
create files `train_set.csv` and `test_set.csv` in your `processed_data` directory (unless you change any of the entries in the configuration directory as above).

In [7]:
# ONLY NEED TO RUN THIS STEP ONCE (switch this to True to run it)
RUN_MAKE_TRAIN_TEST_FILES = False
if RUN_MAKE_TRAIN_TEST_FILES:
    utils.make_train_test_sets(config=file_config)

Next step is to fit the preprocessing pipelines. This is done in file `noshow_lib/preprocess.py`. Again you can edit code as needed in that file to incorporate your part I solution as you wish. The result will be to create files `feature_pipeline.pkl` and `labels_pipeline.pkl` containing the fit preprocessing pipelines we can then use to preprocess data.

In [8]:
import noshow_lib.preprocess as preprocess

# ONLY NEED TO RUN THIS STEP ONCE
RUN_FIT_PREPROCESSING = False
if RUN_FIT_PREPROCESSING:
    preprocess.fit_save_pipelines(config=file_config)

Finally, once we do that, we can get a training matrix and labels:

In [17]:
train_X, train_y = preprocess.load_train_data(config=file_config)

In [10]:
len(train_X)

90526

In [10]:
print(train_X.shape)
print(train_y.shape)

(90526, 113)
(90526,)


In [18]:
test_X, test_y = preprocess.load_test_data(config=file_config)

In [12]:
print(test_X.shape)
print(test_y.shape)

(20000, 113)
(20000,)


### Model Building

Using `sklearn` fit:
    - DecisionTree classifier
    - RandomForest classifier
    - Linear SVM classifier
    - SVM with Radial Basis Kernel classifier
    
Use default parameters for now.
Using 10-fold cross validation report both accuracy and AUC for each of the above four models.

QUESTION: Should you use accuracy or AUC for this task as a performance metric?

_ANSWER HERE_

#### Model 1 - DecisionTree classifier

In [11]:
#Classifier imports
from sklearn import tree

# Performance metrics
from sklearn.metrics import accuracy_score, classification_report

dt = tree.DecisionTreeClassifier()

dt.fit(train_X, train_y)
test_y_dt_model = dt.predict(test_X)
print("DecisionTree Accuracy :", accuracy_score(test_y, test_y_dt_model))

DecisionTree Accuracy : 0.73935


#### Model 2 - RandomForest classifier

In [12]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

rf.fit(train_X, train_y)
test_y_rf_model = rf.predict(test_X)
print("RandomForest Accuracy :", accuracy_score(test_y, test_y_rf_model))

RandomForest Accuracy : 0.7771


#### Model 3 - Linear SVM classifier

In [13]:
# build your models here
# Linear SVM classifier
from sklearn import model_selection
from sklearn.svm import LinearSVC

lsvc = LinearSVC()
lsvc.fit(train_X, train_y)

test_y_lsvc_model = lsvc.predict(test_X)
print("LinearSVC Accuracy :", accuracy_score(test_y, test_y_lsvc_model))

LinearSVC Accuracy : 0.7956


In [None]:
from sklearn.svm import LinearSVC
from sklearn import model_selection
model = LinearSVC()
results = model_selection.cross_val_score(model, train_X, train_y, cv=10)
print(results.mean())

#### Model 4 - SVM with Radial Basis Kernel classifier

In [14]:
from sklearn.svm import SVC 

rbf = SVC(kernel='rbf')  
rbf.fit(train_X, train_y)

test_y_rbf_model = rbf.predict(test_X)
print("SVM with Radial Basis Kernel Accuracy :", accuracy_score(test_y, test_y_rbf_model))  

SVM with Radial Basis Kernel Accuracy : 0.79805


In [15]:
from sklearn.metrics import classification_report, confusion_matrix 

def show_results(model_name, pred_y):
    print(model_name + " Measurements:")
    print(confusion_matrix(test_y,pred_y))  
    print(classification_report(test_y,pred_y))
    print()

In [16]:
measurements =(("SVM with Radial Basis Kernel",test_y_rbf_model),
    ("RandomForest",test_y_rf_model),
    ("Linear SVM",test_y_lsvc_model),
    ("DecisionTree",test_y_dt_model))

for model_name, pred_y in measurements:
    show_results(model_name,pred_y)


SVM with Radial Basis Kernel Measurements:
[[15961     0]
 [ 4039     0]]
             precision    recall  f1-score   support

         -1       0.80      1.00      0.89     15961
          1       0.00      0.00      0.00      4039

avg / total       0.64      0.80      0.71     20000


RandomForest Measurements:
[[14706  1255]
 [ 3203   836]]
             precision    recall  f1-score   support

         -1       0.82      0.92      0.87     15961
          1       0.40      0.21      0.27      4039

avg / total       0.74      0.78      0.75     20000


Linear SVM Measurements:
[[15874    87]
 [ 4001    38]]
             precision    recall  f1-score   support

         -1       0.80      0.99      0.89     15961
          1       0.30      0.01      0.02      4039

avg / total       0.70      0.80      0.71     20000


DecisionTree Measurements:
[[13429  2532]
 [ 2681  1358]]
             precision    recall  f1-score   support

         -1       0.83      0.84      0.84     15961

  'precision', 'predicted', average, warn_for)


#### Conclusion
Based on classification_report result, Random Forest has the best performance on prediction.

### Model Tuning

Based on the above, choose two methods and fit a tuned model:
    - use 5-fold cross validation for model selection
    - use 10-fold cross validation for model assessment (based on appropriate performance metric)

Report estimated performance for both tuned classifiers

In [24]:
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

print("# Tuning Randon Forest parameters")
print()

rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True) 

param_grid = { 
    'n_estimators': [100,150,200],
    'max_features': ['auto', 'sqrt', 'log2']
}

#use 10-fold cross validation 
rf_tuner = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 10)
rf_tuner.fit(train_X, train_y)

print("Best parameters set found on 10-fold cross validation development set:")
print()
print(rf_tuner.best_params_)
print()
print("Grid scores on training set:")
print()
means = rf_tuner.cv_results_['mean_test_score']
stds = rf_tuner.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, rf_tuner.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

# Tuning Randon Forest parameters

Best parameters set found on 10-fold cross validation development set:

{'max_features': 'sqrt', 'n_estimators': 100}

Grid scores on training set:



AttributeError: 'GridSearchCV' object has no attribute 'cv_results_'

In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

print("# Tuning Randon Forest parameters")
print()
parameters = [{'n_estimators': [50,150],
               'max_features': [20,30]}]
rf = RandomForestClassifier()
rf_tuner = GridSearchCV(estimator=rf, param_grid=parameters, cv=5,scoring='roc_auc',n_jobs = 6)
rf_tuner.fit(train_X, train_y)

print("Best parameters set found on 5-fold cross validation development set:")
print()
print(rf_tuner.best_params_)
print()
print("Grid scores on training set:")
print()
means = rf_tuner.cv_results_['mean_test_score']
stds = rf_tuner.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, rf_tuner.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

# Tuning Randon Forest parameters

Best parameters set found on 5-fold cross validation development set:

{'max_features': 20, 'n_estimators': 150}

Grid scores on training set:

0.720 (+/-0.005) for {'max_features': 20, 'n_estimators': 50}
0.725 (+/-0.006) for {'max_features': 20, 'n_estimators': 150}
0.721 (+/-0.007) for {'max_features': 30, 'n_estimators': 50}
0.725 (+/-0.006) for {'max_features': 30, 'n_estimators': 150}



In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

print("# Tuning Randon Forest parameters")
print()
parameters = [{'n_estimators': [50,150],
               'max_features': [20,30]}]
rf = RandomForestClassifier()
rf_tuner = GridSearchCV(estimator=rf, param_grid=parameters, cv=10,scoring='roc_auc',n_jobs = 6)
rf_tuner.fit(train_X, train_y)

print("Best parameters set found on 10-fold cross validation development set:")
print()
print(rf_tuner.best_params_)
print()
print("Grid scores on training set:")
print()
means = rf_tuner.cv_results_['mean_test_score']
stds = rf_tuner.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, rf_tuner.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

# Tuning Randon Forest parameters

Best parameters set found on 10-fold cross validation development set:

{'max_features': 20, 'n_estimators': 50}

Grid scores on training set:

0.718 (+/-0.010) for {'max_features': 20, 'n_estimators': 30}
0.722 (+/-0.012) for {'max_features': 20, 'n_estimators': 50}
0.718 (+/-0.013) for {'max_features': 30, 'n_estimators': 30}
0.722 (+/-0.013) for {'max_features': 30, 'n_estimators': 50}



In [25]:
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

print("# Tuning Randon Forest parameters")
print()

rfc = RandomForestClassifier(n_jobs=-1, oob_score = True) 

param_grid = { 
    'n_estimators': [100,150,200],
    'max_features': [30, 20]
}

#use 10-fold cross validation 
rf_tuner = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 10)
rf_tuner.fit(train_X, train_y)

print("Best parameters set found on 10-fold cross validation development set:")
print()
print(rf_tuner.best_params_)
print()
print("Grid scores on training set:")
print()
means = rf_tuner.cv_results_['mean_test_score']
stds = rf_tuner.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, rf_tuner.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

# Tuning Randon Forest parameters

Best parameters set found on 10-fold cross validation development set:

{'max_features': 30, 'n_estimators': 200}

Grid scores on training set:



AttributeError: 'GridSearchCV' object has no attribute 'cv_results_'

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

#This one runs forever and eventually causes my machine dead.
print("# Tuning hyper-parameters")
print()
parameters = [{'kernel': ['rbf'],
               'gamma': [1e-4, 1e-3, 0.01, 0.1, 0.2, 0.5],
                'C': [1, 10, 100, 1000]}]

clf = GridSearchCV(SVC(decision_function_shape='ovr'), parameters, cv=5)
clf.fit(train_X, train_y)

print("Best parameters set found on 5-fold cross validation development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on training set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

# Tuning hyper-parameters



### Linear SVM with Gradient Descent

In [12]:
import numpy as np

# initialize model parameters w and b
# intializing to 0 is not a good idea
# it should be a random vector see np.random.randn
# YOU NEED TO IMPLEMENT THIS
def _initialize_parameters(nfeatures):
    w_b = (np.random.randn(1, 2))/10000000
    
    #w = ((np.random.randn(1, nfeatures))/10000000).tolist()[0] #?? all w should be same or different?
    
    w = np.full((nfeatures), w_b[0][0])
    b = np.full((1), w_b[0][1])
    return w, b

# this is a vectorized version of positive_part operation
# we can use this for hinge loss as post_part(1.0 - y*f)
pos_part = np.vectorize(lambda u: u if u > 0. else 0.)

# compute the value of the linear SVM objective function
# given current signed distances, and parameter vector w
def _get_objective(f, y, w, lam):
    nobs = y.shape[0]
    loss = np.sum(pos_part(1.0 - y*f)) / nobs
    penalty = 0.5 * lam * np.dot(w,w)
    return loss + penalty

# compute the signed distances
# based on current model estimates
# w and b
# YOU NEED TO IMPLEMENT THIS
def _get_signed_distances(X, w, b):   
    result = np.full((len(X),len(w[0])), 0).tolist()
    
    for i in range(len(X)):
        for j in range(len(w[0])):
            for k in range(len(w)):
                result[i][j] += X[i][k] * w[k][j]
            result[i][j] += b

    f = np.asarray(result, dtype=np.float64)
    return f


# compute gradients with respect to w and b
# YOU NEEED TO IMPLEMENT THIS
signed = np.vectorize(lambda t: 1. if  t > 0. else 0.)

def _inverse_multiple(X,y):
    result = np.full((len(X),len(X[0])), 0).tolist()

    for i in range(len(X)):
        for j in range(len(X[i])):
            result[i][j] = X[i][j] * y[j][0]

    f = np.asarray(result)
    return f

def _get_gradients(f, X, y, w, b, lam):
    nobs = len(X)
    yf = y * f
    t = signed(1. - yf)
    ty = t * y
    
    gb = np.sum(ty) / nobs
    gw = np.sum(_inverse_multiple(X, ty), axis=0) / nobs
    gw += lam * w
    
    return gw, gb


# fit an SVM using gradient descent
# X: matrix of feature values
# y: labels (-1 or 1)
# lam: penalty parameter lambda
# n_iter: numer of iterations
# eta: learning rate
def fit_svm(X, y, lam, n_iter=100, eta=.4):
    nexamples, nfeatures = X.shape
    
    w, b = _initialize_parameters(nfeatures)
    
    for k in range(n_iter):
        f = _get_signed_distances(X, w, b)
        
        # print information and 
        # update the learning rate
        if k % 10 == 0:
            obj = _get_objective(f, y, w, lam)
            eta = eta / 2.0
            print("it: %d, obj %.2f" % (k, obj))
        
        gw, gb = _get_gradients(f, X, y, w, b, lam)
        w = w - eta * gw
        b = b - eta * gb
    return w, b

In [None]:
w,b = fit_svm(train_X, train_y, 1.0, n_iter=100)

In [15]:
#Test
X = np.array([[1, 3,2,6],
                   [4, 0,5,7],
                   [2, 1,6,8]])

Y = np.array([[1],[5],[9],[2]])
result = _get_signed_distances(X,Y,1)
print(result)

[[47.]
 [64.]
 [78.]]


In [14]:
X = np.array([[1, 3],
                   [4, 0],
                   [2, 1]])

y = np.array([[1],[5]])

result = _inverse_multiple(X,y)
print(result)

[[ 1 15]
 [ 4  0]
 [ 2  5]]
