In [16]:
import warnings
warnings.simplefilter("ignore")
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

In [3]:
%matplotlib inline

In [4]:
# Read the data from csv file
col_names = []
for i in range(20):
    if i == 0:
        col_names.append('quality')
    if i == 1:
        col_names.append('prescreen')
    if i >= 2 and i <= 7:
        col_names.append('ma' + str(i))
    if i >= 8 and i <= 15:
        col_names.append('exudate' + str(i))
    if i == 16:
        col_names.append('euDist')
    if i == 17:
        col_names.append('diameter')
    if i == 18:
        col_names.append('amfm_class')
    if i == 19:
        col_names.append('label')

data = pd.read_csv("messidor_features.txt", names = col_names)
print(data.shape)
data.head(10)

(1151, 20)


Unnamed: 0,quality,prescreen,ma2,ma3,ma4,ma5,ma6,ma7,exudate8,exudate9,exudate10,exudate11,exudate12,exudate13,exudate14,exudate15,euDist,diameter,amfm_class,label
0,1,1,22,22,22,19,18,14,49.895756,17.775994,5.27092,0.771761,0.018632,0.006864,0.003923,0.003923,0.486903,0.100025,1,0
1,1,1,24,24,22,18,16,13,57.709936,23.799994,3.325423,0.234185,0.003903,0.003903,0.003903,0.003903,0.520908,0.144414,0,0
2,1,1,62,60,59,54,47,33,55.831441,27.993933,12.687485,4.852282,1.393889,0.373252,0.041817,0.007744,0.530904,0.128548,0,1
3,1,1,55,53,53,50,43,31,40.467228,18.445954,9.118901,3.079428,0.840261,0.272434,0.007653,0.001531,0.483284,0.11479,0,0
4,1,1,44,44,44,41,39,27,18.026254,8.570709,0.410381,0.0,0.0,0.0,0.0,0.0,0.475935,0.123572,0,1
5,1,1,44,43,41,41,37,29,28.3564,6.935636,2.305771,0.323724,0.0,0.0,0.0,0.0,0.502831,0.126741,0,1
6,1,0,29,29,29,27,25,16,15.448398,9.113819,1.633493,0.0,0.0,0.0,0.0,0.0,0.541743,0.139575,0,1
7,1,1,6,6,6,6,2,1,20.679649,9.497786,1.22366,0.150382,0.0,0.0,0.0,0.0,0.576318,0.071071,1,0
8,1,1,22,21,18,15,13,10,66.691933,23.545543,6.151117,0.496372,0.0,0.0,0.0,0.0,0.500073,0.116793,0,1
9,1,1,79,75,73,71,64,47,22.141784,10.054384,0.874633,0.09978,0.023386,0.0,0.0,0.0,0.560959,0.109134,0,1


### 1. Data prep

In [5]:
# Separate the feature columns from the class label column

X = data.drop(['label'], axis = 1)
y = data['label']

print(X.shape, y.shape)
print()
print(X.head())

(1151, 19) (1151,)

   quality  prescreen  ma2  ma3  ma4  ma5  ma6  ma7   exudate8   exudate9  \
0        1          1   22   22   22   19   18   14  49.895756  17.775994   
1        1          1   24   24   22   18   16   13  57.709936  23.799994   
2        1          1   62   60   59   54   47   33  55.831441  27.993933   
3        1          1   55   53   53   50   43   31  40.467228  18.445954   
4        1          1   44   44   44   41   39   27  18.026254   8.570709   

   exudate10  exudate11  exudate12  exudate13  exudate14  exudate15    euDist  \
0   5.270920   0.771761   0.018632   0.006864   0.003923   0.003923  0.486903   
1   3.325423   0.234185   0.003903   0.003903   0.003903   0.003903  0.520908   
2  12.687485   4.852282   1.393889   0.373252   0.041817   0.007744  0.530904   
3   9.118901   3.079428   0.840261   0.272434   0.007653   0.001531  0.483284   
4   0.410381   0.000000   0.000000   0.000000   0.000000   0.000000  0.475935   

   diameter  amfm_class  
0  0

### 2. Support Vector Machines (SVM) and Pipelines

Q2. For some classification algorithms, like KNN, SVMs, and Neural Nets, scaling of the data is critical for the algorithm to operate correctly. For other classification algorithms, like Naive Bayes, and Decision Trees, data scaling is not necessary (take a minute to think about why that is the case). 

We discussed in class how the data scaling should happen on the _training set only_, which means that it should happen _inside_ of the cross validation loop. In other words, in each fold of the cross validation, the data will be separated in to training and test sets. The scaling (calculating mean and std, for instance) should happen based on the values in the _traning set only_. Then the test set can be scaled using the values found on the training set. (Refer to the concept of [data leakage](https://machinelearningmastery.com/data-leakage-machine-learning/).)

In order to do this with scikit-learn, you must create what's called a `Pipeline` and pass that in to the cross validation. This is a very important concept for Data Mining and Machine Learning, so let's practice it here.

Do the following:
* Create a `sklearn.preprocessing.StandardScaler` object to standardize the dataset’s features (mean = 0 and variance = 1). Do not call `fit` on it yet. Just create the `StandardScaler` object.
* Create a sklearn.svm.SVC classifier (do not set any arguments - use the defaults). Do not call fit on it yet. Just create the SVC object.
* Create a `sklearn.pipeline.Pipeline` and set the `steps` to the scaler and the SVC objects that you just created. 
* Pass the `pipeline` in to a `cross_val_score` as the estimator, along with the features and the labels, and use a 5-fold-CV. 

In each fold of the cross validation, the training phase will use _only_ the training data for scaling and training the model. Then the testing phase will scale the test data into the scaled space (found on the training data) and run the test data through the trained classifier, to return an accuracy measurement for each fold. Print the average accuracy across all 5 folds. 

In [6]:
# Create StandardScaler and SVC Object
scaler = StandardScaler()
svc = SVC()

# Create pipeline and set steps to scaler and SVC objects
pipe = Pipeline(steps = [('Scaler', scaler), ('svc', svc)])

scores = cross_val_score(pipe, X, y, cv = 5)
#Print accuracy
print('Accuracy:', scores.mean() * 100)

Accuracy: 70.11368341803124


In [7]:
# For the 'svm' part of the pipeline, tune the 'kernel' hyperparameter
param_grid = {'svc__kernel': ['linear', 'rbf', 'poly', 'sigmoid']}

# Create GridSearchCV that takes in pipeline
grid = GridSearchCV(pipe, param_grid, cv = 5)
grid.fit(X, y)
# Print best kernel
print("Best kernel:", grid.best_params_)


Best kernel: {'svc__kernel': 'linear'}


Q4. Now put what you did in Q3 in to an outer CV loop to evaluate the accuracy of using that best-found kernel on unseen test data. 
* Pass the `GridSearchCV` in to a `cross_val_score` with 5-fold-CV. Print out the accuracy.

Note that the accuracy increases from Q2 because of a better choice of kernel function.

In [8]:
# Pass the GridSearchCV in to a cross_val_score with 5-fold-CV
grid_score = cross_val_score(grid, X, y, cv = 5, scoring = 'accuracy')
# Print accuracy
print('Accuracy:', grid_score.mean() * 100)

Accuracy: 72.28646715603239


Q5. Let's see if we can get the accuracy even higher by tuning additional hyperparameters. SVMs have a parameter called 'C' that is the cost for a misclassification. (More info [here](https://medium.com/@pushkarmandot/what-is-the-significance-of-c-value-in-support-vector-machine-28224e852c5a)).
* Create a parameter grid that includes the kernel (as you have above) and the C value as well. Try values of C from 50 to 100 by increments of 10. (You can use the range function to help you with this.)
* Create a `GridSearchCV` with the pipeline from above, this new parameter grid, and a 5-fold-CV.
* Pass the `GridSearchCV` into a `cross_val_score` with a 5-fold-CV and print out the accuracy.

Be patient as this can take some time to run. Note that the accurcay has increased even further because the best value of C was found and used on the test data.

Now we're actually starting to get closer to some decent accuracies on this dataset!

In [9]:
# Create parameter grid and GridSearchCV with pipeline from above
params = {'svc__C': range(50, 110, 10), 'svc__kernel': ['linear']}
grid_search = GridSearchCV(pipe, params, cv = 5)

# Pass the GridSearchCV into a cross_val_score with a 5-fold-CV 
grid_search_score = cross_val_score(grid_search, X, y, cv = 5)
# Print accuracy
print('Accuracy:', grid_search_score.mean() * 100)

Accuracy: 74.54357236965933


### 3. Neural Networks (NN)

In [10]:
# Create pipeline that includes scaling and an MLPClassifier
mlp = MLPClassifier()
parameters = {'mlp__activation': ['logistic', 'tanh', 'relu'], 'mlp__hidden_layer_sizes': [(10,), (20,), (30,), (40,), (50,), (60,)]}
pipe_nn = Pipeline(steps = [('Scaler', scaler), ('mlp', mlp)])
grid_nn = GridSearchCV(pipe_nn, parameters, cv = 5)

# Fit GridSearchCV to determine best parameters
grid_nn.fit(X, y)
print("Best Params:", grid_nn.best_params_)

scores_nn = cross_val_score(grid_nn, X, y, cv = 5)
print('Accuracy:', scores_nn.mean() * 100)

Best Params: {'mlp__activation': 'relu', 'mlp__hidden_layer_sizes': (40,)}
Accuracy: 72.28609072087335


### 4. Ensemble Classifiers


**A. Random Forests**


In [15]:
# Use Random Forests to classify data
rfc = RandomForestClassifier()
params_rfc = {'max_depth': range(35, 56, 1), 'min_samples_leaf': [8, 10, 12], 'max_features': ['sqrt', 'log2']}

grid_rfc = GridSearchCV(rfc, params_rfc, cv = 5)
grid_rfc.fit(X, y)
print('Best Params:', grid_rfc.best_params_)

# GridSearchCV in a cross_val_score with 5-fold CV
scores_rfc = cross_val_score(grid_rfc, X, y, cv = 5, scoring = 'accuracy')
print('Accuracy:', scores_rfc.mean() * 100)

Best Params: {'max_depth': 43, 'max_features': 'sqrt', 'min_samples_leaf': 12}
Accuracy: 68.28496141539621


**B. AdaBoost**

Random Forests are a kind of ensemble classifier where many estimators are built independently in parallel. In contrast, there is another method of creating an ensemble classifier called *boosting*. Here the classifiers are trained one-by-one in sequence and each time the sampling of the training set depends on the performance of previously generated models.

Q8. Evaluate a `sklearn.ensemble.AdaBoostClassifier` classifier on the data. By default, `AdaBoostClassifier` uses decision trees as the base classifiers (but this can be changed). 
* Use a GridSearchCV to find the best number of trees in the ensemble (`n_estimators`). Try values from 50-250 with increments of 25. (you can use the range function to help you with this.)
* Wrap your GridSearchCV in a cross_val_score with 5-fold CV to report the accuracy of the model.

Be patient, this can take a few minutes to run.

In [18]:
ada = AdaBoostClassifier()
params_ada = {'n_estimators': range(50, 275, 25)}
grid_ada = GridSearchCV(ada, params_ada, cv = 5)

# Find best number of trees in ensemble
grid_ada.fit(X, y)
print('Best params:', grid_ada.best_params_)

# Wrap GridSearchCV in a cross_val_score with 5-fold CV
scores_ada = cross_val_score(grid_ada, X, y, cv = 5)
print('Accuracy:', scores_ada.mean() * 100)

Best params: {'n_estimators': 250}
Accuracy: 71.32806324110673


### 5. Deploying a final model

In [24]:
import pickle

params_final = {'mlp__activation': ['logistic', 'tanh', 'relu'], 'mlp__hidden_layer_sizes': [(10,), (20,), (30,), (40,), (50,), (60,)]}
pipe_final = Pipeline(steps = [('Scaler', scaler), ('mlp', mlp)])
grid_final = GridSearchCV(pipe_final, parameters, cv = 5)

# Fit GridSearchCV to determine best parameters
grid_final.fit(X, y)
print("Best Params:", grid_final.best_params_)
print('Accuracy:', grid_final.best_score_ * 100)

# Set this final_model to final model
final_model = grid_final

filename = 'finalized_model.sav'
pickle.dump(final_model, open(filename, 'wb'))

Best Params: {'mlp__activation': 'relu', 'mlp__hidden_layer_sizes': (60,)}
Accuracy: 73.93570807993049


In [31]:
# Use to record to classify
record = [ 0.05905386, 0.2982129, 0.68613149, 0.75078865, 0.87119216, 0.88615694,
  0.93600623, 0.98369184, -0.47426472, -0.57642756, -0.53115361, -0.42789774,
 -0.21907738, -0.20090532, -0.21496782, -0.2080998, 0.06692373, -2.81681183,
 -0.7117194 ]
 
# Load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

# Classify record
ans = loaded_model.predict([record])

if ans == 1:
    print('Positive for disease')
else:
    print('Negative for disease')

Positive for disease
