In [1]:
import warnings
warnings.simplefilter("ignore")
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
%matplotlib inline

In [3]:
# Read the data from csv file
col_names = []
for i in range(20):
    if i == 0:
        col_names.append('quality')
    if i == 1:
        col_names.append('prescreen')
    if i >= 2 and i <= 7:
        col_names.append('ma' + str(i))
    if i >= 8 and i <= 15:
        col_names.append('exudate' + str(i))
    if i == 16:
        col_names.append('euDist')
    if i == 17:
        col_names.append('diameter')
    if i == 18:
        col_names.append('amfm_class')
    if i == 19:
        col_names.append('label')

data = pd.read_csv("messidor_features.txt", names = col_names)
print(data.shape)
data.head(10)

(1151, 20)


Unnamed: 0,quality,prescreen,ma2,ma3,ma4,ma5,ma6,ma7,exudate8,exudate9,exudate10,exudate11,exudate12,exudate13,exudate14,exudate15,euDist,diameter,amfm_class,label
0,1,1,22,22,22,19,18,14,49.895756,17.775994,5.27092,0.771761,0.018632,0.006864,0.003923,0.003923,0.486903,0.100025,1,0
1,1,1,24,24,22,18,16,13,57.709936,23.799994,3.325423,0.234185,0.003903,0.003903,0.003903,0.003903,0.520908,0.144414,0,0
2,1,1,62,60,59,54,47,33,55.831441,27.993933,12.687485,4.852282,1.393889,0.373252,0.041817,0.007744,0.530904,0.128548,0,1
3,1,1,55,53,53,50,43,31,40.467228,18.445954,9.118901,3.079428,0.840261,0.272434,0.007653,0.001531,0.483284,0.11479,0,0
4,1,1,44,44,44,41,39,27,18.026254,8.570709,0.410381,0.0,0.0,0.0,0.0,0.0,0.475935,0.123572,0,1
5,1,1,44,43,41,41,37,29,28.3564,6.935636,2.305771,0.323724,0.0,0.0,0.0,0.0,0.502831,0.126741,0,1
6,1,0,29,29,29,27,25,16,15.448398,9.113819,1.633493,0.0,0.0,0.0,0.0,0.0,0.541743,0.139575,0,1
7,1,1,6,6,6,6,2,1,20.679649,9.497786,1.22366,0.150382,0.0,0.0,0.0,0.0,0.576318,0.071071,1,0
8,1,1,22,21,18,15,13,10,66.691933,23.545543,6.151117,0.496372,0.0,0.0,0.0,0.0,0.500073,0.116793,0,1
9,1,1,79,75,73,71,64,47,22.141784,10.054384,0.874633,0.09978,0.023386,0.0,0.0,0.0,0.560959,0.109134,0,1


## Part 1: Naive Bayes Classifier


In [4]:
# Split data
X = data.drop(['label'], axis = 1)
y = data['label']
print(X.shape)
print(y.shape)

# Create GaussianNB classifier
clf = GaussianNB()

# Run 10-fold cross validation
scores = cross_val_score(clf, X, y, cv = 10)
print("Scores:", scores)

print("Accuracy:", scores.mean() * 100)


(1151, 19)
(1151,)
Scores: [0.61206897 0.56521739 0.63478261 0.53913043 0.6        0.66086957
 0.56521739 0.54782609 0.60869565 0.64347826]
Accuracy: 59.77286356821588


In [5]:
# Predict y-values
pred_y = cross_val_predict(clf, X, y, cv = 10)
print("Predictions:", pred_y)

# Create confusion matrix
cnf_matrix = confusion_matrix(y, pred_y)
print("Confusion Matrix \n", cnf_matrix)

# Display classification report for clf
print("Classification Report \n", classification_report(y, pred_y))


Predictions: [0 0 1 ... 0 1 0]
Confusion Matrix 
 [[502  38]
 [425 186]]
Classification Report 
               precision    recall  f1-score   support

           0       0.54      0.93      0.68       540
           1       0.83      0.30      0.45       611

    accuracy                           0.60      1151
   macro avg       0.69      0.62      0.56      1151
weighted avg       0.69      0.60      0.56      1151



## Part 2: K Nearest Neighbor (KNN) Classifier
 

In [6]:
# Scale data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
print()
print(type(X_scaled))
print(X_scaled.shape)

# Make sure the data got scaled correctly
print()
print(X_scaled[0])
print("mean=", X_scaled.mean()) #mean should be 0
print("std=", X_scaled.std())   #std should be 1



<class 'numpy.ndarray'>
(1151, 19)

[1.00000000e+00 1.00000000e+00 1.40000000e-01 1.60305344e-01
 1.76470588e-01 1.73076923e-01 1.77083333e-01 1.47727273e-01
 1.22764445e-01 1.06359374e-01 4.96928012e-02 1.29130181e-02
 3.62326676e-04 3.41516240e-04 6.60682519e-04 1.27091478e-03
 5.30801274e-01 2.61133465e-01 1.00000000e+00]
mean= 0.2812233141456987
std= 0.33490829012056916


In [7]:
# Classify normalized data
neigh = KNeighborsClassifier(n_neighbors=5)
predicted = cross_val_predict(neigh, X_scaled, y, cv = 10)
print(classification_report(y, predicted))


              precision    recall  f1-score   support

           0       0.60      0.65      0.62       540
           1       0.67      0.62      0.64       611

    accuracy                           0.64      1151
   macro avg       0.64      0.64      0.63      1151
weighted avg       0.64      0.64      0.64      1151



In [8]:
# Find best value of k
params = {'n_neighbors':[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]}
grid_search = GridSearchCV(neigh, params, cv=10, scoring='accuracy')

grid_search.fit(X_scaled, y)
print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_ * 100)

{'n_neighbors': 23}
Accuracy: 66.02953953084274


In [9]:
# Display the accuracy, precision, and recall of a KNN classifier
neigh_best = KNeighborsClassifier(n_neighbors = 23)
predicted_best = cross_val_predict(neigh_best, X_scaled, y, cv = 10)
print(classification_report(y, predicted_best))

              precision    recall  f1-score   support

           0       0.62      0.72      0.67       540
           1       0.71      0.61      0.65       611

    accuracy                           0.66      1151
   macro avg       0.66      0.66      0.66      1151
weighted avg       0.67      0.66      0.66      1151



In [10]:
# Run nested cross-validation loop
print(X_scaled.shape)
print(y.shape)
nested_score = cross_val_score(grid_search, X, y, cv = 10)

# Print accuracy
print("Accuracy:", nested_score.mean() * 100)

# Print precision and recall
pred_nested = cross_val_predict(grid_search, X_scaled, y, cv = 10)
print(classification_report(y, pred_nested))


(1151, 19)
(1151,)




Accuracy: 66.4647676161919




              precision    recall  f1-score   support

           0       0.61      0.74      0.67       540
           1       0.71      0.58      0.64       611

    accuracy                           0.66      1151
   macro avg       0.66      0.66      0.65      1151
weighted avg       0.67      0.66      0.65      1151



In [11]:
# Define a pipeline to search for the best combination of PCA dimensions and n_neighbors.

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

# Create a scaler
scaler = sk.preprocessing.MinMaxScaler()

# Create a PCA
pca = PCA()

# Create a KNN classifier
knn = KNeighborsClassifier()

# Create a pipeline that does scaling, then PCA, then KNN
pipe = Pipeline(steps=[('scaler', scaler), ('pca', pca), ('knn', knn)])

#Set up the parameters to tune for each of the pipeline steps
param_grid = {
    'pca__n_components': list(range(1, 19)), #find how many principal componenet to keep
    'knn__n_neighbors': list(range(1, 30)),  #find the best value of k
}

# Pass the pipeline and the parameters into a GridSearchCV with a 5-fold cross validation
grid = GridSearchCV(pipe, param_grid, cv = 5, scoring = "accuracy")

# Call fit() on the GridSearchCV and pass in the unscaled data (X_values, Y_values)
grid.fit(X, y)

# Print out the best_score_ and best_params_ from the GridSearchCV
print("Best Score:", grid.best_score_ * 100)
print("Best Parameters:", grid.best_params_)

Best Score: 65.94265855777584
Best Parameters: {'knn__n_neighbors': 27, 'pca__n_components': 8}


In [12]:
# Create outer CV loop
nested_score_2 = cross_val_score(grid, X, y, cv = 5)
print("Accuracy:", nested_score_2.mean() * 100)



Accuracy: 64.90269151138716
