# ---- COGS 118A FINAL PROJECT ----

# IMPORTS

In [120]:
import numpy as np
import pandas as pd
import requests, zipfile, io

from sklearn.metrics import accuracy_score, f1_score, auc, roc_auc_score, roc_curve, make_scorer
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

#plots
import seaborn as sns
import matplotlib.pyplot as plt

# DATA


## 1- LETTER ('A'-'M' positive class, 'N'-'X' negative class)
### Source:
https://archive.ics.uci.edu/ml/datasets/Letter+Recognition
### Goal: 
Identify black-and-white rectangular pixel displays as one of the 26 capital letters in the English alphabet

In [93]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data', header=None)
print(data.shape)
data.head()

(20000, 17)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


In [94]:
#checking if any observations have missing data
rows_to_drop = data[data.isnull().any(axis=1)] 
print(rows_to_drop)

Empty DataFrame
Columns: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
Index: []


In [95]:
# get inputs (X) & targets (y) 
letter_1_X = data.loc[: , 1:16] #feature inputs: taking colums "1" to "16"
labels = data[0]

#transform

#determining +/- classes, A-M = positive class, N-Z = negative class
letter_1_Y=[]
for i in labels:
    if i in ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M'):
        letter_1_Y += [1]
    else:
        letter_1_Y += [-1]
        
# add label column to original dataframe (for plot)
data['y'] = letter_1_Y


## 2- LETTER ("O" positive class)
### Source:
https://archive.ics.uci.edu/ml/datasets/Letter+Recognition
### Goal: 
Identify black-and-white rectangular pixel displays as one of the 26 capital letters in the English alphabet

In [96]:
data2 = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data', header=None)
print(data2.shape)
data2.head()

(20000, 17)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


In [97]:
# get inputs (X) & targets (y) 
letter_2_X = data2.loc[: , 1:16] #feature inputs: taking colums "1" to "16"
labels = data2[0]


#transform

#determining +/- classes, A-M = positive class, N-Z = negative class
letter_2_Y=[]
for i in labels:
    if i in ('O'):
        letter_2_Y += [1]
    else:
        letter_2_Y += [-1]
        
# add label column to original dataframe (for plot)
data2['y'] = letter_2_Y

## 3 - COVTYPE

In [98]:
data3 = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz",
                    header=None)

In [99]:
rows_to_drop = data3[data3.isnull().any(axis=1)] 
print(rows_to_drop)

Empty DataFrame
Columns: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54]
Index: []

[0 rows x 55 columns]


In [100]:
data3[54].value_counts() #looking for largest class to make as the positive class

2    283301
1    211840
3     35754
7     20510
6     17367
5      9493
4      2747
Name: 54, dtype: int64

In [101]:
# get inputs (X) & targets (Y) 
cov_X = data3.loc[: , 0:53] #feature inputs: taking colums "0" to "53"
labels = data3[54]
print(cov_X)
print(labels)

#transform

#determining +/- classes, + class is Lodgepole Pine (2), - class is everything else
cov_Y=[]
for i in labels:
    if i == 2:
        cov_Y += [1]
    else:
        cov_Y += [-1]
        
# add label column to original dataframe (for plot)
data3['y'] = cov_Y
data3

          0    1   2    3    4     5    6    7    8     9   ...  44  45  46  \
0       2596   51   3  258    0   510  221  232  148  6279  ...   0   0   0   
1       2590   56   2  212   -6   390  220  235  151  6225  ...   0   0   0   
2       2804  139   9  268   65  3180  234  238  135  6121  ...   0   0   0   
3       2785  155  18  242  118  3090  238  238  122  6211  ...   0   0   0   
4       2595   45   2  153   -1   391  220  234  150  6172  ...   0   0   0   
...      ...  ...  ..  ...  ...   ...  ...  ...  ...   ...  ...  ..  ..  ..   
581007  2396  153  20   85   17   108  240  237  118   837  ...   0   0   0   
581008  2391  152  19   67   12    95  240  237  119   845  ...   0   0   0   
581009  2386  159  17   60    7    90  236  241  130   854  ...   0   0   0   
581010  2384  170  15   60    5    90  230  245  143   864  ...   0   0   0   
581011  2383  165  13   60    4    67  231  244  141   875  ...   0   0   0   

        47  48  49  50  51  52  53  
0        0   0

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,46,47,48,49,50,51,52,53,54,y
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,5,-1
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,5,-1
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,2,1
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,2,1
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,5,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581007,2396,153,20,85,17,108,240,237,118,837,...,0,0,0,0,0,0,0,0,3,-1
581008,2391,152,19,67,12,95,240,237,119,845,...,0,0,0,0,0,0,0,0,3,-1
581009,2386,159,17,60,7,90,236,241,130,854,...,0,0,0,0,0,0,0,0,3,-1
581010,2384,170,15,60,5,90,230,245,143,864,...,0,0,0,0,0,0,0,0,3,-1


## 4 - MUSHROOM 
### Source:
https://archive.ics.uci.edu/ml/datasets/mushroom
### Goal:
Predict whether or not a mushroom is edible or not based off 22 different categorical characteristics

In [142]:
data4 = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data', 
                    header=None)
print(data4.shape)
data4.head() 

(8124, 23)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [143]:
rows_to_drop4 = data4[data4.isnull().any(axis=1)] 
print(rows_to_drop)

Empty DataFrame
Columns: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54]
Index: []

[0 rows x 55 columns]


In [165]:
# Extract Labels
y = data4[0]

mush_Y=[] 
for i in y:
    if i == "e":
        mush_Y += [1]
    else:
        mush_Y += [-1]


# Ectract Data
d4 = data4.loc[:, 1:54]

# One hot encode categorical data
mush_X = pd.DataFrame(pd.get_dummies(d4))
mush_X.head()

Unnamed: 0,1_b,1_c,1_f,1_k,1_s,1_x,2_f,2_g,2_s,2_y,...,21_s,21_v,21_y,22_d,22_g,22_l,22_m,22_p,22_u,22_w
0,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,0,0,1,...,1,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


# LOGISTIC REGRESSION

In [7]:
def log_reg_classifier(X, Y):
    #initialize some lists to store results for training, testing performance tables
    test_ACC = []  
    test_FSC = []
    test_AUC = []
    train_ACC = []
    train_FSC = []
    train_AUC = []
    
    # run 5 trials 
    for i in range(5):
        # sampling w/ replacement
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=5000)
        
        # tansform data
        X_train = StandardScaler().fit_transform(X_train)    
        X_test = StandardScaler().fit_transform(X_test)
        
        # define hyperparameters to search through & error metrics 
        C_list = [10**-8,10**-7,10**-6,10**-5,10**-4,10**-3,10**-2,10**-1,10**0,10**1,10**2,10**3,10**4]
        penalty_list = ['l2', 'l1', 'none']
        solver = ['saga'] #this solver works for no penalty, l1 and l2 penalties
        scoring = {'Accuracy': make_scorer(accuracy_score), 'f1': 'f1', 'AUC': 'roc_auc'}
        classifier = LogisticRegression()
        
        # gridsearch 
        lr =  GridSearchCV(classifier, {'C':C_list, 'penalty':penalty_list,'solver':solver},
                                   scoring=scoring,refit=False, cv=5, n_jobs=-1, 
                                   return_train_score=True, verbose=2 ) 
        
        lr.fit(X_train,Y_train)
        results = pd.DataFrame(lr.cv_results_)
        
        # Get each parameter settings that gives best accuracy, F1, and AUC on validation set
        best_AUC = results[results['rank_test_AUC']==1]['params'].values[0]
        best_accuracy = results[results['rank_test_Accuracy']  ==1]['params'].values[0]
        best_F1 = results[results['rank_test_f1']  ==1]['params'].values[0]
        
        # Train 3 models using the 5000 samples and each of the 3 best parameter settings (one model per metric)
        clf_AUC = LogisticRegression(C=best_AUC['C'], solver = best_AUC['solver'],
                             penalty=best_AUC['penalty']).fit(X_train, Y_train)

        clf_accuracy = LogisticRegression(C=best_accuracy['C'], solver = best_accuracy['solver'],
                             penalty=best_accuracy['penalty']).fit(X_train, Y_train)

        clf_F1 = LogisticRegression(C=best_F1['C'], solver = best_F1['solver'],
                                 penalty=best_F1['penalty']).fit(X_train, Y_train)
        
        # For average training performance
        train_accuracy = clf_accuracy.score(X_train, Y_train)  # Accuracy  
        train_F1 = f1_score(Y_train, clf_F1.predict(X_train))   # F1
        fpr, tpr, threshold = roc_curve(Y_train, clf_AUC.predict_proba(X_train)[:,1])
        train_auc = auc(fpr, tpr) # AUC
        
        # Find and store accuracy, F1, and AUC of the 3 models from previous line on test set
        test_accuracy = clf_accuracy.score(X_test, Y_test)  # Accuracy
    
        test_F1 = f1_score(Y_test, clf_F1.predict(X_test))   # F1
    
        false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_test, clf_AUC.predict_proba(X_test)[:,1])
        test_auc = auc(false_positive_rate, true_positive_rate) # AUC

        #data for train performance table
        train_ACC += [train_accuracy]
        train_FSC += [train_F1]
        train_AUC += [train_auc]
        #data for test performance table
        test_ACC += [test_accuracy]
        test_FSC += [test_F1]
        test_AUC += [test_auc]
        
    # TABLES 
    # training set performance across trials for each alg/data set combo, sep cols for each metric
    data1 = {'ACC': train_ACC, 'FSC': train_FSC, 'AUC': train_AUC}
    df1 = pd.DataFrame(data1, index =['Trial 1','Trial 2', 'Trial 3', 'Trial 4', 'Trial 5']) 
    mean_train_performance = df1.mean()
    display(df1) 
    print("mean_train_performance:")
    print(mean_train_performance)      #display mean performance
    
    # test set performance across trials for each alg/data set combo, sep cols for each metric
    data2 = {'ACC': test_ACC, 'FSC': test_FSC, 'AUC': test_AUC}
    df = pd.DataFrame(data2, index =['Trial 1','Trial 2', 'Trial 3', 'Trial 4', 'Trial 5']) 
    mean_test_performance = df.mean()
    display(df)
    print("mean_test_performance:")
    print(mean_test_performance)       #display mean performance
         
    

## Letter 1: Training and Testing Performance

In [24]:
log_reg_classifier(letter_1_X, letter_1_Y)

Fitting 5 folds for each of 39 candidates, totalling 195 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 195 out of 195 | elapsed:    5.8s finished
  "Setting penalty='none' will ignore the C and l1_ratio "


Fitting 5 folds for each of 39 candidates, totalling 195 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  92 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 195 out of 195 | elapsed:    2.5s finished
  "Setting penalty='none' will ignore the C and l1_ratio "
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 39 candidates, totalling 195 fits


[Parallel(n_jobs=-1)]: Done  92 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 195 out of 195 | elapsed:    2.7s finished
  "Setting penalty='none' will ignore the C and l1_ratio "
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 39 candidates, totalling 195 fits


[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 188 out of 195 | elapsed:    3.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 195 out of 195 | elapsed:    3.5s finished
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "


Fitting 5 folds for each of 39 candidates, totalling 195 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  92 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 195 out of 195 | elapsed:    3.8s finished
  "Setting penalty='none' will ignore the C and l1_ratio "


Unnamed: 0,ACC,FSC,AUC
Trial 1,0.731,0.731737,0.814482
Trial 2,0.7334,0.737029,0.820244
Trial 3,0.7278,0.734335,0.815601
Trial 4,0.7122,0.713175,0.807889
Trial 5,0.7326,0.737483,0.813854


mean_train_performance:
ACC    0.727400
FSC    0.730752
AUC    0.814414
dtype: float64


Unnamed: 0,ACC,FSC,AUC
Trial 1,0.726,0.728354,0.811824
Trial 2,0.726667,0.728908,0.811385
Trial 3,0.730867,0.736113,0.813027
Trial 4,0.725133,0.725115,0.813569
Trial 5,0.7276,0.732005,0.813553


mean_test_performance:
ACC    0.727253
FSC    0.730099
AUC    0.812672
dtype: float64


## Letter 2: Training and Testing Performance

In [25]:
log_reg_classifier(letter_2_X, letter_2_Y)

Fitting 5 folds for each of 39 candidates, totalling 195 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 195 out of 195 | elapsed:    4.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 39 candidates, totalling 195 fits


[Parallel(n_jobs=-1)]: Done  92 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 195 out of 195 | elapsed:    3.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 39 candidates, totalling 195 fits


[Parallel(n_jobs=-1)]: Done  92 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 195 out of 195 | elapsed:    3.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 39 candidates, totalling 195 fits


[Parallel(n_jobs=-1)]: Done  92 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 195 out of 195 | elapsed:    3.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 39 candidates, totalling 195 fits


[Parallel(n_jobs=-1)]: Done  92 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 195 out of 195 | elapsed:    3.8s finished


Unnamed: 0,ACC,FSC,AUC
Trial 1,0.9648,0.0,0.868449
Trial 2,0.962,0.0,0.879958
Trial 3,0.9608,0.0,0.865598
Trial 4,0.965,0.0,0.859079
Trial 5,0.9608,0.0,0.855535


mean_train_performance:
ACC    0.962680
FSC    0.000000
AUC    0.865724
dtype: float64


Unnamed: 0,ACC,FSC,AUC
Trial 1,0.961533,0.0,0.856231
Trial 2,0.962467,0.0,0.849148
Trial 3,0.962867,0.0,0.852891
Trial 4,0.961467,0.0,0.865008
Trial 5,0.962867,0.0,0.858367


mean_test_performance:
ACC    0.962240
FSC    0.000000
AUC    0.856329
dtype: float64


## COVTYPE Training and Testing Performance

In [102]:
log_reg_classifier(cov_X, cov_Y)

Fitting 5 folds for each of 39 candidates, totalling 195 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done 195 out of 195 | elapsed:   22.8s finished
  "Setting penalty='none' will ignore the C and l1_ratio "


Fitting 5 folds for each of 39 candidates, totalling 195 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 195 out of 195 | elapsed:   17.5s finished


Fitting 5 folds for each of 39 candidates, totalling 195 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done 195 out of 195 | elapsed:   18.5s finished


Fitting 5 folds for each of 39 candidates, totalling 195 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done 195 out of 195 | elapsed:   21.7s finished


Fitting 5 folds for each of 39 candidates, totalling 195 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done 195 out of 195 | elapsed:   21.3s finished
  "Setting penalty='none' will ignore the C and l1_ratio "


Unnamed: 0,ACC,FSC,AUC
Trial 1,0.7614,0.759237,0.833847
Trial 2,0.7556,0.747207,0.83054
Trial 3,0.7542,0.759067,0.824457
Trial 4,0.7528,0.757903,0.823384
Trial 5,0.7664,0.763179,0.836713


mean_train_performance:
ACC    0.758080
FSC    0.757319
AUC    0.829788
dtype: float64


Unnamed: 0,ACC,FSC,AUC
Trial 1,0.752278,0.750139,0.823943
Trial 2,0.754425,0.748884,0.824745
Trial 3,0.752892,0.75652,0.823443
Trial 4,0.751333,0.753667,0.823443
Trial 5,0.756614,0.753654,0.826947


mean_test_performance:
ACC    0.753509
FSC    0.752573
AUC    0.824504
dtype: float64


## MUSHROOM training and testing performance

In [166]:
log_reg_classifier(mush_X, mush_Y)

Fitting 5 folds for each of 39 candidates, totalling 195 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   28.9s
[Parallel(n_jobs=-1)]: Done 195 out of 195 | elapsed:   39.8s finished
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 39 candidates, totalling 195 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   27.0s
[Parallel(n_jobs=-1)]: Done 195 out of 195 | elapsed:   37.4s finished
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 39 candidates, totalling 195 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   28.8s
[Parallel(n_jobs=-1)]: Done 195 out of 195 | elapsed:   38.5s finished
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 39 candidates, totalling 195 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   25.5s
[Parallel(n_jobs=-1)]: Done 195 out of 195 | elapsed:   37.1s finished
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 39 candidates, totalling 195 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   24.3s
[Parallel(n_jobs=-1)]: Done 195 out of 195 | elapsed:   33.5s finished
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "


Unnamed: 0,ACC,FSC,AUC
Trial 1,0.9996,0.999612,1.0
Trial 2,0.9996,0.999608,1.0
Trial 3,0.9996,0.999615,1.0
Trial 4,0.9998,0.999807,0.999988
Trial 5,0.9994,0.999418,1.0


mean_train_performance:
ACC    0.999600
FSC    0.999612
AUC    0.999998
dtype: float64


Unnamed: 0,ACC,FSC,AUC
Trial 1,0.99968,0.999694,1.0
Trial 2,0.99968,0.999698,1.0
Trial 3,0.99968,0.99969,1.0
Trial 4,0.99936,0.999382,0.999955
Trial 5,1.0,1.0,1.0


mean_test_performance:
ACC    0.999680
FSC    0.999693
AUC    0.999991
dtype: float64


# RANDOM FOREST

In [8]:
def rf_classifier(X,Y):
    test_ACC = []
    test_FSC = []
    test_AUC = []
    train_ACC = []
    train_FSC = []
    train_AUC = []

    for i in range(5):
        # sampling w/ replacement
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=5000)
       
        # tansform data
        X_train = StandardScaler().fit_transform(X_train)    
        X_test = StandardScaler().fit_transform(X_test)
        
        # define hyperparameters to search through & error metrics  
        n_estimators = [1024]
        min_samples_split = [1,2,4,6,8,12,16,20]
        scoring = {'Accuracy': make_scorer(accuracy_score), 'f1': 'f1', 'AUC': 'roc_auc'}
        classifier = RandomForestClassifier()

        rf =  GridSearchCV(classifier, {'n_estimators':n_estimators, 'min_samples_split':min_samples_split},
                                   scoring=scoring,refit=False, cv=5, n_jobs=-1, 
                                   return_train_score=True, verbose=2) 

        rf.fit(X_train,Y_train)
        results = pd.DataFrame(rf.cv_results_)
        
        # Get each parameter settings that gives best accuracy, F1, and AUC on validation set
        best_AUC = results[results['rank_test_AUC']==1]['params'].values[0]
        best_accuracy = results[results['rank_test_Accuracy']  ==1]['params'].values[0]
        best_F1 = results[results['rank_test_f1']  ==1]['params'].values[0]

        # Train 3 models using the 5000 samples and each of the 3 best parameter settings (one model per metric)
        clf_AUC = RandomForestClassifier(n_estimators=best_AUC['n_estimators'], 
                              min_samples_split=best_AUC['min_samples_split']).fit(X_train, Y_train)

        clf_accuracy = RandomForestClassifier(n_estimators=best_accuracy['n_estimators'], 
                              min_samples_split=best_accuracy['min_samples_split']).fit(X_train, Y_train)

        clf_F1 = RandomForestClassifier(n_estimators=best_F1['n_estimators'], 
                                  min_samples_split=best_F1['min_samples_split']).fit(X_train, Y_train)

         # For average training performance
        train_accuracy = clf_accuracy.score(X_train, Y_train)  # Accuracy  
        train_F1 = f1_score(Y_train, clf_F1.predict(X_train))   # F1
        fpr, tpr, threshold = roc_curve(Y_train, clf_AUC.predict_proba(X_train)[:,1])
        train_auc = auc(fpr, tpr) # AUC
        
        # Find and store accuracy, F1, and AUC of the 3 models from previous line on test set
        test_accuracy = clf_accuracy.score(X_test, Y_test)  # Accuracy
    
        test_F1 = f1_score(Y_test, clf_F1.predict(X_test))   # F1
    
        false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_test, clf_AUC.predict_proba(X_test)[:,1])
        test_auc = auc(false_positive_rate, true_positive_rate) # AUC

        #data for train performance table
        train_ACC += [train_accuracy]
        train_FSC += [train_F1]
        train_AUC += [train_auc]
        #data for test performance table
        test_ACC += [test_accuracy]
        test_FSC += [test_F1]
        test_AUC += [test_auc]
        
    # TABLES 
    # training set performance across trials for each alg/data set combo, sep cols for each metric
    data1 = {'ACC': train_ACC, 'FSC': train_FSC, 'AUC': train_AUC}
    df1 = pd.DataFrame(data1, index =['Trial 1','Trial 2', 'Trial 3', 'Trial 4', 'Trial 5']) 
    mean_train_performance = df1.mean()
    display(df1) 
    print("mean_train_performance:")
    print(mean_train_performance)      #display mean performance
    
    # test set performance across trials for each alg/data set combo, sep cols for each metric
    data2 = {'ACC': test_ACC, 'FSC': test_FSC, 'AUC': test_AUC}
    df = pd.DataFrame(data2, index =['Trial 1','Trial 2', 'Trial 3', 'Trial 4', 'Trial 5']) 
    mean_test_performance = df.mean()
    display(df)
    print("mean_test_performance:")
    print(mean_test_performance)       #display mean performance

## Letter 1: Training and Testing Performance

In [9]:
rf_classifier(letter_1_X, letter_1_Y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.3min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.3min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.3min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.5min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.4min finished


Unnamed: 0,ACC,FSC,AUC
Trial 1,1.0,1.0,1.0
Trial 2,1.0,1.0,1.0
Trial 3,1.0,1.0,1.0
Trial 4,1.0,1.0,1.0
Trial 5,1.0,1.0,1.0


mean_train_performance:
ACC    1.0
FSC    1.0
AUC    1.0
dtype: float64


Unnamed: 0,ACC,FSC,AUC
Trial 1,0.9498,0.949164,0.991932
Trial 2,0.949533,0.949498,0.991508
Trial 3,0.944867,0.943564,0.990384
Trial 4,0.9446,0.946265,0.990244
Trial 5,0.9496,0.950271,0.991452


mean_test_performance:
ACC    0.947680
FSC    0.947752
AUC    0.991104
dtype: float64


## Letter 2: Training and Testing Performance

In [10]:
rf_classifier(letter_1_X, letter_1_Y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.2min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.3min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.3min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.3min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.3min finished


Unnamed: 0,ACC,FSC,AUC
Trial 1,1.0,1.0,1.0
Trial 2,1.0,1.0,1.0
Trial 3,1.0,1.0,1.0
Trial 4,1.0,1.0,1.0
Trial 5,1.0,1.0,1.0


mean_train_performance:
ACC    1.0
FSC    1.0
AUC    1.0
dtype: float64


Unnamed: 0,ACC,FSC,AUC
Trial 1,0.9512,0.950298,0.991602
Trial 2,0.952,0.953312,0.992217
Trial 3,0.943267,0.943655,0.990299
Trial 4,0.946867,0.946121,0.99032
Trial 5,0.944067,0.944381,0.99022


mean_test_performance:
ACC    0.947480
FSC    0.947554
AUC    0.990932
dtype: float64


## Cov_type: Training and Testing Performance

In [103]:
rf_classifier(cov_X, cov_Y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  2.1min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  2.1min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.8min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.7min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  2.4min finished


Unnamed: 0,ACC,FSC,AUC
Trial 1,1.0,1.0,1.0
Trial 2,1.0,0.999185,1.0
Trial 3,1.0,1.0,1.0
Trial 4,1.0,1.0,1.0
Trial 5,1.0,1.0,1.0


mean_train_performance:
ACC    1.000000
FSC    0.999837
AUC    1.000000
dtype: float64


Unnamed: 0,ACC,FSC,AUC
Trial 1,0.820005,0.817743,0.90059
Trial 2,0.822603,0.821279,0.899436
Trial 3,0.820882,0.819098,0.900162
Trial 4,0.823424,0.82383,0.902028
Trial 5,0.819431,0.818123,0.899196


mean_test_performance:
ACC    0.821269
FSC    0.820014
AUC    0.900282
dtype: float64


## Muhsroom Training and Testing Performance

In [167]:
rf_classifier(mush_X, mush_Y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   44.0s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   55.1s finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   45.1s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   56.4s finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   44.6s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   55.2s finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   49.7s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.0min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   53.2s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.1min finished


Unnamed: 0,ACC,FSC,AUC
Trial 1,1.0,1.0,1.0
Trial 2,1.0,1.0,1.0
Trial 3,1.0,1.0,1.0
Trial 4,1.0,1.0,1.0
Trial 5,1.0,1.0,1.0


mean_train_performance:
ACC    1.0
FSC    1.0
AUC    1.0
dtype: float64


Unnamed: 0,ACC,FSC,AUC
Trial 1,1.0,1.0,1.0
Trial 2,1.0,1.0,1.0
Trial 3,1.0,1.0,1.0
Trial 4,1.0,1.0,1.0
Trial 5,1.0,1.0,1.0


mean_test_performance:
ACC    1.0
FSC    1.0
AUC    1.0
dtype: float64


# KNN

In [21]:
def knn_classifier(X,Y):
    test_ACC = []
    test_FSC = []
    test_AUC = []
    train_ACC = []
    train_FSC = []
    train_AUC = []

    for i in range(5):
        # sampling w/ replacement
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=5000)
        
        # tansform data
        X_train = StandardScaler().fit_transform(X_train)    
        X_test = StandardScaler().fit_transform(X_test)
        
        # define hyperparameters to search through & error metrics  
        n_neighbors = np.linspace(1, 105, num=27, dtype=int)
        weights = ['uniform', 'distance']
        p = [1,2]
        scoring = {'Accuracy': make_scorer(accuracy_score), 'f1': 'f1', 'AUC': 'roc_auc'}
        classifier = KNeighborsClassifier()

        rf =  GridSearchCV(classifier, {'n_neighbors':n_neighbors, 'weights': weights, 'p':p},
                                   scoring=scoring,refit=False, cv=5, n_jobs=-1, 
                                   return_train_score=True, verbose=2) 

        rf.fit(X_train,Y_train)
        results = pd.DataFrame(rf.cv_results_)
        
        # Get each parameter settings that gives best accuracy, F1, and AUC on validation set
        best_AUC = results[results['rank_test_AUC']==1]['params'].values[0]
        best_accuracy = results[results['rank_test_Accuracy']  ==1]['params'].values[0]
        best_F1 = results[results['rank_test_f1']  ==1]['params'].values[0]

        # Train 3 models using the 5000 samples and each of the 3 best parameter settings (one model per metric)
        clf_AUC = KNeighborsClassifier(n_neighbors=best_AUC['n_neighbors'], 
                             weights=best_AUC['weights'], p = best_AUC['p']).fit(X_train, Y_train)

        clf_accuracy = KNeighborsClassifier(n_neighbors=best_accuracy['n_neighbors'], 
                              weights=best_accuracy['weights']).fit(X_train, Y_train)

        clf_F1 = KNeighborsClassifier(n_neighbors=best_F1['n_neighbors'], 
                                  weights=best_F1['weights'], p = best_F1['p']).fit(X_train, Y_train)

         # For average training performance
        train_accuracy = clf_accuracy.score(X_train, Y_train)  # Accuracy  
        train_F1 = f1_score(Y_train, clf_F1.predict(X_train))   # F1
        fpr, tpr, threshold = roc_curve(Y_train, clf_AUC.predict_proba(X_train)[:,1])
        train_auc = auc(fpr, tpr) # AUC
        
        # Find and store accuracy, F1, and AUC of the 3 models from previous line on test set
        test_accuracy = clf_accuracy.score(X_test, Y_test)  # Accuracy
    
        test_F1 = f1_score(Y_test, clf_F1.predict(X_test))   # F1
    
        false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_test, clf_AUC.predict_proba(X_test)[:,1])
        test_auc = auc(false_positive_rate, true_positive_rate) # AUC

        #data for train performance table
        train_ACC += [train_accuracy]
        train_FSC += [train_F1]
        train_AUC += [train_auc]
        #data for test performance table
        test_ACC += [test_accuracy]
        test_FSC += [test_F1]
        test_AUC += [test_auc]
        
    # TABLES 
    # training set performance across trials for each alg/data set combo, sep cols for each metric
    data1 = {'ACC': train_ACC, 'FSC': train_FSC, 'AUC': train_AUC}
    df1 = pd.DataFrame(data1, index =['Trial 1','Trial 2', 'Trial 3', 'Trial 4', 'Trial 5']) 
    mean_train_performance = df1.mean()
    display(df1) 
    print("mean_train_performance:")
    print(mean_train_performance)      #display mean performance
    
    # test set performance across trials for each alg/data set combo, sep cols for each metric
    data2 = {'ACC': test_ACC, 'FSC': test_FSC, 'AUC': test_AUC}
    df = pd.DataFrame(data2, index =['Trial 1','Trial 2', 'Trial 3', 'Trial 4', 'Trial 5']) 
    mean_test_performance = df.mean()
    display(df)
    print("mean_test_performance:")
    print(mean_test_performance)       #display mean performance

In [22]:
knn_classifier(letter_1_X, letter_1_Y)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:  6.3min finished


Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:  5.4min finished


Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:  5.1min finished


Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:  5.3min finished


Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:  5.4min finished


Unnamed: 0,ACC,FSC,AUC
Trial 1,1.0,1.0,1.0
Trial 2,1.0,1.0,1.0
Trial 3,1.0,1.0,1.0
Trial 4,1.0,1.0,1.0
Trial 5,1.0,1.0,1.0


mean_train_performance:
ACC    1.0
FSC    1.0
AUC    1.0
dtype: float64


Unnamed: 0,ACC,FSC,AUC
Trial 1,0.9558,0.955308,0.991314
Trial 2,0.951133,0.950782,0.988374
Trial 3,0.9516,0.95233,0.989555
Trial 4,0.955867,0.955493,0.990377
Trial 5,0.9554,0.954694,0.989664


mean_test_performance:
ACC    0.953960
FSC    0.953721
AUC    0.989857
dtype: float64


In [23]:
knn_classifier(letter_2_X, letter_2_Y)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:  5.1min finished


Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 35.3min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 37.9min finished


Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:  5.9min finished


Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:  5.9min finished


Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:  5.6min finished


Unnamed: 0,ACC,FSC,AUC
Trial 1,1.0,1.0,1.0
Trial 2,1.0,1.0,1.0
Trial 3,1.0,1.0,1.0
Trial 4,1.0,1.0,1.0
Trial 5,1.0,1.0,1.0


mean_train_performance:
ACC    1.0
FSC    1.0
AUC    1.0
dtype: float64


Unnamed: 0,ACC,FSC,AUC
Trial 1,0.990733,0.876882,0.996647
Trial 2,0.988467,0.844284,0.989705
Trial 3,0.9914,0.880416,0.994623
Trial 4,0.9888,0.874317,0.996887
Trial 5,0.989867,0.86106,0.985138


mean_test_performance:
ACC    0.989853
FSC    0.867392
AUC    0.992600
dtype: float64


## Cov_type: Training and Testing Performance

In [104]:
knn_classifier(cov_X, cov_Y)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   36.5s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 11.2min finished


Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 32.7min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 35.5min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 41.1min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 46.5min finished


Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   45.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 14.9min finished


Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 63.6min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 90.3min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 100.2min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 104.5min finished


Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   46.2s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 19.1min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 23.4min finished


Unnamed: 0,ACC,FSC,AUC
Trial 1,1.0,1.0,1.0
Trial 2,1.0,1.0,1.0
Trial 3,1.0,1.0,1.0
Trial 4,1.0,1.0,1.0
Trial 5,1.0,1.0,1.0


mean_train_performance:
ACC    1.0
FSC    1.0
AUC    1.0
dtype: float64


Unnamed: 0,ACC,FSC,AUC
Trial 1,0.779286,0.77778,0.859113
Trial 2,0.782136,0.78077,0.864305
Trial 3,0.779338,0.778039,0.856681
Trial 4,0.781492,0.781603,0.859991
Trial 5,0.776284,0.779635,0.863182


mean_test_performance:
ACC    0.779707
FSC    0.779566
AUC    0.860654
dtype: float64


## Mushroom Training and Testing Performance

In [168]:
knn_classifier(mush_X, mush_Y)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 15.9min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 92.9min finished


Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 26.6min finished


Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 15.9min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 25.6min finished


Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 15.2min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 23.2min finished


Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 15.2min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 30.7min finished


Unnamed: 0,ACC,FSC,AUC
Trial 1,1.0,1.0,1.0
Trial 2,1.0,1.0,1.0
Trial 3,1.0,1.0,1.0
Trial 4,1.0,1.0,1.0
Trial 5,1.0,1.0,1.0


mean_train_performance:
ACC    1.0
FSC    1.0
AUC    1.0
dtype: float64


Unnamed: 0,ACC,FSC,AUC
Trial 1,1.0,1.0,1.0
Trial 2,1.0,1.0,1.0
Trial 3,1.0,1.0,1.0
Trial 4,1.0,1.0,1.0
Trial 5,1.0,1.0,1.0


mean_test_performance:
ACC    1.0
FSC    1.0
AUC    1.0
dtype: float64
