# MUSHROOM
### Source:
https://archive.ics.uci.edu/ml/datasets/mushroom
### Goal:
Predict whether or not a mushroom is edible or not based off 22 different categorical characteristics

##  Imports

In [1]:
import numpy as np
import pandas as pd
import requests, zipfile, io

from sklearn.metrics import accuracy_score, f1_score, auc, roc_auc_score, roc_curve, make_scorer
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier


# Algo 1: LOGISTIC REGRESSION


In [2]:
def log_reg_classifier(X_train, Y_train, X_test, Y_test):
    # define hyperparameters to search through & error metrics 
    C_list = [10**-8,10**-7,10**-6,10**-5,10**-4,10**-3,10**-2,10**-1,10**0,10**1,10**2,10**3,10**4]
    penalty_list = ['l2', 'l1', 'none']
    solver = ['saga'] #this solver works for no penalty, l1 and l2 penalties
    scoring = {'Accuracy': make_scorer(accuracy_score), 'f1': 'f1', 'AUC': 'roc_auc'}
    classifier = LogisticRegression()
        
    # gridsearch 
    lr =  GridSearchCV(classifier, {'C':C_list, 'penalty':penalty_list,'solver':solver},
                                   scoring=scoring,refit=False, cv=5, n_jobs=-1, 
                                   return_train_score=True, verbose=2 ) 
        
    lr.fit(X_train,Y_train)
    results = pd.DataFrame(lr.cv_results_)
        
    # Get each parameter settings that gives best accuracy, F1, and AUC on validation set
    best_AUC = results[results['rank_test_AUC']==1]['params'].values[0]
    best_accuracy = results[results['rank_test_Accuracy']  ==1]['params'].values[0]
    best_F1 = results[results['rank_test_f1']  ==1]['params'].values[0]
        
    # Train 3 models using the 5000 samples and each of the 3 best parameter settings (one model per metric)
    clf_AUC = LogisticRegression(C=best_AUC['C'], solver = best_AUC['solver'],
                             penalty=best_AUC['penalty']).fit(X_train, Y_train)

    clf_accuracy = LogisticRegression(C=best_accuracy['C'], solver = best_accuracy['solver'],
                             penalty=best_accuracy['penalty']).fit(X_train, Y_train)

    clf_F1 = LogisticRegression(C=best_F1['C'], solver = best_F1['solver'],
                                penalty=best_F1['penalty']).fit(X_train, Y_train)
        
    # For average training performance
    train_accuracy = clf_accuracy.score(X_train, Y_train)  # Accuracy  
    train_F1 = f1_score(Y_train, clf_F1.predict(X_train))   # F1
    fpr, tpr, threshold = roc_curve(Y_train, clf_AUC.predict_proba(X_train)[:,1])
    train_auc = auc(fpr, tpr) # AUC
        
    # Find and store accuracy, F1, and AUC of the 3 models from previous line on test set
    test_accuracy = clf_accuracy.score(X_test, Y_test)  # Accuracy
    
    test_F1 = f1_score(Y_test, clf_F1.predict(X_test))   # F1
    
    false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_test, clf_AUC.predict_proba(X_test)[:,1])
    test_auc = auc(false_positive_rate, true_positive_rate) # AUC

    return train_accuracy, train_F1, train_auc, test_accuracy, test_F1, test_auc


# Algo 2: RANDOM FOREST

In [3]:
def rf_classifier(X_train,Y_train,X_test,Y_test):
    # define hyperparameters to search through & error metrics  
    n_estimators = [1024]
    min_samples_split = [1,2,4,6,8,12,16,20]
    scoring = {'Accuracy': make_scorer(accuracy_score), 'f1': 'f1', 'AUC': 'roc_auc'}
    classifier = RandomForestClassifier()

    rf =  GridSearchCV(classifier, {'n_estimators':n_estimators, 'min_samples_split':min_samples_split},
                                   scoring=scoring,refit=False, cv=5, n_jobs=-1, 
                                   return_train_score=True, verbose=2) 

    rf.fit(X_train,Y_train)
    results = pd.DataFrame(rf.cv_results_)
        
    # Get each parameter settings that gives best accuracy, F1, and AUC on validation set
    best_AUC = results[results['rank_test_AUC']==1]['params'].values[0]
    best_accuracy = results[results['rank_test_Accuracy']  ==1]['params'].values[0]
    best_F1 = results[results['rank_test_f1']  ==1]['params'].values[0]

    # Train 3 models using the 5000 samples and each of the 3 best parameter settings (one model per metric)
    clf_AUC = RandomForestClassifier(n_estimators=best_AUC['n_estimators'], 
                              min_samples_split=best_AUC['min_samples_split']).fit(X_train, Y_train)

    clf_accuracy = RandomForestClassifier(n_estimators=best_accuracy['n_estimators'], 
                              min_samples_split=best_accuracy['min_samples_split']).fit(X_train, Y_train)

    clf_F1 = RandomForestClassifier(n_estimators=best_F1['n_estimators'], 
                                  min_samples_split=best_F1['min_samples_split']).fit(X_train, Y_train)

    # For average training performance
    train_accuracy = clf_accuracy.score(X_train, Y_train)  # Accuracy  
    train_F1 = f1_score(Y_train, clf_F1.predict(X_train))   # F1
    fpr, tpr, threshold = roc_curve(Y_train, clf_AUC.predict_proba(X_train)[:,1])
    train_auc = auc(fpr, tpr) # AUC
        
    # Find and store accuracy, F1, and AUC of the 3 models from previous line on test set
    test_accuracy = clf_accuracy.score(X_test, Y_test)  # Accuracy
    
    test_F1 = f1_score(Y_test, clf_F1.predict(X_test))   # F1
    
    false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_test, clf_AUC.predict_proba(X_test)[:,1])
    test_auc = auc(false_positive_rate, true_positive_rate) # AUC

    return train_accuracy, train_F1, train_auc, test_accuracy, test_F1, test_auc
  

# Algo 3: KNN

In [4]:
def knn_classifier(X_train,Y_train,X_test,Y_test):
    # define hyperparameters to search through & error metrics  
    n_neighbors = np.linspace(1, 105, num=27, dtype=int)
    weights = ['uniform', 'distance']
    p = [1,2]
    scoring = {'Accuracy': make_scorer(accuracy_score), 'f1': 'f1', 'AUC': 'roc_auc'}
    classifier = KNeighborsClassifier()

    rf =  GridSearchCV(classifier, {'n_neighbors':n_neighbors, 'weights': weights, 'p':p},
                                   scoring=scoring,refit=False, cv=5, n_jobs=-1, 
                                   return_train_score=True, verbose=2) 

    rf.fit(X_train,Y_train)
    results = pd.DataFrame(rf.cv_results_)
        
    # Get each parameter settings that gives best accuracy, F1, and AUC on validation set
    best_AUC = results[results['rank_test_AUC']==1]['params'].values[0]
    best_accuracy = results[results['rank_test_Accuracy']  ==1]['params'].values[0]
    best_F1 = results[results['rank_test_f1']  ==1]['params'].values[0]

    # Train 3 models using the 5000 samples and each of the 3 best parameter settings (one model per metric)
    clf_AUC = KNeighborsClassifier(n_neighbors=best_AUC['n_neighbors'], 
                             weights=best_AUC['weights'], p = best_AUC['p']).fit(X_train, Y_train)

    clf_accuracy = KNeighborsClassifier(n_neighbors=best_accuracy['n_neighbors'], 
                              weights=best_accuracy['weights']).fit(X_train, Y_train)

    clf_F1 = KNeighborsClassifier(n_neighbors=best_F1['n_neighbors'], 
                                  weights=best_F1['weights'], p = best_F1['p']).fit(X_train, Y_train)

    # For average training performance
    train_accuracy = clf_accuracy.score(X_train, Y_train)  # Accuracy  
    train_F1 = f1_score(Y_train, clf_F1.predict(X_train))   # F1
    fpr, tpr, threshold = roc_curve(Y_train, clf_AUC.predict_proba(X_train)[:,1])
    train_auc = auc(fpr, tpr) # AUC
        
    # Find and store accuracy, F1, and AUC of the 3 models from previous line on test set
    test_accuracy = clf_accuracy.score(X_test, Y_test)  # Accuracy
    
    test_F1 = f1_score(Y_test, clf_F1.predict(X_test))   # F1
    
    false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_test, clf_AUC.predict_proba(X_test)[:,1])
    test_auc = auc(false_positive_rate, true_positive_rate) # AUC

    return train_accuracy, train_F1, train_auc, test_accuracy, test_F1, test_auc  

## Data


In [5]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data', 
                    header=None)
print(data.shape)
data.head() 

(8124, 23)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [6]:
rows_to_drop = data[data.isnull().any(axis=1)] 
print(rows_to_drop)

Empty DataFrame
Columns: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
Index: []

[0 rows x 23 columns]


In [7]:
# Extract Labels
y = data[0]

Y=[] 
for i in y:
    if i == "e":
        Y += [1]
    else:
        Y += [-1]


# Ectract Data
d = data.loc[:, 1:54]

# One hot encode categorical data
X = pd.DataFrame(pd.get_dummies(d))
X.head()

Unnamed: 0,1_b,1_c,1_f,1_k,1_s,1_x,2_f,2_g,2_s,2_y,...,21_s,21_v,21_y,22_d,22_g,22_l,22_m,22_p,22_u,22_w
0,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,0,0,1,...,1,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


In [8]:
# %POZ
print(data.shape)
print(y.value_counts())

percent_positive = 4208/8124
print(percent_positive)

(8124, 23)
e    4208
p    3916
Name: 0, dtype: int64
0.517971442639094


## Running 5 trials across the three algos 

In [9]:
log_reg_data = []
rf_data = []
knn_data = []

for i in range(5):
    #split data for training and testing
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, train_size=5000)

    log_reg_data += log_reg_classifier(X_train, Y_train, X_test, Y_test)
    rf_data += rf_classifier(X_train,Y_train, X_test, Y_test)
    knn_data += knn_classifier(X_train,Y_train, X_test, Y_test)
    
    

Fitting 5 folds for each of 39 candidates, totalling 195 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   24.6s
[Parallel(n_jobs=-1)]: Done 195 out of 195 | elapsed:   35.2s finished
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   49.4s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.0min finished


Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 16.7min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 26.1min finished


Fitting 5 folds for each of 39 candidates, totalling 195 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done 195 out of 195 | elapsed:   30.5s finished
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   49.6s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.0min finished


Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 16.2min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 24.9min finished


Fitting 5 folds for each of 39 candidates, totalling 195 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   19.2s
[Parallel(n_jobs=-1)]: Done 195 out of 195 | elapsed:   27.9s finished
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   38.2s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   47.5s finished


Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 94.6min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 105.9min finished


Fitting 5 folds for each of 39 candidates, totalling 195 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   27.7s
[Parallel(n_jobs=-1)]: Done 195 out of 195 | elapsed:   39.9s finished
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   53.2s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.1min finished


Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 363.4min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 434.4min finished


Fitting 5 folds for each of 39 candidates, totalling 195 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   19.5s
[Parallel(n_jobs=-1)]: Done 195 out of 195 | elapsed:   28.8s finished
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
  "Setting penalty='none' will ignore the C and l1_ratio "
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   50.4s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   59.8s finished


Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 17.4min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 27.3min finished


In [15]:
#Logistic Regression Results
df_log_reg = pd.DataFrame(columns=('Train ACC', 'Train F1','Train AUC', 'Test ACC', 'Test F1', 'Test AUC'))
df_log_reg.loc['LR Mush Trial 1'] = log_reg_data[0:6]
df_log_reg.loc['LR Mush Trial 2'] = log_reg_data[6:12]
df_log_reg.loc['LR Mush Trial 3'] = log_reg_data[12:18]
df_log_reg.loc['LR Mush Trial 4'] = log_reg_data[18:24]
df_log_reg.loc['LR Mush Trial 5'] = log_reg_data[24:32]

#Random Forest Results
df_rf = pd.DataFrame(columns=('Train ACC', 'Train F1','Train AUC', 'Test ACC', 'Test F1', 'Test AUC'))
df_rf.loc['RF Mush Trial 1'] = rf_data[0:6]
df_rf.loc['RF Mush Trial 2'] = rf_data[6:12]
df_rf.loc['RF Mush Trial 3'] = rf_data[12:18]
df_rf.loc['RF Mush Trial 4'] = rf_data[18:24]
df_rf.loc['RF Mush Trial 5'] = rf_data[24:32]


#KNN Results
df_knn = pd.DataFrame(columns=('Train ACC', 'Train F1','Train AUC', 'Test ACC', 'Test F1', 'Test AUC'))
df_knn.loc['KNN Mush Trial 1'] = knn_data[0:6]
df_knn.loc['KNN Mush Trial 2'] = knn_data[6:12]
df_knn.loc['KNN Mush Trial 3'] = knn_data[12:18]
df_knn.loc['KNN Mush Trial 4'] = knn_data[18:24]
df_knn.loc['KNN Mush Trial 5'] = knn_data[24:32]

print('Logistic Regression Results')
display(df_log_reg)
print(df_log_reg.mean())

print('\n\nRandom Forest Results')
display(df_rf)
print(df_rf.mean())

print('\n\nKNN Results')
display(df_knn)
print(df_knn.mean())

Logistic Regression Results


Unnamed: 0,Train ACC,Train F1,Train AUC,Test ACC,Test F1,Test AUC
LR Mush Trial 1,1.0,1.0,1.0,1.0,1.0,1.0
LR Mush Trial 2,1.0,1.0,1.0,1.0,1.0,1.0
LR Mush Trial 3,1.0,1.0,1.0,1.0,1.0,1.0
LR Mush Trial 4,1.0,1.0,1.0,1.0,1.0,1.0
LR Mush Trial 5,1.0,1.0,1.0,1.0,1.0,1.0


Train ACC    1.0
Train F1     1.0
Train AUC    1.0
Test ACC     1.0
Test F1      1.0
Test AUC     1.0
dtype: float64


Random Forest Results


Unnamed: 0,Train ACC,Train F1,Train AUC,Test ACC,Test F1,Test AUC
RF Mush Trial 1,1.0,1.0,1.0,1.0,1.0,1.0
RF Mush Trial 2,1.0,1.0,1.0,1.0,1.0,1.0
RF Mush Trial 3,1.0,1.0,1.0,1.0,1.0,1.0
RF Mush Trial 4,1.0,1.0,1.0,1.0,1.0,1.0
RF Mush Trial 5,1.0,1.0,1.0,1.0,1.0,1.0


Train ACC    1.0
Train F1     1.0
Train AUC    1.0
Test ACC     1.0
Test F1      1.0
Test AUC     1.0
dtype: float64


KNN Results


Unnamed: 0,Train ACC,Train F1,Train AUC,Test ACC,Test F1,Test AUC
KNN Mush Trial 1,1.0,1.0,1.0,1.0,1.0,1.0
KNN Mush Trial 2,1.0,1.0,1.0,1.0,1.0,1.0
KNN Mush Trial 3,1.0,1.0,1.0,1.0,1.0,1.0
KNN Mush Trial 4,1.0,1.0,1.0,1.0,1.0,1.0
KNN Mush Trial 5,1.0,1.0,1.0,1.0,1.0,1.0


Train ACC    1.0
Train F1     1.0
Train AUC    1.0
Test ACC     1.0
Test F1      1.0
Test AUC     1.0
dtype: float64


In [16]:
# need to save the data for ttests
df_log_reg.to_csv("lr_mush.csv")
df_rf.to_csv("rf_mush.csv")
df_knn.to_csv("knn_mush.csv")