# Introduction
The dataset is a description of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota family taken from the UCI Machine Learning Repository. The dataset consists of 8124 instances (2480 instances contain missing values), along with 23 categorical attributes. Furthermore, the class distribution is quite balanced, 51.8% of the samples belong to the edible class, while 48.2% belong to the poisonous class. The purpose of this dataset is to identify whether the mushroom is edible or poisonous based on its attribute values. 

In [30]:
# Import the packages
import numpy as np
import pandas as pd

from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

#Feature selection
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectPercentile, f_classif

file_path = 'agaricus-lepiota.data'

# Load dataset and set column
df = pd.read_csv(file_path, header=None)

# Mushroom Dataset before Pre-Processing

In [31]:
print(df)

     0  1  2  3  4  5  6  7  8  9   ... 13 14 15 16 17 18 19 20 21 22
0     p  x  s  n  t  p  f  c  n  k  ...  s  w  w  p  w  o  p  k  s  u
1     e  x  s  y  t  a  f  c  b  k  ...  s  w  w  p  w  o  p  n  n  g
2     e  b  s  w  t  l  f  c  b  n  ...  s  w  w  p  w  o  p  n  n  m
3     p  x  y  w  t  p  f  c  n  n  ...  s  w  w  p  w  o  p  k  s  u
4     e  x  s  g  f  n  f  w  b  k  ...  s  w  w  p  w  o  e  n  a  g
...  .. .. .. .. .. .. .. .. .. ..  ... .. .. .. .. .. .. .. .. .. ..
8119  e  k  s  n  f  n  a  c  b  y  ...  s  o  o  p  o  o  p  b  c  l
8120  e  x  s  n  f  n  a  c  b  y  ...  s  o  o  p  n  o  p  b  v  l
8121  e  f  s  n  f  n  a  c  b  n  ...  s  o  o  p  o  o  p  b  c  l
8122  p  k  y  n  f  y  f  c  n  b  ...  k  w  w  p  w  o  e  w  v  l
8123  e  x  s  n  f  n  a  c  b  y  ...  s  o  o  p  o  o  p  o  c  l

[8124 rows x 23 columns]


In [32]:
### PRE-PROCESSING ###

df.columns = ["class", "cap-shape", "cap-surface", "cap-color", "bruises", "odor", "gill-attachment", "gill-spacing", "gill-size", "gill-color", "stalk-shape", "stalk-root", "stalk-surface-above-ring", "stalk-surface-below-ring", "stalk-color-above-ring", "stalk-color-below-ring", "veil-type", "veil-color", "ring-number", "ring-type", "spore-print-color", "population", "habitat"]
# Move class attribtue to last column
classes = df['class']
df = df.drop(["class"],axis=1)
df['class'] = classes

# Discard the data points that contain missing values
df = df.replace('?',np.NaN)
df = df.dropna()

# Map acronyms to actual value
df["cap-shape"] = df["cap-shape"].map({'b': 'bell', 'c': 'conical', 'x':'convex', 'f':'flat', 'k':'knobbed', 's':'sunkens'})
df["cap-surface"] = df["cap-surface"].map({'f': 'fibrous', 'g': 'grooves', 'y': 'scaly', 's': 'smooth'})
df["cap-color"] = df["cap-color"].map({'n': 'brown', 'b': 'buff', 'c': 'cinnamon', 'g': 'gray', 'r': 'green', 'p': 'pink', 'u': 'purple', 'e': 'red', 'w': 'white', 'y': 'yellow'})
df["bruises"] = df["bruises"].map({'t': 'True', 'f': 'False'})
df["odor"] = df["odor"].map({'a': 'almond', 'l': 'anise', 'c': 'creosote', 'y': 'fishy', 'f': 'foul', 'm': 'musty', 'n': 'odor-none', 'p': 'pungent', 's': 'spicy'})
df["gill-attachment"] = df["gill-attachment"].map({'a': 'attached', 'd': 'descending', 'f': 'free', 'n': 'notched'})
df["gill-spacing"] = df["gill-spacing"].map({'c': 'close', 'w': 'crowded', 'd': 'distant'})
df["gill-size"] = df["gill-size"].map({'b': 'broad', 'n': 'narrow'})
df["gill-color"] = df["gill-color"].map({'k': 'black', 'n': 'brown', 'b': 'buff', 'h': 'chocolate', 'g': 'gray', 'r': 'green', 'o': 'orange', 'p': 'pink', 'u': 'purple', 'e': 'red', 'w': 'white', 'y': 'yellow'})
df["stalk-shape"] = df["stalk-shape"].map({'e': 'enlarging', 't': 'tapering'})
df["stalk-root"] = df["stalk-root"].map({'b': 'bulbous', 'c': 'club', 'u': 'cup', 'e': 'equal', 'z': 'rhizomorphs', 'r': 'rooted'})
df["stalk-surface-above-ring"] = df["stalk-surface-above-ring"].map({'f': 'fibrous', 'y': 'scaly', 'k': 'silky', 's': 'smooth'})
df["stalk-surface-below-ring"] = df["stalk-surface-below-ring"].map({'f': 'fibrous', 'y': 'scaly', 'k': 'silky', 's': 'smooth'})
df["stalk-color-above-ring"] = df["stalk-color-above-ring"].map({'n': 'brown', 'b': 'buff', 'c': 'cinnamon', 'g': 'gray', 'o': 'orange', 'p': 'pink', 'e': 'red', 'w': 'white', 'y': 'yellow'})
df["stalk-color-below-ring"] = df["stalk-color-below-ring"].map({'n': 'brown', 'b': 'buff', 'c': 'cinnamon', 'g': 'gray', 'o': 'orange', 'p': 'pink', 'e': 'red', 'w': 'white', 'y': 'yellow'})
df["veil-type"] = df["veil-type"].map({'p': 'partial', 'u': 'universal'})
df["veil-color"] = df["veil-color"].map({'n': 'brown', 'o': 'orange', 'w': 'white', 'y': 'yellow'})
df["ring-number"] = df["ring-number"].map({'n': 'ring-number-none', 'o': 'one', 't': 'two'})
df["ring-type"] = df["ring-type"].map({'c': 'cobwebby', 'e': 'evanescent', 'f': 'flaring', 'l': 'large', 'n': 'ring-type-none', 'p': 'pendant', 's': 'sheathing', 'z': 'zone'})
df["spore-print-color"] = df["spore-print-color"].map({'k': 'black', 'n': 'brown', 'b': 'buff', 'h': 'chocolate', 'r': 'green', 'o': 'orange', 'u': 'purple', 'w': 'white', 'y': 'yellow'})
df["population"] = df["population"].map({'a': 'abundant', 'c': 'clustered', 'n': 'numerous', 's': 'scattered', 'v': 'several', 'y': 'solitary'})
df["habitat"] = df["habitat"].map({'g': 'grasses', 'l': 'leaves', 'm': 'meadows', 'p': 'paths', 'u': 'urban', 'w': 'waste', 'd': 'woods'})
df["class"] = df["class"].map({'e': 'edible', 'p': 'poisonous'})

# Added prefix to all values to remove duplicates, and clarity
for col in df.columns:
    if col != 'class':
        df[col] = col + '-' + df[col].astype(str)

target_attribute = df["class"];

# Convert all categorical variables into dummy variables 
for column in df.columns:
    dummy = pd.get_dummies(df[column], dtype=float)
    df = df.drop(columns=[column])
    df = pd.concat([df, dummy], axis=1)
df = df.rename(columns={"True": "bruises_yes", "False": "bruises_no"})    
df = df.rename(columns={"poisonous": "edible_no", "edible": "edible_yes"})

# Mushroom Dataset after Pre-Processing
Pre-processing Tasks:
- Added column names from agaricus-lepiota.names file
- Remove instances containing missing values indicated with '?'
- Map acronyms to actual value from agaricus-lepiota.names file
- Added prefix to common values to differentiate them for the next step
- Convert categorical variables to dummy variables

In [33]:
print(df)

      cap-shape-bell  cap-shape-conical  cap-shape-convex  cap-shape-flat  \
0                0.0                0.0               1.0             0.0   
1                0.0                0.0               1.0             0.0   
2                1.0                0.0               0.0             0.0   
3                0.0                0.0               1.0             0.0   
4                0.0                0.0               1.0             0.0   
...              ...                ...               ...             ...   
7986             1.0                0.0               0.0             0.0   
8001             0.0                0.0               1.0             0.0   
8038             0.0                0.0               1.0             0.0   
8095             0.0                0.0               1.0             0.0   
8114             0.0                0.0               0.0             1.0   

      cap-shape-knobbed  cap-shape-sunkens  cap-surface-fibrous  \
0       

In [34]:
# We need to have a separate df for DecisionTree model where the target attribute is not broken down to two attributes (edible_yes and edible_no)
df_decision_tree = df.drop(columns=["edible_no", "edible_yes"])
df_decision_tree['class'] = target_attribute

# Drop target attribute edible_no (poisonous)
df = df.drop('edible_no',axis=1)

# Data Split
x = df.drop(columns=['edible_yes']) 
y = df['edible_yes']
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.6, test_size=0.4, random_state=42)

# create a split for target attribute with categorical value of edible and poisonous
X_train, X_test, y_train_decision_tree, y_test_decision_tree = train_test_split(x, df_decision_tree['class'], train_size=0.6, test_size=0.4, random_state=42)

# Feature Selection (SelectPercentile)
selPercentile = SelectPercentile(score_func=f_classif, percentile=50)
x_fs_sp = selPercentile.fit_transform(x, y)
#print("\nSelected Features (selPercentile, best 50% features): ", selPercentile.get_feature_names_out())
#print("size: ", selPercentile.get_feature_names_out().size)
X_train_fs_sp, X_test_fs_sp, y_train, y_test = train_test_split(x_fs_sp, y, train_size=0.6, test_size=0.4, random_state=42)

# Feature Selection (VarianceThreshold)
varThres = VarianceThreshold(threshold=0.125) #threshold range works if [0, 0.25) because of binary values
x_fs_vt = varThres.fit_transform(x, y)
#print("\nSelected Features (varThres ): ", varThres.get_feature_names_out())
#print("size: ", varThres.get_feature_names_out().size)
X_train_fs_vt, X_test_fs_vt, y_train, y_test = train_test_split(x_fs_vt, y, train_size=0.6, test_size=0.4, random_state=42)

# Prepare models
dt = DecisionTreeClassifier(criterion="gini", max_depth=3)
nb = BernoulliNB()
neigh = KNeighborsClassifier()
svm = SVC(kernel='linear', probability=True)
forest = RandomForestClassifier(criterion="gini", max_depth=3, n_estimators = 100, random_state = 42)
gradient = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=3, random_state=42)

#To see which kernel is better
"""
param_grid = {
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto']
}

grid = GridSearchCV(SVC(), param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best kernel:", grid.best_params_['kernel'])
"""

  f = msb / msw


'\nparam_grid = {\n    \'kernel\': [\'linear\', \'rbf\', \'poly\', \'sigmoid\'],\n    \'C\': [0.1, 1, 10],\n    \'gamma\': [\'scale\', \'auto\']\n}\n\ngrid = GridSearchCV(SVC(), param_grid, cv=5)\ngrid.fit(X_train, y_train)\nprint("Best kernel:", grid.best_params_[\'kernel\'])\n'

# Result
Performance of Decision Tree, Naive Bayes, KNeightbour, SVM, Random Forest, and Gradient Boosting. Features selection used are VarianceThreshold (threshold=0.125) and SelectPercentile(percentile=50)

The goal of this project is to compare the performance of all models with and without feature selection

In [None]:
models = [dt, nb, neigh, svm, forest, gradient]
for i in range(len(models)):
    yTest = ''
    yTrain = ''
    name = '';
    if(i == 0):
        name = "Decision Tree"
        yTest = y_test_decision_tree
        yTrain = y_train_decision_tree
    elif(i == 1):
        name = "Naive Bayes"
        yTest = y_test
        yTrain = y_train
    elif(i == 2):
        name = "K Neighbors"
        yTest = y_test
        yTrain = y_train
    elif(i == 3):
        name = "SVM"
        yTest = y_test_decision_tree
        yTrain = y_train_decision_tree
    elif(i == 4):
        name = "Random Forest"
        yTest = y_test_decision_tree
        yTrain = y_train_decision_tree
    elif(i == 5):
        name = "Gradient Booster"
        yTest = y_test_decision_tree
        yTrain = y_train_decision_tree
    # Perform Cross-Validation (StratifiedKFold Cross-validation, k = 10)
    for j in range(3):
        x_cv = ''
        fs = '';
        if(j == 0):
            fs = ""
            x_cv = x
        elif(j == 1):
            fs = "VarianceThreshold "
            x_cv = x_fs_vt
        elif(j == 2):
            fs = "SelectPercentile "
            x_cv = x_fs_sp
        accuracy = np.mean(cross_val_score(models[i], x_cv, y, cv=10, scoring=make_scorer(accuracy_score)))
        print(name + " " + fs + "Cross-validation: {:.2f}%".format(accuracy * 100))  
    #Perform Model Test
    for k in range(3):
        x_train = ''
        x_test = ''
        fs = '';
        if(k == 0):
            fs = ""
            x_train = X_train
            x_test = X_test
        elif(k == 1):
            fs = "with VarianceThreshold"
            x_train = X_train_fs_vt
            x_test = X_test_fs_vt
        elif(k == 2):
            fs = "with SelectPercentile"
            x_train = X_train_fs_sp
            x_test = X_test_fs_sp
        models[i].fit(x_train, yTrain)
        y_pred = models[i].predict(x_test);
        y_proba = models[i].predict_proba(x_test)[:, 1]
        accuracy = accuracy_score(yTest, y_pred)
        auc = roc_auc_score(yTest, y_proba)
        print(name + " " + fs + " | Accuracy: {:.2f}%".format(accuracy * 100) + " | AUC Score: {:.2f} ".format(auc))
    print("=======================================================================")

Decision Tree Cross-validation: 93.84%
Decision Tree VarianceThreshold Cross-validation: 90.29%
Decision Tree SelectPercentile Cross-validation: 96.58%
Decision Tree  | Accuracy: 97.34% | AUC Score: 0.99 
Decision Tree with VarianceThreshold | Accuracy: 93.58% | AUC Score: 0.98 
Decision Tree with SelectPercentile | Accuracy: 96.81% | AUC Score: 0.98 
Naive Bayes Cross-validation: 92.00%
Naive Bayes VarianceThreshold Cross-validation: 90.51%
Naive Bayes SelectPercentile Cross-validation: 92.44%
Naive Bayes  | Accuracy: 92.78% | AUC Score: 1.00 
Naive Bayes with VarianceThreshold | Accuracy: 89.73% | AUC Score: 0.96 
Naive Bayes with SelectPercentile | Accuracy: 91.98% | AUC Score: 1.00 
K Neighbors Cross-validation: 98.76%
K Neighbors VarianceThreshold Cross-validation: 96.62%
K Neighbors SelectPercentile Cross-validation: 98.26%
K Neighbors  | Accuracy: 100.00% | AUC Score: 1.00 
K Neighbors with VarianceThreshold | Accuracy: 99.60% | AUC Score: 1.00 
K Neighbors with SelectPercentile

# Note
Detailed explanation and analysis of the selected paramters and result can be seen in the Project Report PDF