In [1]:
import pandas as pd
import category_encoders as ce
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.metrics import classification_report, confusion_matrix
import plotly.figure_factory as ff

In [2]:
column_map = {
    'cap-shape_b': 'Bell Cap Shape',
    'cap-shape_c': 'Conical Cap Shape',
    'cap-shape_x': 'Convex Cap Shape',
    'cap-shape_f': 'Flat Cap Shape',
    'cap-shape_k': 'Knobbed Cap Shape',
    'cap-shape_s': 'Sunken Cap Shape',
    'cap-surface_f': 'Fibrous Cap Surface',
    'cap-surface_g': 'Groovey Cap Surface',
    'cap-surface_y': 'Scaly Cap Surface',
    'cap-surface_s': 'Smooth Cap Surface',
    'cap-color_n': 'Brown Cap Color',
    'cap-color_b': 'Buff Cap Color',
    'cap-color_c': 'Cinnamon Cap Color',
    'cap-color_g': 'Gray Cap Color',
    'cap-color_r': 'Green Cap Color',
    'cap-color_p': 'Pink Cap Color',
    'cap-color_u': 'Purple Cap Color',
    'cap-color_e': 'Red Cap Color',
    'cap-color_w': 'White Cap Color',
    'cap-color_y': 'Yellow Cap Color',
    'bruises?_t': 'Bruises',
    'bruises?_f': 'No Bruises',
    'odor_a': 'Almond Odor',
    'odor_l': 'Anise Odor',
    'odor_c': 'Creosote Odor',
    'odor_y': 'Fishy Odor',
    'odor_f': 'Foul Odor',
    'odor_m': 'Musty Odor',
    'odor_n': 'No Odor',
    'odor_p': 'Pungent Odor',
    'odor_s': 'Spicy Odor',
    'gill-attachment_a': 'Attached Gills',
    'gill-attachment_d': 'Descending Gills',
    'gill-attachment_f': 'Free Gills',
    'gill-attachment_n': 'Notched Gills',
    'gill-spacing_c': 'Closely Spaced Gills',
    'gill-spacing_w': 'Crowded Gills',
    'gill-spacing_d': 'Distantly Spaced Gills',
    'gill-size_b': 'Broad Gills',
    'gill-size_n': 'Narrow Gills',
    'gill-color_k': 'Black Gills',
    'gill-color_n': 'Brown Gills',
    'gill-color_b': 'Buff Gills',
    'gill-color_h': 'Chocolate Gills',
    'gill-color_g': 'Gray Gills',
    'gill-color_r': 'Gree Gills',
    'gill-color_o': 'Orange Gills',
    'gill-color_p': 'Pink Gills',
    'gill-color_u': 'Purple Gills',
    'gill-color_e': 'Red Gills',
    'gill-color_w': 'White Gills',
    'gill-color_y': 'Yellow Gills',
    'stalk-shape_e': 'Enlarging Stalk',
    'stalk-shape_t': 'Tapering Stalk',
    'stalk-root_b': 'Bulbous Root',
    'stalk-root_c': 'Cup Root',
    'stalk-root_u': 'Club Root',
    'stalk-root_e': 'Equal Root',
    'stalk-root_z': 'Rhizomorph Root',
    'stalk-root_r': 'Rooted Root',
    'stalk-root_m': 'Missing',
    'stalk-surface-above-ring_f': 'Fibrous Surface Above Ring',
    'stalk-surface-above-ring_y': 'Scaly Surface Above Ring',
    'stalk-surface-above-ring_k': 'Silky Surface Above Ring',
    'stalk-surface-above-ring_s': 'Smooth Surface Above Ring',
    'stalk-surface-below-ring_f': 'Fibrous Surface Below Ring',
    'stalk-surface-below-ring_y': 'Scaly Surface Below Ring',
    'stalk-surface-below-ring_k': 'Silky Surface Below Ring',
    'stalk-surface-below-ring_s': 'Smooth Surface Below Ring',
    'stalk-color-above-ring_n': 'Brown Stalk Above Ring',
    'stalk-color-above-ring_b': 'Buff Stalk Above Ring',
    'stalk-color-above-ring_c': 'Cinnamon Stalk Above Ring',
    'stalk-color-above-ring_g': 'Gray Stalk Above Ring',
    'stalk-color-above-ring_o': 'Orange Stalk Above Ring',
    'stalk-color-above-ring_p': 'Pink Stalk Above Ring',
    'stalk-color-above-ring_e': 'Red Stalk Above Ring',
    'stalk-color-above-ring_w': 'White Stalk Above Ring',
    'stalk-color-above-ring_y': 'Yellow Stalk Above Ring',
    'stalk-color-below-ring_n': 'Brown Stalk Below Ring',
    'stalk-color-below-ring_b': 'Buff Stalk Below Ring',
    'stalk-color-below-ring_c': 'Cinnamon Stalk Below Ring',
    'stalk-color-below-ring_g': 'Gray Stalk Below Ring',
    'stalk-color-below-ring_o': 'Orange Stalk Below Ring',
    'stalk-color-below-ring_p': 'Pink Stalk Below Ring',
    'stalk-color-below-ring_e': 'Red Stalk Below Ring',
    'stalk-color-below-ring_w': 'White Stalk Below Ring',
    'stalk-color-below-ring_y': 'Yellow Stalk Below Ring',
    'veil-type_p' : 'Partial Veil',
    'veil-type_u' : 'Universial Veil',
    'veil-color_n' : 'Brown Veil',
    'veil-color_o' : 'Orange Veil',
    'veil-color_w' : 'White Veil',
    'veil-color_y' : 'Yellow Veil',
    'ring-number_n' : 'No Ring',
    'ring-number_o' : 'One Ring',
    'ring-number_t' : 'Two Rings',
    'ring-type_c' : 'Cobwebby Ring',
    'ring-type_e' : 'Evanescent Ring',
    'ring-type_f' : 'Flaring Ring',
    'ring-type_l' : 'Large Ring',
    'ring-type_n' : 'No Ring',
    'ring-type_p' : 'Pendant Ring',
    'ring-type_s' : 'Sheathing Ring',
    'ring-type_z' : 'Zone Ring',
    'spore-print-color_k' : 'Black Spore Print',
    'spore-print-color_n' : 'Brown Spore Print',
    'spore-print-color_b' : 'Buff Spore Print',
    'spore-print-color_h' : 'Chocolate Spore Print',
    'spore-print-color_r' : 'Green Spore Print',
    'spore-print-color_o' : 'Orange Spore Print',
    'spore-print-color_u' : 'Purple Spore Print',
    'spore-print-color_w' : 'White Spore Print',
    'spore-print-color_y' : 'Yellow Spore Print',
    'population_a' : 'Abundant Population',
    'population_c' : 'Clustered Population',
    'population_n' : 'Numerous Population',
    'population_s' : 'Scattered Population',
    'population_v' : 'Several Population',
    'population_y' : 'Solitary Population',
    'habitat_g' : 'Habitat: Grass',
    'habitat_l' : 'Habitat: Leaves',
    'habitat_m' : 'Habitat: Meadows',
    'habitat_p' : 'Habitat: Paths',
    'habitat_u' : 'Habitat: Urban',
    'habitat_w' : 'Habitat: Waste',
    'habitat_d' : 'Habitat: Woods', 
}

In [3]:
# Load data
pickleFile = open("../assets/clean_mushroom_data.pkl", 'rb')
mushrooms = pickle.load(pickleFile)
pickleFile.close()


#### Create features and target and train test split
X = mushrooms.drop(columns='class')
X = ce.OneHotEncoder(use_cat_names=True).fit_transform(X)
X = X.rename(columns = column_map)

#X

y = mushrooms['class'].replace({'p':0, 'e':1})

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42,
                                                    test_size=.2, stratify=y)

In [4]:
X_test.columns

Index(['Convex Cap Shape', 'Bell Cap Shape', 'Sunken Cap Shape',
       'Flat Cap Shape', 'Knobbed Cap Shape', 'Conical Cap Shape',
       'Smooth Cap Surface', 'Scaly Cap Surface', 'Fibrous Cap Surface',
       'Groovey Cap Surface',
       ...
       'Several Population', 'Solitary Population', 'Clustered Population',
       'Habitat: Urban', 'Habitat: Grass', 'Habitat: Meadows',
       'Habitat: Woods', 'Habitat: Paths', 'Habitat: Waste',
       'Habitat: Leaves'],
      dtype='object', length=117)

In [5]:
def pickle_model(model, filename):
    pickleFile = open(filename, 'wb')
    pickle.dump(model, pickleFile)
    pickleFile.close()

In [6]:
pickle_model(X_test, '../assets/X_test.pkl')
pickle_model(y_test, '../assets/y_test.pkl')

## Decision Stump

In [7]:
decision_stump = DecisionTreeClassifier(max_depth=1)

decision_stump.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [8]:
model_predictions = decision_stump.predict(X_test)

decision_stump_report = pd.DataFrame(classification_report(y_test, model_predictions, target_names=['0-Poisonous', '1-Edible'], output_dict = True))
decision_stump_report

Unnamed: 0,0-Poisonous,1-Edible,accuracy,macro avg,weighted avg
f1-score,0.894214,0.882391,0.888615,0.888303,0.888088
precision,0.824353,0.974175,0.888615,0.899264,0.901984
recall,0.977011,0.806413,0.888615,0.891712,0.888615
support,783.0,842.0,0.888615,1625.0,1625.0


In [9]:
con_matrix = pd.DataFrame(confusion_matrix(y_test, model_predictions),
                                columns=['Predicted Poison', 'Predicted Edible'], 
                                index=['Actual Poison', 'Actual Edible'])
con_matrix

Unnamed: 0,Predicted Poison,Predicted Edible
Actual Poison,765,18
Actual Edible,163,679


In [10]:
pickle_model(decision_stump, "../assets/decision_stump.pkl")

## Vanilla Decision Tree

In [11]:
vanilla_decision_tree = DecisionTreeClassifier(max_depth=2)

vanilla_decision_tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [12]:
model_predictions = vanilla_decision_tree.predict(X_test)

vanilla_decision_tree_report = pd.DataFrame(
    classification_report(y_test,
                          model_predictions,
                          target_names=['0-Poisonous', '1-Edible'],
                          output_dict = True)
    )

vanilla_decision_tree_report

Unnamed: 0,0-Poisonous,1-Edible,accuracy,macro avg,weighted avg
f1-score,0.950617,0.95092,0.950769,0.950769,0.950774
precision,0.919952,0.983503,0.950769,0.951727,0.952881
recall,0.983397,0.920428,0.950769,0.951912,0.950769
support,783.0,842.0,0.950769,1625.0,1625.0


In [13]:
con_matrix = pd.DataFrame(confusion_matrix(y_test, model_predictions),
                                columns=['Predicted Poison', 'Predicted Edible'], 
                                index=['Actual Poison', 'Actual Edible'])
con_matrix

Unnamed: 0,Predicted Poison,Predicted Edible
Actual Poison,770,13
Actual Edible,67,775


In [14]:
pickle_model(vanilla_decision_tree, "../assets/vanilla_decision_tree.pkl")

## Decision Tree with Hyperparameter Optomization

In [15]:
param_distributions = {
    'max_depth':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
}

search = RandomizedSearchCV(estimator = DecisionTreeClassifier(),
                            param_distributions = param_distributions,
                            n_iter=100,
                            scoring='precision',
                            n_jobs=-1,
                            cv=10,
                            verbose=10,
                            return_train_score=True)

search.fit(X_train, y_train)

opto_decision_tree = search.best_estimator_

Fitting 10 folds for each of 11 candidates, totalling 110 fits



The total space of parameters 11 is smaller than n_iter=100. Running 11 iterations. For exhaustive searches, use GridSearchCV.

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 110 out of 110 | elapsed:   11.0s finished


In [16]:
model_predictions = opto_decision_tree.predict(X_test)

opto_decision_tree_report = pd.DataFrame(classification_report(y_test, model_predictions, target_names=['0-Poisonous', '1-Edible'], output_dict = True))
opto_decision_tree_report

Unnamed: 0,0-Poisonous,1-Edible,accuracy,macro avg,weighted avg
f1-score,1.0,1.0,1.0,1.0,1.0
precision,1.0,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0,1.0
support,783.0,842.0,1.0,1625.0,1625.0


In [17]:
con_matrix = pd.DataFrame(confusion_matrix(y_test, model_predictions),
                                columns=['Predicted Poison', 'Predicted Edible'], 
                                index=['Actual Poison', 'Actual Edible'])
con_matrix

Unnamed: 0,Predicted Poison,Predicted Edible
Actual Poison,783,0
Actual Edible,0,842


In [18]:
pickle_model(opto_decision_tree, "../assets/opto_decision_tree.pkl")

## Vanilla Random Forest

In [19]:
vanilla_forest = DecisionTreeClassifier(max_depth=3)

vanilla_forest.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [20]:
model_predictions = vanilla_forest.predict(X_test)

vanilla_forest_report = pd.DataFrame(classification_report(y_test, model_predictions, target_names=['0-Poisonous', '1-Edible'], output_dict = True))
vanilla_forest_report

Unnamed: 0,0-Poisonous,1-Edible,accuracy,macro avg,weighted avg
f1-score,0.984887,0.98556,0.985231,0.985223,0.985235
precision,0.971429,0.99878,0.985231,0.985105,0.985601
recall,0.998723,0.972684,0.985231,0.985703,0.985231
support,783.0,842.0,0.985231,1625.0,1625.0


In [21]:
con_matrix = pd.DataFrame(confusion_matrix(y_test, model_predictions),
                                columns=['Predicted Poison', 'Predicted Edible'], 
                                index=['Actual Poison', 'Actual Edible'])
con_matrix

Unnamed: 0,Predicted Poison,Predicted Edible
Actual Poison,782,1
Actual Edible,23,819


In [22]:
pickle_model(vanilla_forest, "../assets/vanilla_forest.pkl")

## Random Forest with Hyperparameter Optomization

In [23]:
"""param_distributions = {
    'max_depth':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
    'n_estimators': [10, 25, 50, 100]
}

search = RandomizedSearchCV(estimator = RandomForestClassifier(),
                            param_distributions = param_distributions,
                            n_iter=100,
                            scoring='precision',
                            n_jobs=-1,
                            cv=10,
                            verbose=10,
                            return_train_score=True)

search.fit(X_train, y_train)

opto_forest = search.best_estimator_"""

"param_distributions = {\n    'max_depth':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],\n    'n_estimators': [10, 25, 50, 100]\n}\n\nsearch = RandomizedSearchCV(estimator = RandomForestClassifier(),\n                            param_distributions = param_distributions,\n                            n_iter=100,\n                            scoring='precision',\n                            n_jobs=-1,\n                            cv=10,\n                            verbose=10,\n                            return_train_score=True)\n\nsearch.fit(X_train, y_train)\n\nopto_forest = search.best_estimator_"

In [24]:
opto_forest = DecisionTreeClassifier(max_depth=4)
opto_forest.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [25]:
model_predictions = opto_forest.predict(X_test)
opto_forest_report = pd.DataFrame(classification_report(y_test, model_predictions, target_names=['0-Poisonous', '1-Edible'], output_dict = True))
opto_forest_report

Unnamed: 0,0-Poisonous,1-Edible,accuracy,macro avg,weighted avg
f1-score,0.993016,0.993433,0.993231,0.993224,0.993232
precision,0.987374,0.9988,0.993231,0.993087,0.993294
recall,0.998723,0.988124,0.993231,0.993423,0.993231
support,783.0,842.0,0.993231,1625.0,1625.0


In [26]:
con_matrix = pd.DataFrame(confusion_matrix(y_test, model_predictions),
                                columns=['Predicted Poison', 'Predicted Edible'], 
                                index=['Actual Poison', 'Actual Edible'])
con_matrix

Unnamed: 0,Predicted Poison,Predicted Edible
Actual Poison,782,1
Actual Edible,10,832


In [27]:
pickle_model(opto_forest, "../assets/opto_forest.pkl")