In [56]:
import pandas as pd
import category_encoders as ce
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.metrics import classification_report, confusion_matrix
import plotly.figure_factory as ff

# Load data
pickleFile = open("../assets/clean_mushroom_data.pkl", 'rb')
mushrooms = pickle.load(pickleFile)
pickleFile.close()


#### Create features and target and train test split
X = mushrooms.drop(columns='class')
X = ce.OneHotEncoder(use_cat_names=True).fit_transform(X)
y = mushrooms['class'].replace({'p':0, 'e':1})

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42,
                                                    test_size=.2, stratify=y)

In [57]:
def pickle_model(model, filename):
    pickleFile = open(filename, 'wb')
    pickle.dump(model, pickleFile)
    pickleFile.close()

In [78]:
pickle_model(X_test, "../assets/X_test.pkl")
pickle_model(y_test, "../assets/y_test.pkl")

## Decision Stump

In [58]:
decision_stump = DecisionTreeClassifier(max_depth=1)

decision_stump.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [59]:
model_predictions = decision_stump.predict(X_test)

decision_stump_report = pd.DataFrame(classification_report(y_test, model_predictions, target_names=['0-Poisonous', '1-Edible'], output_dict = True))
decision_stump_report

Unnamed: 0,0-Poisonous,1-Edible,accuracy,macro avg,weighted avg
f1-score,0.894214,0.882391,0.888615,0.888303,0.888088
precision,0.824353,0.974175,0.888615,0.899264,0.901984
recall,0.977011,0.806413,0.888615,0.891712,0.888615
support,783.0,842.0,0.888615,1625.0,1625.0


In [60]:
con_matrix = pd.DataFrame(confusion_matrix(y_test, model_predictions),
                                columns=['Predicted Poison', 'Predicted Edible'], 
                                index=['Actual Poison', 'Actual Edible'])
con_matrix

Unnamed: 0,Predicted Poison,Predicted Edible
Actual Poison,765,18
Actual Edible,163,679


In [61]:
pickle_model(decision_stump, "../assets/decision_stump.pkl")

## Vanilla Decision Tree

In [62]:
vanilla_decision_tree = DecisionTreeClassifier()

vanilla_decision_tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [63]:
model_predictions = vanilla_decision_tree.predict(X_test)

vanilla_decision_tree_report = pd.DataFrame(
    classification_report(y_test,
                          model_predictions,
                          target_names=['0-Poisonous', '1-Edible'],
                          output_dict = True)
    )

vanilla_decision_tree_report

Unnamed: 0,0-Poisonous,1-Edible,accuracy,macro avg,weighted avg
f1-score,1.0,1.0,1.0,1.0,1.0
precision,1.0,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0,1.0
support,783.0,842.0,1.0,1625.0,1625.0


In [64]:
con_matrix = pd.DataFrame(confusion_matrix(y_test, model_predictions),
                                columns=['Predicted Poison', 'Predicted Edible'], 
                                index=['Actual Poison', 'Actual Edible'])
con_matrix

Unnamed: 0,Predicted Poison,Predicted Edible
Actual Poison,783,0
Actual Edible,0,842


In [65]:
pickle_model(vanilla_decision_tree, "../assets/vanilla_decision_tree.pkl")

## Decision Tree with Hyperparameter Optomization

In [66]:
param_distributions = {
    'max_depth':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
}

search = RandomizedSearchCV(estimator = DecisionTreeClassifier(),
                            param_distributions = param_distributions,
                            n_iter=100,
                            scoring='precision',
                            n_jobs=-1,
                            cv=10,
                            verbose=10,
                            return_train_score=True)

search.fit(X_train, y_train)

opto_decision_tree = search.best_estimator_

Fitting 10 folds for each of 11 candidates, totalling 110 fits



The total space of parameters 11 is smaller than n_iter=100. Running 11 iterations. For exhaustive searches, use GridSearchCV.

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1995s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done  70 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done  92 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 110 out of 110 | elapsed:    5.8s finished


In [67]:
model_predictions = opto_decision_tree.predict(X_test)

opto_decision_tree_report = pd.DataFrame(classification_report(y_test, model_predictions, target_names=['0-Poisonous', '1-Edible'], output_dict = True))
opto_decision_tree_report

Unnamed: 0,0-Poisonous,1-Edible,accuracy,macro avg,weighted avg
f1-score,1.0,1.0,1.0,1.0,1.0
precision,1.0,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0,1.0
support,783.0,842.0,1.0,1625.0,1625.0


In [68]:
con_matrix = pd.DataFrame(confusion_matrix(y_test, model_predictions),
                                columns=['Predicted Poison', 'Predicted Edible'], 
                                index=['Actual Poison', 'Actual Edible'])
con_matrix

Unnamed: 0,Predicted Poison,Predicted Edible
Actual Poison,783,0
Actual Edible,0,842


In [69]:
pickle_model(opto_decision_tree, "../assets/opto_decision_tree.pkl")

## Vanilla Random Forest

In [70]:
vanilla_forest = RandomForestClassifier()

vanilla_forest.fit(X_train, y_train)


The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [71]:
model_predictions = vanilla_forest.predict(X_test)

vanilla_forest_report = pd.DataFrame(classification_report(y_test, model_predictions, target_names=['0-Poisonous', '1-Edible'], output_dict = True))
vanilla_forest_report

Unnamed: 0,0-Poisonous,1-Edible,accuracy,macro avg,weighted avg
f1-score,1.0,1.0,1.0,1.0,1.0
precision,1.0,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0,1.0
support,783.0,842.0,1.0,1625.0,1625.0


In [72]:
con_matrix = pd.DataFrame(confusion_matrix(y_test, model_predictions),
                                columns=['Predicted Poison', 'Predicted Edible'], 
                                index=['Actual Poison', 'Actual Edible'])
con_matrix

Unnamed: 0,Predicted Poison,Predicted Edible
Actual Poison,783,0
Actual Edible,0,842


In [73]:
pickle_model(vanilla_forest, "../assets/vanilla_forest.pkl")

## Random Forest with Hyperparameter Optomization

In [74]:
param_distributions = {
    'max_depth':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
    'n_estimators': [10, 25, 50, 100]
}

search = RandomizedSearchCV(estimator = RandomForestClassifier(),
                            param_distributions = param_distributions,
                            n_iter=100,
                            scoring='precision',
                            n_jobs=-1,
                            cv=10,
                            verbose=10,
                            return_train_score=True)

search.fit(X_train, y_train)

opto_forest = search.best_estimator_


The total space of parameters 44 is smaller than n_iter=100. Running 44 iterations. For exhaustive searches, use GridSearchCV.

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 44 candidates, totalling 440 fits


[Parallel(n_jobs=-1)]: Batch computation too fast (0.1933s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 172 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done 202 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:   21.5s
[Parallel(n_jobs=-1)]: Done 266 tasks      | elapsed:   23.9s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0092s.) Setting batch_size=1.
[Parallel(n_jobs=-1)]: Done 

In [75]:
model_predictions = opto_forest.predict(X_test)
opto_forest_report = pd.DataFrame(classification_report(y_test, model_predictions, target_names=['0-Poisonous', '1-Edible'], output_dict = True))
opto_forest_report

Unnamed: 0,0-Poisonous,1-Edible,accuracy,macro avg,weighted avg
f1-score,1.0,1.0,1.0,1.0,1.0
precision,1.0,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0,1.0
support,783.0,842.0,1.0,1625.0,1625.0


In [76]:
con_matrix = pd.DataFrame(confusion_matrix(y_test, model_predictions),
                                columns=['Predicted Poison', 'Predicted Edible'], 
                                index=['Actual Poison', 'Actual Edible'])
con_matrix

Unnamed: 0,Predicted Poison,Predicted Edible
Actual Poison,783,0
Actual Edible,0,842


In [77]:
pickle_model(opto_forest, "../assets/opto_forest.pkl")