In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
import os
import graphviz
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc, roc_curve



In [None]:
df = pd.read_csv(r"C:\Users\mwach\OneDrive\Desktop\Class\3rd Year\Artificial Intelligence\GroupAssignment2\mushrooms.csv")

In [None]:
df.head()

In [None]:
df["stalk-root"].hist();

In [None]:
# import matplotlib.pyplot as plt

# fig,axes = plt.subplots(4,2,figsize=(12,15))
# for idx,cat_col in enumerate(df["stalk-root"]):
#     row,col = idx//2,idx%2
#     sns.countplot(x=cat_col,data=df["stalk-root"],ax=axes[row,col])


# plt.subplots_adjust(hspace=1)

In [None]:
df["stalk-root"].unique()

In [None]:
df["stalk-shape"].value_counts()

In [None]:
# e=equal    1120 
# c=club     556
# b=bulbous  3776
# r=rooted   192
# ?= missing  2480

In [None]:
# print("stalk-root bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?")
df['gill-size'].value_counts()

In [None]:
df["stalk-shape"].value_counts()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df["class"].unique()

In [None]:
df["class"].value_counts()

In [None]:
df['class'].hist()

In [None]:
df['gill-size'].value_counts()

In [None]:
#encoding the categorical columns
labelencoder=LabelEncoder()
for column in df.columns:
    df[column] = labelencoder.fit_transform(df[column])

In [None]:
df["stalk-shape"].value_counts()

In [None]:
df['gill-size'].value_counts()

In [None]:
df.head()

In [None]:
df["stalk-root"].value_counts()

In [None]:
df_div = pd.melt(df, "class", var_name= "Characteristics")  
fig, ax = plt.subplots(figsize=(16,6))

p = sns.violinplot(ax = ax,
                   x="Characteristics",
                   y="value",
                   hue="class",
                   split = True,
                   data=df_div,
                   inner = "quartile",
                   palette = "Set1")

df_no_class = df.drop(["class"],axis = 1)

p.set_xticklabels(rotation = 90, labels = list(df_no_class.columns));
#plt.savefig(“violinplot.png”, format=’png’, dpi=500, bbox_inches=’tight’)



In [None]:
plt.figure(figsize=(14,12))
sns.heatmap(df.corr(),linewidths=.1,cmap="Purples", annot=True, annot_kws={"size": 7})
plt.yticks(rotation=0);
#plt.savefig("corr.png", format='png', dpi=400, bbox_inches='tight')

In [None]:
df[['class', 'gill-color']].groupby(['gill-color'], as_index=False).mean().sort_values(by='class', ascending=False)

Preparing the data

In [None]:
df["stalk-root"].value_counts()

In [None]:
# e=equal    1120 
# c=club     556
# b=bulbous  3776 
# r=rooted   192
# ?= missing  2480

# 0= missing
# 1=bulbous
# 2=club
# 3=equal
# 4=rooted

In [None]:
X = df.drop(['class'], axis=1)
y = df["class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.1)

In [None]:
X_train.head()

Classification Methods

In [None]:
# 1. Decision Tree Classification
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [None]:
features_list = X.columns.values
feature_importance = dt.feature_importances_
sorted_idx = np.argsort(feature_importance)
plt.figure(figsize=(8,7))


plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center', color ="red")
plt.yticks(range(len(sorted_idx)), features_list[sorted_idx])
plt.xlabel('Importance')
plt.title('Feature importance')
plt.draw()
#plt.savefig("featureimp.png", format='png', dpi=500, bbox_inches='tight')
plt.show()

Predicting and estimating the result

In [None]:
y_pred_dt = dt.predict(X_test)
print("Decision Tree Classifier report: \n\n", classification_report(y_test, y_pred_dt))
print("Test Accuracy: {}%".format(round(dt.score(X_test, y_test)*100, 2)))

**Confusion Matrix for Decision Tree Classifier**

In [None]:
cm = confusion_matrix(y_test, y_pred_dt)

x_axis_labels = ["Edible", "Poisonous"]
y_axis_labels = ["Edible", "Poisonous"]

f, ax = plt.subplots(figsize =(7,7))
sns.heatmap(cm, annot = True, linewidths=0.2, linecolor="black", fmt = ".0f", ax=ax, cmap="Purples", xticklabels=x_axis_labels, yticklabels=y_axis_labels)
plt.xlabel("PREDICTED LABEL")
plt.ylabel("TRUE LABEL")
plt.title('Confusion Matrix for Decision Tree Classifier')
#plt.savefig("dtcm.png", format='png', dpi=500, bbox_inches='tight')
plt.show()

In [None]:
#KNN CLASSIFICATION
from sklearn.neighbors import KNeighborsClassifier
best_Kvalue = 0
best_score = 0
for i in range(1,10):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    if knn.score(X_test, y_test) > best_score:
        best_score = knn.score(X_train, y_train)
        best_Kvalue = i
        
print("Best KNN Value: {}".format(best_Kvalue))
print("Test Accuracy: {}%".format(round(best_score*100,2)))

Classification report of KNN Classifier

In [None]:
y_pred_knn = knn.predict(X_test)
print("KNN Classifier report: \n\n", classification_report(y_test, y_pred_knn))

In [None]:
# Confusion Matrix for KNN Classifier

cm = confusion_matrix(y_test, y_pred_knn)
x_axis_labels = ["Edible", "Poisonous"]
y_axis_labels = ["Edible", "Poisonous"]
f, ax = plt.subplots(figsize =(7,7))
sns.heatmap(cm, annot = True, linewidths=0.2, linecolor="black", fmt = ".0f", ax=ax, cmap="Purples", xticklabels=x_axis_labels, yticklabels=y_axis_labels)
plt.xlabel("PREDICTED LABEL")
plt.ylabel("TRUE LABEL")
plt.title('Confusion Matrix for KNN Classifier')
#plt.savefig("knncm.png", format='png', dpi=500, bbox_inches='tight')
plt.show()

In [None]:
preds = dt.predict(X_test)
print(preds[:36])
print(y_test[:36].values)

In [None]:
X = df[['gill-color', 'spore-print-color', 'population','gill-size', 'stalk-root','bruises','stalk-shape']]
y = df["class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.1)

In [None]:
import matplotlib.pyplot as plt

fig,axes = plt.subplots(4,2,figsize=(12,15))
for idx,cat_col in enumerate(X):
    row,col = idx//2,idx%2
    sns.countplot(x=cat_col,data=X,ax=axes[row,col])


plt.subplots_adjust(hspace=1)

In [None]:
X['gill-color'].value_counts()

In [None]:
# 1. Decision Tree Classification
from sklearn.tree import DecisionTreeClassifier
dt1 = DecisionTreeClassifier()
dt1.fit(X_train, y_train)

In [None]:
dt1

In [None]:
# save the model to disk

import pickle
filename = 'decision_tree.pkl'
pickle.dump(dt1, open(filename, 'wb'))

In [None]:
loaded_dt1_model=pickle.load(open(filename, 'rb'))

In [None]:
loaded_dt1_model.predict(X_test.iloc[8:9])

In [None]:
y_test.iloc[8:9]

In [None]:
X_test.iloc[8:9]

In [None]:
X_test

In [None]:
X_train.head()

In [None]:
X_train["gill-color"].value_counts()

In [None]:
df["gill-color"].value_counts()

In [None]:
# input from page # Your mushroom is ['k', 'b', 'c', 'n', '2', 'f', 'e']

# gill_color,spore_print_color,population,gill_size,stalk_root,bruises,stalk_shape

In [None]:
df["gill-color"].index

In [None]:
X_train["gill-color"].value_counts()