# Mushroom dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

### Import dataset e preparazione dati

In [2]:
dataset = pd.read_csv("mushroom_data_all.csv")
dataset.class_edible = dataset.class_edible.map({'p': 1, 'e': 0})
dataset

Unnamed: 0,class_edible,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,0,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,0,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,1,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,0,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,0,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,0,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,1,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [3]:
labelEncoder = LabelEncoder()
data = dataset.apply(LabelEncoder().fit_transform)
data

Unnamed: 0,class_edible,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,3,2,4,0,5,0,0,0,11,...,2,5,5,0,1,1,4,0,1,2
8120,0,5,2,4,0,5,0,0,0,11,...,2,5,5,0,0,1,4,0,4,2
8121,0,2,2,4,0,5,0,0,0,5,...,2,5,5,0,1,1,4,0,1,2
8122,1,3,3,4,0,8,1,0,1,0,...,1,7,7,0,2,1,0,7,4,2


### Split dataset

In [4]:
train, test = train_test_split(data, test_size=0.2)

# se voglio considerare solo alcune feature prendo solo alcune colonne
#columns = ["class_edible","cap-shape","cap-surface","cap-color","gill-attachment","gill-spacing","gill-size","gill-color"]
#train = train[columns]
#test = test[columns]

y_train = train.class_edible
y_test = test.class_edible
x_train = train.loc[:, train.columns != "class_edible"]
x_test = test.loc[:, train.columns != "class_edible"]

In [5]:
print("train", train.shape)
print("test", test.shape)
print("y_train", y_train.shape)
print("y_test", y_test.shape)
print("x_train", x_train.shape)
print("x_test", x_test.shape)

train (6499, 23)
test (1625, 23)
y_train (6499,)
y_test (1625,)
x_train (6499, 22)
x_test (1625, 22)


### Loop per verificare l'accuracy al variare di alcuni parametri

In [6]:
from sklearn.tree import DecisionTreeClassifier

train_score=[]
test_score=[]
for i in range(1,10):
    tree = DecisionTreeClassifier(max_depth=i,random_state=0)
    tree.fit(x_train, y_train)
    print("Max_depth:", i)
    print("Accuracy on training set: {:.8f}".format(tree.score(x_train, y_train)))
    train_score.append(tree.score(x_train, y_train))
    print("Accuracy on test set: {:.8f}".format(tree.score(x_test, y_test)))
    test_score.append(tree.score(x_test, y_test))

Max_depth: 1
Accuracy on training set: 0.78704416
Accuracy on test set: 0.80307692
Max_depth: 2
Accuracy on training set: 0.91075550
Accuracy on test set: 0.91138462
Max_depth: 3
Accuracy on training set: 0.95799354
Accuracy on test set: 0.96123077
Max_depth: 4
Accuracy on training set: 0.97722727
Accuracy on test set: 0.98030769
Max_depth: 5
Accuracy on training set: 0.97891983
Accuracy on test set: 0.98092308
Max_depth: 6
Accuracy on training set: 0.99492230
Accuracy on test set: 0.99569231
Max_depth: 7
Accuracy on training set: 1.00000000
Accuracy on test set: 1.00000000
Max_depth: 8
Accuracy on training set: 1.00000000
Accuracy on test set: 1.00000000
Max_depth: 9
Accuracy on training set: 1.00000000
Accuracy on test set: 1.00000000


### Modello della soluzione

In [7]:
tree = DecisionTreeClassifier(max_depth=7,random_state=0)
tree.fit(x_train, y_train)
print(tree.n_features_)
print("Feature importances:")
print(tree.feature_importances_)

22
Feature importances:
[0.         0.         0.00327674 0.02842191 0.00411628 0.
 0.         0.12509582 0.3337668  0.02417605 0.04953058 0.
 0.0026288  0.         0.         0.         0.         0.02095324
 0.         0.20302631 0.18995692 0.01505056]


In [8]:
print("Accuracy on training set: {:.8f}".format(tree.score(x_train, y_train)))
print("Accuracy on test set: {:.8f}".format(tree.score(x_test, y_test)))

Accuracy on training set: 1.00000000
Accuracy on test set: 1.00000000
