In [104]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

In [105]:
df = pd.read_csv("mushrooms.tsv", sep = "\t", header = None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
1,p,x,y,n,t,p,f,c,n,n,...,s,w,w,p,w,o,p,n,v,g
2,e,b,y,w,t,a,f,c,b,w,...,s,w,w,p,w,o,p,n,n,m
3,e,b,s,w,t,l,f,c,b,g,...,s,w,w,p,w,o,p,k,s,m
4,e,x,y,y,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m


In [106]:
#data.nunique()

### get_dummies(), train_test_split

In [107]:
dummies = pd.get_dummies(df, drop_first = True)

In [108]:
dummies

Unnamed: 0,0_p,1_c,1_f,1_k,1_s,1_x,2_g,2_s,2_y,3_c,...,21_n,21_s,21_v,21_y,22_g,22_l,22_m,22_p,22_u,22_w
0,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0
1,1,0,0,0,0,1,0,0,1,0,...,0,0,1,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0
4,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7252,1,0,0,1,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
7253,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
7254,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
7255,1,0,0,1,0,0,0,0,1,0,...,0,0,1,0,0,1,0,0,0,0


In [109]:
dummies.shape

(7257, 96)

In [110]:
data, data_test = train_test_split(dummies, test_size=0.1,random_state=13)

In [111]:
X = data.drop('0_p', axis=1)
y = data['0_p']

### Naive Bayes

In [112]:
model = GaussianNB()

In [113]:
model.fit(X, y)

In [114]:
print('Edible: %.2f' % model.class_prior_[0])
print('Poisonous: %.2f' % model.class_prior_[1])

Edible: 0.52
Poisonous: 0.48


In [115]:
x_test = data_test.iloc[:, 1:]
y_predicted = model.predict(x_test)
y_expected = data_test["0_p"]
score = model.score(X, y)
print("Model Score: %.3f" % score)
error = mean_squared_error(y_expected, y_predicted)
print("Mean squared error: %.3f" % error)
precision, recall, fscore, support = precision_recall_fscore_support(y_expected, y_predicted, average="weighted")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F-score: {fscore:.3f}")

Model Score: 0.952
Mean squared error: 0.054
Precision: 0.952
Recall: 0.946
F-score: 0.946


### Część zaawansowana

In [130]:
new_data = df.drop(axis =1, columns = [1,2,3,4,5,6,8,9,10,11,12,14,15,16,17,18,20,21,22])

In [131]:
dummies3 = pd.get_dummies(new_data, drop_first = True, columns = [0,13, 7, 19])

In [132]:
data3, data_test3 = train_test_split(dummies3, test_size=0.1,random_state=13)

In [133]:
X3 = data3.drop(["0_p"], axis=1)
y3 = data3["0_p"]

In [134]:
model3= GaussianNB()
model3.fit(X3, y3)

In [136]:
x_test3 = data_test3.iloc[:, 1:]
y_predicted3 = model3.predict(x_test3)
y_expected3 = data_test3["0_p"]
score3 = model3.score(X3, y3)

In [137]:
print("Model Score: %.3f" % score)
error3 = mean_squared_error(y_expected3, y_predicted3)
print("Mean squared error: %.3f" % error3)
precision, recall, fscore, support = precision_recall_fscore_support(y_expected, y_predicted, average="weighted")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F-score: {fscore:.3f}")

Model Score: 0.952
Mean squared error: 0.218
Precision: 0.952
Recall: 0.946
F-score: 0.946


In [138]:
print("Skuteczność klasyfikatora wytrenowanego na wszystkich cechach: %.3f" % accuracy_score(y_expected, y_predicted))
print("Skuteczność klasyfikatora wytrenowanego na wszystkich cechach: %.3f" % accuracy_score(y_expected3, y_predicted3))

Skuteczność klasyfikatora wytrenowanego na wszystkich cechach: 0.946
Skuteczność klasyfikatora wytrenowanego na wszystkich cechach: 0.782
