In [1]:
%matplotlib inline

import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.stats import spearmanr
from future_encoders import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import precision_score, recall_score, r2_score

In [2]:
FULL_DATASET_PATH = os.path.join("datasets", "mushrooms", "mushrooms.csv")

In [3]:
full_dataset = pd.read_csv(FULL_DATASET_PATH)

In [4]:
full_dataset.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [5]:
mushrooms = full_dataset.copy()
mushrooms_labels = full_dataset["class"]
mushrooms_labels.replace(to_replace=dict(e=1, p=0), inplace=True)
mushrooms.drop(["class"],axis=1,inplace=True)

dictvectoraiser = DictVectorizer(sparse=False)
mushrooms_transformed = dictvectoraiser.fit_transform(mushrooms.to_dict("records"))
dictvectoraiser.categories_

In [6]:
cat_encoder = OneHotEncoder(sparse=False)
mushrooms_transformed = cat_encoder.fit_transform(mushrooms)
mushrooms_transformed.shape

(8124, 117)

In [7]:
mushrooms_names = list(mushrooms)
mushrooms_names_zipped = zip(mushrooms_names, cat_encoder.categories_)

final_names = []
for mushroom_name_zipped in mushrooms_names_zipped:
    names = list(mushroom_name_zipped[1])
    for name in names:
        final_name = """{}_{}""".format(mushroom_name_zipped[0],name)
        final_names.append(final_name)     
print('')




In [8]:
mushrooms_dict = dict(zip(final_names,mushrooms_transformed.transpose()))
mushrooms_encoded = pd.DataFrame(mushrooms_dict)

In [9]:
mushrooms_encoded_with_labels = mushrooms_encoded.copy()
mushrooms_encoded_with_labels["labels"] = mushrooms_labels
mushrooms_encoded_with_labels.head()

Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,...,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w,labels
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1


In [10]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1,test_size=0.8,random_state=42)

for train_index,test_index in split.split(mushrooms_encoded_with_labels,mushrooms_encoded_with_labels["labels"]):
    strat_train_set_with_labels = mushrooms_encoded_with_labels.loc[train_index]
    strat_test_set_with_labels = mushrooms_encoded_with_labels.loc[test_index]

In [11]:
strat_train_set_labels = strat_train_set_with_labels["labels"]
strat_train_set = strat_train_set_with_labels.drop(["labels"],axis=1)
strat_test_set_labels = strat_test_set_with_labels["labels"]
strat_test_set = strat_test_set_with_labels.drop(["labels"],axis=1)

In [12]:
corr_matrix = strat_train_set_with_labels.corr()
corr_matrix["labels"].sort_values(ascending=False)

labels                        1.000000
odor_n                        0.783692
gill-size_b                   0.541321
ring-type_p                   0.505293
stalk-surface-above-ring_s    0.465498
bruises_t                     0.460678
spore-print-color_n           0.413759
stalk-surface-below-ring_s    0.356376
gill-spacing_w                0.350180
spore-print-color_k           0.348614
gill-color_n                  0.262851
stalk-color-above-ring_g      0.260086
stalk-color-below-ring_g      0.254976
odor_l                        0.223934
stalk-color-above-ring_w      0.221577
population_n                  0.219636
population_a                  0.219636
ring-number_t                 0.215665
gill-color_w                  0.215166
odor_a                        0.213798
stalk-color-below-ring_w      0.209367
stalk-root_c                  0.209156
gill-color_u                  0.198998
stalk-root_e                  0.183806
cap-surface_f                 0.183295
habitat_g                

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

In [14]:
sgd_cls = SGDClassifier()
sgd_cls.fit(strat_train_set,strat_train_set_labels)



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [15]:
labels_pred_sgd = sgd_cls.predict(strat_test_set)
len(labels_pred_sgd)
strat_test_set_with_labels["labels_pred_sgd"] = labels_pred_sgd

In [16]:
print("Precision: {:.2f}%".format(100 * precision_score(strat_test_set_labels, labels_pred_sgd)))
print("Recall: {:.2f}%".format(100 * recall_score(strat_test_set_labels, labels_pred_sgd)))

print(sgd_cls.coef_)

Precision: 99.94%
Recall: 100.00%
[[-2.19322294e+00  0.00000000e+00 -1.09661147e+00  1.94602531e-16
   1.09661147e+00  2.19322294e+00  2.19322294e+00 -1.09661147e+00
  -2.19322294e+00  1.09661147e+00 -3.28983441e+00  3.28983441e+00
   1.09661147e+00 -2.19322294e+00  1.09661147e+00 -2.19322294e+00
   2.19322294e+00  1.09661147e+00 -3.28983441e+00  2.19322294e+00
   1.09661147e+00 -1.09661147e+00  7.67628029e+00 -7.67628029e+00
  -9.86950324e+00  7.67628029e+00 -1.09661147e+00  1.53525606e+01
  -7.67628029e+00 -3.28983441e+00 -1.09661147e+00  1.09661147e+00
  -1.09661147e+00 -5.48305735e+00  5.48305735e+00  7.67628029e+00
  -7.67628029e+00 -4.38644588e+00  0.00000000e+00 -2.19322294e+00
   1.09661147e+00  1.09661147e+00  4.38644588e+00  0.00000000e+00
   2.19322294e+00 -1.09661147e+00  1.09661147e+00 -1.94602531e-15
  -2.19322294e+00 -3.28983441e+00  3.28983441e+00  2.19322294e+00
  -8.77289176e+00  4.38644588e+00  1.22599595e-14  2.19322294e+00
   1.09661147e+00 -4.38644588e+00  4.38644

In [17]:
log_reg = LogisticRegression()
log_reg.fit(strat_train_set,strat_train_set_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [18]:
labels_pred_log = log_reg.predict(strat_test_set)
len(labels_pred_log)
strat_test_set_with_labels["labels_pred_log"] = labels_pred_log
strat_test_set_with_labels.head()

Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,...,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w,labels,labels_pred_sgd,labels_pred_log
2465,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1
5815,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0
3652,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
7066,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0
2021,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1


In [19]:
print("Precision: {:.2f}%".format(100 * precision_score(strat_test_set_labels, labels_pred_log)))
print("Recall: {:.2f}%".format(100 * recall_score(strat_test_set_labels, labels_pred_log)))
print("")
print("R^2: {:.2f}%".format(100 * r2_score(strat_test_set_labels, labels_pred_log)))
print("R")

Precision: 99.91%
Recall: 100.00%

R^2: 99.82%
R


In [20]:
attribs = list(strat_train_set)
selected_attribs = attribs.copy()
attribs_to_delete = []

corr_matrix = strat_train_set_with_labels.corr()

for attrib in attribs:
    if abs(corr_matrix["labels"][attrib]) < 0.13:
        attribs_to_delete.append(attrib)
attribs_to_delete += ['odor_n','odor_a','odor_l','odor_f','odor_y','odor_s']

In [21]:
for attrib in attribs_to_delete:
    selected_attribs.remove(attrib)

In [22]:
class DataFrameSelector (BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X,y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names].values

In [23]:
selector = DataFrameSelector(selected_attribs)
selected_strat_train_set = selector.fit_transform(strat_train_set)
selected_strat_test_set = selector.fit_transform(strat_test_set)

In [24]:
log_reg.fit(selected_strat_train_set,strat_train_set_labels)
test_labels_pred_log = log_reg.predict(selected_strat_test_set)
train_labels_pred_log = log_reg.predict(selected_strat_train_set)
len(labels_pred_log)
strat_test_set_with_labels["labels_pred_log"] = test_labels_pred_log
print("train:")
print("Precision: {:.2f}%".format(100 * precision_score(strat_train_set_labels, train_labels_pred_log)))
print("Recall: {:.2f}%".format(100 * recall_score(strat_train_set_labels, train_labels_pred_log)))
print("R^2: {:.2f}%".format(100 * r2_score(strat_train_set_labels, train_labels_pred_log)))
print("\ntest:")
print("Precision: {:.2f}%".format(100 * precision_score(strat_test_set_labels, test_labels_pred_log)))
print("Recall: {:.2f}%".format(100 * recall_score(strat_test_set_labels, test_labels_pred_log)))
print("R^2: {:.2f}%".format(100 * r2_score(strat_test_set_labels, test_labels_pred_log)))
print("\ncoef:")
print(len(selected_attribs))
print(log_reg.coef_)
strat_test_set_with_labels

train:
Precision: 99.88%
Recall: 99.41%
R^2: 98.52%

test:
Precision: 99.67%
Recall: 99.32%
R^2: 97.90%

coef:
59
[[-0.44357515  0.         -0.11333517  0.89864972  0.11847982 -0.09687502
  -4.8489364  -2.70255332 -1.6842211   1.7058259   1.56288234 -1.54127754
  -3.41733148 -0.23389536  0.36238761  0.50057149  0.50788962  3.2402571
   1.17166961  0.37945341  1.16965415 -1.8541888   1.2435911   1.27689627
  -0.81590728  0.33621836 -0.42733159  0.92659967 -0.29238865  0.36702783
  -0.15229348 -0.00669187 -0.32785347  0.72093879  0.82048825  0.36702783
  -0.19686581 -0.39812038  0.0216048  -0.06046476  0.74705747  0.07641731
  -1.05588424 -1.66033241 -0.71144373 -2.61145323  1.83924741  2.23194361
  -0.35623449  0.12374203 -1.10426659  0.89274293 -0.77691137 -0.69172104
   1.57801885 -0.3822863  -0.20802198 -0.12662205  1.17653519]]


Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,...,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w,labels,labels_pred_sgd,labels_pred_log
2465,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1
5815,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0
3652,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
7066,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0
2021,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1
6256,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0
947,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,1,1
2728,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1
991,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,1,1
4188,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0


In [25]:
selected_dict = dict(zip(selected_attribs,selected_strat_train_set.transpose()))
selected = pd.DataFrame(selected_dict)
selected_corr = selected.corr()
print(selected_corr)

                            cap-shape_b  cap-shape_c  cap-shape_k  \
cap-shape_b                    1.000000          NaN    -0.080075   
cap-shape_c                         NaN          NaN          NaN   
cap-shape_k                   -0.080075          NaN     1.000000   
cap-surface_f                 -0.062816          NaN    -0.169202   
bruises_f                     -0.075823          NaN     0.230879   
bruises_t                      0.075823          NaN    -0.230879   
odor_c                        -0.041816          NaN    -0.057072   
odor_p                        -0.048257          NaN    -0.065863   
gill-spacing_c                -0.089982          NaN    -0.003351   
gill-spacing_w                 0.089982          NaN     0.003351   
gill-size_b                    0.138393          NaN    -0.288936   
gill-size_n                   -0.138393          NaN     0.288936   
gill-color_b                  -0.123714          NaN     0.392225   
gill-color_h                  -0.0