# Is the mushroom edible ?

## Problem Statement

We want to know whether or not the mushroom is edible

In [1]:
import pandas as pd
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv("data/mushrooms.csv")
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
df.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [46]:
# We see that all the columns are categorical, therefore we can not use it with our algorythms.
df.dtypes

class                       object
cap-shape                   object
cap-surface                 object
cap-color                   object
bruises                     object
odor                        object
gill-attachment             object
gill-spacing                object
gill-size                    int64
gill-color                  object
stalk-shape                 object
stalk-root                  object
stalk-surface-above-ring    object
stalk-surface-below-ring    object
stalk-color-above-ring      object
stalk-color-below-ring      object
veil-type                   object
veil-color                  object
ring-number                 object
ring-type                   object
spore-print-color           object
population                  object
habitat                     object
dtype: object

## Solution 1: One hot encoding

In [4]:
from sklearn.preprocessing import LabelEncoder
lbe = LabelEncoder()

In [12]:
X = df[list(set(df.columns)-set(["class"]))].apply(LabelEncoder().fit_transform).values
y= df["class"]

In [16]:
from sklearn.preprocessing import OneHotEncoder

In [17]:
one = OneHotEncoder()
X_enc = one.fit_transform(X)

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
clf = RandomForestClassifier(max_depth=5)

In [20]:
from sklearn.model_selection import cross_val_score

In [22]:
cross_val_score(clf,X_enc,y,scoring="accuracy").mean()

0.8518914019323951

In [23]:
len(X[0])

22

In [48]:
# The problem is that we create too many columns and we create a dimensionality problem 
# (curse of dimensionality)
len(X_enc.toarray()[0])

117

## Solution 2: Applying weights

In [49]:
df2.groupby("cap-shape").sum()["class"]

cap-shape
0          0
4          4
48        48
600      600
1556    1556
1708    1708
Name: class, dtype: int64

In [50]:
df2 = df.copy()
df2["class"]=LabelEncoder().fit_transform(df2["class"])

In [51]:
for column in list(set(df.columns)-set(["class"])):
    weight = df2.groupby(column).sum()["class"].reset_index()
    weight.columns = [column,"weight"]
    df2[column] = df2[column].reset_index().merge(weight).sort_values("index")["weight"].values
df2.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,1708,1412,1020,624,256,3898,3804,2224,64,...,1536,1712,1680,3916,3908,3808,816,224,368,272
1,0,1708,1412,672,624,0,3898,3804,1692,64,...,1536,1712,1680,3916,3908,3808,816,224,0,740
2,0,48,1412,320,624,0,3898,3804,1692,112,...,1536,1712,1680,3916,3908,3808,816,224,0,36
3,1,1708,1740,320,624,256,3898,3804,2224,112,...,1536,1712,1680,3916,3908,3808,816,224,368,272
4,0,1708,1412,808,3292,120,3898,112,1692,64,...,1536,1712,1680,3916,3908,3808,1768,224,0,740


In [52]:
len(df2.columns)

23

In [53]:
X = df2[['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat']]
y = df2["class"]

In [55]:
# And we even obtain a better score
cross_val_score(clf,X,y,scoring="accuracy").mean()

0.9058037798343318