In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [17]:
df = pd.read_csv("./mushrooms.csv")

In [18]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [19]:
from sklearn.preprocessing import LabelEncoder

In [20]:
col_cat = {}
for c in df.columns:
    col_cat[c] = {}
    le = LabelEncoder()
    df[c] = le.fit_transform(df[c])
    col_cat[c] = {le.classes_[i]  : i for i in range(len(le.classes_))}

In [21]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [24]:
col_cat

{'class': {'e': 0, 'p': 1},
 'cap-shape': {'b': 0, 'c': 1, 'f': 2, 'k': 3, 's': 4, 'x': 5},
 'cap-surface': {'f': 0, 'g': 1, 's': 2, 'y': 3},
 'cap-color': {'b': 0,
  'c': 1,
  'e': 2,
  'g': 3,
  'n': 4,
  'p': 5,
  'r': 6,
  'u': 7,
  'w': 8,
  'y': 9},
 'bruises': {'f': 0, 't': 1},
 'odor': {'a': 0,
  'c': 1,
  'f': 2,
  'l': 3,
  'm': 4,
  'n': 5,
  'p': 6,
  's': 7,
  'y': 8},
 'gill-attachment': {'a': 0, 'f': 1},
 'gill-spacing': {'c': 0, 'w': 1},
 'gill-size': {'b': 0, 'n': 1},
 'gill-color': {'b': 0,
  'e': 1,
  'g': 2,
  'h': 3,
  'k': 4,
  'n': 5,
  'o': 6,
  'p': 7,
  'r': 8,
  'u': 9,
  'w': 10,
  'y': 11},
 'stalk-shape': {'e': 0, 't': 1},
 'stalk-root': {'?': 0, 'b': 1, 'c': 2, 'e': 3, 'r': 4},
 'stalk-surface-above-ring': {'f': 0, 'k': 1, 's': 2, 'y': 3},
 'stalk-surface-below-ring': {'f': 0, 'k': 1, 's': 2, 'y': 3},
 'stalk-color-above-ring': {'b': 0,
  'c': 1,
  'e': 2,
  'g': 3,
  'n': 4,
  'o': 5,
  'p': 6,
  'w': 7,
  'y': 8},
 'stalk-color-below-ring': {'b': 0,
  '

In [25]:
def entropy(arr):
    val, freq = np.unique(arr, return_counts = True)
    ent = 0.0
    for i in range(len(freq)):
        p = freq[i]/freq.sum()
        ent += (-p*np.log2(p))
    return ent

In [42]:
def divide(data, fkey, fval):
    left_data = data[data[fkey] < fval]
    right_data = data[data[fkey] >= fval]
    left_data.reset_index(drop = True, inplace = True)
    right_data.reset_index(drop = True, inplace = True)
    return left_data, right_data

def information_gain(data, fkey, fval):
    left_data, right_data = divide(data, fkey, fval)
    init_ent = entropy(data)
    fin_ent = (left_data.shape[0]/data.shape[0])*entropy(left_data) + (right_data.shape[0]/data.shape[0])*entropy(right_data)
    return init_ent - fin_ent

In [64]:
class DecisionTree:
    def __init__(self, curr_height = 0, max_height = 5):
        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.target = None
        self.height = curr_height
        self.max_height = max_height
        
    def train(self, data, xcols, ycol):
        info_gain = []
        for c in xcols:
            info_gain.append(information_gain(data, c, data[c].mean()))
        self.fkey = xcols[np.argmax(info_gain)]
        self.fval = df[self.fkey].mean()
        print("Feature Selected: ", self.fkey, "curr_height: ", self.height)
        
        left_data, right_data = divide(data, self.fkey, self.fval)
        
        un_val, freq = np.unique(data[ycol], return_counts = True)
        self.target = un_val[np.argmax(freq)]
        
        if len(left_data) == 0 or len(right_data) == 0 or self.height > self.max_height:
            return
        
        self.left = DecisionTree(self.height+1, self.max_height)
        self.left.train(left_data, xcols, ycol)
        
        self.right = DecisionTree(self.height+1, self.max_height)
        self.right.train(right_data, xcols, ycol)
        
        return
    
    def predict(self, x_test):
        if x_test[self.fkey] <= self.fval:
            if self.left !=  None:
                return self.left.predict(x_test)
            return self.target
        if self.right !=  None:
            return self.right.predict(x_test)
        return self.target

In [94]:
model = DecisionTree(max_height = 7)

In [95]:
xcols = df.columns[1:]
ycol = df.columns[0]
print(xcols)
print(ycol)

Index(['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')
class


In [96]:
from sklearn.model_selection import train_test_split

In [97]:
df_train , df_test, Y_train, Y_test = train_test_split(df, df["class"],test_size = 0.3)

In [98]:
model.train(df_train, xcols, ycol)

Feature Selected:  spore-print-color curr_height:  0
Feature Selected:  population curr_height:  1
Feature Selected:  bruises curr_height:  2
Feature Selected:  gill-attachment curr_height:  3
Feature Selected:  gill-color curr_height:  4
Feature Selected:  cap-color curr_height:  4
Feature Selected:  cap-color curr_height:  5
Feature Selected:  odor curr_height:  5
Feature Selected:  cap-color curr_height:  6
Feature Selected:  gill-color curr_height:  6
Feature Selected:  gill-color curr_height:  7
Feature Selected:  gill-color curr_height:  7
Feature Selected:  gill-size curr_height:  3
Feature Selected:  stalk-shape curr_height:  4
Feature Selected:  cap-color curr_height:  5
Feature Selected:  gill-color curr_height:  6
Feature Selected:  cap-color curr_height:  6
Feature Selected:  cap-color curr_height:  5
Feature Selected:  gill-color curr_height:  6
Feature Selected:  cap-shape curr_height:  7
Feature Selected:  habitat curr_height:  8
Feature Selected:  habitat curr_height:  

In [104]:
pred = []
for i in range(len(df_train)):
    pred.append(model.predict(df_train.iloc[i, :]))
pred = np.array(pred)
y = df_train["class"].to_numpy()
(y == pred).sum()/y.shape[0]

0.9333450580372845

In [105]:
pred = []
for i in range(len(df_test)):
    pred.append(model.predict(df_test.iloc[i, :]))
pred = np.array(pred)
y = df_test["class"].to_numpy()
(y == pred).sum()/y.shape[0]

0.938884331419196

In [106]:
from sklearn.tree import DecisionTreeClassifier

In [111]:
model = DecisionTreeClassifier()
model.fit(df_train.iloc[:, 1:], df_train.iloc[:, 0])
print(model.score(df_train.iloc[:, 1:], df_train.iloc[:, 0]))
print(model.score(df_test.iloc[:, 1:], df_test.iloc[:, 0]))

1.0
1.0


In [112]:
from sklearn.ensemble import RandomForestClassifier

In [114]:
model = RandomForestClassifier()
model.fit(df_train.iloc[:, 1:], df_train.iloc[:, 0])
print(model.score(df_train.iloc[:, 1:], df_train.iloc[:, 0]))
print(model.score(df_test.iloc[:, 1:], df_test.iloc[:, 0]))

1.0
1.0


In [115]:
from sklearn.ensemble import AdaBoostClassifier

In [116]:
model = AdaBoostClassifier()
model.fit(df_train.iloc[:, 1:], df_train.iloc[:, 0])
print(model.score(df_train.iloc[:, 1:], df_train.iloc[:, 0]))
print(model.score(df_test.iloc[:, 1:], df_test.iloc[:, 0]))

1.0
1.0
