In [7]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [8]:
df = pd.read_csv("./mushrooms.csv")
print(df.shape, df.columns)
df.head()

(8124, 23) Index(['type', 'cap_shape', 'cap_surface', 'cap_color', 'bruises', 'odor',
       'gill_attachment', 'gill_spacing', 'gill_size', 'gill_color',
       'stalk_shape', 'stalk_root', 'stalk_surface_above_ring',
       'stalk_surface_below_ring', 'stalk_color_above_ring',
       'stalk_color_below_ring', 'veil_type', 'veil_color', 'ring_number',
       'ring_type', 'spore_print_color', 'population', 'habitat'],
      dtype='object')


Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [9]:
### encode categorical data to numerical data

In [15]:
le = LabelEncoder()

# applies transformation for each column
ds = df.apply(le.fit_transform)

ds.head()

data = ds.values
print(data.shape, data[:5,])

data_x = data[:, 1:]
data_y = data[:, 0]

(8124, 23) [[1 5 2 4 1 6 1 0 1 4 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 9 1 0 1 0 0 4 0 2 2 2 7 7 0 2 1 4 3 2 1]
 [0 0 2 8 1 3 1 0 0 5 0 2 2 2 7 7 0 2 1 4 3 2 3]
 [1 5 3 8 1 6 1 0 1 5 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 3 0 5 1 1 0 4 1 3 2 2 7 7 0 2 1 0 3 0 1]]


In [14]:
## split data to train and test

In [17]:
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2)

In [19]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
print(np.unique(y_train)) # total classes

(6499, 22) (1625, 22) (6499,) (1625,)
[0 1]


In [20]:
### building our classifier

In [27]:
a = np.array([0,0,0,1,1,1,1,0,1])
np.sum(a==1)

5

In [28]:
def prior_prob(y_train, label):
    total_examples = y_train.shape[0]
    class_examples = np.sum(y_train==label)
    
    return (class_examples)/float(total_examples)

In [29]:
prior_prob(a, 0)

0.4444444444444444

In [57]:
def cond_prob(x_train, y_train, feature_col, feature_val, label):
    x_filtered = x_train[y_train == label]
    numerator = np.sum(x_filtered[:, feature_col] == feature_val)
    denominator = np.sum(y_train == label)
    
    return numerator/float(denominator)

In [58]:
### compute posterior prob foreach example and make predictions

In [68]:
def predict(x_train, y_train, xtest):
    # xtest is a single test point and has n features
    classes = np.unique(y_train)
    n_features = x_train.shape[1]
    post_probs = [] # list of possibilities for all classes for one test point
    
    #compute posterior for each class
    for label in classes:
        # post_c = likelihood * prior
        likelihood = 1.0
        for f in range(n_features):
            cond = cond_prob(x_train, y_train, f, xtest[f], label)
            likelihood *= cond
            
        prior = prior_prob(y_train, label)
        
        post = likelihood*prior
        post_probs.append(post)
        
    pred = np.argmax(post_probs)
    
    return pred

In [72]:
output = predict(x_train, y_train, x_test[1])
print(output)
print(y_test[1])

0
0


In [70]:
def score(x_train, y_train, x_test, y_test):
    pred = []
    
    for i in range(x_test.shape[0]):
        pred_label = predict(x_train, y_train, x_test[i])
        pred.append(pred_label)
        
    pred =np.array(pred)
    
    accuracy = np.sum(pred == y_test)/y_test.shape[0]
    
    return accuracy

In [71]:
print(score(x_train, y_train, x_test, y_test))

0.9944615384615385
