In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Naive Bayes --- Mushroom Classifier

#### Load the Dataset

In [3]:
data = pd.read_csv('../dataset/mushrooms.csv')

In [4]:
print(data.shape)

(8124, 23)


In [5]:
print(data.isnull().values.sum())

0


In [6]:
data.head(3)

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m


#### converting the categorical data into numerical value

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
le = LabelEncoder()

In [9]:
numerical = data.apply(le.fit_transform)

In [10]:
print(numerical.shape)

(8124, 23)


In [11]:
numerical.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [12]:
x = numerical.values[:, 1:]
y = numerical.values[:, 0]

In [13]:
print(x.shape)
print(y.shape)

(8124, 22)
(8124,)


####  train test split

In [14]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.40)

In [15]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(4874, 22) (4874,)
(3250, 22) (3250,)


In [16]:
print(x[:5, :])

[[5 2 4 1 6 1 0 1 4 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [5 2 9 1 0 1 0 0 4 0 2 2 2 7 7 0 2 1 4 3 2 1]
 [0 2 8 1 3 1 0 0 5 0 2 2 2 7 7 0 2 1 4 3 2 3]
 [5 3 8 1 6 1 0 1 5 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [5 2 3 0 5 1 1 0 4 1 3 2 2 7 7 0 2 1 0 3 0 1]]


#### Building our classifier

In [17]:
def prior_prob(y, label):
    total_classes = y.shape[0]
    total_labels = np.sum(y == label)
    return (total_labels)/float(total_classes)

def conditional_prob(x, y, fcol, fval,  label): #fcol - feautre_col  #fval-feautre_value
#     m = x.shape[0]
#     total_labels = np.sum(y == label)
#     print(total_labels)
#     total = 0.0
#     for i in range(m):
#         if x[i][fcol] == fval and y[i] == label:
#             total += 1
    
#     return total/total_labels
    # short cut to this is
    x_filtered = x[y == label]
    numerator = np.sum(x_filtered[:, fcol] == fval )
    denominator = np.sum(y == label)
    return numerator/denominator

def predict(x, y, test):
    labels = np.unique(y)
    post_prob = []
    m = test.shape[0]
    
    for label in labels:
        
        likelihood = 1.0
        
        for fcol in range(m):
            likelihood *= conditional_prob(x, y, fcol, test[fcol], label)
            
        post_prob.append(likelihood*prior_prob(y, label))    
        
    pred = np.argmax(post_prob)
    return pred
        


In [18]:
prior_prob(np.array([0, 0, 1]), 0)

0.6666666666666666

In [19]:
conditional_prob(np.array([[0, 0, 1], [0, 1, 1], [1, 0, 1], [1, 1, 1], [1, 1, 1]]), np.array([0, 1, 1, 1, 0]), 0, 0, 0)

0.5

In [24]:
predict(x_train, y_train, x_test[52])

0

In [25]:
x_test[52]

array([2, 3, 4, 1, 3, 1, 0, 0, 5, 0, 4, 2, 3, 7, 7, 0, 2, 1, 4, 3, 3, 4])

In [22]:
y_test[52]

0

In [23]:
predictions = []
for each in x_test:
    predictions.append(predict(x_train, y_train, each))

In [26]:
import sklearn.metrics as metrics

In [27]:
metrics.accuracy_score(predictions, y_test)

0.996

### Gaussian Naive Bayes -- Handling Continuous Data

In [48]:
data = np.array([[1, 2, 4], [2, 3, 7]])

In [49]:
np.std(data, axis = 0)

array([0.5, 0.5, 1.5])

In [50]:
data

array([[1, 2, 4],
       [2, 3, 7]])

In [53]:
a = np.array([1, 2, 3])
b = np.array([3, 4, 5])

In [54]:
a/b

array([0.33333333, 0.5       , 0.6       ])

In [60]:
np.exp(a)

array([ 2.71828183,  7.3890561 , 20.08553692])