# Naive Bayes Implementation
- Goal is to predict the class of mushrooms, given some features of mushrooms using naive bayes classification

In [25]:
#Load the dataset
import numpy as np
import pandas as pd
df=pd.read_csv('../../Datasets/Mushrooms/mushrooms.csv')
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [26]:
print(df.shape)

(8124, 23)


-Data is not numerical ,data is in categorical form
#### Encode Categorical Data into Numerical Data

In [27]:
#Handling Categorical data using sklearn Label Encoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
le=LabelEncoder()
ds=df.apply(le.fit_transform)
print(type(ds))
print(ds.head())

<class 'pandas.core.frame.DataFrame'>
   type  cap_shape  cap_surface  cap_color  bruises  odor  gill_attachment  \
0     1          5            2          4        1     6                1   
1     0          5            2          9        1     0                1   
2     0          0            2          8        1     3                1   
3     1          5            3          8        1     6                1   
4     0          5            2          3        0     5                1   

   gill_spacing  gill_size  gill_color  ...  stalk_surface_below_ring  \
0             0          1           4  ...                         2   
1             0          0           4  ...                         2   
2             0          0           5  ...                         2   
3             0          1           5  ...                         2   
4             1          0           4  ...                         2   

   stalk_color_above_ring  stalk_color_below_ring  vei

In [28]:
data=ds.values
print(data.shape,type(data))

(8124, 23) <class 'numpy.ndarray'>


In [29]:
#Break data into train and test set
data_X=data[:,1:]
data_Y=data[:,0]
x_train,x_test,y_train,y_test=train_test_split(data_X,data_Y,test_size=0.2)
print(x_train.shape,y_train.shape,x_test.shape,y_test.shape)
print(np.unique(y_train))

(6499, 22) (6499,) (1625, 22) (1625,)
[0 1]


## Mushroom Classifier : Prior and Conditional Probability

In [30]:
def prior_prob(y_train,label):
    total_examples=y_train.shape[0]
    class_examples=np.sum(y_train==label)
    return (class_examples)/float(total_examples)

In [31]:
#Example
y=np.array([0,5,5,1,1,1,1,0,0,0])
print(len(y))
print(prior_prob(y,0))
print(prior_prob(y,1))
print(prior_prob(y,5))

10
0.4
0.4
0.2


In [32]:
def cond_prob(x_train,y_train,feature_col,feature_val,label):
    x_filtered=x_train[y_train==label]
    numerator=np.sum(x_filtered[:,feature_col]==feature_val)
    denominator=np.sum(y_train==label)
    return numerator/float(denominator)

## Mushroom classifier : Prediction using Posterior probability
- Compute posterior probability for each test sample and make prediction

In [33]:
def predict(x_train,y_train,xtest):
    #xtest is single test sample with n features
    
    classes=np.unique(y_train)
    n_features=x_train.shape[1]
    
    #list of probabilities of all classes and given single point
    post_probs=[]
    
    #compute posterior probability for each class
    for label in classes:
        #post_c=likelihood*prior
        likelihood=1.0
        
        for f in range(n_features):
            cond=cond_prob(x_train,y_train,f,xtest[f],label)
            likelihood*=cond
            
        prior=prior_prob(y_train,label)
        post=likelihood*prior
        post_probs.append(post)
        
    pred=np.argmax(post_probs)
    return pred

In [34]:
output=predict(x_train,y_train,x_test[2])
print(output)
print(y_test[2])

0
0


In [35]:
#Calculate accuracy of naive bayes 
def score(x_train,y_train,x_test,y_test):
    pred=[]
    for i in range(x_test.shape[0]):
        pred_label=predict(x_train,y_train,x_test[i])
        pred.append(pred_label)
        
    pred=np.array(pred)
    accuracy=np.sum(pred==y_test)/y_test.shape[0]
    return accuracy

In [36]:
score(x_train,y_train,x_test,y_test)

0.9987692307692307