## Naive Bayes - Mushroom Classification
Goal is to predict the class of mushrooms, given some features of the mushrooms.


In [97]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [98]:
df=pd.read_csv("mushrooms.csv")

In [99]:
df.head(n=10)

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
5,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g
6,e,b,s,w,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,n,m
7,e,b,y,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,s,m
8,p,x,y,w,t,p,f,c,n,p,...,s,w,w,p,w,o,p,k,v,g
9,e,b,s,y,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,s,m


In [100]:
print(df.shape)

(8124, 23)


 #### Encode the Categorial Data into Numerical Data

In [101]:
le=LabelEncoder()
ds=df.apply(le.fit_transform)
print(ds.shape)
ds.head()

(8124, 23)


Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [102]:
data=ds.values

In [103]:
print(type(data))

<class 'numpy.ndarray'>


In [104]:
X=data[:,1:]
Y=data[:,0]

In [105]:
print(X.shape)
print(Y.shape)

(8124, 22)
(8124,)


In [106]:
X

array([[5, 2, 4, ..., 2, 3, 5],
       [5, 2, 9, ..., 3, 2, 1],
       [0, 2, 8, ..., 3, 2, 3],
       ...,
       [2, 2, 4, ..., 0, 1, 2],
       [3, 3, 4, ..., 7, 4, 2],
       [5, 2, 4, ..., 4, 1, 2]])

In [107]:
Y

array([1, 0, 0, ..., 0, 1, 0])

In [112]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=5)

In [113]:
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(6499, 22) (6499,)
(1625, 22) (1625,)


### Naive Bayes Algorithm

In [114]:
def prior(Y_train,label):
    total_example=Y_train.shape[0]
    numerator=np.sum(Y_train==label)
    return  numerator/float(total_example)

def cond_prob(x_train,y_train,feature_col,feature_val,label ):

    filtered_x = x_train[y_train==label] 
    numerator= np.sum(filtered_x[:,feature_col]==feature_val)
    denominator= np.sum(y_train==label)
    return numerator/float(denominator)

def predict(x_train,y_train,xtest):
    
    classes=np.unique(y_train)
    post_prob=[]
    n_features=x_train.shape[1]
    for label in classes:
       
        likelihood=1
        
        for i in range (n_features):
            cond=cond_prob(x_train,y_train,i,xtest[i],label)
            likelihood*=cond
        pri=prior(y_train,label)
        post=pri*likelihood
        post_prob.append(post)
        
    pred=np.argmax(post_prob)
    return pred
    
    

### prediction

In [115]:
output = predict(X_train,Y_train,X_test[1])
print(output)
print(Y_test[1])

1
1


### Accuracy

In [116]:
def score(x_train,y_train,x_test,y_test):
    predictions=[]
    for i in range( x_test.shape[0]):
        pred=predict(x_train,y_train,x_test[i])
        predictions.append(pred)
    pred=np.array(predictions)
    correct=np.sum(y_test==pred)
    return correct/float(y_test.shape[0])

In [117]:
print(score(X_train,Y_train,X_test,Y_test))

0.9975384615384615
