In [1]:
import numpy as np
import pandas as pd

In [2]:
### use mushroom database
df = pd.read_csv("datasets/mushrooms.csv")
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
df.shape

(8124, 23)

In [4]:
## convert in numeric data
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


In [5]:
le = LabelEncoder()
ds = df.apply(le.fit_transform)
## apply that fit_transform to each column.

In [6]:
ds.head()
# "type" is the output and all other are inputs !

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [7]:
data = ds.values ## numpy array
data

array([[1, 5, 2, ..., 2, 3, 5],
       [0, 5, 2, ..., 3, 2, 1],
       [0, 0, 2, ..., 3, 2, 3],
       ...,
       [0, 2, 2, ..., 0, 1, 2],
       [1, 3, 3, ..., 7, 4, 2],
       [0, 5, 2, ..., 4, 1, 2]])

In [8]:
X = data[:,1:]
y = data[:,0]

In [9]:
X.shape, y.shape

((8124, 22), (8124,))

In [10]:
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.33)

### __Custom Naive Baye's__

In [11]:
class CustomNB:
    
    def fit(self, x, y) :
        self.x_train = x
        self.y_train = y
            
    # label = which class you want this prob for !
    def prior_prob(self, label) :
        total = self.y_train.shape[0]
        class_examples = np.sum(self.y_train == label)
        return class_examples/total
    
    # p(xi = red | y = lable) - ith feature (feature_col = i) for a single example !
    def conditional_prob(self, feature_col, feature_val, label):
        # out of all examples what mushrooms have feature as feature_val in feature_col that belongs to that label/class
        x_filtered = self.x_train[self.y_train == label ] # all examples in classs label
        numerator = np.sum(x_filtered[:, feature_col] == feature_val)
        denominator = len(x_filtered)
        return numerator/denominator
    
    def predict_point(self, x_test) : 
        classes = np.unique(self.y_train)
        n_features = self.x_train.shape[1]
        post_pro = []
        
        for label in classes :
            likehood = 1.0
            for feature in range(n_features):
                cond = self.conditional_prob(feature, x_test[feature], label)
                likehood *= cond
            prior = self.prior_prob(label)
            post = prior*likehood
            post_pro.append(post)
        
        return np.argmax(post_pro)
    
    def predict(self, x_test) :
        res = []
        for point in x_test:
            res.append(self.predict_point(point))
        return np.array(res)
    
    def score(self, x_test, y_test) :
        return (self.predict(x_test) == y_test).mean()
    

In [12]:
model = CustomNB()
model.fit(x_train, y_train)
model.predict(x_test[:10]) # prediction

array([0, 1, 1, 0, 1, 1, 1, 1, 0, 0])

In [13]:
y_test[:10] # actual 

array([0, 1, 1, 0, 1, 1, 1, 1, 0, 0])

In [14]:
model.score(x_test, y_test) # accuracy 

0.9973890339425587