In [45]:
"""
with help from https://www.youtube.com/watch?v=BqUmKsfSWho&t=607s
"""
import numpy as np
class NaiveBayes():
    
    
    def fit(self, X, y):
        """Fit the Naive Bayse model"""
        n_samples, n_features = X.shape
        self.classes = np.unique(y)
        n_classes = len(self.classes)
        
        self.mean = np.zeros((n_classes, n_features), dtype=np.float64) 
        self.var = np.zeros((n_classes, n_features), dtype=np.float64)
        self.priors = np.zeros(n_classes, dtype = np.float64)
        
        for i, c in enumerate(self.classes):
            X_c = X[y==c]
            self.mean[i, :] = X_c.mean(axis=0)
            self.var[i, :] = X_c.var(axis=0)
            self.priors[i] = X_c.shape[0] / float(n_samples)
        
        

        print("Model fitted")
    
    
    def pdf(self, class_i, x):
        """Gaussian PDF"""
        mean = self.mean[class_i]
        var = self.var[class_i]
        num = np.exp( - (x - mean)**2 / (2 * var))
        den = np.sqrt(2 * np.pi * var)
        return num / den
    
    def predict_single(self, x):
        """Predict a single new row"""
        posteriors = []
        for i, c in enumerate(self.classes):
            prior = np.log(self.priors[i])
            posterior = np.sum(np.log(self.pdf(i, x)))
            posterior = posterior + prior
            posteriors.append(posterior)
            
        return self.classes[np.argmax(posteriors)]
    
    def predict(self, X):
        y_pred = [self.predict_single(x) for x in X]
        return np.array(y_pred)

In [46]:

NB = NaiveBayes()
X = np.array([[1,2,3,4,5],[1,3,4,5,6],[9,7,7,8,6]])
y = np.array([1,2,2])
NB.fit(X,y)

Model fitted


In [47]:
from sklearn import datasets
import pandas as pd
X, y = datasets.load_wine(return_X_y = True)
pd.DataFrame(X).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


In [48]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

NB.fit(X_train, y_train)
np.unique(y)


Model fitted


array([0, 1, 2])

In [51]:
y_pred = NB.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_pred, y_test))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       0.96      1.00      0.98        23
           2       1.00      0.93      0.97        15

    accuracy                           0.98        54
   macro avg       0.99      0.98      0.98        54
weighted avg       0.98      0.98      0.98        54

