In [1]:
import numpy as np
import pandas as pd
from scipy.stats import multivariate_normal
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

# Naive Bayes Model Implimentation

In [2]:
class GaussianNB:
    def fit(self, X, y):
        self.C,counts = np.unique(y,return_counts=True)
        self.k = len(self.C)
        self.prior = counts/len(y)
        self.mean = []
        self.cov = []
        
        for i in range(self.k):
            X_class = X[y == self.C[i]]
            self.mean.append(np.mean(X_class, axis=0))
            self.cov.append(np.cov(X_class.T))

    def predict(self, X):
        posterior = np.zeros((X.shape[0],self.k))
        for j in range(self.k):
            likelihood_j = multivariate_normal.pdf(X, mean=self.mean[j], cov=self.cov[j])
            posterior[:,j] = np.exp(np.log(likelihood_j*self.prior[j])) # a_k
        # Normalizing the posterior by evidence
        posterior/= np.sum(posterior,axis=1,keepdims=True)
        return self.C[np.argmax(posterior,axis=1)]

## Train with sample training data set stored in a .CSV file and compute the accuracy with a few test data sets.

In [3]:
data = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/seeds.csv")

In [4]:
data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22,1
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,1
2,14.29,14.09,0.905,5.291,3.337,2.699,4.825,1
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,1
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,1


In [5]:
X,y = data.iloc[:,:-1],data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=20,shuffle=True)

In [6]:
gnb = GaussianNB()
gnb.fit(X_train,y_train)
preds = gnb.predict(X_test)

  posterior[:,j] = np.exp(np.log(likelihood_j*self.prior[j])) # a_k


In [7]:
print("Missclassification:",np.sum(y_test != preds),"out off",len(y_test))
print("Model Accuracy:",accuracy_score(y_test,preds))

Missclassification: 2 out off 42
Model Accuracy: 0.9523809523809523


In [8]:
# Comparing with sklearn implementation
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train,y_train)
preds = gnb.predict(X_test)
print("Missclassification:",np.sum(y_test != preds),"out off",len(y_test))
print("Model Accuracy:",accuracy_score(y_test,preds))

Missclassification: 5 out off 42
Model Accuracy: 0.8809523809523809


# Multinomial Naive Bayes Implimentation

In [17]:
class MultinomialNBDOC:
    def fit(self, X, y,alpha=1):
        self.C,counts = np.unique(y,return_counts=True)
        self.k = len(self.C)
        self.priors = counts/len(y)
        self.priors = np.log(self.priors)
        
        self.vectorizer = CountVectorizer()
        X = self.vectorizer.fit_transform(X)
        
        word_counts = np.zeros((self.k, X.shape[1]))
        for i in range(self.k):
            X_class = X[y == self.C[i]]
            word_counts[i, :] = np.sum(X_class, axis=0)
        total_counts = word_counts.sum(axis=1, keepdims=True)
        
        self.log_likelihood_ratio = np.log((word_counts + alpha) / (total_counts + alpha*X.shape[1])) - np.log((total_counts + alpha).sum(axis=1, keepdims=True))
        
    def predict(self, X):
        X = self.vectorizer.transform(X)
        log_probs = self.priors + X @ self.log_likelihood_ratio.T
        return self.C[np.argmax(log_probs, axis=1)]        

# To classify a set of documents and measure the accuracy, precision, and recall.

In [10]:
from sklearn.datasets import fetch_20newsgroups

In [11]:
docs = fetch_20newsgroups()

In [12]:
X_train, X_test, y_train, y_test = train_test_split(docs.data, docs.target, test_size=0.2, shuffle=True,random_state=410)

In [18]:
doc_clf = MultinomialNBDOC()
doc_clf.fit(X_train,y_train)
preds = doc_clf.predict(X_test)

In [19]:
print("Missclassification:",np.sum(y_test != preds),"out off",len(y_test))
print(f"Accuracy Score: {accuracy_score(y_test,preds):.3f}")
print(f"Precision Score: {precision_score(y_test,preds,average='weighted'):.3f}")
print(f"Recall Score: {recall_score(y_test,preds,average='weighted'):.3f}")

Missclassification: 764 out off 2263
Accuracy Score: 0.662
Precision Score: 0.839
Recall Score: 0.662


In [16]:
# Comparing with sklearn implementation
from sklearn.naive_bayes import MultinomialNB

vectorizer = CountVectorizer(stop_words='english')
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)
clf = MultinomialNB()
clf.fit(X_train_counts, y_train)
y_pred = clf.predict(X_test_counts)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
print("Missclassification:",np.sum(y_test != y_pred),"out off",len(y_test))
print(f"Accuracy Score: {accuracy_score(y_test,y_pred):.3f}")
print(f"Precision Score: {precision_score(y_test,y_pred,average='weighted'):.3f}")
print(f"Recall Score: {recall_score(y_test,y_pred,average='weighted'):.3f}")

Missclassification: 320 out off 2263
Accuracy Score: 0.859
Precision Score: 0.878
Recall Score: 0.859
