In [None]:
# reading input data
import pandas as pd
import numpy as np

df = pd.read_csv("data/data_email.csv")

In [None]:
# parsing input data_X into separate words and labels into [0,1]; defining X and c
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer

cv = CountVectorizer()
X = cv.fit_transform(df['data_X']).toarray()
lb = LabelBinarizer()
c = lb.fit_transform(df['data_c']).ravel()



Naive Bayes classifier: $\arg \max_{1\leq i\leq n} \left\{ \log \left( \prod_{k=1}^m P(x^{(k)}|c_i) \cdot P(c_i) \right) \right\} = \arg \max_{1\leq i\leq n} \left\{ \sum_{k=1}^m \log \left(  P(x^{(k)}|c_i) \right) + \log \left( P(c_i) \right) \right\}$



In [None]:
# implement Naive Bayes for discrete variables
class NaiveBayes:
    # training 
    def fit(self, X, c):
        self.n_samples, self.n_features = X.shape
        self._classes, self._classCounts = np.unique(c,return_counts=True)
        self.n_classes = len(self._classes)
        self.alpha = 1.
        self._priors = np.zeros(self.n_classes)
        self._P_words_feature = np.zeros((self.n_classes, self.n_features))
        for idx, c_i in enumerate(self._classes):
            self._priors[idx] = np.log(self._classCounts[idx]/self.n_samples)
            X_c_i = X[c==c_i]
            column_sum = np.zeros(self.n_features)
            for iFeature in range(self.n_features):
                column_sum[iFeature] = np.sum(X_c_i[:,iFeature])
            self._P_words_feature[idx] = np.log((column_sum+self.alpha)/(np.sum(column_sum)+self.alpha*self.n_features))
    # application
    def predict(self, X):
        predictions = np.zeros(X.shape[0])
        for idx, x in enumerate(X):
            posteriors = np.zeros(self.n_classes)
            for idx_c, c_i in enumerate(self._classes):
                prior = self._priors[idx_c]
                toSum = np.zeros(self.n_features)
                for iFeature in range(self.n_features):
                    toSum[iFeature] = x[iFeature]*self._P_words_feature[idx_c][iFeature]
                posteriors[idx_c] = np.sum(toSum)+prior
            predictions[idx]= self._classes[np.argmax(posteriors)]
        return predictions 

In [None]:
# split input data in training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, c_train, c_test = train_test_split(X,c, test_size=0.2, random_state=123)

In [None]:
# training and testing Naive Bayes
nb = NaiveBayes()
nb.fit(X_train,c_train)
predictions = nb.predict(X_test)

In [None]:
# implementing the same in scikit learn
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train,c_train)
predictions_sk = nb.predict(X_test)

In [None]:
# comparing accuracy
def accuracy(y_true, y_pred):
    return np.sum(y_true==y_pred)/len(y_true)

print("accuracy = ", accuracy(c_test, predictions))
print("accuracy (scikit learn) = ", accuracy(c_test, predictions_sk))

In [None]:
# comparing confusion matrix
from sklearn.metrics import confusion_matrix

print('confusion matrix:')
print(confusion_matrix(c_test, predictions))
print('confusion matrix (scikit learn):')
print(confusion_matrix(c_test, predictions_sk))