# Example of Naive Bayes

In [1]:
# import necessary modules
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

https://github.com/randerson112358/Python/blob/master/Email_Spam_Detection/Email_Spam_Detection.ipynb

Data Source: https://www.kaggle.com/balakishan77/spam-or-ham-email-classification/data

Read data as DataFrame

In [2]:
df = pd.read_csv('emails.csv')
df.head(5)
df.shape

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


(5728, 2)

In [3]:
# remove duplicates
df.drop_duplicates(inplace = True)
df.shape

(5695, 2)

Encode text using `CountVectorizer`

In [4]:
message0 = 'hello world hello hello world play'
message1 = 'test test test test one hello'

#Convert a collection of text documents to a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
bow = vectorizer.fit_transform([message0, message1])
print(bow)
print(type(bow))

  (0, 0)	3
  (0, 4)	2
  (0, 2)	1
  (1, 0)	1
  (1, 3)	4
  (1, 1)	1
<class 'scipy.sparse.csr.csr_matrix'>


As you can see, `CountVectorizer` returns a sparse matrix encoding our texts with the number of times a particular word occurs.

In [5]:
vocabulary = {v: k for k, v in vectorizer.vocabulary_.items()}
[vocabulary[i] for i in sorted([v for k,v in vectorizer.vocabulary_.items()])]
bow.toarray()

['hello', 'one', 'play', 'test', 'world']

array([[3, 0, 1, 0, 2],
       [1, 1, 0, 4, 0]])

Note that you can see how the encoding information is saved in a sparse matrix. For `message0`, on indices [0,4,2] you have values [3,2,1].

If we set `binary=True` when encoding messages, our encoder only records the whether the word is present or not, ignoring the numbber of occurance. For our first implementation of `NaiveBayes`, we will simply encode the presence of each word.

In [6]:
vectorizer_b = CountVectorizer(binary=True)
bow_b = vectorizer_b.fit_transform([message0, message1])
print(bow_b)
bow_b.toarray()

  (0, 0)	1
  (0, 4)	1
  (0, 2)	1
  (1, 0)	1
  (1, 3)	1
  (1, 1)	1


array([[1, 0, 1, 0, 1],
       [1, 1, 0, 1, 0]])

## Experiment with MultinomialNB from sklearn

Before implementing our own learner, let's check the performance of `MultinomialNB` from sklearn. Here we remove the stop_words when vectorizing our text.

In [7]:
messages_bow = CountVectorizer(stop_words='english').fit_transform(df['text'])

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(messages_bow, df['spam'], test_size = 0.20, random_state = 0,
                                                   stratify = df['spam'])

messages_bow.shape

from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

#Evaluate the model on the training data set
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
pred = classifier.predict(X_train)
print(classification_report(y_train ,pred ))
print('Confusion Matrix: \n',confusion_matrix(y_train,pred))
print()
print('Accuracy: ', accuracy_score(y_train,pred))

pred = classifier.predict(X_test)
print(classification_report(y_test ,pred ))

print('Confusion Matrix: \n', confusion_matrix(y_test,pred))
print()
print('Accuracy: ', accuracy_score(y_test,pred))

(5695, 36996)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3462
           1       0.99      1.00      0.99      1094

    accuracy                           1.00      4556
   macro avg       0.99      1.00      1.00      4556
weighted avg       1.00      1.00      1.00      4556

Confusion Matrix: 
 [[3450   12]
 [   3 1091]]

Accuracy:  0.9967076382791923
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       865
           1       0.97      1.00      0.98       274

    accuracy                           0.99      1139
   macro avg       0.98      0.99      0.99      1139
weighted avg       0.99      0.99      0.99      1139

Confusion Matrix: 
 [[856   9]
 [  1 273]]

Accuracy:  0.9912203687445127


It acheived a pretty descent accuracy using the default parameter. Now let's check how it performs if we only encoding the presence information of the words in our text corpus.

In [8]:
messages_bow_b = CountVectorizer(stop_words='english', binary=True).fit_transform(df['text'])

X_train, X_test, y_train, y_test = train_test_split(messages_bow_b, df['spam'], test_size = 0.20, random_state = 0,
                                                    stratify = df['spam'])
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

#Evaluate the model on the training data set
pred = classifier.predict(X_train)
print(classification_report(y_train ,pred ))
print('Confusion Matrix: \n',confusion_matrix(y_train,pred))
print()
print('Accuracy: ', accuracy_score(y_train,pred))

pred = classifier.predict(X_test)
print(classification_report(y_test ,pred ))

print('Confusion Matrix: \n', confusion_matrix(y_test,pred))
print()
print('Accuracy: ', accuracy_score(y_test,pred))

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3462
           1       1.00      0.99      0.99      1094

    accuracy                           1.00      4556
   macro avg       1.00      1.00      1.00      4556
weighted avg       1.00      1.00      1.00      4556

Confusion Matrix: 
 [[3458    4]
 [   8 1086]]

Accuracy:  0.9973661106233538
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       865
           1       0.97      0.99      0.98       274

    accuracy                           0.99      1139
   macro avg       0.98      0.99      0.99      1139
weighted avg       0.99      0.99      0.99      1139

Confusion Matrix: 
 [[856   9]
 [  3 271]]

Accuracy:  0.9894644424934153


We didn't observe a large performance drop between these encoding methods. Now let's check how our own implementation performs compared to sklearn

In [9]:
from naive_bayes import NaiveBayes_v0
clf = NaiveBayes_v0()
clf.fit(X_train.toarray(), y_train)

pred = clf.predict(X_train.toarray())
print(classification_report(y_train ,pred ))
print('Confusion Matrix: \n',confusion_matrix(y_train,pred))
print()
print('Accuracy: ', accuracy_score(y_train,pred))

pred = clf.predict(X_test.toarray())
print(classification_report(y_test ,pred ))

print('Confusion Matrix: \n', confusion_matrix(y_test,pred))
print()
print('Accuracy: ', accuracy_score(y_test,pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      3462
           1       0.98      1.00      0.99      1094

    accuracy                           0.99      4556
   macro avg       0.99      1.00      0.99      4556
weighted avg       0.99      0.99      0.99      4556

Confusion Matrix: 
 [[3441   21]
 [   4 1090]]

Accuracy:  0.9945127304653204
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       865
           1       0.94      1.00      0.97       274

    accuracy                           0.98      1139
   macro avg       0.97      0.99      0.98      1139
weighted avg       0.98      0.98      0.98      1139

Confusion Matrix: 
 [[847  18]
 [  1 273]]

Accuracy:  0.9833187006145742


Wow, we acheived simliar performance with our naive implementation of Naive Bayes. Let's see whether our algorithm can generalize to other dataset rather than spam detection.

https://github.com/aishajv/Unfolding-Naive-Bayes-from-Scratch/blob/master/%23%20Unfolding%20Na%C3%AFve%20Bayes%20from%20Scratch!%20Take-2%20%F0%9F%8E%AC.ipynb

In [29]:
training_set=pd.read_csv('./labeledTrainData.tsv',sep='\t')
testing_set=pd.read_csv('./testData.tsv',sep='\t')
training_set.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [27]:
#getting training set examples labels
print ("Unique Classes: ",np.unique(training_set['sentiment']))
print ("Total Number of Training Examples: ",training_set['review'].shape)
print ("Total Number of Testing Examples: ",testing_set['review'].shape)

Unique Classes:  [0 1]
Total Number of Training Examples:  (25000,)
Total Number of Testing Examples:  (25000,)


In [53]:
vectorizer = CountVectorizer(stop_words='english', binary=True)
train_bow_b = vectorizer.fit_transform(training_set['review'])
train_bow_b.shape
# Loading the kaggle test dataset
test_set = pd.read_csv('./testData.tsv',sep='\t')
test_bow_b = vectorizer.transform(testing_set['review'])
test_bow_b.shape

(25000, 74538)

(25000, 74538)

In [54]:
X_train, X_test, y_train, y_test = train_test_split(train_bow_b, training_set['sentiment'], 
                                                    test_size = 0.20, random_state = 0,
                                                    stratify = training_set['sentiment'])

In [55]:
clf = NaiveBayes_v0()
clf.fit(X_train.toarray(), y_train)

pred = clf.predict(X_train.toarray())
print(classification_report(y_train ,pred ))
print('Confusion Matrix: \n',confusion_matrix(y_train,pred))
print()
print('Accuracy: ', accuracy_score(y_train,pred))

pred = clf.predict(X_test.toarray())
print(classification_report(y_test ,pred ))

print('Confusion Matrix: \n', confusion_matrix(y_test,pred))
print()
print('Accuracy: ', accuracy_score(y_test,pred))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92     10000
           1       0.91      0.93      0.92     10000

    accuracy                           0.92     20000
   macro avg       0.92      0.92      0.92     20000
weighted avg       0.92      0.92      0.92     20000

Confusion Matrix: 
 [[9072  928]
 [ 658 9342]]

Accuracy:  0.9207
              precision    recall  f1-score   support

           0       0.89      0.83      0.86      2500
           1       0.84      0.89      0.87      2500

    accuracy                           0.86      5000
   macro avg       0.86      0.86      0.86      5000
weighted avg       0.86      0.86      0.86      5000

Confusion Matrix: 
 [[2080  420]
 [ 268 2232]]

Accuracy:  0.8624


In [61]:
test_pred = clf.predict(test_bow_b.toarray())

#writing results to csv to uplaoding on kaggle!
kaggle_df = pd.DataFrame(data=np.column_stack([testing_set["id"].values,test_pred.astype(int)])
                         ,columns=["id","sentiment"])
kaggle_df.to_csv("./naive_bayes_model_take1.csv",index=False)
print ('Predcitions Generated and saved to naive_bayes_model_take1.csv')

Predcitions Generated and saved to naive_bayes_model_take1.csv


![](./kaggle1.png)

Wow, we can submission our result to kaggle. Not bad!