# Naive Bayes Application Examples

In [1]:
# import necessary modules
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.display import HTML
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from evaluation import test
from utils import load_data
from naive_bayes import NaiveBayes

## Email Spam Filter

The most well known application of naive bayes is classification of spam email. Let's see it in practice.

In [2]:
emails = load_data('emails.csv')
emails.head(5)
emails.shape
# 1 being spam

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


(5728, 2)

In [3]:
# remove duplicates
emails.drop_duplicates(inplace = True)
emails.shape

(5695, 2)

Check examples of how to vectorize text data more at [here](https://nickyfoto.github.io/blog/entries/vectorizing-text-data).

In [4]:
messages_bow = CountVectorizer(stop_words='english').fit_transform(emails['text'])
X_train, X_test, y_train, y_test = train_test_split(messages_bow, emails['spam'], test_size = 0.20,
                                                    random_state = 0,
                                                    stratify = emails['spam'])
messages_bow.shape
test(NaiveBayes(), X_train, X_test, y_train, y_test)

(5695, 36996)

testing NaiveBayes(alpha=1.0)
Confusion Matrix: 
 [[3450   12]
 [   3 1091]]

Training Accuracy:  0.9967076382791923
Confusion Matrix: 
 [[856   9]
 [  1 273]]

Testing Accuracy:  0.9912203687445127


NaiveBayes(alpha=1.0)

It acheived a pretty descent accuracy. Now let's check how it performs if we only encoding the presence information of the words in our text corpus.

In [5]:
messages_bow_b = CountVectorizer(stop_words='english', binary=True).fit_transform(emails['text'])
X_train, X_test, y_train, y_test = train_test_split(messages_bow_b, emails['spam'],
                                                    test_size = 0.20, random_state = 0,
                                                    stratify = emails['spam'])
test(NaiveBayes(), X_train, X_test, y_train, y_test)

testing NaiveBayes(alpha=1.0)
Confusion Matrix: 
 [[3458    4]
 [   8 1086]]

Training Accuracy:  0.9973661106233538
Confusion Matrix: 
 [[856   9]
 [  3 271]]

Testing Accuracy:  0.9894644424934153


NaiveBayes(alpha=1.0)

We didn't observe a large performance drop between these encoding methods. Let's see whether our algorithm can generalize to other dataset rather than spam detection.

## Movie review

In [6]:
training_set = load_data('labeledTrainData.tsv', sep='\t')
testing_set = load_data('testData.tsv', sep='\t')
training_set.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [7]:
#getting training set examples labels
print ("Unique Classes: ",np.unique(training_set['sentiment']))
print ("Total Number of Training Examples: ",training_set['review'].shape)
print ("Total Number of Testing Examples: ",testing_set['review'].shape)

Unique Classes:  [0 1]
Total Number of Training Examples:  (25000,)
Total Number of Testing Examples:  (25000,)


In [8]:
vectorizer = CountVectorizer(stop_words='english')
train_bow_b = vectorizer.fit_transform(training_set['review'])
train_bow_b.shape
# Loading the kaggle test dataset
test_set = pd.read_csv('./datasets/testData.tsv',sep='\t')
test_bow_b = vectorizer.transform(testing_set['review'])
test_bow_b.shape

(25000, 74538)

(25000, 74538)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(train_bow_b, training_set['sentiment'], 
                                                    test_size = 0.20, random_state = 0,
                                                    stratify = training_set['sentiment'])

In [None]:
clf = test(NaiveBayes(), X_train, X_test, y_train, y_test)

testing NaiveBayes(alpha=1.0)
Confusion Matrix: 
 [[9401  599]
 [1139 8861]]

Training Accuracy:  0.9131


In [None]:
test_pred = clf.predict(test_bow_b.toarray())

#writing results to csv to uplaoding on kaggle!
kaggle_df = pd.DataFrame(data=np.column_stack([testing_set["id"].values,test_pred.astype(int)])
                         ,columns=["id","sentiment"])
#kaggle_df.to_csv("./naive_bayes_model_take1.csv",index=False)
#print ('Predcitions Generated and saved to naive_bayes_model_take1.csv')

Wow, we can submission our result to kaggle. Not bad!

![](./images/kaggle1.png)

## 20 News Group

In [None]:
categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']

from sklearn.datasets import fetch_20newsgroups

twenty_train = fetch_20newsgroups(subset='train',
     categories=categories, shuffle=True, random_state=42)

twenty_test = fetch_20newsgroups(subset='test',
     categories=categories, shuffle=True, random_state=42)

vectorizer = CountVectorizer(stop_words='english')

twenty_train.target_names
len(twenty_train.data)
len(twenty_train.filenames)
X_train = vectorizer.fit_transform(twenty_train.data)
X_test = vectorizer.transform(twenty_test.data)
y_train = twenty_train.target
y_test = twenty_test.target

test(NaiveBayes(), X_train, X_test, y_train, y_test)

What modification we need to make in order to fit and predict multi-class dataset? It's similar to what we do on binary classification.

## References

- https://github.com/randerson112358/Python/blob/master/Email_Spam_Detection/Email_Spam_Detection.ipynb
- https://github.com/aishajv/Unfolding-Naive-Bayes-from-Scratch/blob/master/%23%20Unfolding%20Na%C3%AFve%20Bayes%20from%20Scratch!%20Take-2%20%F0%9F%8E%AC.ipynb

Data Source: https://www.kaggle.com/balakishan77/spam-or-ham-email-classification/data

In [None]:


HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')