# Naive Bayes

# Library Import

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Data

In [23]:
email_spam = pd.read_csv('../Data/email_spam.csv')
email_spam

Unnamed: 0,Category,Message
0,non_spam,"Go until jurong point, crazy.. Available only ..."
1,non_spam,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,non_spam,U dun say so early hor... U c already then say...
4,non_spam,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,non_spam,Will ü b going to esplanade fr home?
5569,non_spam,"Pity, * was in mood for that. So...any other s..."
5570,non_spam,The guy did some bitching but I acted like i'd...


In [7]:
email_spam.Category.value_counts()

non_spam    4825
spam         747
Name: Category, dtype: int64

## Encode Category text

In [10]:
email_spam['spam'] = email_spam['Category'].apply(lambda x: 1 if x=='spam' else 0)
email_spam.drop(['Category'], axis=1, inplace=True)
email_spam.head()

Unnamed: 0,Message,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(email_spam['Message'], email_spam['spam'], test_size=0.2)

# Types of Naive Bayes

* Bernoulli Naive Bayes: assumes all our features are binary only having 2 values
* Multinomial Naive Bayes: used for discrete data e.g movie ratings(from 1 to 5)
* Gaussian Naive Bayes: features are continuous e.g iris dataset

# Count Vectorizer
Counts the occurence of each word for each email and lists the words as features

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
countVectorizer = CountVectorizer()
X_train_count = countVectorizer.fit_transform(X_train.values)
X_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [14]:
from sklearn.naive_bayes import MultinomialNB
multinomialNB = MultinomialNB()
multinomialNB.fit(X_train_count, y_train)

MultinomialNB()

## Prediction

In [20]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]
emails_count = countVectorizer.transform(emails)
multinomialNB.predict(emails_count)

array([0, 1])

In [22]:
X_test_count = countVectorizer.transform(X_test)
multinomialNB.score(X_test_count, y_test)

0.9883408071748879

# Pipeline
Pipeline allows to easily transform our data: by first converting our data into a count vectorizer then fitting the naive bayes classifier in one step instead of creating a new data object then fitting the model

In [27]:
from sklearn.pipeline import Pipeline
classifier = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [28]:
classifier.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [26]:
classifier.score(X_train, y_train)

0.9930446488669509

In [29]:
classifier.predict(emails)

array([0, 1])