In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sms-spam-collection-dataset/spam.csv


In [4]:
import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
df_spam = pd.read_csv("../input/sms-spam-collection-dataset/spam.csv")

### Let us use only 2 columns

In [6]:


df_spam_data = df_spam.loc[:,["v1","v2"]]

df_spam_data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Make Spam Ham Labels as numeric

- Spam as 1
- Ham as 0

In [7]:
df_spam_data['v1'] = df_spam_data['v1'].map({"spam": 1,"ham":0})
df_spam_data['v1']

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: v1, Length: 5572, dtype: int64

## Preprocessing of data
1. Removal of stopwords

In [8]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [9]:
def all_stop():
    stop_words = stopwords.words('english')
    stop_words.extend([w.capitalize() for w in stop_words])
    return stop_words

### Let us remove from in the data set

In [10]:
def remove_stop_words(sent):
    words = sent.split()
    stop_words = all_stop()
    for w in words:
        if w in stop_words:
            sent = sent.replace(w,'')
    return sent

In [11]:
df_spam_data['v2'] = df_spam_data['v2'].apply(remove_stop_words)

df_spam_data['v2'].head()

0    Go  jurong pot, crazy.. Available   bugis n gr...
1                        Ok lar... Joking wif u oni...
2    Free entry  2  wkly comp  w FA Cup fl tkts 21s...
3          U dun say  early hor... U c already  say...
4      Nah   think  goes  usf,  lives around re though
Name: v2, dtype: object

### Let us now build a naive bayes model on top of the data

#### First let us split the data into training and test set

In [12]:
X_train,X_test,y_train,y_test = train_test_split( df_spam_data["v2"],  df_spam_data["v1"],random_state=100,train_size=0.75)

### Let us convert the train set into bag of words

1. First we will try CountVectorizer
2. Then we will try TfIDfVectorizer
3. Gauzzian Naive Bayes and Bernoulli Naive Bayes 

In [13]:
cv = CountVectorizer()

x_bag = cv.fit_transform(X_train)

x_test_bag = cv.transform(X_test)

In [14]:
x_bag = x_bag.toarray()
x_test_bag  = x_test_bag.toarray()


In [15]:
mn = MultinomialNB()
mn = mn.fit(x_bag,y_train)

y_test_pred = mn.predict(x_test_bag)




### Lets evaluate accuracy 

In [16]:
def get_metrics(cf):
    acc = (cf[0,0]+ cf[1,1])/(cf[0,0]+ cf[1,1]+ cf[0,1] + cf[1,0])
    sen = cf[1,1]/(cf[1,1]+cf[1,0])
    spe = cf[0,0]/(cf[0,0] + cf[0,1])
    return acc,sen,spe

### Let us evaluate test set 

In [17]:
cf = confusion_matrix(y_test,y_test_pred)
print(cf)

print(" accuracy:{0} \n sensitivity: {1}\n specificity: {2}".format(get_metrics(cf)[0],get_metrics(cf)[1],get_metrics(cf)[2]))

[[1204   11]
 [  12  166]]
 accuracy:0.9834888729361091 
 sensitivity: 0.9325842696629213
 specificity: 0.9909465020576131


### Evaluation of train set

In [18]:
y_train_pred = mn.predict(x_bag)
cf = confusion_matrix(y_train,y_train_pred)
print(cf)

print(" accuracy:{0} \n sensitivity: {1}\n specificity: {2}".format(get_metrics(cf)[0],get_metrics(cf)[1],get_metrics(cf)[2]))

[[3604    6]
 [   8  561]]
 accuracy:0.9966499162479062 
 sensitivity: 0.9859402460456942
 specificity: 0.9983379501385041


### Let us repeat same using tfidf vectorizer

In [19]:
tfidf = TfidfVectorizer()

x_bag = tfidf.fit_transform(X_train)

x_test_bag = tfidf.transform(X_test)


x_bag = x_bag.toarray()
x_test_bag  = x_test_bag.toarray()

x_bag

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Predictions on train and test set

In [20]:
mn = MultinomialNB()
mn = mn.fit(x_bag,y_train)

y_train_pred = mn.predict(x_bag)

y_test_pred = mn.predict(x_test_bag)

In [21]:

print("Training----------------------------------------------------------------------------\n")
cf = confusion_matrix(y_train,y_train_pred)
print(cf)

print(" accuracy:{0} \n sensitivity: {1}\n specificity: {2}".format(get_metrics(cf)[0],get_metrics(cf)[1],get_metrics(cf)[2]))


print("Testing----------------------------------------------------------------------------\n")
cf = confusion_matrix(y_test,y_test_pred)
print(cf)

print(" accuracy:{0} \n sensitivity: {1}\n specificity: {2}".format(get_metrics(cf)[0],get_metrics(cf)[1],get_metrics(cf)[2]))

Training----------------------------------------------------------------------------

[[3610    0]
 [  84  485]]
 accuracy:0.9798994974874372 
 sensitivity: 0.8523725834797891
 specificity: 1.0
Testing----------------------------------------------------------------------------

[[1214    1]
 [  47  131]]
 accuracy:0.9655419956927495 
 sensitivity: 0.7359550561797753
 specificity: 0.9991769547325103


### The count vectorizer does well while applyiing naive bayes

#### Now let us try bernoulli naive bayes. 

The bag of words in bernoulli naive bayes

- In bernoulli Naive bayes the frequency of the words doesnt matter
- If the word is present it will consider as 1 no matter how many times the word is present in text


In [22]:
bn = BernoulliNB()

cv = CountVectorizer()

x_bag_bernoulli = cv.fit_transform(X_train)
x_bag_bernoulli = x_bag_bernoulli.toarray()

x_text_bag_bern = cv.transform(X_test)
x_text_bag_bern = x_text_bag_bern.toarray()

##### Fitting the model

bernoulli_model = bn.fit(x_bag_bernoulli,y_train)

y_train_pred = bn.predict(x_bag_bernoulli)
y_test_pred = bn.predict(x_text_bag_bern)
print("train metrics\n",confusion_matrix(y_train,y_train_pred))
print("sensitivity")
print("train metrics\n",confusion_matrix(y_test,y_test_pred))

train metrics
 [[3606    4]
 [  65  504]]
sensitivity
train metrics
 [[1214    1]
 [  33  145]]


In [23]:
cf = confusion_matrix(y_train,y_train_pred)

si = cf[1,1]/(cf[1,0]+ cf[1,1])

sp = cf[0,0]/(cf[0,0]+ cf[0,1])

acc = (cf[1,1] + cf[0,0])/(cf[0,0]+ cf[0,1] +cf[1,0]+ cf[1,1])

print("sensitivity = {0}\nSpecificity = {1}\nAccuracy = {2}".format(si,sp,acc))

sensitivity = 0.8857644991212654
Specificity = 0.9988919667590028
Accuracy = 0.9834888729361091


In [24]:
#### Evalyuating on test set 
cf = confusion_matrix(y_test,y_test_pred)

si = cf[1,1]/(cf[1,0]+ cf[1,1])

sp = cf[0,0]/(cf[0,0]+ cf[0,1])

acc = (cf[1,1] + cf[0,0])/(cf[0,0]+ cf[0,1] +cf[1,0]+ cf[1,1])

print("sensitivity = {0}\nSpecificity = {1}\nAccuracy = {2}".format(si,sp,acc))

sensitivity = 0.8146067415730337
Specificity = 0.9991769547325103
Accuracy = 0.9755922469490309


## We want to predict spam messages correctly 
1. Here sensitivity is very important
2. As a result we will go with bernoulliNB
3. The Sensitivity is higher