# **Naive Bayes Model**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [8]:
df = pd.read_csv('https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv', encoding = 'iso-8859-1')

In [9]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [10]:
data = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)

In [11]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Steps to clean the text for classification task

1. Remove all the punctuation marks
2. Lower case all the letters
3. Remove all the stopwords
4. Lemmetize the data
4. find a way to convert the text data to numerical data
5. pass it to the machine learning model

In [12]:
data['v2'][435]

'The message sent is askin for  &lt;#&gt; dollars. Shoul i pay  &lt;#&gt;  or  &lt;#&gt; ?'

In [13]:
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [14]:
# 1) removing puncuation (i.e. special characters)

df['v2'] = df['v2'].apply(lambda x: ''.join([i for i in x if i not in string.punctuation]))

In [15]:
df['v2'][435]

'The message sent is askin for  ltgt dollars Shoul i pay  ltgt  or  ltgt '

In [16]:
# 2) making all as lower case

df['v2'] = df['v2'].apply(lambda x: x.lower())

In [17]:
# remove the stopwords (we can use nltk library or spacu library)

# Stop words from nltk library

import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords')
stop = set(stopwords.words('english'))
print(stop)

{"isn't", 'do', 'had', 'while', 'don', 'your', 'nor', "couldn't", 'shan', 'you', 'that', 'has', 'mustn', 'once', 's', "don't", "you've", 'these', 'through', 'to', 've', "should've", 'ourselves', 'been', 'at', 'm', 'haven', "hadn't", 'be', 'ma', 'as', "hasn't", "didn't", 'an', 'yourselves', 'needn', 'ours', 'other', 'her', "haven't", "shan't", 'shouldn', 'into', 'any', 'mightn', 'there', 'are', 'y', 'where', 'both', 'aren', 'were', 'won', "she's", 'down', 'themselves', 'again', 'wasn', 'about', 'out', "shouldn't", 'ain', 'doesn', 'being', 'd', 'hers', 'its', 'above', 'll', 'so', 'under', 'or', 'have', 'off', 'and', 'him', 'with', 'theirs', 'then', 'we', 'was', 'weren', "you'd", 'such', 'against', 'couldn', 'but', 'more', 'they', 'up', 'i', 'their', 'no', 'himself', 'our', 'who', "mightn't", 'his', 'should', 'a', 'only', 'of', 'o', 'hasn', 'here', 'further', 'when', 'than', "needn't", "weren't", 'this', 'over', 'before', 'all', 'it', 'did', 'some', 'because', 'own', 'myself', 'yourself',

In [18]:
# Stop words from spacy library

from spacy.lang.en.stop_words import STOP_WORDS
print(STOP_WORDS)

{'towards', 'next', 'us', 'everything', 'noone', 'done', 'various', 'might', 'alone', 'these', 'to', 'regarding', 'whereas', 'latter', 'mostly', 'ours', 'her', 'twenty', 'nobody', 'part', 'down', 'themselves', 'out', 'under', 'within', 'him', 'we', 'toward', 'against', 'mine', 'they', 'up', 'no', 'himself', 'anything', 'a', 'together', 'everywhere', 'than', '‘ll', 'fifty', 'name', 'this', 'meanwhile', 'back', 'all', 'did', 'also', 'because', 'seeming', 'my', 'across', 'twelve', 'full', 'except', 'six', 'would', 'thereby', 'herself', 'them', 'those', 'below', 'whither', 'yours', 'still', 'everyone', 'n’t', 'seems', 'fifteen', 'serious', 'n‘t', 'doing', 'perhaps', 'side', 'from', 'too', 'between', "'ve", 'had', 'while', 'your', 'less', 'nor', 'becoming', 'put', 'sixty', 'has', 'anywhere', 'therein', 'somehow', 'third', 'least', 'via', 'at', 'be', 'though', 'as', 'ca', 'amount', 'never', 'an', 'yourselves', '’ll', 'else', 'first', 'where', 'again', 'although', 'whereupon', 'without', 'any

In [19]:
# I am using the spacy library because it is more advanced and have more stopwords compared to nltk

In [20]:
df['v2'] = df['v2'].apply(lambda x: ' '.join([i for i in x.split() if i not in STOP_WORDS]))

In [21]:
df['v2'][2]

'free entry 2 wkly comp win fa cup final tkts 21st 2005 text fa 87121 receive entry questionstd txt ratetcs apply 08452810075over18s'

In [22]:
# 3)Limmetization

In [23]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\manoj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [24]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [25]:
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Just some examples how lemmitizer is working

lemmatizer.lemmatize('going', wordnet.VERB)

# lemmatizer.lemmatize('working', wordnet.VERB)

# lemmatizer.lemmatize('swim', wordnet.VERB)

'go'

In [26]:
import nltk
nltk.download('averaged_perceptron_tagger')

nltk.pos_tag(['we', 'are','going'])

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\manoj\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('we', 'PRP'), ('are', 'VBP'), ('going', 'VBG')]

In [28]:
'We are doing good'.split()

['We', 'are', 'doing', 'good']

In [29]:
nltk.pos_tag('we are doing good'.split())

[('we', 'PRP'), ('are', 'VBP'), ('doing', 'VBG'), ('good', 'JJ')]

In [30]:
df['v2'] = df['v2'].apply(lambda x: ' '.join([lemmatizer.lemmatize(i[0], get_wordnet_pos(i[1])) for i in nltk.pos_tag(x.split())]))

In [31]:
# get_wordnet_pos(i[1] is mapping with function

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(max_features = 4000)

tfidf_vectors = vec.fit_transform(df['v2'])

In [33]:
tfidf_vectors

<5572x4000 sparse matrix of type '<class 'numpy.float64'>'
	with 37741 stored elements in Compressed Sparse Row format>

In [35]:
# To see how the data is there in tfidf_vectors 

new_df = pd.DataFrame(tfidf_vectors.toarray(), columns = vec.get_feature_names())
new_df.head()

Unnamed: 0,008704050406,01223585334,020603,0207,02073162414,020903,021,050703,0578,071104,...,yun,yunny,yuo,yup,zed,zoe,åð,ìll,ìï,ûò
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
new_df.shape

(5572, 4000)

Curse of dimensionality will not happen

Note: If we directly apply one hot encoding to any column then it will result in curse of dimensionality so we are using 

1) reducing the stop words
2) using limmetization

# Actual model (i.e Navies Bayes Model)

In [39]:
# taking the y column i.e. making ham to 0 and spam to 1

y = df['v1'] = df['v1'].map({'ham': 0, 'spam': 1})

In [40]:
y.value_counts()

0    4825
1     747
Name: v1, dtype: int64

In [41]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf_vectors, y, test_size = 0.2)

In [42]:
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model.fit(X_train.toarray(), y_train)

GaussianNB()

In [43]:
y_pred = nb_model.predict(X_test.toarray())

from sklearn.metrics import accuracy_score
score = accuracy_score(y_pred, y_test)
print(score)

0.8771300448430494


In [44]:
from sklearn.naive_bayes import MultinomialNB
nb_model1 = MultinomialNB()
nb_model1.fit(X_train.toarray(), y_train)

MultinomialNB()

In [46]:
y_pred_m = nb_model1.predict(X_test.toarray())

from sklearn.metrics import accuracy_score
score1 = accuracy_score(y_pred_m, y_test)
print(score1)

0.9766816143497757
