In [11]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer  
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split  

In [12]:
mails_dataset = pd.read_csv('Dataset/trial_spam.csv', encoding = 'latin-1')
mails_dataset.head()           #show first 5 rows

Unnamed: 0,sno,v1,v2,Unnamed: 3,Unnamed: 4
0,1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,
1,2,spam,FreeMsg Hey there darling it's been 3 week's n...,,
2,3,spam,WINNER!! As a valued network customer you have...,,
3,4,spam,Had your mobile 11 months or more? U R entitle...,,
4,5,spam,"SIX chances to win CASH! From 100 to 20,000 po...",,


In [13]:
#drop undesirable columns
mails_dataset.drop(['Unnamed: 3', 'Unnamed: 4'], axis = 1, inplace = True)
mails_dataset.head()

Unnamed: 0,sno,v1,v2
0,1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
1,2,spam,FreeMsg Hey there darling it's been 3 week's n...
2,3,spam,WINNER!! As a valued network customer you have...
3,4,spam,Had your mobile 11 months or more? U R entitle...
4,5,spam,"SIX chances to win CASH! From 100 to 20,000 po..."


In [14]:
#Rename the columns, to make it easy to read and manipulate
mails_dataset.rename(columns = {'sno': 'docID', 'v1': 'label', 'v2': 'message'}, inplace = True)
mails_dataset.head()

Unnamed: 0,docID,label,message
0,1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
1,2,spam,FreeMsg Hey there darling it's been 3 week's n...
2,3,spam,WINNER!! As a valued network customer you have...
3,4,spam,Had your mobile 11 months or more? U R entitle...
4,5,spam,"SIX chances to win CASH! From 100 to 20,000 po..."


In [15]:
mails_dataset['label'].value_counts()  #count number of each label

spam    100
ham     100
Name: label, dtype: int64

In [16]:
total_mails = mails_dataset.shape[0]
total_mails

200

In [17]:
mails_dataset['label'] = mails_dataset['label'].map({'ham': 0, 'spam': 1})
mails_dataset.head()

Unnamed: 0,docID,label,message
0,1,1,Free entry in 2 a wkly comp to win FA Cup fina...
1,2,1,FreeMsg Hey there darling it's been 3 week's n...
2,3,1,WINNER!! As a valued network customer you have...
3,4,1,Had your mobile 11 months or more? U R entitle...
4,5,1,"SIX chances to win CASH! From 100 to 20,000 po..."


In [18]:
#lowercase
mails_dataset['message'] = mails_dataset['message'].str.lower()
mails_dataset.head()

Unnamed: 0,docID,label,message
0,1,1,free entry in 2 a wkly comp to win fa cup fina...
1,2,1,freemsg hey there darling it's been 3 week's n...
2,3,1,winner!! as a valued network customer you have...
3,4,1,had your mobile 11 months or more? u r entitle...
4,5,1,"six chances to win cash! from 100 to 20,000 po..."


In [19]:
#remove all digits
mails_dataset['message'] = mails_dataset['message'].str.replace('\d+', '')
mails_dataset.head()

Unnamed: 0,docID,label,message
0,1,1,free entry in a wkly comp to win fa cup final...
1,2,1,freemsg hey there darling it's been week's no...
2,3,1,winner!! as a valued network customer you have...
3,4,1,had your mobile months or more? u r entitled ...
4,5,1,"six chances to win cash! from to , pounds txt..."


### Remove all punctutations
- ^   : Not these characters.
- \w  :  Word characters.
- \s :  Space characters.

replace any character that is not a word character or a space character with nothing.

In [20]:
mails_dataset['message'] = mails_dataset['message'].str.replace('[^\w\s]', '')
mails_dataset.head()

Unnamed: 0,docID,label,message
0,1,1,free entry in a wkly comp to win fa cup final...
1,2,1,freemsg hey there darling its been weeks now ...
2,3,1,winner as a valued network customer you have b...
3,4,1,had your mobile months or more u r entitled t...
4,5,1,six chances to win cash from to pounds txt c...


In [21]:
sample_mail = mails_dataset.iloc[0]
sample_mail['message']

'free entry in  a wkly comp to win fa cup final tkts st may  text fa to  to receive entry questionstd txt ratetcs apply overs'

In [101]:
#create a dictionary of slang words and their corresponding terms
slang_list = {'u': 'you', 'r': 'are', 'd': "the", 'urs' : 'yours', 'wkly' : 'weekly', 'st' : 'such that', 
              'txt': 'text','comp': 'competition', 'prctc' : 'practice', 'dffrnc': 'difference', 'y': 'why', 
              'f9':'fine', 'tkts': 'tickets', 'csh': 'cash', 'phn': 'phone', 'im': 'i am', 'm': 'am', 'spcl': 'special'}


In [102]:
sample_mail = mails_dataset.iloc[0]
message = sample_mail['message']

new_message = ' '.join(slang_list[i] if i in slang_list else i for i in message.split())
new_message

'free entry in a weekly competition to win fa cup final tickets such that may text fa to to receive entry questionstd text ratetcs apply overs'

In [103]:
#applying to all rows
def convert_slangs(row):
    message = row['message']
    new_message = ' '.join(slang_list[i] if i in slang_list else i for i in message.split())
    return new_message

mails_dataset['message'] = mails_dataset.apply(convert_slangs, axis=1)
mails_dataset.head()

Unnamed: 0,docID,label,message,tokens
0,1,1,free entry in a weekly competition to win fa c...,"[free, entry, weekly, competition, win, fa, cu..."
1,2,1,freemsg hey there darling its been weeks now a...,"[freemsg, hey, darling, week, word, back, id, ..."
2,3,1,winner as a valued network customer you have b...,"[winner, valued, network, customer, selected, ..."
3,4,1,had your mobile months or more you are entitle...,"[mobile, month, entitled, update, latest, colo..."
4,5,1,six chances to win cash from to pounds text ca...,"[six, chance, win, cash, pound, text, cash, se..."


## Tokenise every message

In [104]:
sample_mail = mails_dataset.iloc[0]
message = sample_mail['message']
tokens = word_tokenize(message)
token_words = [w for w in tokens if w.isalpha()]
token_words

['free',
 'entry',
 'in',
 'a',
 'weekly',
 'competition',
 'to',
 'win',
 'fa',
 'cup',
 'final',
 'tickets',
 'such',
 'that',
 'may',
 'text',
 'fa',
 'to',
 'to',
 'receive',
 'entry',
 'questionstd',
 'text',
 'ratetcs',
 'apply',
 'overs']

In [105]:
#applying to all rows
def identify_tokens(row):
    message = row['message']
    tokens = word_tokenize(message)
    token_words = [w for w in tokens if w.isalpha()]
    return token_words

mails_dataset['tokens'] = mails_dataset.apply(identify_tokens, axis=1)
mails_dataset.head()

Unnamed: 0,docID,label,message,tokens
0,1,1,free entry in a weekly competition to win fa c...,"[free, entry, in, a, weekly, competition, to, ..."
1,2,1,freemsg hey there darling its been weeks now a...,"[freemsg, hey, there, darling, its, been, week..."
2,3,1,winner as a valued network customer you have b...,"[winner, as, a, valued, network, customer, you..."
3,4,1,had your mobile months or more you are entitle...,"[had, your, mobile, months, or, more, you, are..."
4,5,1,six chances to win cash from to pounds text ca...,"[six, chances, to, win, cash, from, to, pounds..."


## Stemming

In [106]:
'''
stemming = PorterStemmer()
sample_mail = mails_dataset.iloc[0]
tokens = sample_mail['tokens']
stemmed_list = [stemming.stem(word) for word in tokens]
stemmed_list
'''

"\nstemming = PorterStemmer()\nsample_mail = mails_dataset.iloc[0]\ntokens = sample_mail['tokens']\nstemmed_list = [stemming.stem(word) for word in tokens]\nstemmed_list\n"

## Lemmatization

In [107]:
lemmatizer = WordNetLemmatizer() 

sample_mail = mails_dataset.iloc[0]
tokens = sample_mail['tokens']
lemmatize_list = [lemmatizer.lemmatize(word) for word in tokens]
lemmatize_list

['free',
 'entry',
 'in',
 'a',
 'weekly',
 'competition',
 'to',
 'win',
 'fa',
 'cup',
 'final',
 'ticket',
 'such',
 'that',
 'may',
 'text',
 'fa',
 'to',
 'to',
 'receive',
 'entry',
 'questionstd',
 'text',
 'ratetcs',
 'apply',
 'over']

In [108]:
def lemmatize_tokens(row):
    tokens = row['tokens']
    lemmatized_list = [lemmatizer.lemmatize(word) for word in tokens]
    return (lemmatized_list)

mails_dataset['tokens'] = mails_dataset.apply(lemmatize_tokens, axis=1)
mails_dataset.head()

Unnamed: 0,docID,label,message,tokens
0,1,1,free entry in a weekly competition to win fa c...,"[free, entry, in, a, weekly, competition, to, ..."
1,2,1,freemsg hey there darling its been weeks now a...,"[freemsg, hey, there, darling, it, been, week,..."
2,3,1,winner as a valued network customer you have b...,"[winner, a, a, valued, network, customer, you,..."
3,4,1,had your mobile months or more you are entitle...,"[had, your, mobile, month, or, more, you, are,..."
4,5,1,six chances to win cash from to pounds text ca...,"[six, chance, to, win, cash, from, to, pound, ..."


## Remove stop words

In [109]:
stop_words = set(stopwords.words('english')) 
sample_mail = mails_dataset.iloc[0]
tokens = sample_mail['tokens']
filtered_sentence = [w for w in tokens if not w in stop_words] 
filtered_sentence

['free',
 'entry',
 'weekly',
 'competition',
 'win',
 'fa',
 'cup',
 'final',
 'ticket',
 'may',
 'text',
 'fa',
 'receive',
 'entry',
 'questionstd',
 'text',
 'ratetcs',
 'apply']

In [110]:
def remove_stopwords(row):
    tokens = row['tokens']
    filtered_list = [w for w in tokens if not w in stop_words]
    return (filtered_list)

mails_dataset['tokens'] = mails_dataset.apply(remove_stopwords , axis=1)
mails_dataset.head()

Unnamed: 0,docID,label,message,tokens
0,1,1,free entry in a weekly competition to win fa c...,"[free, entry, weekly, competition, win, fa, cu..."
1,2,1,freemsg hey there darling its been weeks now a...,"[freemsg, hey, darling, week, word, back, id, ..."
2,3,1,winner as a valued network customer you have b...,"[winner, valued, network, customer, selected, ..."
3,4,1,had your mobile months or more you are entitle...,"[mobile, month, entitled, update, latest, colo..."
4,5,1,six chances to win cash from to pounds text ca...,"[six, chance, win, cash, pound, text, cash, se..."


#### >>>>Now our data is clean and ready for training

## Training and Testing Sets

We will randomly split our dataset in 80–20 ratio. Where 80% of the total data will be used as training set and rest 20% will be considered as test set. 

In [117]:
X = mails_dataset.drop('label',axis=1) 
y = mails_dataset['label']

#random state = 0, will give same split o=evry time. 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [118]:
test_size, train_size = X_test.shape[0], X_train.shape[0]
print("Number of instance in :\n Training set = ", train_size, "\n Test set = ", test_size)

Number of instance in :
 Training set =  160 
 Test set =  40


In [119]:
X_train.head()

Unnamed: 0,docID,message,tokens
134,135,yeah you should i think you can use your gt at...,"[yeah, think, use, gt, atm, register, sure, an..."
66,67,upgrdcentre orange customer you may now claim ...,"[upgrdcentre, orange, customer, may, claim, fr..."
26,27,customer service annoncement you have a new ye...,"[customer, service, annoncement, new, year, de..."
113,114,hi its kate how is your evening i hope i can s...,"[hi, kate, evening, hope, see, tomorrow, bit, ..."
168,169,iûm parked next to a mini when are you coming ...,"[iûm, parked, next, mini, coming, today, think]"


In [153]:
#y_train is an arrat. so in order to concat it with our X variables, we need to convert it into dataframe
labels_train = pd.DataFrame(y_train, columns = ['label'])
labels_train.head()

Unnamed: 0,label
134,0
66,1
26,1
113,0
168,0


In [154]:
'''
    We have X_train, y_train, X_test, y_test.
    Using these lists and dataframes we will randomly create two non-overlapping datasets 
        1. training set
        2. testing set
'''

#creating training set
train_set = pd.concat([X_train, labels_train], axis = 1).reset_index(drop=True)
train_set.head()

Unnamed: 0,docID,message,tokens,label
0,135,yeah you should i think you can use your gt at...,"[yeah, think, use, gt, atm, register, sure, an...",0
1,67,upgrdcentre orange customer you may now claim ...,"[upgrdcentre, orange, customer, may, claim, fr...",1
2,27,customer service annoncement you have a new ye...,"[customer, service, annoncement, new, year, de...",1
3,114,hi its kate how is your evening i hope i can s...,"[hi, kate, evening, hope, see, tomorrow, bit, ...",0
4,169,iûm parked next to a mini when are you coming ...,"[iûm, parked, next, mini, coming, today, think]",0


In [155]:
# Our training set is ready, similarly creating test set

labels_test = pd.DataFrame(y_test, columns = ['label'])
test_set = pd.concat([X_test, labels_test], axis = 1).reset_index(drop=True)
test_set.head()

Unnamed: 0,docID,message,tokens,label
0,19,you are a winner you have been specially selec...,"[winner, specially, selected, receive, å, holi...",1
1,171,anyway i am going shopping on my own now cos m...,"[anyway, going, shopping, co, si, done, yet, d...",0
2,108,hi finally i completed the course,"[hi, finally, completed, course]",0
3,99,hi i am sue i am years old and work as a lapda...,"[hi, sue, year, old, work, lapdancer, love, se...",1
4,178,night has ended for another day morning has co...,"[night, ha, ended, another, day, morning, ha, ...",0


#### >>>>Our training and test sets are ready, let's create our model

# Multinomial Naive Bayes Classifier

### Prior Probability of each class

Prior probaility of class c, is computed as:

    P(c) = (examples with label c) / (total examples)
    

In [175]:
total_label = train_set['label'].nunique()
total_label

2

In [179]:
#set of all instances wth class label = 1
y1 = train_set[train_set.label == 1]

#count instances with class label = 1 
len(y1)

82

In [180]:
#now calculate the prior probaility of each label/class

prior = {}          #empty array

for ci in range(total_label):
    count_label = len(train_set[train_set.label == ci])
    prior[ci] = count_label/train_size       
    
prior    

{0: 0.4875, 1: 0.5125}

### Conditional Probability of each term

Cond probaility of term t, is computed as:

    P(c) = (examples with label c) / (total examples)
  