In [233]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer  
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split  

In [234]:
mails_dataset = pd.read_csv('Dataset/trial_spam.csv', encoding = 'latin-1')
mails_dataset.head()           #show first 5 rows

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3
0,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,
1,spam,FreeMsg Hey there darling it's been 3 week's n...,,
2,spam,WINNER!! As a valued network customer you have...,,
3,spam,Had your mobile 11 months or more? U R entitle...,,
4,spam,"SIX chances to win CASH! From 100 to 20,000 po...",,


In [235]:
#drop undesirable columns
mails_dataset.drop(['Unnamed: 2', 'Unnamed: 3'], axis = 1, inplace = True)
mails_dataset.head()

Unnamed: 0,v1,v2
0,spam,Free entry in 2 a wkly comp to win FA Cup fina...
1,spam,FreeMsg Hey there darling it's been 3 week's n...
2,spam,WINNER!! As a valued network customer you have...
3,spam,Had your mobile 11 months or more? U R entitle...
4,spam,"SIX chances to win CASH! From 100 to 20,000 po..."


In [236]:
#Rename the columns, to make it easy to read and manipulate
mails_dataset.rename(columns = {'v1': 'label', 'v2': 'message'}, inplace = True)
mails_dataset.head()

Unnamed: 0,label,message
0,spam,Free entry in 2 a wkly comp to win FA Cup fina...
1,spam,FreeMsg Hey there darling it's been 3 week's n...
2,spam,WINNER!! As a valued network customer you have...
3,spam,Had your mobile 11 months or more? U R entitle...
4,spam,"SIX chances to win CASH! From 100 to 20,000 po..."


In [237]:
mails_dataset['label'].value_counts()  #count number of each label

spam    100
ham     100
Name: label, dtype: int64

In [238]:
total_mails = mails_dataset.shape[0]
total_mails

200

In [239]:
mails_dataset['label'] = mails_dataset['label'].map({'ham': 0, 'spam': 1})
mails_dataset.head()

Unnamed: 0,label,message
0,1,Free entry in 2 a wkly comp to win FA Cup fina...
1,1,FreeMsg Hey there darling it's been 3 week's n...
2,1,WINNER!! As a valued network customer you have...
3,1,Had your mobile 11 months or more? U R entitle...
4,1,"SIX chances to win CASH! From 100 to 20,000 po..."


In [240]:
#lowercase
mails_dataset['message'] = mails_dataset['message'].str.lower()
mails_dataset.head()

Unnamed: 0,label,message
0,1,free entry in 2 a wkly comp to win fa cup fina...
1,1,freemsg hey there darling it's been 3 week's n...
2,1,winner!! as a valued network customer you have...
3,1,had your mobile 11 months or more? u r entitle...
4,1,"six chances to win cash! from 100 to 20,000 po..."


In [241]:
#remove all digits
mails_dataset['message'] = mails_dataset['message'].str.replace('\d+', '')
mails_dataset.head()

Unnamed: 0,label,message
0,1,free entry in a wkly comp to win fa cup final...
1,1,freemsg hey there darling it's been week's no...
2,1,winner!! as a valued network customer you have...
3,1,had your mobile months or more? u r entitled ...
4,1,"six chances to win cash! from to , pounds txt..."


### Remove all punctutations
- ^   : Not these characters.
- \w  :  Word characters.
- \s :  Space characters.

replace any character that is not a word character or a space character with nothing.

In [242]:
mails_dataset['message'] = mails_dataset['message'].str.replace('[^\w\s]', '')
mails_dataset.head()

Unnamed: 0,label,message
0,1,free entry in a wkly comp to win fa cup final...
1,1,freemsg hey there darling its been weeks now ...
2,1,winner as a valued network customer you have b...
3,1,had your mobile months or more u r entitled t...
4,1,six chances to win cash from to pounds txt c...


In [243]:
sample_mail = mails_dataset.iloc[0]
sample_mail['message']

'free entry in  a wkly comp to win fa cup final tkts st may  text fa to  to receive entry questionstd txt ratetcs apply overs'

In [281]:
#create a dictionary of slang words and their corresponding terms
slang_list = {'u': 'you', 'r': 'are', 'd': "the", 'urs' : 'yours', 'wkly' : 'weekly', 'st' : 'such that', 
              'txt': 'text','comp': 'competition', 'prctc' : 'practice', 'dffrnc': 'difference', 'y': 'why', 
              'f9':'fine', 'tkts': 'tickets', 'csh': 'cash', 'phn': 'phone'}


In [282]:
sample_mail = mails_dataset.iloc[0]
message = sample_mail['message']

new_message = ' '.join(slang_list[i] if i in slang_list else i for i in message.split())
new_message

'free entry in a weekly competition to win fa cup final tickets such that may text fa to to receive entry questionstd text ratetcs apply overs'

In [283]:
#applying to all rows
def convert_slangs(row):
    message = row['message']
    new_message = ' '.join(slang_list[i] if i in slang_list else i for i in message.split())
    return new_message

mails_dataset['message'] = mails_dataset.apply(convert_slangs, axis=1)
mails_dataset.head()

Unnamed: 0,label,message,tokens
0,1,free entry in a weekly competition to win fa c...,"[free, entry, weekly, competition, win, fa, cu..."
1,1,freemsg hey there darling its been weeks now a...,"[freemsg, hey, darling, week, word, back, id, ..."
2,1,winner as a valued network customer you have b...,"[winner, valued, network, customer, selected, ..."
3,1,had your mobile months or more you are entitle...,"[mobile, month, entitled, update, latest, colo..."
4,1,six chances to win cash from to pounds text ca...,"[six, chance, win, cash, pound, text, csh, sen..."


## Tokenise every message

In [284]:
sample_mail = mails_dataset.iloc[0]
message = sample_mail['message']
tokens = word_tokenize(message)
token_words = [w for w in tokens if w.isalpha()]
token_words

['free',
 'entry',
 'in',
 'a',
 'weekly',
 'competition',
 'to',
 'win',
 'fa',
 'cup',
 'final',
 'tickets',
 'such',
 'that',
 'may',
 'text',
 'fa',
 'to',
 'to',
 'receive',
 'entry',
 'questionstd',
 'text',
 'ratetcs',
 'apply',
 'overs']

In [285]:
#applying to all rows
def identify_tokens(row):
    message = row['message']
    tokens = word_tokenize(message)
    token_words = [w for w in tokens if w.isalpha()]
    return token_words

mails_dataset['tokens'] = mails_dataset.apply(identify_tokens, axis=1)
mails_dataset.head()

Unnamed: 0,label,message,tokens
0,1,free entry in a weekly competition to win fa c...,"[free, entry, in, a, weekly, competition, to, ..."
1,1,freemsg hey there darling its been weeks now a...,"[freemsg, hey, there, darling, its, been, week..."
2,1,winner as a valued network customer you have b...,"[winner, as, a, valued, network, customer, you..."
3,1,had your mobile months or more you are entitle...,"[had, your, mobile, months, or, more, you, are..."
4,1,six chances to win cash from to pounds text ca...,"[six, chances, to, win, cash, from, to, pounds..."


## Stemming

In [286]:
'''
stemming = PorterStemmer()
sample_mail = mails_dataset.iloc[0]
tokens = sample_mail['tokens']
stemmed_list = [stemming.stem(word) for word in tokens]
stemmed_list
'''

"\nstemming = PorterStemmer()\nsample_mail = mails_dataset.iloc[0]\ntokens = sample_mail['tokens']\nstemmed_list = [stemming.stem(word) for word in tokens]\nstemmed_list\n"

## Lemmatization

In [287]:
lemmatizer = WordNetLemmatizer() 

sample_mail = mails_dataset.iloc[0]
tokens = sample_mail['tokens']
lemmatize_list = [lemmatizer.lemmatize(word) for word in tokens]
lemmatize_list

['free',
 'entry',
 'in',
 'a',
 'weekly',
 'competition',
 'to',
 'win',
 'fa',
 'cup',
 'final',
 'ticket',
 'such',
 'that',
 'may',
 'text',
 'fa',
 'to',
 'to',
 'receive',
 'entry',
 'questionstd',
 'text',
 'ratetcs',
 'apply',
 'over']

In [288]:
def lemmatize_tokens(row):
    tokens = row['tokens']
    lemmatized_list = [lemmatizer.lemmatize(word) for word in tokens]
    return (lemmatized_list)

mails_dataset['tokens'] = mails_dataset.apply(lemmatize_tokens, axis=1)
mails_dataset.head()

Unnamed: 0,label,message,tokens
0,1,free entry in a weekly competition to win fa c...,"[free, entry, in, a, weekly, competition, to, ..."
1,1,freemsg hey there darling its been weeks now a...,"[freemsg, hey, there, darling, it, been, week,..."
2,1,winner as a valued network customer you have b...,"[winner, a, a, valued, network, customer, you,..."
3,1,had your mobile months or more you are entitle...,"[had, your, mobile, month, or, more, you, are,..."
4,1,six chances to win cash from to pounds text ca...,"[six, chance, to, win, cash, from, to, pound, ..."


## Remove stop words

In [291]:
stop_words = set(stopwords.words('english')) 
sample_mail = mails_dataset.iloc[0]
tokens = sample_mail['tokens']
filtered_sentence = [w for w in tokens if not w in stop_words] 
filtered_sentence

['free',
 'entry',
 'weekly',
 'competition',
 'win',
 'fa',
 'cup',
 'final',
 'ticket',
 'may',
 'text',
 'fa',
 'receive',
 'entry',
 'questionstd',
 'text',
 'ratetcs',
 'apply']

In [292]:
def remove_stopwords(row):
    tokens = row['tokens']
    filtered_list = [w for w in tokens if not w in stop_words]
    return (filtered_list)

mails_dataset['tokens'] = mails_dataset.apply(remove_stopwords , axis=1)
mails_dataset.head()

Unnamed: 0,label,message,tokens
0,1,free entry in a weekly competition to win fa c...,"[free, entry, weekly, competition, win, fa, cu..."
1,1,freemsg hey there darling its been weeks now a...,"[freemsg, hey, darling, week, word, back, id, ..."
2,1,winner as a valued network customer you have b...,"[winner, valued, network, customer, selected, ..."
3,1,had your mobile months or more you are entitle...,"[mobile, month, entitled, update, latest, colo..."
4,1,six chances to win cash from to pounds text ca...,"[six, chance, win, cash, pound, text, cash, se..."


## Training and Testing Sets

In [312]:
X, y = mails_dataset['tokens'], mails_dataset['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [313]:
test_size, train_size = X_test.shape[0], X_train.shape[0]
print("Number of instance in :\n Training set = ", train_size, "\n Test set = ", test_size)

Number of instance in :
 Training set =  160 
 Test set =  40
