## STEP 1: IMPORTING LIBRARIES

In [61]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
#from wordcloud import WordCloud
from math import log, sqrt
import pandas as pd
import numpy as np
import re
from sklearn.datasets import load_files 
#nltk.download('stopwords')  
import pickle  
from nltk.corpus import stopwords
%matplotlib inline  

## STEP 2: LOADING DATASET

Load the required dataset

In [62]:
mails_dataset = pd.read_csv('Dataset/spam.csv', encoding = 'latin-1')
mails_dataset.head()           #show first 5 rows

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


## STEP 3: FEATURE SELECTION

Select the relevant features, important for mail classification. We can see that column Unamed are irrelevant for our classifier. Thus, we need to remove them. 

In [63]:
#drop undesirable columns
mails_dataset.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1, inplace = True)
mails_dataset.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## STEP 4: DATA PREPROCESSING

We need to clean our data for further processing. Emails may contain a lot of undesirable characters like punctuation marks, stop words, digits, etc which may not be helpful in detecting the spam email

In [64]:
#Rename the columns, to make it easy to read and manipulate
mails_dataset.rename(columns = {'v1': 'label', 'v2': 'message'}, inplace = True)
mails_dataset.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [65]:
mails_dataset['label'].value_counts()  #count number of each label

ham     4825
spam     747
Name: label, dtype: int64

In [66]:
total_mails = mails_dataset.shape[0]            #total number on instances in our dataset
total_mails

5572

In [67]:
#convert the data into lower case
mails_dataset['message'] =  mails_dataset['message'].str.lower()
mails_dataset.head()

Unnamed: 0,label,message
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."


## STEP 5: CONVERTING TEXT TO NUMBERS

Assign number 1 to spam and 0 to ham

In [51]:
mails_dataset['label'] = mails_dataset['label'].map({'ham': 0, 'spam': 1})
mails_dataset.head()

Unnamed: 0,label,message
0,0,"go until jurong point, crazy.. available only ..."
1,0,ok lar... joking wif u oni...
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor... u c already then say...
4,0,"nah i don't think he goes to usf, he lives aro..."


In [52]:
#split our data into training and testing data

train_index, test_index = list(), list()
for i in range(total_mails):
    if np.random.uniform(0, 1) < 0.75:
        train_index += [i]
    else:
        test_index += [i]
train_data = mails_dataset.loc[train_index]
test_data = mails_dataset.loc[test_index]

In [53]:
train_data.reset_index(inplace = True)
train_data.drop(['index'], axis = 1, inplace = True)
train_data.head()

Unnamed: 0,label,message
0,0,"go until jurong point, crazy.. available only ..."
1,1,free entry in 2 a wkly comp to win fa cup fina...
2,0,u dun say so early hor... u c already then say...
3,0,"nah i don't think he goes to usf, he lives aro..."
4,1,freemsg hey there darling it's been 3 week's n...


In [54]:
test_data.reset_index(inplace = True)
test_data.drop(['index'], axis = 1, inplace = True)
test_data.head()

Unnamed: 0,label,message
0,0,ok lar... joking wif u oni...
1,0,as per your request 'melle melle (oru minnamin...
2,1,had your mobile 11 months or more? u r entitle...
3,0,i've been searching for the right words to tha...
4,1,"xxxmobilemovieclub: to use your credit, click ..."


In [55]:
#check train data
train_data['label'].value_counts()

0    3629
1     570
Name: label, dtype: int64

In [56]:
#check test data
test_data['label'].value_counts()

0    1196
1     177
Name: label, dtype: int64

In [57]:
spam_words = ' '.join(list(train_data[train_data['label'] == 1]['message']))
#spam_words

In [58]:
ham_words = ' '.join(list(train_data[train_data['label'] == 0]['message']))
#ham_words

In [59]:
train_data.head()

Unnamed: 0,label,message
0,0,"go until jurong point, crazy.. available only ..."
1,1,free entry in 2 a wkly comp to win fa cup fina...
2,0,u dun say so early hor... u c already then say...
3,0,"nah i don't think he goes to usf, he lives aro..."
4,1,freemsg hey there darling it's been 3 week's n...


In [60]:
train_data['message'] =  train_data['message'].str.lower()
train_data.head()

Unnamed: 0,label,message
0,0,"go until jurong point, crazy.. available only ..."
1,1,free entry in 2 a wkly comp to win fa cup fina...
2,0,u dun say so early hor... u c already then say...
3,0,"nah i don't think he goes to usf, he lives aro..."
4,1,freemsg hey there darling it's been 3 week's n...


In [129]:
stop_words = set(stopwords.words('english')) 
#stop_words

In [130]:
mails_dataset.get_dtype_counts()

object    1
int64     1
dtype: int64

In [100]:
dt = mails_dataset['message']
u = dt.str.contains('u')
u[:10]

0     True
1     True
2     True
3     True
4     True
5     True
6    False
7     True
8     True
9     True
Name: message, dtype: bool

In [111]:
#clean the data
mails_dataset['message'] = dt.str.replace('u', 'you')
mails_dataset['message'].head()

0    yoyoyoyoyoyou 
1    yoyoyoyoyoyou 
2    yoyoyoyoyoyou 
3    yoyoyoyoyoyou 
4    yoyoyoyoyoyou 
Name: message, dtype: object