In [112]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
#from wordcloud import WordCloud
from math import log, sqrt
import pandas as pd
import numpy as np
import re
%matplotlib inline  

In [113]:
mails_dataset = pd.read_csv('Dataset/spam.csv', encoding = 'latin-1')
mails_dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [114]:
#drop undesirable columns
mails_dataset.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1, inplace = True)
mails_dataset.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [115]:
#rename the columns
mails_dataset.rename(columns = {'v1': 'labels', 'v2': 'message'}, inplace = True)
mails_dataset.head()

Unnamed: 0,labels,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [116]:
mails_dataset['labels'].value_counts()  #count number of each label

ham     4825
spam     747
Name: labels, dtype: int64

In [117]:
total_mails = mails_dataset.shape[0]
total_mails

5572

In [118]:
#assign number 1 to spam and 0 to ham
mails_dataset['label'] = mails_dataset['labels'].map({'ham': 0, 'spam': 1})
mails_dataset.head()

Unnamed: 0,labels,message,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [119]:
mails_dataset.drop(['labels'], axis = 1, inplace = True)
mails_dataset.head()

Unnamed: 0,message,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [120]:
#split our data into training and testing data

train_index, test_index = list(), list()
for i in range(total_mails):
    if np.random.uniform(0, 1) < 0.75:
        train_index += [i]
    else:
        test_index += [i]
train_data = mails_dataset.loc[train_index]
test_data = mails_dataset.loc[test_index]

In [121]:
train_data.reset_index(inplace = True)
train_data.drop(['index'], axis = 1, inplace = True)
train_data.head()

Unnamed: 0,message,label
0,"Go until jurong point, crazy.. Available only ...",0
1,U dun say so early hor... U c already then say...,0
2,"Nah I don't think he goes to usf, he lives aro...",0
3,Even my brother is not like to speak with me. ...,0
4,As per your request 'Melle Melle (Oru Minnamin...,0


In [122]:
test_data.reset_index(inplace = True)
test_data.drop(['index'], axis = 1, inplace = True)
test_data.head()

Unnamed: 0,message,label
0,Ok lar... Joking wif u oni...,0
1,Free entry in 2 a wkly comp to win FA Cup fina...,1
2,FreeMsg Hey there darling it's been 3 week's n...,1
3,WINNER!! As a valued network customer you have...,1
4,"SIX chances to win CASH! From 100 to 20,000 po...",1


In [123]:
#check train data
train_data['label'].value_counts()

0    3596
1     570
Name: label, dtype: int64

In [124]:
#check test data
test_data['label'].value_counts()

0    1229
1     177
Name: label, dtype: int64

In [125]:
spam_words = ' '.join(list(train_data[train_data['label'] == 1]['message']))
#spam_words

In [126]:
ham_words = ' '.join(list(train_data[train_data['label'] == 0]['message']))
#ham_words

In [127]:
train_data.head()

Unnamed: 0,message,label
0,"Go until jurong point, crazy.. Available only ...",0
1,U dun say so early hor... U c already then say...,0
2,"Nah I don't think he goes to usf, he lives aro...",0
3,Even my brother is not like to speak with me. ...,0
4,As per your request 'Melle Melle (Oru Minnamin...,0


In [128]:
train_data['message'] =  train_data['message'].str.lower()
train_data.head()

Unnamed: 0,message,label
0,"go until jurong point, crazy.. available only ...",0
1,u dun say so early hor... u c already then say...,0
2,"nah i don't think he goes to usf, he lives aro...",0
3,even my brother is not like to speak with me. ...,0
4,as per your request 'melle melle (oru minnamin...,0


In [129]:
stop_words = set(stopwords.words('english')) 
#stop_words

In [130]:
mails_dataset.get_dtype_counts()

object    1
int64     1
dtype: int64

In [100]:
dt = mails_dataset['message']
u = dt.str.contains('u')
u[:10]

0     True
1     True
2     True
3     True
4     True
5     True
6    False
7     True
8     True
9     True
Name: message, dtype: bool

In [111]:
#clean the data
mails_dataset['message'] = dt.str.replace('u', 'you')
mails_dataset['message'].head()

0    yoyoyoyoyoyou 
1    yoyoyoyoyoyou 
2    yoyoyoyoyoyou 
3    yoyoyoyoyoyou 
4    yoyoyoyoyoyou 
Name: message, dtype: object