# Notebook Imports

In [1]:
import pandas as pd

%matplotlib inline

### Constants

In [2]:
spam_file_paths = "spam.csv"

In [15]:
data = pd.read_csv(spam_file_paths)

### Data Cleaning

In [16]:
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [17]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [18]:
data['v2']

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object

In [19]:
type(data)

pandas.core.frame.DataFrame

In [20]:
data.drop('Unnamed: 2', inplace = True, axis = 1)
data.drop('Unnamed: 3', inplace = True, axis = 1)
data.drop('Unnamed: 4', inplace = True, axis = 1)

In [24]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
# Modifying the DataSet to modify all the words present to their original content

In [25]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [37]:
new_data = []

stop_words = stopwords.words('english')
for i in range(0, len(data)):
    newString = '';
    string = data['v2'][i]
    words = word_tokenize(string)
    for word in words:
        word = str(word.lower())
        if word not in stop_words and word.isalnum():
            word = ps.stem(word)
            newString = newString+word+' '
    new_data.append(newString)

In [38]:
new_data

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat ',
 'ok lar joke wif u oni ',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18 ',
 'u dun say earli hor u c alreadi say ',
 'nah think goe usf live around though ',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send rcv ',
 'even brother like speak treat like aid patent ',
 'per request mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun ',
 'winner valu network custom select receivea prize reward claim call 09061701461 claim code kl341 valid 12 hour ',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030 ',
 'gon na home soon want talk stuff anymor tonight k cri enough today ',
 'six chanc win cash 100 pound txt csh11 send 87575 cost 6day tsandc appli repli hl 4 info ',
 'urgent 1 week free membership prize jackpo

In [39]:
data['content'] = new_data

In [40]:
data

Unnamed: 0,v1,v2,content
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,2nd time tri 2 contact u pound prize 2 claim e...
5568,ham,Will Ì_ b going to esplanade fr home?,b go esplanad fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",piti mood suggest
5570,ham,The guy did some bitching but I acted like i'd...,guy bitch act like interest buy someth els nex...


In [41]:
data.drop(['v2'], axis = 1, inplace=True)

In [42]:
data

Unnamed: 0,v1,content
0,ham,go jurong point crazi avail bugi n great world...
1,ham,ok lar joke wif u oni
2,spam,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah think goe usf live around though
...,...,...
5567,spam,2nd time tri 2 contact u pound prize 2 claim e...
5568,ham,b go esplanad fr home
5569,ham,piti mood suggest
5570,ham,guy bitch act like interest buy someth els nex...


### Splitting The Training and Testing Data

In [44]:
type(data['v1'])

pandas.core.series.Series

In [45]:
import numpy as np
from sklearn.model_selection import train_test_split

In [46]:
X = pd.Series(data['content'])
X = X.to_numpy()

y = pd.Series(data['v1'])
y = y.to_numpy()

In [47]:
type(y)
print(y)

['ham' 'ham' 'spam' ... 'ham' 'ham' 'ham']


In [48]:
def map_func(array, mapped_array):
    for val in array:
        mapped_array.append(1 if val == 'spam' else 0)

In [49]:
#mapping the spam to 1 and non spam as 0
mapped_array = []
map_func(y, mapped_array)
y = np.array(mapped_array)

In [50]:
y

array([0, 0, 1, ..., 0, 0, 0])

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, train_size = 0.8)

In [54]:
y_train

array([0, 0, 0, ..., 0, 0, 0])

In [19]:
## done splitting the data now make the dictionary of values

### Making The Dictionary of Unique Words After applying NLP to each String


import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [21]:
def unique_words_in_spam(string, vocab_words_spam):
    words = word_tokenize(string)
    for word in words:
        word = str(word.lower())
        if (word not in vocab_words_spam):
            vocab_words_spam.append(word)

In [22]:
vocab_words_spam = [] # dictionary of spam words present in the dataset

for i in range(0, len(X_train)):
    if y_train[i] == 1:
        unique_words_in_spam(str(X_train[i]), vocab_words_spam)

In [23]:
vocab_words_spam

['ve',
 '4',
 'costa',
 'del',
 'sol',
 'holiday',
 'await',
 'collect',
 'call',
 '09050090044',
 'toclaim',
 'sae',
 'tc',
 'pobox334',
 'stockport',
 'sk38xh',
 'max10min',
 'congrat',
 '2',
 'mobil',
 '3g',
 'videophon',
 'r',
 'call',
 '09063458130',
 'videochat',
 'wid',
 'mate',
 'play',
 'java',
 'game',
 'dload',
 'polyph',
 'music',
 'nolin',
 'rentl',
 'we',
 'tri',
 'contact',
 'repli',
 'offer',
 'video',
 'handset',
 '750',
 'anytim',
 'network',
 'min',
 'unlimit',
 'text',
 'camcord',
 'repli',
 'call',
 '08000930705',
 'contact',
 'date',
 'servic',
 'someon',
 'know',
 'to',
 'find',
 'call',
 'land',
 'line',
 '09050000878',
 'pobox45w2tg150p',
 'loan',
 'ani',
 'purpos',
 'homeown',
 'tenant',
 'welcom',
 'previous',
 'refus',
 'we',
 'still',
 'help',
 'call',
 'free',
 '0800',
 '1956669',
 'text',
 'back',
 'get',
 'garden',
 'readi',
 'summer',
 'free',
 'select',
 'summer',
 'bulb',
 'seed',
 'worth',
 'onli',
 'scotsman',
 'thi',
 'saturday',
 'to',
 'stop',
 '

In [24]:
vocab_word_spam = list(dict.fromkeys(vocab_words_spam))
vocab_word_spam

['ve',
 '4',
 'costa',
 'del',
 'sol',
 'holiday',
 'await',
 'collect',
 'call',
 '09050090044',
 'toclaim',
 'sae',
 'tc',
 'pobox334',
 'stockport',
 'sk38xh',
 'max10min',
 'congrat',
 '2',
 'mobil',
 '3g',
 'videophon',
 'r',
 '09063458130',
 'videochat',
 'wid',
 'mate',
 'play',
 'java',
 'game',
 'dload',
 'polyph',
 'music',
 'nolin',
 'rentl',
 'we',
 'tri',
 'contact',
 'repli',
 'offer',
 'video',
 'handset',
 '750',
 'anytim',
 'network',
 'min',
 'unlimit',
 'text',
 'camcord',
 '08000930705',
 'date',
 'servic',
 'someon',
 'know',
 'to',
 'find',
 'land',
 'line',
 '09050000878',
 'pobox45w2tg150p',
 'loan',
 'ani',
 'purpos',
 'homeown',
 'tenant',
 'welcom',
 'previous',
 'refus',
 'still',
 'help',
 'free',
 '0800',
 '1956669',
 'back',
 'get',
 'garden',
 'readi',
 'summer',
 'select',
 'bulb',
 'seed',
 'worth',
 'onli',
 'scotsman',
 'thi',
 'saturday',
 'stop',
 'go2',
 'hmv',
 'bonu',
 'special',
 '500',
 'pound',
 'genuin',
 'voucher',
 'answer',
 'easi',
 'que

In [28]:
# find the number of times a unique word occur in the spam emails
dict_spam = {}

for w in vocab_word_spam:
    emails_with_w = 0
    for sentence in X_train:
        for word in sentence:
            word = word.lower()
            word = ps.stem(word)
            if word.isalnum() and word == w:
                emails_with_w+=1
    
    dict_spam[w] = emails_with_w

In [29]:
dict_spam

{'ve': 0,
 '4': 825,
 'costa': 0,
 'del': 0,
 'sol': 0,
 'holiday': 0,
 'await': 0,
 'collect': 0,
 'call': 0,
 '09050090044': 0,
 'toclaim': 0,
 'sae': 0,
 'tc': 0,
 'pobox334': 0,
 'stockport': 0,
 'sk38xh': 0,
 'max10min': 0,
 'congrat': 0,
 '2': 1350,
 'mobil': 0,
 '3g': 0,
 'videophon': 0,
 'r': 14247,
 '09063458130': 0,
 'videochat': 0,
 'wid': 0,
 'mate': 0,
 'play': 0,
 'java': 0,
 'game': 0,
 'dload': 0,
 'polyph': 0,
 'music': 0,
 'nolin': 0,
 'rentl': 0,
 'we': 0,
 'tri': 0,
 'contact': 0,
 'repli': 0,
 'offer': 0,
 'video': 0,
 'handset': 0,
 '750': 0,
 'anytim': 0,
 'network': 0,
 'min': 0,
 'unlimit': 0,
 'text': 0,
 'camcord': 0,
 '08000930705': 0,
 'date': 0,
 'servic': 0,
 'someon': 0,
 'know': 0,
 'to': 0,
 'find': 0,
 'land': 0,
 'line': 0,
 '09050000878': 0,
 'pobox45w2tg150p': 0,
 'loan': 0,
 'ani': 0,
 'purpos': 0,
 'homeown': 0,
 'tenant': 0,
 'welcom': 0,
 'previous': 0,
 'refus': 0,
 'still': 0,
 'help': 0,
 'free': 0,
 '0800': 0,
 '1956669': 0,
 'back': 0,
 'g