# Notebook Imports

In [3]:
import pandas as pd

%matplotlib inline

### Constants

In [4]:
spam_file_paths = "spam.csv"

In [5]:
data = pd.read_csv(spam_file_paths)

### Data Cleaning

In [6]:
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [7]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [8]:
data['v2']

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object

In [9]:
type(data)

pandas.core.frame.DataFrame

In [10]:
data.drop('Unnamed: 2', inplace = True, axis = 1)
data.drop('Unnamed: 3', inplace = True, axis = 1)
data.drop('Unnamed: 4', inplace = True, axis = 1)

In [11]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
# Modifying the DataSet to modify all the words present to their original content

In [13]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [14]:
new_data = []

stop_words = stopwords.words('english')
for i in range(0, len(data)):
    newString = '';
    string = data['v2'][i]
    words = word_tokenize(string)
    for word in words:
        word = str(word.lower())
        if word not in stop_words and word.isalnum():
            word = ps.stem(word)
            newString = newString+word+' '
    new_data.append(newString)

In [15]:
new_data

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat ',
 'ok lar joke wif u oni ',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18 ',
 'u dun say earli hor u c alreadi say ',
 'nah think goe usf live around though ',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send rcv ',
 'even brother like speak treat like aid patent ',
 'per request mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun ',
 'winner valu network custom select receivea prize reward claim call 09061701461 claim code kl341 valid 12 hour ',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030 ',
 'gon na home soon want talk stuff anymor tonight k cri enough today ',
 'six chanc win cash 100 pound txt csh11 send 87575 cost 6day tsandc appli repli hl 4 info ',
 'urgent 1 week free membership prize jackpo

In [16]:
data['content'] = new_data

In [17]:
data

Unnamed: 0,v1,v2,content
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,2nd time tri 2 contact u pound prize 2 claim e...
5568,ham,Will Ì_ b going to esplanade fr home?,b go esplanad fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",piti mood suggest
5570,ham,The guy did some bitching but I acted like i'd...,guy bitch act like interest buy someth els nex...


In [18]:
data.drop(['v2'], axis = 1, inplace=True)

In [19]:
data

Unnamed: 0,v1,content
0,ham,go jurong point crazi avail bugi n great world...
1,ham,ok lar joke wif u oni
2,spam,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah think goe usf live around though
...,...,...
5567,spam,2nd time tri 2 contact u pound prize 2 claim e...
5568,ham,b go esplanad fr home
5569,ham,piti mood suggest
5570,ham,guy bitch act like interest buy someth els nex...


### Splitting The Training and Testing Data

In [20]:
type(data['v1'])

pandas.core.series.Series

In [21]:
import numpy as np
from sklearn.model_selection import train_test_split

In [22]:
X = pd.Series(data['content'])
X = X.to_numpy()

y = pd.Series(data['v1'])
y = y.to_numpy()

In [23]:
type(y)
print(y)

['ham' 'ham' 'spam' ... 'ham' 'ham' 'ham']


In [24]:
type(X)
print(X)

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat '
 'ok lar joke wif u oni '
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18 '
 ... 'piti mood suggest '
 'guy bitch act like interest buy someth els next week gave us free '
 'rofl true name ']


In [25]:
def map_func(array, mapped_array):
    for val in array:
        mapped_array.append(1 if val == 'spam' else 0)

In [26]:
#mapping the spam to 1 and non spam as 0
mapped_array = []
map_func(y, mapped_array)
y = np.array(mapped_array)

In [27]:
y

array([0, 0, 1, ..., 0, 0, 0])

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, train_size = 0.8)

In [29]:
y_train

array([0, 0, 0, ..., 0, 0, 0])

In [29]:
## done splitting the data now make the dictionary of values

In [31]:
count_arr = np.bincount(y_train)

In [34]:
count_arr[1]

600

In [None]:
# Calculating the probability of spam

In [36]:
prob_spam = (count_arr[0]/len(y_train))
prob_ham = (count_arr[1]/len(y_train))

In [37]:
prob_spam

0.8653803006506618

In [38]:
prob_ham

0.13461969934933812

In [40]:
probability = {}
probability['spam'] = prob_spam
probability['ham'] = prob_ham

In [41]:
probability

{'spam': 0.8653803006506618, 'ham': 0.13461969934933812}

In [70]:
len(X_train)

4457

In [71]:
len(X_test)

1115

### Making The Dictionary of Unique Words After applying NLP to each String


import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps = PorterStemmer()

### Making the Frequency Table of all the words present in the spam mails

In [53]:
def unique_words_in_spam(string, vocab_words_spam):
    words = word_tokenize(string)
    for word in words:
        word = str(word.lower())
        vocab_words_spam.append(word)

In [54]:
vocab_words_spam = [] # dictionary of spam words present in the dataset

for i in range(0, len(X_train)):
    if y_train[i] == 1:
        unique_words_in_spam(str(X_train[i]), vocab_words_spam)

In [55]:
vocab_words_spam

['email',
 'alertfrom',
 'jeri',
 'stewarts',
 '2kbsubject',
 'prescripiton',
 'drvgsto',
 'listen',
 'email',
 'call',
 '123',
 'u',
 'r',
 'winner',
 'u',
 'ave',
 'special',
 'select',
 '2',
 'receiv',
 'cash',
 '4',
 'holiday',
 'flight',
 'inc',
 'speak',
 'live',
 'oper',
 '2',
 'claim',
 '18',
 'want',
 'funk',
 'ur',
 'fone',
 'weekli',
 'new',
 'tone',
 'repli',
 'tones2u',
 '2',
 'text',
 'origin',
 'n',
 'best',
 'tone',
 '3gbp',
 'network',
 'oper',
 'rate',
 'appli',
 'free',
 '1st',
 'week',
 'entri',
 '2',
 'textpod',
 '4',
 'chanc',
 '2',
 'win',
 '40gb',
 'ipod',
 'cash',
 'everi',
 'wk',
 'txt',
 'vpod',
 '81303',
 'ts',
 'cs',
 'custcar',
 '08712405020',
 'eastend',
 'tv',
 'quiz',
 'flower',
 'dot',
 'compar',
 'violet',
 'tulip',
 'lili',
 'txt',
 'e',
 'f',
 '84025',
 '4',
 'chanc',
 '2',
 'win',
 'cash',
 'xma',
 'prize',
 'draw',
 'tri',
 'contact',
 'today',
 'draw',
 'show',
 'prize',
 'guarante',
 'call',
 '09058094565',
 'land',
 'line',
 'valid',
 '12hr',
 

In [56]:
vocab_words_spam_set = list(dict.fromkeys(vocab_words_spam))
vocab_words_spam_set

['email',
 'alertfrom',
 'jeri',
 'stewarts',
 '2kbsubject',
 'prescripiton',
 'drvgsto',
 'listen',
 'call',
 '123',
 'u',
 'r',
 'winner',
 'ave',
 'special',
 'select',
 '2',
 'receiv',
 'cash',
 '4',
 'holiday',
 'flight',
 'inc',
 'speak',
 'live',
 'oper',
 'claim',
 '18',
 'want',
 'funk',
 'ur',
 'fone',
 'weekli',
 'new',
 'tone',
 'repli',
 'tones2u',
 'text',
 'origin',
 'n',
 'best',
 '3gbp',
 'network',
 'rate',
 'appli',
 'free',
 '1st',
 'week',
 'entri',
 'textpod',
 'chanc',
 'win',
 '40gb',
 'ipod',
 'everi',
 'wk',
 'txt',
 'vpod',
 '81303',
 'ts',
 'cs',
 'custcar',
 '08712405020',
 'eastend',
 'tv',
 'quiz',
 'flower',
 'dot',
 'compar',
 'violet',
 'tulip',
 'lili',
 'e',
 'f',
 '84025',
 'xma',
 'prize',
 'draw',
 'tri',
 'contact',
 'today',
 'show',
 'guarante',
 '09058094565',
 'land',
 'line',
 'valid',
 '12hr',
 'congratul',
 'thank',
 'good',
 'friend',
 'easi',
 '08718726971',
 '10p',
 'per',
 'minut',
 'freemsg',
 '86888',
 'reward',
 '3',
 'hour',
 'talk

In [59]:
# find the number of times a unique word occur in the spam emails
dict_spam = {}

# for w in vocab_word_spam:
#     emails_with_w = 0
#     for sentence in X_train:
# #         print(sentence)
#         sentence = word_tokenize(sentence)
#         for word in sentence:
# #             print(word)
#             word = word.lower()
#             if word == w:
#                 emails_with_w+=1
    
#     dict_spam[w] = emails_with_w

len_vocab_spam = len(vocab_words_spam)

for word in vocab_words_spam_set:
    emails_counter = 0
    for w in vocab_words_spam:
        if word == w:
            emails_counter+=1
    dict_spam[word] = (emails_counter/len_vocab_spam)

In [60]:
dict_spam

{'email': 0.00021452322213879653,
 'alertfrom': 0.00010726161106939826,
 'jeri': 0.00010726161106939826,
 'stewarts': 0.00010726161106939826,
 '2kbsubject': 0.00010726161106939826,
 'prescripiton': 0.00010726161106939826,
 'drvgsto': 0.00010726161106939826,
 'listen': 0.0003217848332081948,
 'call': 0.031213128821194893,
 '123': 0.00010726161106939826,
 'u': 0.012227823661911401,
 'r': 0.002788801887804355,
 'winner': 0.0017161857771103722,
 'ave': 0.0003217848332081948,
 'special': 0.0015016625549715756,
 'select': 0.002896063498873753,
 '2': 0.016303764882548536,
 'receiv': 0.0033251099431513462,
 'cash': 0.005148557331331117,
 '4': 0.010297114662662234,
 'holiday': 0.0022524938324573634,
 'flight': 0.0007508312774857878,
 'inc': 0.0008580928885551861,
 'speak': 0.0009653544996245844,
 'live': 0.0031105867210125494,
 'oper': 0.0012871393328327792,
 'claim': 0.009975329829454038,
 '18': 0.002037970610318567,
 'want': 0.0024670170545961602,
 'funk': 0.00010726161106939826,
 'ur': 0.013

In [61]:
len(dict_spam)

2046

### Making the Frequency Table of all the words present in the non-spam mails

In [62]:
def unique_words_in_ham(string, vocab_words_ham):
    words = word_tokenize(string)
    for word in words:
        word = str(word.lower())
        vocab_words_ham.append(word)

In [63]:
vocab_words_ham = [] # dictionary of non spam words present in the dataset

for i in range(0, len(X_train)):
    if y_train[i] == 0:
        unique_words_in_ham(str(X_train[i]), vocab_words_ham)

In [64]:
vocab_words_ham_set = list(dict.fromkeys(vocab_words_ham))
vocab_words_ham_set

['signific',
 'mind',
 'ask',
 'happen',
 'dont',
 'say',
 'uncomfort',
 'wuld',
 'without',
 'babi',
 'thought',
 'alon',
 'mite',
 'break',
 'donåõt',
 'wan',
 'na',
 'go',
 'crazi',
 'everyboy',
 'need',
 'ladi',
 'xxxxxxxx',
 'wrong',
 'phone',
 'answer',
 'one',
 'assum',
 'peopl',
 'well',
 'haha',
 'angri',
 'take',
 'practic',
 'real',
 'thing',
 'mon',
 'oki',
 'lor',
 'best',
 'cheap',
 'n',
 'gd',
 'food',
 'la',
 'ex',
 'oso',
 'depend',
 'whether',
 'wana',
 'eat',
 'western',
 'chines',
 'den',
 'u',
 'prefer',
 'hey',
 'guy',
 'know',
 'breath',
 'neck',
 'get',
 'bud',
 'anyway',
 'abl',
 'half',
 'track',
 'usf',
 'tonight',
 'ha',
 'good',
 'joke',
 'girl',
 'situat',
 'seeker',
 'hope',
 'went',
 'remind',
 'still',
 'c',
 'littl',
 'left',
 'loung',
 'oh',
 'fun',
 'care',
 'da',
 'car',
 'park',
 'pleas',
 'like',
 'hi',
 'babe',
 'uawak',
 'feellikw',
 'via',
 'alett',
 'thatmum',
 'gotmarri',
 'ourback',
 'åð',
 'fuckinnic',
 'selfish',
 'cours',
 'make',
 'stink

In [81]:
dict_ham = {}

len_vocab_ham = len(vocab_words_ham)

for word in vocab_words_ham_set:
    emails_counter = 0
    for w in vocab_words_ham:
        if word == w:
            emails_counter+=1
    dict_ham[word] = (emails_counter/len_vocab_ham)

In [82]:
dict_ham

{'signific': 6.579813133307015e-05,
 'mind': 0.0011514672983287274,
 'ask': 0.003224108435320437,
 'happen': 0.001447558889327543,
 'dont': 0.003289906566653507,
 'say': 0.003651796288985393,
 'uncomfort': 3.2899065666535074e-05,
 'wuld': 6.579813133307015e-05,
 'without': 0.0007237794446637715,
 'babi': 0.000921173838662982,
 'thought': 0.000954072904329517,
 'alon': 0.0003289906566653507,
 'mite': 9.869719699960521e-05,
 'break': 0.00039478878799842083,
 'donåõt': 0.00016449532833267535,
 'wan': 0.0021055402026582447,
 'na': 0.002434530859323595,
 'go': 0.010790893538623503,
 'crazi': 0.0002302934596657455,
 'everyboy': 3.2899065666535074e-05,
 'need': 0.00473746545598105,
 'ladi': 0.0002302934596657455,
 'xxxxxxxx': 3.2899065666535074e-05,
 'wrong': 0.0003289906566653507,
 'phone': 0.00240163179365706,
 'answer': 0.0005263850506645612,
 'one': 0.004211080405316489,
 'assum': 0.00016449532833267535,
 'peopl': 0.0012501644953283326,
 'well': 0.0030925121726542966,
 'haha': 0.001480457

### Saving the Dictionaries as JSON objects

In [83]:
import json

In [84]:
with open("spam_dict.json", "w") as outfile:
    json.dump(dict_spam, outfile)

In [85]:
with open("non_spam_dict.json", "w") as outfile:
    json.dump(dict_ham, outfile)

In [86]:
# Saving the probability as json file
with open("probability.json", "w") as outfile:
    json.dump(probability, outfile)

In [87]:
type(X_test)

numpy.ndarray

In [None]:
# Saving the Testing Data to Json Objects

In [88]:
with open("testing\X_test.json", "w") as outfile:
    json.dump(X_test.tolist(), outfile)

In [89]:
with open("testing\Y_test.json", "w") as outfile:
    json.dump(y_test.tolist(), outfile)