# Notebook Imports

In [1]:
import pandas as pd

%matpltlib inline

### Constants

In [2]:
spam_file_paths = "spam.csv"

In [3]:
data = pd.read_csv(spam_file_paths)

### Data Cleaning

In [4]:
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [5]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
data['v2']

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object

In [7]:
type(data)

pandas.core.frame.DataFrame

In [8]:
data.drop('Unnamed: 2', inplace = True, axis = 1)
data.drop('Unnamed: 3', inplace = True, axis = 1)
data.drop('Unnamed: 4', inplace = True, axis = 1)

In [9]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Splitting The Training and Testing Data

In [24]:
type(data['v1'])

pandas.core.series.Series

In [25]:
import numpy as np
from sklearn.model_selection import train_test_split

In [95]:
X = pd.Series(data['v2'])
X = X.to_numpy()

y = pd.Series(data['v1'])
y = y.to_numpy()

In [96]:
type(y)
print(y)

['ham' 'ham' 'spam' ... 'ham' 'ham' 'ham']


In [97]:
def map_func(array, mapped_array):
    for val in array:
        mapped_array.append(1 if val == 'spam' else 0)

In [98]:
#mapping the spam to 1 and non spam as 0
mapped_array = []
map_func(y, mapped_array)
y = np.array(mapped_array)

In [99]:
y

array([0, 0, 1, ..., 0, 0, 0])

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, train_size = 0.8)

In [102]:
y_train

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
## done splitting the data now make the dictionary of values

### Making The Dictionary of Unique Words After applying NLP to each String


In [10]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps = PorterStemmer()  #object for porterStemmer

In [134]:
def unique_words_in_spam(string, vocab_words_spam):
    words = word_tokenize(string)
    for word in words:
        word = ps.stem(word)
        if (word not in stopwords.words('english')) and word.isalnum():
            word = str(word.lower())
            vocab_words_spam.append(word)

In [135]:
vocab_words_spam = [] # dictionary of spam words present in the dataset

for i in range(0, len(X_train)):
    if y_train[i] == 1:
        unique_words_in_spam(str(X_train[i]), vocab_words_spam)

In [136]:
vocab_words_spam

['clair',
 'havin',
 'borin',
 'time',
 'alon',
 'u',
 'wan',
 'na',
 'cum',
 '2nite',
 'chat',
 '09099725823',
 'hope',
 '2',
 'c',
 'u',
 'luv',
 'clair',
 'xx',
 '1',
 'new',
 'voicemail',
 'pleas',
 'call',
 '08719181513',
 'ask',
 '3mobil',
 'if',
 '0870',
 'chatlin',
 'inclu',
 'in',
 'free',
 'min',
 'india',
 'cust',
 'serv',
 'sed',
 'ye',
 'l8er',
 'got',
 'mega',
 'bill',
 '3',
 'dont',
 'giv',
 'a',
 'shit',
 'bailiff',
 'due',
 'in',
 'day',
 'i',
 'o',
 '3',
 'want',
 'doubl',
 'min',
 'doubl',
 'txt',
 'price',
 'linerent',
 'latest',
 'orang',
 'bluetooth',
 'mobil',
 'call',
 'mobileupd8',
 'veri',
 'latest',
 'offer',
 '08000839402',
 'freemsg',
 'whi',
 'repli',
 'text',
 'i',
 'randi',
 'sexi',
 'femal',
 'live',
 'local',
 'luv',
 'hear',
 'netcollex',
 'ltd',
 '08700621170150p',
 'per',
 'msg',
 'repli',
 'stop',
 'end',
 'got',
 'take',
 '2',
 'take',
 'part',
 'wrc',
 'ralli',
 'oz',
 'u',
 'lucozad',
 'energi',
 'text',
 'ralli',
 'le',
 '61200',
 '25p',
 'see'

In [138]:
vocab_word_spam = list(dict.fromkeys(vocab_words_spam))
vocab_word_spam

['clair',
 'havin',
 'borin',
 'time',
 'alon',
 'u',
 'wan',
 'na',
 'cum',
 '2nite',
 'chat',
 '09099725823',
 'hope',
 '2',
 'c',
 'luv',
 'xx',
 '1',
 'new',
 'voicemail',
 'pleas',
 'call',
 '08719181513',
 'ask',
 '3mobil',
 'if',
 '0870',
 'chatlin',
 'inclu',
 'in',
 'free',
 'min',
 'india',
 'cust',
 'serv',
 'sed',
 'ye',
 'l8er',
 'got',
 'mega',
 'bill',
 '3',
 'dont',
 'giv',
 'a',
 'shit',
 'bailiff',
 'due',
 'day',
 'i',
 'o',
 'want',
 'doubl',
 'txt',
 'price',
 'linerent',
 'latest',
 'orang',
 'bluetooth',
 'mobil',
 'mobileupd8',
 'veri',
 'offer',
 '08000839402',
 'freemsg',
 'whi',
 'repli',
 'text',
 'randi',
 'sexi',
 'femal',
 'live',
 'local',
 'hear',
 'netcollex',
 'ltd',
 '08700621170150p',
 'per',
 'msg',
 'stop',
 'end',
 'take',
 'part',
 'wrc',
 'ralli',
 'oz',
 'lucozad',
 'energi',
 'le',
 '61200',
 '25p',
 'see',
 'pack',
 'itcould',
 'xclusiv',
 'clubsaisai',
 '2morow',
 'soire',
 'special',
 'zouk',
 'nichol',
 'rose',
 'ladi',
 'info',
 '10',
 '

In [None]:
# find the number of times a unique word occur in the spam emails