# Notebook Imports

In [2]:
import pandas as pd

%matplotlib inline

### Constants

In [3]:
spam_file_paths = "spam.csv"

In [4]:
data = pd.read_csv(spam_file_paths)

### Data Cleaning

In [5]:
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [6]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [7]:
data['v2']

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object

In [8]:
type(data)

pandas.core.frame.DataFrame

In [9]:
data.drop('Unnamed: 2', inplace = True, axis = 1)
data.drop('Unnamed: 3', inplace = True, axis = 1)
data.drop('Unnamed: 4', inplace = True, axis = 1)

In [10]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Splitting The Training and Testing Data

In [11]:
type(data['v1'])

pandas.core.series.Series

In [12]:
import numpy as np
from sklearn.model_selection import train_test_split

In [13]:
X = pd.Series(data['v2'])
X = X.to_numpy()

y = pd.Series(data['v1'])
y = y.to_numpy()

In [14]:
type(y)
print(y)

['ham' 'ham' 'spam' ... 'ham' 'ham' 'ham']


In [15]:
def map_func(array, mapped_array):
    for val in array:
        mapped_array.append(1 if val == 'spam' else 0)

In [16]:
#mapping the spam to 1 and non spam as 0
mapped_array = []
map_func(y, mapped_array)
y = np.array(mapped_array)

In [17]:
y

array([0, 0, 1, ..., 0, 0, 0])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, train_size = 0.8)

In [19]:
y_train

array([1, 1, 0, ..., 0, 0, 0])

In [20]:
## done splitting the data now make the dictionary of values

### Making The Dictionary of Unique Words After applying NLP to each String


In [21]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps = PorterStemmer()  #object for porterStemmer

In [22]:
def unique_words_in_spam(string, vocab_words_spam):
    words = word_tokenize(string)
    for word in words:
        word = ps.stem(word)
        if (word not in stopwords.words('english')) and word.isalnum():
            word = str(word.lower())
            vocab_words_spam.append(word)

In [23]:
vocab_words_spam = [] # dictionary of spam words present in the dataset

for i in range(0, len(X_train)):
    if y_train[i] == 1:
        unique_words_in_spam(str(X_train[i]), vocab_words_spam)

In [24]:
vocab_words_spam

['someon',
 'ha',
 'conact',
 'date',
 'servic',
 'enter',
 'phone',
 'becaus',
 'fanci',
 'to',
 'find',
 'call',
 'landlin',
 '09111030116',
 'pobox12n146tf15',
 'recpt',
 'order',
 'rington',
 'order',
 'process',
 'ringtonek',
 '84484',
 'gr8',
 'new',
 'servic',
 'live',
 'sex',
 'video',
 'chat',
 'mob',
 'see',
 'sexiest',
 'dirtiest',
 'girl',
 'live',
 'ur',
 'phone',
 '4',
 'detail',
 'text',
 'horni',
 '89070',
 'cancel',
 'send',
 'stop',
 '89070',
 '88066',
 '88066',
 'lost',
 '3pound',
 'help',
 'dear',
 'matthew',
 'pleas',
 'call',
 '09063440451',
 'landlin',
 'complimentari',
 '4',
 'lux',
 'tenerif',
 'holiday',
 'cash',
 'await',
 'collect',
 'ppm150',
 'sae',
 't',
 'cs',
 'box334',
 'sk38xh',
 'pleas',
 'call',
 '08712402902',
 'immedi',
 'urgent',
 'messag',
 'wait',
 'well',
 'done',
 '4',
 'costa',
 'del',
 'sol',
 'holiday',
 'await',
 'collect',
 'call',
 '09050090044',
 'toclaim',
 'sae',
 'tc',
 'pobox334',
 'stockport',
 'sk38xh',
 'max10min',
 'congrat',
 

In [25]:
vocab_word_spam = list(dict.fromkeys(vocab_words_spam))
vocab_word_spam

['someon',
 'ha',
 'conact',
 'date',
 'servic',
 'enter',
 'phone',
 'becaus',
 'fanci',
 'to',
 'find',
 'call',
 'landlin',
 '09111030116',
 'pobox12n146tf15',
 'recpt',
 'order',
 'rington',
 'process',
 'ringtonek',
 '84484',
 'gr8',
 'new',
 'live',
 'sex',
 'video',
 'chat',
 'mob',
 'see',
 'sexiest',
 'dirtiest',
 'girl',
 'ur',
 '4',
 'detail',
 'text',
 'horni',
 '89070',
 'cancel',
 'send',
 'stop',
 '88066',
 'lost',
 '3pound',
 'help',
 'dear',
 'matthew',
 'pleas',
 '09063440451',
 'complimentari',
 'lux',
 'tenerif',
 'holiday',
 'cash',
 'await',
 'collect',
 'ppm150',
 'sae',
 't',
 'cs',
 'box334',
 'sk38xh',
 '08712402902',
 'immedi',
 'urgent',
 'messag',
 'wait',
 'well',
 'done',
 'costa',
 'del',
 'sol',
 '09050090044',
 'toclaim',
 'tc',
 'pobox334',
 'stockport',
 'max10min',
 'congrat',
 'nokia',
 '3650',
 'camera',
 '09066382422',
 'cost',
 '150ppm',
 'ave',
 '3min',
 'vari',
 'mobil',
 'close',
 '300603',
 'post',
 'bcm4284',
 'ldn',
 'wc1n3xx',
 '2',
 '3g'

In [26]:
# find the number of times a unique word occur in the spam emails