In [10]:
#Libraries

#Dataframe manipulation
import pandas as pd

#Arrays/numerical functions
import numpy as np

#String manipulation/Word Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

#Neural Network
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.callbacks import EarlyStopping

#Data manipulation
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

#Load Dataframe
data=pd.read_csv(r'spam_or_not_spam.csv')
data.head()
data.dropna()


Unnamed: 0,email,label
0,mike bostock said received from trackingNUMBE...,0
1,no i was just a little confused because i m r...,0
2,this is just an semi educated guess if i m wro...,0
3,jm URL justin mason writes except for NUMBER t...,0
4,i just picked up razor sdk NUMBER NUMBER and N...,0
...,...,...
1495,abc s good morning america ranks it the NUMBE...,1
1496,hyperlink hyperlink hyperlink let mortgage le...,1
1497,thank you for shopping with us gifts for all ...,1
1498,the famous ebay marketing e course learn to s...,1


In [11]:
#Split the columns
X = data['email'].values
y = data['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

#Remove any float values from X_test
X_test=X_test.tolist()
X_test=list(map(str,X_test))
X_test=np.array(X_test)


In [12]:
#Split the text of the email to words
t = Tokenizer()
t.fit_on_texts(X_train)


In [13]:
#Encode word strings of each email to ints 
encoded_train = t.texts_to_sequences(X_train)
encoded_test = t.texts_to_sequences(X_test)
print(encoded_train[0:2])



[[7, 7, 44, 23, 149, 2068, 962, 12, 6, 1593, 4, 58, 1, 3173, 2884, 6449, 14, 59, 9404, 9405, 1168, 5, 278, 9406, 3, 74, 9407, 468, 5142, 9408, 5143, 3626, 13, 557, 12, 72, 5144, 875, 2391, 9409, 25, 6, 1593, 4, 641, 469, 1169, 6449, 10, 152, 65, 2226, 34, 759, 740, 17, 13, 302, 5, 67, 10, 108, 3, 117, 1039, 6, 9410, 1120, 125, 1, 203, 1, 218, 1441, 1, 1, 7, 1, 7, 1, 7], [2885, 38, 296, 523, 64, 18, 41, 143, 352, 15, 41, 79, 2069, 315, 18, 8, 774, 16, 193, 19, 42, 68, 11, 344, 95, 2, 485, 4, 6, 479, 2070, 694, 30, 35, 2886, 1519, 2885, 404, 1121, 3, 16, 1918, 73, 24, 2071, 60, 17, 16, 118, 191, 25, 4170, 267, 2072, 6450, 1783, 396, 98, 72, 1594, 290, 364, 45, 524, 6451, 150, 36, 248, 1, 1, 1, 1, 509, 524, 13, 12, 6, 49, 62, 85, 8, 31, 23, 21, 5145, 226]]


In [14]:
max_length = 500
#Add padding
padded_train = pad_sequences(encoded_train, maxlen=max_length, padding='post')
padded_test = pad_sequences(encoded_test, maxlen=max_length, padding='post')
print(padded_train)

[[    7     7    44 ...     0     0     0]
 [ 2885    38   296 ...     0     0     0]
 [13025 13026 13027 ...  1443  4171  4172]
 ...
 [    7     7    44 ...     0     0     0]
 [   89    12   539 ...   684   685     7]
 [  668     3   913 ...     0     0     0]]


In [15]:
vocab_size = len(t.word_index) + 1
#Sequential Neural network model
model = Sequential()
model.add(Embedding(vocab_size, 24, input_length=max_length))
model.add(Flatten())
model.add(Dense(500, activation='relu'))
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(100, activation='relu'))
#Binary result (spam/not spam)
model.add(Dense(1, activation='sigmoid'))
#Compile the model
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 24)           485328    
_________________________________________________________________
flatten_1 (Flatten)          (None, 12000)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 500)               6000500   
_________________________________________________________________
dense_5 (Dense)              (None, 200)               100200    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 100)               20100     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                

In [16]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)

#Train the model
model.fit(x=padded_train,y=y_train,epochs=50,validation_data=(padded_test, y_test), verbose=1,callbacks=[early_stop])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 00015: early stopping


<keras.callbacks.History at 0x1ff88db41c0>

In [17]:
#Propabilities above 0.5 return 1 (binary results)
preds = (model.predict(padded_test) > 0.5).astype("int32")


In [18]:
#Classification Report
print(classification_report(y_test, preds))


              precision    recall  f1-score   support

           0       0.98      0.99      0.99       240
           1       0.98      0.96      0.97       135

    accuracy                           0.98       375
   macro avg       0.98      0.98      0.98       375
weighted avg       0.98      0.98      0.98       375

