In [87]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.regularizers import L1L2
from keras import backend as K
from keras.utils import plot_model
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [88]:
train_df = pd.read_csv("/content/train.csv") # Change to location of csv files stored in your directory
test_df = pd.read_csv("/content/test.csv")

In [89]:
train_df.head()

Unnamed: 0,spam,text
0,0,subject institute international finance annual...
1,1,subject mortgage even worst credit zwzm detail...
2,1,subject partnership mr edward moko independenc...
3,1,subject de la part de enfants ama rue de marty...
4,0,subject synfuel option valuation lenny believe...


In [90]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   spam    5000 non-null   int64 
 1   text    5000 non-null   object
dtypes: int64(1), object(1)
memory usage: 78.2+ KB


In [91]:
test_df.head()

Unnamed: 0,spam,text
0,1,subject perfect logo charset koi r thinking br...
1,0,subject storage model security stinson added t...
2,1,subject wall street micro news report homeland...
3,1,subject logo stationer website design much lt ...
4,0,subject video conference ross mcintyre vince r...


In [92]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226 entries, 0 to 225
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   spam    226 non-null    int64 
 1   text    226 non-null    object
dtypes: int64(1), object(1)
memory usage: 3.7+ KB


In [93]:
lemmatizer = WordNetLemmatizer()
sw = stopwords.words('english')
def clean(text):
  text = text.lower()
  for x in '!"#$%&()\'*+,-./:;?@[]^_`{|}~':
    text = text.replace(x, " ")
  text = [word.lower() for word in text.split() if word.lower() not in sw]
  text = [lemmatizer.lemmatize(word) for word in text]
  text = " ".join(text)
  return text

In [94]:
def preprocess(df):
  df['text'] = df['text'].apply(lambda x: clean(x))
  return df

In [95]:
train_df = preprocess(train_df)
test_df = preprocess(test_df)

In [96]:
MAX_NUM_WORDS = 5000
EMBEDDING_DIM = 100
MAX_SEQUENCE_LENGTH = 300

In [97]:
def tokenize(tokenizer, texts):
  sequences = tokenizer.texts_to_sequences(texts)
  return pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [98]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(train_df['text'].values)

X = tokenize(tokenizer, train_df['text'].values)
y = train_df['spam'].values.astype('int')

In [99]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, random_state = 42)
X_test = tokenize(tokenizer, test_df['text'].values)
y_test = test_df['spam'].values.astype('int')

In [100]:
inp = Input(shape = (MAX_SEQUENCE_LENGTH))

x = Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)(inp)
x = Bidirectional(LSTM(units=64))(x)
x = Dense(32, activation='relu')(x)
x = Dropout(0.1)(x)
out = Dense(1, activation = 'sigmoid')(x)

model = Model(inputs = inp, outputs = out)
model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.AdamW(1e-3), metrics=['accuracy'])
model.summary()

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 300)]             0         
                                                                 
 embedding_6 (Embedding)     (None, 300, 100)          500000    
                                                                 
 bidirectional_6 (Bidirecti  (None, 128)               84480     
 onal)                                                           
                                                                 
 dense_12 (Dense)            (None, 32)                4128      
                                                                 
 dropout_6 (Dropout)         (None, 32)                0         
                                                                 
 dense_13 (Dense)            (None, 1)                 33        
                                                           

In [101]:
# Training on train data and getting training history object
history = model.fit(X_train, y_train, epochs=10, batch_size=128,
                    validation_data=(X_val, y_val), shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [102]:
Y_probas = model.predict(X_val)
Y_pred = Y_probas.round().astype('int').flatten()
print("validation set accuracy: ")
print(classification_report(y_val, Y_pred))

validation set accuracy: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       895
           1       1.00      1.00      1.00       605

    accuracy                           1.00      1500
   macro avg       1.00      1.00      1.00      1500
weighted avg       1.00      1.00      1.00      1500



In [103]:
Y_probas_test = model.predict(X_test)
Y_pred_test = Y_probas_test.round().astype('int').flatten()
print("test set accuracy: ")
print(classification_report(y_test, Y_pred_test))

test set accuracy: 
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       113
           1       0.98      0.97      0.98       113

    accuracy                           0.98       226
   macro avg       0.98      0.98      0.98       226
weighted avg       0.98      0.98      0.98       226



In [104]:
print("test accuracy: ")
print(accuracy_score(y_test, Y_pred_test))

test accuracy: 
0.9778761061946902


In [105]:
predicted_df = test_df.copy()
predicted_df['spam'] = Y_pred_test
predicted_df['prediction'] = ['Spam' if x == 1 else 'Ham' for x in predicted_df['spam']]
predicted_df

Unnamed: 0,spam,text,prediction
0,1,subject perfect logo charset koi r thinking br...,Spam
1,0,subject storage model security stinson added t...,Ham
2,1,subject wall street micro news report homeland...,Spam
3,1,subject logo stationer website design much lt ...,Spam
4,0,subject video conference ross mcintyre vince r...,Ham
...,...,...,...
221,0,subject sorry see hyatt lobby vince j kaminski...,Ham
222,1,subject yyyy know hgh difference hello jm netn...,Spam
223,1,subject try ouut hello welcome pharmon content...,Spam
224,1,subject department energy deploying corporate ...,Spam


In [106]:
with open ("lstm_formatted_example_email_spam_predictions.txt", "w") as predictions_file:
  for i in range(0,50,2):
    pred = "Email: "+ predicted_df['text'][i] + ".\nPrediction: This is a "+ predicted_df['prediction'][i]+ " email.\n"
    print(pred)
    predictions_file.write(pred+'\n')

Email: subject perfect logo charset koi r thinking breathing new life business start revamping front end logo visuai identity loqodentity offer creative custom design logo stationery web site careful hand powerfui marketinq toois wiii bring breath fresh air business make stand among competitor click away future success click see sample artwork check price hot offer.
Prediction: This is a Spam email.

Email: subject wall street micro news report homeland security investment terror attack united state september changed security landscape foreseeable future physical logical security become paramount industry segment especially banking national resource government sector according giga wholly owned subsidiary forrester research worldwide demand information security product service set eclipse b homeland security investment newsletter dedicated providing reader information pertaining investment opportunity lucrative sector know event related homeland security happen lightning speed investor