# Orange SPAM detector

One of the main pain point that AT\&T users are facing is constant exposure to SPAM messages.

AT\&T has been able to manually flag spam messages for a time, but they are looking for an automated way of detecting spams to protect their users.

Your goal is to build a spam detector, that can automatically flag spams as they come based sollely on the sms' content.

In [None]:

# Import Tensorflow & Pathlib librairies
import tensorflow as tf 
import pathlib 
import pandas as pd 
import os
import io
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Import dataset with Pandas 
dataset = pd.read_csv("spam.csv", error_bad_lines=False, encoding = "ISO-8859-1")
dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
dataset.describe(include="all")

print("Number of rows : {}".format(dataset.shape[0]))
print()

print("Percentage of missing values: ")
display(100*dataset.isnull().sum()/dataset.shape[0])

dataset.describe(include="all")

Number of rows : 5572

Percentage of missing values: 


v1             0.000000
v2             0.000000
Unnamed: 2    99.102656
Unnamed: 3    99.784637
Unnamed: 4    99.892319
dtype: float64

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [None]:
# show the distribution between the classes to predict
dataset.v1.groupby(dataset.v1).count()

v1
ham     4825
spam     747
Name: v1, dtype: int64

In [None]:
# keep only usefull columns
dataset = dataset.loc[:,["v1", "v2"]]
dataset.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# transform target names in binary classes
dataset['v1_transformed'] = dataset.v1.apply(lambda x : 0 if x == 'ham' else 1)
dataset.head()

Unnamed: 0,v1,v2,v1_transformed
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
# download en_core_web_sm to manage english words
!python -m spacy download en_core_web_sm -q

2022-12-19 10:08:32.699494: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[K     |████████████████████████████████| 12.8 MB 7.3 MB/s 
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
# Import Spacy and english initialisation
import en_core_web_sm
nlp = en_core_web_sm.load()

In [None]:
# Import Stop words 
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
# Create a new clean column to make words management easier 
dataset["v2_clean"] = dataset["v2"].apply(lambda x:''.join(ch for ch in x if ch.isalnum() or ch==" "))
dataset["v2_clean"] = dataset["v2_clean"].apply(lambda x: x.replace(" +"," ").lower().strip())
dataset["v2_clean"] = dataset["v2_clean"].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if (token.lemma_ not in STOP_WORDS) & (token.text not in STOP_WORDS)]))

dataset.head()


Unnamed: 0,v1,v2,v1_transformed,v2_clean
0,ham,"Go until jurong point, crazy.. Available only ...",0,jurong point crazy available bugis n great wor...
1,ham,Ok lar... Joking wif u oni...,0,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,0,u dun early hor u c
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,nah think usf live


In [None]:
# instanciate the tokenizer
import numpy as np
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=1000)
tokenizer.fit_on_texts(dataset["v2_clean"])
dataset["v2_encoded"] = tokenizer.texts_to_sequences(dataset.v2_clean)
dataset["len_v2"] = dataset["v2_encoded"].apply(lambda x: len(x))
dataset = dataset[dataset["len_v2"]!=0]

dataset.head()

Unnamed: 0,v1,v2,v1_transformed,v2_clean,v2_encoded,len_v2
0,ham,"Go until jurong point, crazy.. Available only ...",0,jurong point crazy available bugis n great wor...,"[230, 444, 460, 943, 35, 51, 204, 944, 79, 945...",11
1,ham,Ok lar... Joking wif u oni...,0,ok lar joking wif u oni,"[9, 193, 289, 1]",4
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry 2 wkly comp win fa cup final tkts 2...,"[11, 300, 3, 532, 655, 33, 849, 420, 20, 157, ...",13
3,ham,U dun say so early hor... U c already then say...,0,u dun early hor u c,"[1, 124, 149, 1, 84]",5
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,nah think usf live,"[705, 22, 656, 127]",4


In [None]:
# use the padding to be sure that the words will have the same lenght
v2_pad = tf.keras.preprocessing.sequence.pad_sequences(dataset.v2_encoded, padding="post")

In [None]:
# create a dataset from tensor slices
full_ds = tf.data.Dataset.from_tensor_slices((v2_pad, dataset.v1_transformed.values))

In [None]:
# Train Test Split
TAKE_SIZE = int(0.7*dataset.shape[0])

train_data = full_ds.take(TAKE_SIZE).shuffle(TAKE_SIZE)
train_data = train_data.batch(64)

test_data = full_ds.skip(TAKE_SIZE)
test_data = test_data.batch(64)

In [None]:
 # Take a look of one batch 
for v2, v1 in train_data.take(1):
  print(v2, v1)

tf.Tensor(
[[705   2   0 ...   0   0   0]
 [  1 481 729 ...   0   0   0]
 [ 76 580 455 ...   0   0   0]
 ...
 [263  18  52 ...   0   0   0]
 [556  17  73 ...   0   0   0]
 [ 63  36   2 ...   0   0   0]], shape=(64, 47), dtype=int32) tf.Tensor(
[0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 1 0 0], shape=(64,), dtype=int64)


# Simple RNN

In [None]:
# Let's try with Simple RNN method
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, GRU, LSTM

vocab_size = len(tokenizer.word_index)
model = tf.keras.Sequential([
                  # Word Embedding layer           
                  Embedding(vocab_size, 64,name="embedding"),
                  # Gobal average pooling
                  SimpleRNN(units=64, return_sequences=True), # maintains the sequential nature
                  SimpleRNN(units=32, return_sequences=False), # returns the last output
                  # Dense layers once the data is flat
                  Dense(16, activation='relu'),
                  Dense(8, activation='relu'),

                  # output layer with as many neurons as the number of classes
                  # for the target variable and softmax activation
                  Dense(1, activation="sigmoid")
])

In [None]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          524032    
                                                                 
 simple_rnn_16 (SimpleRNN)   (None, None, 64)          8256      
                                                                 
 simple_rnn_17 (SimpleRNN)   (None, 32)                3104      
                                                                 
 dense_24 (Dense)            (None, 16)                528       
                                                                 
 dense_25 (Dense)            (None, 8)                 136       
                                                                 
 dense_26 (Dense)            (None, 1)                 9         
                                                                 
Total params: 536,065
Trainable params: 536,065
Non-tr

In [None]:
# instanciate the optimizer
optimizer= tf.keras.optimizers.Adam()

model.compile(optimizer=optimizer,
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy()])

In [None]:
(dataset["v1_transformed"]-1).value_counts()

-1    4667
 0     745
Name: v1_transformed, dtype: int64

In [None]:
# given the fact that the classes haven't the same frequency in the dataset, we have to add a properly weight
weights = 1/(dataset.v1_transformed).value_counts()
weights = weights * len(dataset)/2
weights = {index : values for index , values in zip(weights.index,weights.values)}
weights

{0: 0.5798157274480394, 1: 3.632214765100671}

In [None]:
# Model training 
history = model.fit(train_data,
                    epochs=100, 
                    validation_data=test_data,
                    class_weight=weights)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Scores for 100 epochs :


    loss : 0.0018 - val_loss : 0.3580
    binary_accuracy : 0.9995 - val_binary_accuracy : 0.9495


In [None]:
# Save the model
model.save("model_simpleRNN.h5")

In [None]:
import json
json.dump(model.history.history, open("/content/simpleRNN_history.json", 'w'))

# GRU

In [None]:
# Let's try GRU now 
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, GRU, LSTM

vocab_size = len(tokenizer.word_index)
model_gru = tf.keras.Sequential([
                  Embedding(vocab_size, 64,name="embedding"),
                  GRU(units=64, return_sequences=True), # maintains the sequential nature
                  GRU(units=32, return_sequences=False), # returns the last output
                  Dense(16, activation='relu'),
                  Dense(8, activation='relu'),

                  Dense(1, activation="sigmoid")
])

In [None]:
model_gru.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          524032    
                                                                 
 gru (GRU)                   (None, None, 64)          24960     
                                                                 
 gru_1 (GRU)                 (None, 32)                9408      
                                                                 
 dense_27 (Dense)            (None, 16)                528       
                                                                 
 dense_28 (Dense)            (None, 8)                 136       
                                                                 
 dense_29 (Dense)            (None, 1)                 9         
                                                                 
Total params: 559,073
Trainable params: 559,073
Non-tr

In [None]:
# instanciate the optimizer
optimizer= tf.keras.optimizers.Adam()

model_gru.compile(optimizer=optimizer,
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy()])

In [None]:
# fit the model
model_gru.fit(train_data,
              epochs=100, 
              validation_data=test_data,
              class_weight=weights)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f05142c9b50>

Scores for 100 epochs :


    loss : 0.6921 - val_loss : 0.6938
    binary_accuracy : 0.1381 - val_binary_accuracy : 0.1392

In [None]:
model_gru.save("model_gru.h5")

In [None]:
json.dump(model_gru.history.history, open("/content/GRU_history.json", 'w'))

# LSTM

In [None]:
# We try LSTM now
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, GRU, LSTM

vocab_size = len(tokenizer.word_index)
model_lstm = tf.keras.Sequential([
                  Embedding(vocab_size, 64,name="embedding"),
                  LSTM(units=64, return_sequences=True), # maintains the sequential nature
                  LSTM(units=32, return_sequences=False), # returns the last output
                  Dense(16, activation='relu'),
                  Dense(8, activation='relu'),

                  Dense(1, activation="sigmoid")
])

In [None]:
model_lstm.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          524032    
                                                                 
 lstm_4 (LSTM)               (None, None, 64)          33024     
                                                                 
 lstm_5 (LSTM)               (None, 32)                12416     
                                                                 
 dense_34 (Dense)            (None, 16)                528       
                                                                 
 dense_35 (Dense)            (None, 8)                 136       
                                                                 
 dense_36 (Dense)            (None, 1)                 9         
                                                                 
Total params: 570,145
Trainable params: 570,145
Non-t

In [None]:
optimizer= tf.keras.optimizers.Adam()

model_lstm.compile(optimizer=optimizer,
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy()])

In [None]:
model_lstm.fit(train_data,
              epochs=100, 
              validation_data=test_data,
               class_weight=weights)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f051cf66100>

Scores for 100 epochs :


    loss : 0.2199 - val_loss : 0.2697
    binary_accuracy : 0.9659 - val_binary_accuracy : 0.9403

In [None]:
model_lstm.save("model_lstm.h5")

In [None]:
json.dump(model_lstm.history.history, open("/content/LSTM_history.json", 'w'))

## Model Evaluation

### SimpleRNN

In [None]:
# We load our models history
simpleRNN_history = json.load(open("/content/simpleRNN_history.json", 'r'))

In [None]:
model_simpleRNN = tf.keras.models.load_model("/content/model_simpleRNN.h5")

In [None]:
# make a graph to show the SimpleRNN model loss
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(y=simpleRNN_history["loss"],
                    mode='lines',
                    name='loss'))
fig.add_trace(go.Scatter(y=simpleRNN_history["val_loss"],
                    mode='lines',
                    name='val_loss'))
fig.show()


#### This model has a good score but let's see the others before to make a judgement

### GRU

In [None]:
GRU_history = json.load(open("/content/GRU_history.json", 'r'))
model_gru = tf.keras.models.load_model("/content/model_gru.h5")

In [None]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(y=GRU_history["loss"],
                    mode='lines',
                    name='loss'))
fig.add_trace(go.Scatter(y=GRU_history["val_loss"],
                    mode='lines',
                    name='val_loss'))
fig.show()

##### Gru model has the lowest score

### LSTM

In [None]:
LSTM_history = json.load(open("/content/LSTM_history.json", 'r'))
model_lstm = tf.keras.models.load_model("/content/model_lstm.h5")

In [None]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(y=LSTM_history["loss"],
                    mode='lines',
                    name='loss'))
fig.add_trace(go.Scatter(y=LSTM_history["val_loss"],
                    mode='lines',
                    name='val_loss'))
fig.show()

## LSTM Overfits rapidly too but it seems better but we can observe that the best model to use for this kind of data is Simple RNN