In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline

In [3]:
#Load data into Pandas dataframe

df = pd.read_csv('spam.csv',delimiter=',',encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# drop coloumns not needed for neural network
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis=1,inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
v1    5572 non-null object
v2    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


# **See Data_Engineering_and_Logistic_Regression.ipynb for data distribution research

In [5]:
# Create input and output vectors
# Process the labels
# LabelEncoder() is used to encode labels with value between 0 and n_classes-1

X = df.v2
Y = df.v1
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1,1)

In [6]:
#Split data into training & test
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2, stratify=Y) #stratify = Y to represent the disproportion of ham & spam in the data

In [7]:
print(len(X_train))
print(len(X_test))
print(len(Y_train))
print(len(Y_test))
print(Y_test)

4457
1115
4457
1115
[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


In [8]:
# Process the data via 1) Tokenize the data and convert the text to sequences; 2) Add padding to ensure that all the sequences have the same shape.
# There are many ways of taking the max_len and here an arbitrary length of 150 is chosen.

max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [9]:
sequences_matrix

array([[  0,   0,   0, ..., 871, 315, 231],
       [  0,   0,   0, ...,  49, 761, 232],
       [  0,   0,   0, ..., 872,  11, 938],
       ...,
       [  0,   0,   0, ...,   3, 346, 724],
       [  0,   0,   0, ...,   2, 251, 642],
       [  0,   0,   0, ...,  17,   4,  22]], dtype=int32)

# LSTM

In [10]:
# Define LSTM structure
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [11]:
# Call the function & compile the model

model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 150)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 50)           50000     
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                29440     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               16640     
_________________________________________________________________
activation_1 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_1 (Dropout)  

In [12]:
# Fit on the training data
model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10, validation_data=(sequences_matrix,Y_train),
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Instructions for updating:
Use tf.cast instead.
Train on 4457 samples, validate on 4457 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10


<keras.callbacks.History at 0x7f8fb85b5b38>

# The model performs well on the validation set and this configuration is chosen as the final model.

In [13]:
# Process the test data set
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [14]:
# Evaluate model on test set
accr = model.evaluate(test_sequences_matrix,Y_test)



In [15]:
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.141
  Accuracy: 0.962


In [42]:
# this goves me 0 for precision; not sure why

from sklearn.metrics import precision_score, recall_score
import keras.callbacks

class Metrics(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self._data = []

    def on_epoch_end(self, batch, logs={}):
        X_val, y_val = self.validation_data[0], self.validation_data[1]
        y_predict = np.asarray(model.predict(X_val))

        y_val = np.argmax(y_val, axis=1)
        y_predict = np.argmax(y_predict, axis=1)

        self._data.append({
            'val_recall': recall_score(y_val, y_predict),
            'val_precision': precision_score(y_val, y_predict),
        })
        return

    def get_data(self):
        return self._data


metrics = Metrics()
history = model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10, validation_data=(sequences_matrix,Y_train),
          validation_split=0.2,callbacks=[metrics])
metrics.get_data()

Train on 4457 samples, validate on 4457 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[{'val_recall': 0.0, 'val_precision': 0.0},
 {'val_recall': 0.0, 'val_precision': 0.0},
 {'val_recall': 0.0, 'val_precision': 0.0},
 {'val_recall': 0.0, 'val_precision': 0.0},
 {'val_recall': 0.0, 'val_precision': 0.0},
 {'val_recall': 0.0, 'val_precision': 0.0},
 {'val_recall': 0.0, 'val_precision': 0.0},
 {'val_recall': 0.0, 'val_precision': 0.0},
 {'val_recall': 0.0, 'val_precision': 0.0},
 {'val_recall': 0.0, 'val_precision': 0.0}]

In [46]:
from sklearn.metrics import classification_report

y_pred = model.predict(test_sequences_matrix)
print(classification_report(Y_test, y_pred.round()))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.93      0.92      0.93       149

   micro avg       0.98      0.98      0.98      1115
   macro avg       0.96      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

