# Deep Learning: This file is used for deep learning experiment purposes

In [None]:
pip install -q -U keras-tuner

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,  CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import keras
import tensorflow as tf
from tensorflow import keras

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('/content/drive/MyDrive/spam_x.csv')

In [None]:
# filename was to make sure files are loaded correctly, drop this column for model training
df = df.drop(columns=['file_name', 'volume'])

In [None]:
df.replace(to_replace='ham', value=0, inplace=True)
df.replace(to_replace='spam', value=1, inplace=True)
df = df.rename(columns={'ham/spam': 'spm-lbl-trgt'})  # choose a unique name

In [None]:
# fill missing values with 'unknown'
df['subject'].fillna(value="unknown", inplace=True)
df['body'].fillna(value="unknown", inplace=True)

In [None]:
# concat body and subject
df["body-subject"] = df[["body", "subject"]].apply(" ".join, axis=1)
df = df.drop(columns=['body', 'subject'])

In [None]:
from sklearn.model_selection import train_test_split

X = df.loc[:, df.columns != 'spm-lbl-trgt'].values
y = df['spm-lbl-trgt'].values

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=22)

In [None]:
X_train = X_train.tolist()
X_test = X_test.tolist()
y_train = y_train.tolist()
y_test = y_test.tolist()

In [None]:
## some config values 
embed_size = 100 # how big is each word vector
max_feature = 50000 # how many unique words to use (i.e num rows in embedding vector)
max_len = 2000 # max number of words in a question to use

In [None]:
max_feature = 50000 #number of unique words to consider
from keras.preprocessing.text import Tokenizer
import numpy as np

# apply tokenization
tokenizer = Tokenizer(num_words=max_feature)
tokenizer.fit_on_texts(X_train)

x_train_features = np.array(tokenizer.texts_to_sequences(X_train))
x_test_features = np.array(tokenizer.texts_to_sequences(X_test))

  x_test_features = np.array(tokenizer.texts_to_sequences(X_test))


In [None]:
# apply padding
x_train_features = pad_sequences(x_train_features,maxlen=max_len)
x_test_features = pad_sequences(x_test_features,maxlen=max_len)

In [None]:
from keras import backend as K
# define metrics
#https://datascience.stackexchange.com/a/45166

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
embed_size = 100
def model_builder(unit_dense, rate_dropout, learning_rate):
    inp = Input(shape=(max_len,))
    x = Embedding(max_feature, embed_size)(inp)

    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)

    x = Dropout(rate_dropout)(x)

    x = Dense(unit_dense, activation="relu")(x)

    x = Dropout(rate_dropout)(x)

    x = Dense(1, activation="sigmoid")(x)

    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=learning_rate), 
                metrics=['accuracy',f1_m,precision_m, recall_m])

    return model

In [None]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
from keras.callbacks import CSVLogger

# save results to csv in case of a crash
csv_logger = CSVLogger('/content/drive/MyDrive/log2.csv', append=True, separator=';')

In [None]:
units = [512, 256, 128, 64, 32]
dropouts = [0.3,0.5]
learning_rates = [1e-2, 1e-3, 1e-4]

# perform hyperparameter grid search
for unit_dense in units:
    for rate_dropout in dropouts:
        for learning_rate in learning_rates:
            print(" unit_dense ", str(unit_dense), " rate_dropout ", str(rate_dropout), " learning_rate ", str(learning_rate))
            model = model_builder(unit_dense,rate_dropout,learning_rate)
            history = model.fit(x_train_features, np.array(y_train), batch_size=512, epochs=20, validation_data=(x_test_features, np.array(y_test)), callbacks=[stop_early,csv_logger])



 unit_dense  512  rate_dropout  0.3  learning_rate  0.01
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
 unit_dense  512  rate_dropout  0.3  learning_rate  0.001
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
 unit_dense  512  rate_dropout  0.3  learning_rate  0.0001
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
 unit_dense  512  rate_dropout  0.5  learning_rate  0.01
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
 unit_dense  512  rate_dropout  0.5  learning_rate  0.001
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
 unit_dense  512  rate_dropout  0.5  learning_rate  0.0001
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
 unit_dense  256  rate_dropout  0.3  learning_rate  0.01
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
 unit_dense  256  rate_dropout  0.3  learn