In [1]:
import tensorflow as tf
from tensorflow.keras import layers, preprocessing, Input, Model, optimizers, callbacks
import pandas as pd
import os
from sklearn.metrics import classification_report

In [2]:
train_dataset = pd.read_csv("train_clean_set.csv")
test_dataset = pd.read_csv("test_clean_set.csv")

In [3]:
train_dataset = train_dataset[train_dataset['word_count'] <= 150]

In [4]:
train_dataset.describe()

Unnamed: 0,label,word_count,length,avg_word
count,34085.0,34085.0,34085.0,34085.0
mean,0.477101,57.119349,262.530703,3.635064
std,0.499483,36.262552,168.013332,0.465427
min,0.0,0.0,0.0,0.0
25%,0.0,28.0,128.0,3.347826
50%,0.0,53.0,242.0,3.583333
75%,1.0,82.0,373.0,3.857143
max,1.0,150.0,897.0,9.0


In [5]:
MAX_LEN = 150
MAX_WORDS = 8000

In [6]:
X = train_dataset.search_text.astype(str)
Y = train_dataset.label
vocab = preprocessing.text.Tokenizer(num_words=MAX_WORDS, oov_token='UNK')
vocab.fit_on_texts(X)
X_vector = vocab.texts_to_sequences(X)
X_matrix = preprocessing.sequence.pad_sequences(X_vector, maxlen=MAX_LEN)
print(len(vocab.word_index))

29346


In [7]:
print(vocab.texts_to_sequences(['httpsbitlyvn32wl']))

[[1]]


In [8]:
def RNN():
    inputs = Input(shape=[MAX_LEN])
    layer = layers.Embedding(MAX_WORDS, 300, input_length=MAX_LEN)(inputs)
    layer = layers.CuDNNLSTM(64)(layer)
    layer = layers.Dense(256)(layer)
    layer = layers.Activation('relu')(layer)
    layer = layers.Dropout(0.2)(layer)
    layer = layers.Dense(1)(layer)
    layer = layers.Activation('sigmoid')(layer)
    model = Model(inputs=inputs, outputs=layer)
    return model

In [32]:
model = RNN()
model.summary()
model.compile(optimizer=optimizers.RMSprop(), loss='binary_crossentropy', metrics=['accuracy','Precision','Recall'])

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 150)]             0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 150, 300)          2400000   
_________________________________________________________________
cu_dnnlstm_2 (CuDNNLSTM)     (None, 64)                93696     
_________________________________________________________________
dense_4 (Dense)              (None, 256)               16640     
_________________________________________________________________
activation_4 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 257 

In [33]:
checkpoint_path = "training1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)


ES = callbacks.EarlyStopping(monitor='val_loss',min_delta=0.0001)
CP = callbacks.ModelCheckpoint(checkpoint_path, save_weights_only=True, monitor="val_acc", mode="max", save_best_only=True, verbose=1)

cplist = [CP]

In [34]:
model.fit(X_matrix, Y, batch_size=128, epochs=20, validation_split=0.2, callbacks=cplist)

Train on 27268 samples, validate on 6817 samples
Epoch 1/20
Epoch 00001: val_acc improved from -inf to 0.92035, saving model to training1/cp.ckpt
Epoch 2/20
Epoch 00002: val_acc improved from 0.92035 to 0.93927, saving model to training1/cp.ckpt
Epoch 3/20
Epoch 00003: val_acc improved from 0.93927 to 0.94162, saving model to training1/cp.ckpt
Epoch 4/20
Epoch 00004: val_acc improved from 0.94162 to 0.94176, saving model to training1/cp.ckpt
Epoch 5/20
Epoch 00005: val_acc improved from 0.94176 to 0.94235, saving model to training1/cp.ckpt
Epoch 6/20
Epoch 00006: val_acc did not improve from 0.94235
Epoch 7/20
Epoch 00007: val_acc did not improve from 0.94235
Epoch 8/20
Epoch 00008: val_acc did not improve from 0.94235
Epoch 9/20
Epoch 00009: val_acc did not improve from 0.94235
Epoch 10/20
Epoch 00010: val_acc did not improve from 0.94235
Epoch 11/20
Epoch 00011: val_acc did not improve from 0.94235
Epoch 12/20
Epoch 00012: val_acc did not improve from 0.94235
Epoch 13/20
Epoch 00013:

<tensorflow.python.keras.callbacks.History at 0x7f39e81d6390>

In [35]:
X_test = test_dataset.search_text.astype(str)
Y_test = test_dataset.label
X_test_matrix = preprocessing.sequence.pad_sequences(vocab.texts_to_sequences(X_test), maxlen=MAX_LEN)

In [36]:
accr = model.evaluate(X_test_matrix, Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n  Recall: {:0.3f}\n  Precision: {:0.3f}'.format(accr[0],accr[1],accr[2],accr[3]))

Test set
  Loss: 0.628
  Accuracy: 0.909
  Recall: 0.905
  Precision: 0.912


In [37]:
f1_score = (2 * accr[2] * accr[3]) / (accr[2] + accr[3])
print('F1 Score: {:0.3f}'.format(f1_score))

F1 Score: 0.909


In [38]:
Y_predicted = model.predict(X_test_matrix).transpose()[0].round()
print(classification_report(Y_test.values, Y_predicted, digits=4))

              precision    recall  f1-score   support

           0     0.9133    0.9059    0.9096      5049
           1     0.9048    0.9123    0.9086      4951

    accuracy                         0.9091     10000
   macro avg     0.9091    0.9091    0.9091     10000
weighted avg     0.9091    0.9091    0.9091     10000



In [39]:
model.load_weights(checkpoint_path)
Y_predicted = model.predict(X_test_matrix).transpose()[0].round()
print(classification_report(Y_test.values, Y_predicted, digits=4))

              precision    recall  f1-score   support

           0     0.9215    0.9228    0.9221      5049
           1     0.9211    0.9198    0.9205      4951

    accuracy                         0.9213     10000
   macro avg     0.9213    0.9213    0.9213     10000
weighted avg     0.9213    0.9213    0.9213     10000

