In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix

In [2]:
from google.colab import files
uploaded = files.upload()

Saving clickbait_data.csv to clickbait_data (1).csv


In [3]:
import io

data = pd.read_csv(io.BytesIO(uploaded['clickbait_data.csv']))

In [4]:
data.sample(10)

Unnamed: 0,headline,clickbait
5589,19 Photos You'll Appreciate If You Love Rainy ...,1
617,Which Person Should Write You A Haiku For Vale...,1
12036,Donald Trump Gave Us His Best Dad Moves For A ...,1
19827,Political pundits debate Fred Thompson's young...,0
13351,Justin Bieber Revealed Why He Cried At The VMAs,1
7903,17 Beautiful Rooms For The Book-Loving Soul,1
23803,Super Tuesday 2012: 'Joe the Plumber' wins GOP...,0
16632,ITMS Canada launched,0
7325,17 Thoughts Women Have Had During Sex That Are...,1
12003,"24 ""Back To The Future"" Tattoos That Will Blow...",1


In [5]:
 X = data['headline']
 y = data['clickbait']

In [6]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y)

### Tokenizer

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [8]:
tokenizer = Tokenizer(num_words=5000)

tokenizer.fit_on_texts(X)

In [9]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [10]:
X_train = pad_sequences(X_train, maxlen=500)
X_test = pad_sequences(X_test, maxlen=500)

### **Training Model**

In [11]:
import keras
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, GlobalMaxPooling1D
from keras.callbacks import ModelCheckpoint, EarlyStopping

Using TensorFlow backend.


In [12]:
model = Sequential()

In [13]:
model.add(Embedding(5000, 32, input_length=500))
model.add(LSTM(32, return_sequences=True))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 500, 32)           8320      
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 32)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 168,353
Trainable params: 168,353
Non-trainable params: 0
_________________________________________________________________


In [14]:
callbacks = [
    EarlyStopping(
        monitor='val_accuracy',
        min_delta=1e-4,
        patience=3,
        verbose=1
    ),
    ModelCheckpoint(
        filepath='weights.h5',
        monitor='val_accuracy', 
        mode='max', 
        save_best_only=True,
        save_weights_only=True,
        verbose=1
    )
]

In [15]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, batch_size=512, validation_data=(X_test, y_test), epochs=20, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 24000 samples, validate on 8000 samples
Epoch 1/20

Epoch 00001: val_accuracy improved from -inf to 0.95375, saving model to weights.h5
Epoch 2/20

Epoch 00002: val_accuracy improved from 0.95375 to 0.96688, saving model to weights.h5
Epoch 3/20

Epoch 00003: val_accuracy did not improve from 0.96688
Epoch 4/20

Epoch 00004: val_accuracy improved from 0.96688 to 0.97300, saving model to weights.h5
Epoch 5/20

Epoch 00005: val_accuracy improved from 0.97300 to 0.97588, saving model to weights.h5
Epoch 6/20

Epoch 00006: val_accuracy improved from 0.97588 to 0.97663, saving model to weights.h5
Epoch 7/20

Epoch 00007: val_accuracy improved from 0.97663 to 0.97750, saving model to weights.h5
Epoch 8/20

Epoch 00008: val_accuracy did not improve from 0.97750
Epoch 9/20

Epoch 00009: val_accuracy did not improve from 0.97750
Epoch 10/20

Epoch 00010: val_accuracy did not improve from 0.97750
Epoch 00010: early stopping


In [16]:
y_pred = [round(i[0]) for i in model.predict(X_test)]

In [17]:
cm = confusion_matrix(y_test, y_pred)

tn, fp, fn, tp = cm.ravel()

precision = tp/(tp+fp)
recall = tp/(tp+fn)

print("Recall of the model is {:.2f}".format(recall))
print("Precision of the model is {:.2f}".format(precision))

Recall of the model is 0.97
Precision of the model is 0.98
