In [101]:
import sklearn
assert sklearn.__version__ >= "0.20"
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow import feature_column
from tensorflow.keras.layers import Embedding, LSTM
from tensorflow.keras import layers
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras import losses
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

## Parsing Dataset

In [102]:
!rm -r text

In [103]:
!mkdir text
!mkdir text/clickbait
!mkdir text/normal

In [104]:
df = pd.read_csv('clickbait_data.csv')

In [105]:
normal = df.loc[df['clickbait']==0]['headline'].values
clickbait = df.loc[df['clickbait']==1]['headline'].values

In [106]:
def generateTensorflowTextDir(data,folderName):
    for i,line in enumerate(data):
        filename = f'{i}_{folderName}'
        with open(f"text/{folderName}/{filename}.txt", "w") as outfile:
            outfile.write(line)

In [107]:
generateTensorflowTextDir(normal,'normal')

In [108]:
generateTensorflowTextDir(clickbait,'clickbait')

In [79]:
# Batch Size -- we set to 1 to just read it in
BATCH_SIZE = 512
DATASET_SIZE = 32000
train_size = 32000 - 6400
test_size =6400
seed = 49

In [80]:
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'text',
    batch_size=BATCH_SIZE,
    subset='training',
    validation_split=0.2, 
    seed=seed
)

Found 32000 files belonging to 2 classes.
Using 25600 files for training.


In [81]:
print("Label 0 corresponds to", raw_train_ds.class_names[0])
print("Label 1 corresponds to", raw_train_ds.class_names[1])

Label 0 corresponds to clickbait
Label 1 corresponds to normal


In [82]:
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
   'text/',
    batch_size=BATCH_SIZE,
    validation_split=0.2, 
    subset='validation', 
    seed=seed)

Found 32000 files belonging to 2 classes.
Using 6400 files for validation.


In [77]:
for x,y in raw_train_ds.take(1):
    print(x[0:10],y[0:10])

tf.Tensor(
[b'This Time, Treasuries Push Markets Higher'
 b'Tunisian ATR-72 plane crash on 6 August caused by incorrect fuel gauge'
 b'Sri Lankan Rebel Commander Also Served as a Cult Figure'
 b'News services and web companies increase Farsi services in light of Iranian political situation'
 b'Chemical firm LyondellBasell collapses'
 b'Earnings Drop 40% for European Plane Maker'
 b'As the Eurovision entrants return home, the home crowds weigh in'
 b'Lewis Hamilton wins 2008 British Grand Prix'
 b'Five dead in Calgary murder-suicide'
 b'Senate Committee hears bailout proposal'], shape=(10,), dtype=string) tf.Tensor([1 1 1 1 1 1 0 1 1 1], shape=(10,), dtype=int32)


In [83]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    return input_data
    return lowercase

In [84]:
max_features = 5000
sequence_length = 500

vectorize_layer = TextVectorization(
#     standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [85]:
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [86]:
def vectorize_text(text, label):
    #text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [87]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)

In [88]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [89]:
embedding_dim = 32

## Models

In [90]:
keras.backend.clear_session()
model = tf.keras.Sequential([
  layers.Embedding(max_features, embedding_dim,input_length=sequence_length),
  LSTM(32, return_sequences=True),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(1,activation='sigmoid')])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 32)           160000    
_________________________________________________________________
lstm (LSTM)                  (None, 500, 32)           8320      
_________________________________________________________________
global_average_pooling1d (Gl (None, 32)                0         
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 168,353
Trainable params: 168,353
Non-trainable params: 0
_________________________________________________________________


In [96]:
keras.backend.clear_session()
model = tf.keras.Sequential([
  layers.Embedding(max_features, embedding_dim),
  LSTM(32, return_sequences=True),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(units=32, activation='relu'),
  layers.BatchNormalization(),
  layers.Dropout(0.2),
  layers.Dense(1,activation='sigmoid')])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          160000    
_________________________________________________________________
lstm (LSTM)                  (None, None, 32)          8320      
_________________________________________________________________
global_average_pooling1d (Gl (None, 32)                0         
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 32)                1056      
_________________________________________________________________
batch_normalization (BatchNo (None, 32)                128       
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0

In [97]:
#[tf.metrics.BinaryAccuracy(threshold=0.5]
optimizer = keras.optimizers.Adam(lr=0.0001)
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=['accuracy'])

In [98]:
epochs = 100
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
 9/50 [====>.........................] - ETA: 24s - loss: 0.0947 - accuracy: 0.9801

KeyboardInterrupt: 