In [90]:
import sklearn
assert sklearn.__version__ >= "0.20"
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow import feature_column
from tensorflow.keras.layers import Embedding, LSTM
from tensorflow.keras import layers
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras import losses
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

### Generating Text Files

In [2]:
!rm -r text

In [3]:
!mkdir text
!mkdir text/clickbait
!mkdir text/normal

In [77]:
df = pd.read_csv('clickbait_data.csv')

In [80]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.1,random_state=seed)

In [81]:
train.to_csv('clickbait_train.csv',index=False)
test.to_csv('clickbait_test.csv',index=False)

In [82]:
df = pd.read_csv('clickbait_train.csv')

In [83]:
normal = df.loc[df['clickbait']==0]['headline'].values
clickbait = df.loc[df['clickbait']==1]['headline'].values

In [84]:
def generateTensorflowTextDir(data,folderName):
    for i,line in enumerate(data):
        filename = f'{i}_{folderName}'
        with open(f"text/{folderName}/{filename}.txt", "w") as outfile:
            outfile.write(line)

In [85]:
generateTensorflowTextDir(normal,'normal')

In [86]:
generateTensorflowTextDir(clickbait,'clickbait')

### Generating Test Data

In [21]:
testData = pd.read_csv('clickbait_test.csv')
testText = testData['headline'].values
testLabels = testData['clickbait'].values

In [23]:
tweetTestData = pd.read_csv('tweets.csv')
tweetTestText = [' '.join(map(lambda x: x.strip("\n;[]\\"), l.split(' '))) for l in tweetTestData.postText]

tweetTestLabels = np.round(tweetTestData['isClickbait'].values)

### Evaluation Helper functions

In [22]:
from sklearn.metrics import accuracy_score
def getAccuracy(labels,predictions):
    return accuracy_score(labels,np.round(predictions))

In [45]:
def export_model(model):
    ouput = tf.keras.Sequential([
      vectorize_layer,
      model,
    ])
    return ouput

In [59]:
def runEvaluation(model):
    model_predictions= model.predict(testText)
    model_accuracy = getAccuracy(testLabels,model_predictions)
    model_tweet_predictions = model.predict(tweetTestText)
    model_tweet_accuracy = getAccuracy(tweetTestLabels,model_tweet_predictions)
    return model_accuracy,model_tweet_accuracy
    

### Parsing Text Files

In [2]:
# Batch Size -- we set to 1 to just read it in
BATCH_SIZE = 512
DATASET_SIZE = 32000
train_size = 32000 - 6400
test_size =6400
seed = 49

In [3]:
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'text',
    batch_size=BATCH_SIZE,
    class_names=['normal','clickbait'],
    subset='training',
    validation_split=0.2, 
    seed=seed
)

Found 51083 files belonging to 2 classes.
Using 40867 files for training.


In [4]:
print("Label 0 corresponds to", raw_train_ds.class_names[0])
print("Label 1 corresponds to", raw_train_ds.class_names[1])

Label 0 corresponds to normal
Label 1 corresponds to clickbait


In [5]:
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
   'text/',
    batch_size=BATCH_SIZE,
    class_names=['normal','clickbait'],
    validation_split=0.2, 
    subset='validation', 
    seed=seed)

Found 51083 files belonging to 2 classes.
Using 10216 files for validation.


In [6]:
for x,y in raw_train_ds.take(1):
    print(x[0:10],y[0:10])

tf.Tensor(
[b"Don't Try To Make Your Own Homemade Creme Eggs Because It's More Trouble Than It's Worth"
 b"Alleged 'rights group' involved with removal of anti-Scientology videos from YouTube doesn't exist; says EFF"
 b'Several groups seek to purchase Saturn auto brand'
 b'US military admits to accidentally killing Iraqi child'
 b'A Familiar Path in Months Before Fatal Shooting'
 b'Two politicians jailed for life over Rwandan genocide'
 b"Brazilian President: not continuing to use biofuels would be a 'crime against humanity'"
 b'15 Bloody Delicious Afternoon Teas You Must Eat Before You Die'
 b"We Need To Talk About Justin Bieber's Hair"
 b'What Does GOP Even Stand For'], shape=(10,), dtype=string) tf.Tensor([1 0 0 0 0 0 0 1 1 1], shape=(10,), dtype=int32)


In [7]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    return input_data
    return lowercase

In [8]:
max_features = 5000
sequence_length = 500

vectorize_layer = TextVectorization(
#     standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [9]:
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [10]:
def vectorize_text(text, label):
    #text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [11]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)

In [12]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [13]:
embedding_dim = 32

## Experiment 1: Simple NN Performance vs Traditional ML

### Base ML Models

In [14]:
def dsToNumpy(ds):
    xList=[]
    yList=[]
    for x,y in ds.unbatch():
        xList.append(x.numpy())
        yList.append(y.numpy())
    xList=np.array(xList)
    yList=np.array(yList)
    return xList,yList
x_train,y_train = dsToNumpy(train_ds)
x_val,y_val = dsToNumpy(val_ds)

In [25]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=10, random_state=seed).fit(x_train, y_train)
predictions=rf.predict(x_val)
predictions

array([1, 0, 1, ..., 1, 1, 0], dtype=int32)

In [26]:
getAccuracy(y_val,predictions)

0.8307556773688332

### Simple LSTM Model

In [36]:
def LSTM_nn():
    model = tf.keras.Sequential([
      layers.Embedding(max_features, embedding_dim,input_length=sequence_length),
      LSTM(32, return_sequences=True),
      layers.GlobalAveragePooling1D(),
      layers.Dropout(0.2),
      layers.Dense(1,activation='sigmoid')])
    return model

In [37]:
lstm_model = LSTM_nn()
optimizer = keras.optimizers.Adam(lr=0.0001)
lstm_model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=['accuracy'])
epochs = 5
history = lstm_model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [60]:
export_lstm_model = export_model(lstm_model)
lstm_accuracy,lstm_tweet_accuracy = runEvaluation(export_lstm_model)

In [62]:
print('same dataset evaluation accuracy:')
print(lstm_accuracy)
print('twitter dataset evaluation accuracy:')
print(lstm_tweet_accuracy)

same dataset evaluation accuracy:
0.98
twitter dataset evaluation accuracy:
0.5421


## Experiment 2: NN Architecture Comparisons + Ensemble

### Simple GRU Model

In [54]:
def GRU_nn():
    model = tf.keras.Sequential([
      layers.Embedding(max_features, embedding_dim,input_length=sequence_length),
      layers.GRU(32, return_sequences=True),
      layers.GlobalAveragePooling1D(),
      layers.Dropout(0.2),
      layers.Dense(1,activation='sigmoid')])
    return model

In [55]:
gru_model = GRU_nn()
optimizer = keras.optimizers.Adam(lr=0.0001)
gru_model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=['accuracy'])
epochs = 5
history = gru_model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [63]:
export_gru_model = export_model(gru_model)
gru_accuracy,gru_tweet_accuracy = runEvaluation(export_gru_model)

In [64]:
print('same dataset evaluation accuracy:')
print(gru_accuracy)
print('twitter dataset evaluation accuracy:')
print(gru_tweet_accuracy)

same dataset evaluation accuracy:
0.9740625
twitter dataset evaluation accuracy:
0.5544


### BERT Model

In [80]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optmizer

In [81]:
bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8' 

map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_base/2',
    'electra_small':
        'https://tfhub.dev/google/electra_small/2',
    'electra_base':
        'https://tfhub.dev/google/electra_base/2',
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
    'experts_wiki_books':
        'https://tfhub.dev/google/experts/bert/wiki_books/2',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
    'electra_small':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'electra_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_wiki_books':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


In [82]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
text_test = ['this is such an amazing movie!']
text_preprocessed = bert_preprocess_model(text_test)

In [84]:
def BERT_nn():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [89]:
bert_model = BERT_nn()
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()
epochs = 5
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')
bert_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)
history = bert_model.fit(
    raw_train_ds,
    validation_data=raw_val_ds,
    epochs=epochs)









Epoch 1/5
 5/80 [>.............................] - ETA: 1:10:47 - loss: 0.8643 - binary_accuracy: 0.4123

KeyboardInterrupt: 

## Experiment 2: Performance of Label Smoothing

In [67]:
keras.backend.clear_session()
model = tf.keras.Sequential([
  layers.Embedding(max_features, embedding_dim,input_length=sequence_length),
  LSTM(32, return_sequences=True),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(1,activation='sigmoid')])

model.summary()
#[tf.metrics.BinaryAccuracy(threshold=0.5]
optimizer = keras.optimizers.Adam(lr=0.0001)
model.compile(loss=losses.BinaryCrossentropy(from_logits=True,label_smoothing=.1),
              optimizer='adam',
              metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 32)           160000    
_________________________________________________________________
lstm (LSTM)                  (None, 500, 32)           8320      
_________________________________________________________________
global_average_pooling1d (Gl (None, 32)                0         
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 168,353
Trainable params: 168,353
Non-trainable params: 0
_________________________________________________________________


In [68]:

epochs = 5
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
14/80 [====>.........................] - ETA: 34s - loss: 0.3120 - accuracy: 0.9523

KeyboardInterrupt: 

In [69]:
export_model = tf.keras.Sequential([
  vectorize_layer,
  model,
])

In [70]:
predictions = export_model.predict(testText)

In [71]:
getAccuracy(testLabels,predictions)

0.963125

In [72]:
predictions = export_model.predict(tweetTestText)

In [73]:
getAccuracy(tweetTestLabels,predictions)

0.558