In [1]:
import sklearn
assert sklearn.__version__ >= "0.20"
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow import feature_column
from tensorflow.keras.layers import Embedding, LSTM
from tensorflow.keras import layers
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras import losses
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

### Generating Text Files

In [2]:
!rm -r text

In [3]:
!mkdir text
!mkdir text/clickbait
!mkdir text/normal

In [77]:
df = pd.read_csv('clickbait_data.csv')

In [80]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.1,random_state=seed)

In [81]:
train.to_csv('clickbait_train.csv',index=False)
test.to_csv('clickbait_test.csv',index=False)

In [82]:
df = pd.read_csv('clickbait_train.csv')

In [83]:
normal = df.loc[df['clickbait']==0]['headline'].values
clickbait = df.loc[df['clickbait']==1]['headline'].values

In [84]:
def generateTensorflowTextDir(data,folderName):
    for i,line in enumerate(data):
        filename = f'{i}_{folderName}'
        with open(f"text/{folderName}/{filename}.txt", "w") as outfile:
            outfile.write(line)

In [85]:
generateTensorflowTextDir(normal,'normal')

In [86]:
generateTensorflowTextDir(clickbait,'clickbait')

### Generating Test Data

In [17]:
testData = pd.read_csv('clickbait_test.csv')
testText = testData['headline'].values
testLabels = testData['clickbait'].values

In [163]:
predictions = export_model.predict(testText)

In [164]:
from sklearn.metrics import accuracy_score
def getAccuracy(labels,predictions):
    return accuracy_score(labels,np.round(predictions))

In [63]:
tweetTestData = pd.read_csv('tweets.csv')
tweetTestText = [' '.join(map(lambda x: x.strip("\n;[]\\"), l.split(' '))) for l in tweetTestData.postText]

tweetTestLabels = np.round(tweetTestData['isClickbait'].values)

### Parsing Text Files

In [2]:
# Batch Size -- we set to 1 to just read it in
BATCH_SIZE = 512
DATASET_SIZE = 32000
train_size = 32000 - 6400
test_size =6400
seed = 49

In [3]:
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'text',
    batch_size=BATCH_SIZE,
    class_names=['normal','clickbait'],
    subset='training',
    validation_split=0.2, 
    seed=seed
)

Found 51083 files belonging to 2 classes.
Using 40867 files for training.


In [4]:
print("Label 0 corresponds to", raw_train_ds.class_names[0])
print("Label 1 corresponds to", raw_train_ds.class_names[1])

Label 0 corresponds to normal
Label 1 corresponds to clickbait


In [5]:
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
   'text/',
    batch_size=BATCH_SIZE,
    class_names=['normal','clickbait'],
    validation_split=0.2, 
    subset='validation', 
    seed=seed)

Found 51083 files belonging to 2 classes.
Using 10216 files for validation.


In [6]:
for x,y in raw_train_ds.take(1):
    print(x[0:10],y[0:10])

tf.Tensor(
[b"Don't Try To Make Your Own Homemade Creme Eggs Because It's More Trouble Than It's Worth"
 b"Alleged 'rights group' involved with removal of anti-Scientology videos from YouTube doesn't exist; says EFF"
 b'Several groups seek to purchase Saturn auto brand'
 b'US military admits to accidentally killing Iraqi child'
 b'A Familiar Path in Months Before Fatal Shooting'
 b'Two politicians jailed for life over Rwandan genocide'
 b"Brazilian President: not continuing to use biofuels would be a 'crime against humanity'"
 b'15 Bloody Delicious Afternoon Teas You Must Eat Before You Die'
 b"We Need To Talk About Justin Bieber's Hair"
 b'What Does GOP Even Stand For'], shape=(10,), dtype=string) tf.Tensor([1 0 0 0 0 0 0 1 1 1], shape=(10,), dtype=int32)


In [7]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    return input_data
    return lowercase

In [8]:
max_features = 5000
sequence_length = 500

vectorize_layer = TextVectorization(
#     standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [9]:
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [10]:
def vectorize_text(text, label):
    #text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [11]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)

In [12]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [13]:
embedding_dim = 32

## Models

In [14]:
keras.backend.clear_session()
model = tf.keras.Sequential([
  layers.Embedding(max_features, embedding_dim,input_length=sequence_length),
  LSTM(32, return_sequences=True),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(1,activation='sigmoid')])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 32)           160000    
_________________________________________________________________
lstm (LSTM)                  (None, 500, 32)           8320      
_________________________________________________________________
global_average_pooling1d (Gl (None, 32)                0         
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 168,353
Trainable params: 168,353
Non-trainable params: 0
_________________________________________________________________


In [15]:
#[tf.metrics.BinaryAccuracy(threshold=0.5]
optimizer = keras.optimizers.Adam(lr=0.0001)
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=['accuracy'])

In [16]:
epochs = 5
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [55]:
export_model = tf.keras.Sequential([
  vectorize_layer,
  model,
])

In [57]:
predictions = export_model.predict(testText)

In [61]:
getAccuracy(testLabels,predictions)

0.975625

In [64]:
predictions = export_model.predict(tweetTestText)

In [65]:
getAccuracy(tweetTestLabels,predictions)

0.5084

### Label Smoothing

In [67]:
keras.backend.clear_session()
model = tf.keras.Sequential([
  layers.Embedding(max_features, embedding_dim,input_length=sequence_length),
  LSTM(32, return_sequences=True),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(1,activation='sigmoid')])

model.summary()
#[tf.metrics.BinaryAccuracy(threshold=0.5]
optimizer = keras.optimizers.Adam(lr=0.0001)
model.compile(loss=losses.BinaryCrossentropy(from_logits=True,label_smoothing=.1),
              optimizer='adam',
              metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 32)           160000    
_________________________________________________________________
lstm (LSTM)                  (None, 500, 32)           8320      
_________________________________________________________________
global_average_pooling1d (Gl (None, 32)                0         
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 168,353
Trainable params: 168,353
Non-trainable params: 0
_________________________________________________________________


In [68]:

epochs = 5
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
14/80 [====>.........................] - ETA: 34s - loss: 0.3120 - accuracy: 0.9523

KeyboardInterrupt: 

In [69]:
export_model = tf.keras.Sequential([
  vectorize_layer,
  model,
])

In [70]:
predictions = export_model.predict(testText)

In [71]:
getAccuracy(testLabels,predictions)

0.963125

In [72]:
predictions = export_model.predict(tweetTestText)

In [73]:
getAccuracy(tweetTestLabels,predictions)

0.558