# **CLICKBAIT CLASSIFIER**
***

[Dataset](https://www.kaggle.com/datasets/amananandrai/clickbait-dataset?select=clickbait_data.csv)

In [84]:
# ML-related external modules
import nltk
import keras.layers
import numpy as np
import pandas as pd
import matplotlib as plt
from tensorflow import keras
from tensorflow.keras.models import Sequential, model_from_json
from tensorflow.keras.layers import Dense, Embedding, GlobalAvgPool1D, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.backend import clear_session
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from numpy import asarray
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# non-ML related external modules
import contractions
import latexify

# built-in modules
from collections import Counter
from importlib import reload
import time
import re
import random
import string
from string import punctuation
import math
import csv

#local files
import preprocessor
import grapher
import encode
import df_helper

In [85]:
# important paths
DATASETS = 'Datasets\\'
SAVED_MODELS = 'Saved Models\\'

## LOAD DATA
- create aggregate dataset from all datasets for model training

In [86]:
# load data from csv into dataframe
file1 = 'Datasets\\clickbait_data.csv'
file2 = 'Datasets\\clickbait_data2.csv'
file3 = 'Datasets\\news_data.csv'

f1_content = pd.read_csv(file1)
f2_content = pd.read_csv(file2, usecols=['title'])
f3_content = pd.read_csv(file3, usecols=['headline_text'], nrows=f2_content.shape[0] - 2)

df1 = pd.DataFrame(f1_content)
df1.rename(columns={'clickbait': 'label'}, inplace=True)

df2 = pd.DataFrame(f2_content)
df2.rename(columns={'title': 'headline'}, inplace=True)

df3 = pd.DataFrame(f3_content)
df3.rename(columns={'headline_text' : 'headline'}, inplace=True)

In [87]:
f1_content.shape, f2_content.shape, f3_content.shape

((32000, 2), (4978, 1), (4976, 1))

In [88]:
# add label columns to df2 and df3
df2['label'] = 1
df3['label'] = 0

In [89]:
# aggregate data
agg_df = pd.concat([df1, df2, df3], ignore_index=True)
agg_df.shape

(41954, 2)

In [90]:
# check for missing entries
agg_df['headline'].isna().sum()

0

In [91]:
# check distribution of labels
agg_df['label'].value_counts()

1    20977
0    20977
Name: label, dtype: int64

In [92]:
# preview uncleaned dataset, and save uncleaned
uncleaned_data = agg_df.to_csv(DATASETS + 'aggregate_data.csv', encoding='utf-8')
agg_df

Unnamed: 0,headline,label
0,Should I Get Bings,1
1,Which TV Female Friend Group Do You Belong In,1
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1
3,"This Vine Of New York On ""Celebrity Big Brothe...",1
4,A Couple Did A Stunning Photo Shoot With Their...,1
...,...,...
41949,port kembla steelworks fire extinguished,0
41950,portrait of terror sheikh as a young man,0
41951,protesters target pm at centre funding announc...,0
41952,push on for leagues club changes,0


In [93]:
# define label column
y = agg_df['label'].astype(np.float32)

## **PRE-PROCESSING**
- clean data
    - convert to lowercase
    - expand contractions (handle exceptions)
    - remove punctuation (keep hyphens)
    - remove stop words *(optional)*
    - adjust spacing as needed

In [None]:
cleaned: DataFrame = preprocessor.clean_data(agg_df, remove_sw=True)
agg_df.headline = cleaned
cleaned.head()

In [None]:
# saved cleaned dataset
# cleaned_data = cleaned.to_csv(DATASETS + 'cleaned_data.csv', encoding='utf-8')

## Splitting the Dataset
- split cleaned data into training and test sets

In [None]:
train_data, test_data, y_tr, y_te = train_test_split(cleaned, y, test_size=0.2, random_state=3)
y_tr = y_tr.astype(np.float32)
y_te = y_te.astype(np.float32)

### **ENCODING**
- obtain doc-term matrix of corpus
- obtain vocab list

In [None]:
# X_train is a document-term matrix (sparse matrix)
reload(encode)
cv = CountVectorizer()
cv.fit(train_data)
vocab = cv.get_feature_names()
X_tr = cv.transform(train_data)
X_tr = X_tr.toarray()
X_te = cv.transform(test_data)
X_te = X_te.toarray()
vocab, X_tr.shape, X_te.shape

## Building & Training the Model

In [None]:
@latexify.with_latex
def relu(x):
    if x >= 0:
        return x
    else:
        return 0
relu

In [None]:
# activation function
@latexify.with_latex
def sigmoid(x):
    return 1 / (1 + e ** -x)
sigmoid

In [None]:
model = Sequential()
model.add(Dense(10, activation='relu', input_dim=X_tr.shape[1]))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
from keras.callbacks import EarlyStopping
callback = EarlyStopping(monitor='val_accuracy', patience=5)

# uncomment to train with early stopping
# history = model.fit(X_tr, y_tr,
#                     epochs=20,
#                     verbose=True,
#                     validation_data=(X_te, y_te),
#                     batch_size=128,
#                     callbacks=callback)

history = model.fit(X_tr, y_tr,
                    epochs=20,
                    verbose=True,
                    validation_data=(X_te, y_te),
                    batch_size=128)

grapher.plot_history(history)

In [None]:
loss, accuracy = model.evaluate(X_tr, y_tr, verbose=True)
print("Training Accuracy: {:.4f}".format(accuracy))

In [None]:
loss, accuracy = model.evaluate(X_te, y_te, verbose=True)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
# save model
# model.save('Saved Models\\trained_agg')

In [None]:
# load model
#trained_model = keras.models.load_model('Saved Models\\trained_2')

In [94]:
def clickbait_classifier():
    from_user = input()
    doc_term_matrix = encode.to_doc_term_matrix(from_user, train_data=train_data)
    pred = model.predict(doc_term_matrix, verbose=False)[0]
    if pred >= 0.5:
        print('"{}": CLICKBAIT!\n'.format(from_user))
    else:
        print('"{}": Not clickbait.\n'.format(from_user))
    return pred

In [95]:
result = clickbait_classifier()
print(result)

"15 shocking life hacks that will leave you speechless": CLICKBAIT!

[0.99999994]
