# **Clickbait Detection**

## **Preparation & Preprocess**

In [None]:
# Install dependencies
!pip install transformers datasets evaluate
!pip install transformers[torch]
!pip install kora -q

In [2]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

In [3]:
# Download data dari Google Drive
from kora import drive
from urllib.request import urlretrieve
import sys

def download_folder(folder_id):
    # authenticate
    from google.colab import auth
    auth.authenticate_user()
    # get folder_name
    from googleapiclient.discovery import build
    service = build('drive', 'v3')
    folder_name = service.files().get(fileId=folder_id).execute()['name']
    # install library
    url = 'https://github.com/segnolin/google-drive-folder-downloader/raw/master/download.py'
    path = 'download.py'
    urlretrieve(url, path)
    # recursive download
    import download
    download.download_folder(service, folder_id, './', folder_name)
    return folder_name

folder_name = download_folder('1IDfO0fwT6xUor6EG7qlBjUJ7EPiYDH-A')  # id folder penyimpanan dataset dan model
sys.path.insert(0, f"/content/{folder_name}")

df_train = pd.read_csv(f'/content/{folder_name}/train.csv')
df_valid = pd.read_csv(f'/content/{folder_name}/valid.csv')

1kXInsWxf-n7oCM3CRQk4H8rgt4_mQ-mb test.csv text/csv (1/3)
15gZ_1j4-PCHjxvE8O4Cf-MYc6kxhSLdi train.csv text/csv (2/3)
1LmUR_8ORIZKeP47rgczrUko--vXAhIhi valid.csv text/csv (3/3)


In [4]:
df_train

Unnamed: 0,label,title,text
0,news,China and Economic Reform: Xi Jinping’s Track ...,Economists generally agree: China must overhau...
1,news,Trade to Be a Big Topic in Theresa May’s U.S. ...,LONDON—British Prime Minister Theresa May said...
2,clickbait,"The Top Beaches In The World, According To Nat...",Beaches come in all sorts of shapes and sizes ...
3,clickbait,Sheriff’s Report Provides New Details on Tamir...,"A timeline of what happened after Tamir Rice, ..."
4,news,Surgeon claiming he will transplant volunteer'...,An Italian neurosurgeon who has claimed for mo...
...,...,...,...
24866,other,,Because the success of the individual is very ...
24867,news,"Unable to Enter U.S., and Still Stranded Abroa...","AMSTERDAM — Pedram Paragomi, a Iranian medical..."
24868,news,"Calais Migrant Camp Will Be Demolished Soon, F...",(AFP) — The total dismantling of the “Jungle” ...
24869,news,Twitter’s NFL Deal No Cure for User and Advert...,Gaining the worldwide rights to stream 10 NFL ...


In [5]:
# Fungsi preproses dataset
# Preproses berupa: menghapus data yang mengandung N/A, penyeimbangan data positif dan negatif (undersampling)
def preprocess(df, contain_label=True, balance_out=False):
  result = None
  if contain_label:
    result = df[(df['label'] == 'news') | (df['label'] == 'clickbait')].dropna()

    if balance_out:
      news_count = len(result.loc[result['label'] == 'news'])
      clickbait_count = len(result.loc[result['label'] == 'clickbait'])
      each_count = min(news_count, clickbait_count)

      news = shuffle(shuffle(result.loc[result['label'] == 'news']).head(each_count))
      clickbait = shuffle(shuffle(result.loc[result['label'] == 'clickbait']).head(each_count))

      result = shuffle(pd.concat([news, clickbait]))
  else:
    result = df.dropna()

  X = result[['title', 'text']].to_numpy()
  if contain_label:
    y = (result['label'] == 'clickbait').to_numpy().astype(int)
    return X, y
  else:
    return X

In [6]:
# Preproses data latih
X_train, y_train = preprocess(df_train, balance_out=True)

In [7]:
# Preproses data validasi
X_valid, y_valid = preprocess(df_valid)

In [8]:
# Create dataset
from datasets import Dataset

ds_train = Dataset.from_dict({
    'title': X_train[:, 0].tolist(),
    'text': X_train[:, 1].tolist(),
    'label': y_train.tolist()
})
ds_validation = Dataset.from_dict({
    'title': X_valid[:, 0].tolist(),
    'text': X_valid[:, 1].tolist(),
    'label': y_valid.tolist()
})

In [9]:
from transformers import AutoTokenizer, DataCollatorWithPadding

# Tokenizer
pretrained_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(pretrained_name)

# Fungsi preproses tokenisasi
def preprocess_title_only(examples):
  return tokenizer(examples['title'], truncation=True)

def preprocess_title_text_pair(examples):
  return tokenizer(examples['title'], examples['text'], truncation=True)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
# Map id ke label dan sebaliknya
id2label = {0: 'news', 1: 'clickbait'}
label2id = {'news': 0, 'clickbait': 1}

In [11]:
# Metrik evaluasi
import evaluate

accuracy_metric = evaluate.load('accuracy')
precision_metric = evaluate.load('precision')
recall_metric = evaluate.load('recall')
f1_metric = evaluate.load('f1')

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)

  accuracy = accuracy_metric.compute(predictions=predictions, references=labels)['accuracy']
  precision = precision_metric.compute(predictions=predictions, references=labels)['precision']
  recall = recall_metric.compute(predictions=predictions, references=labels)['recall']
  f1 = f1_metric.compute(predictions=predictions, references=labels)['f1']

  return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

##**Fungsi Training (PyTorch)**

In [27]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Fungsi training
def train(save_path, tokenized_train, tokenized_validation):
  # Load pretrained
  model = AutoModelForSequenceClassification.from_pretrained(
      pretrained_name, num_labels=2, id2label=id2label, label2id=label2id
  )

  # Set argumen training
  training_args_title_only = TrainingArguments(
      output_dir=save_path,
      learning_rate=2e-5,
      per_device_train_batch_size=16,
      per_device_eval_batch_size=16,
      num_train_epochs=5,
      weight_decay=0.01,
      evaluation_strategy='epoch',
      save_strategy='epoch',
      load_best_model_at_end=True,
  )

  # Membuat trainer
  trainer = Trainer(
      model=model,
      args=training_args_title_only,
      train_dataset=tokenized_train_title_only,
      eval_dataset=tokenized_validation_title_only,
      data_collator=data_collator,
      compute_metrics=compute_metrics
  )

  # Melakukan training
  trainer.train()

  # Save model
  trainer.save_model(save_path)
  tokenizer.save_pretrained(save_path)

## **Training: Title Only**

In [13]:
# Tokenisasi
tokenized_train_title_only = ds_train.map(preprocess_title_only, batched=True)
tokenized_validation_title_only = ds_validation.map(preprocess_title_only, batched=True)

Map:   0%|          | 0/7448 [00:00<?, ? examples/s]

Map:   0%|          | 0/2624 [00:00<?, ? examples/s]

In [14]:
# Training
train('./model_title_only', tokenized_train_title_only, tokenized_validation_title_only)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.499554,0.76753,0.452349,0.625232,0.524922
2,0.572100,0.672673,0.68064,0.369204,0.782931,0.501784
3,0.433000,0.730884,0.724848,0.401719,0.693878,0.508844
4,0.257700,0.974143,0.743902,0.420168,0.649351,0.510204
5,0.144500,1.191071,0.734375,0.40814,0.651206,0.501787


## **Training: Title + Text Pair**

In [15]:
# Tokenisasi
tokenized_train_title_text_pair = ds_train.map(preprocess_title_text_pair, batched=True)
tokenized_validation_title_text_pair = ds_validation.map(preprocess_title_text_pair, batched=True)

Map:   0%|          | 0/7448 [00:00<?, ? examples/s]

Map:   0%|          | 0/2624 [00:00<?, ? examples/s]

In [16]:
# Training
train('./model_title_text_pair', tokenized_train_title_text_pair, tokenized_validation_title_text_pair)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.485917,0.785442,0.480952,0.562152,0.518392
2,0.574500,0.637177,0.672256,0.360556,0.769944,0.491124
3,0.444400,0.755905,0.715701,0.392968,0.705009,0.504648
4,0.273100,0.950414,0.724848,0.401081,0.688312,0.506831
5,0.166400,1.119589,0.728659,0.403567,0.671614,0.504178


In [17]:
# Contoh prediksi
from transformers import pipeline

classifier = pipeline('text-classification', model='./model_title_text_pair')

In [19]:
for i in range(20):
  title = ds_validation['title'][i]
  text = ds_validation['text'][i]
  prediction = classifier(title + ' [SEP] ' + text, truncation=True)
  print(f"Prediction: {prediction[0]['label']}| Actual: {id2label[ds_validation['label'][i]]}| Title: {title}")

Prediction: news| Actual: news| Title: Trump says he is releasing something 'phenomenal in terms of tax' in 2 to 3 weeks
Prediction: news| Actual: news| Title: Fidel Castro's ashes make their final journey across Cuba
Prediction: news| Actual: news| Title: Obama Administration Sending $500 Million to Global Climate Change Fund
Prediction: news| Actual: news| Title: Insurers Are Worried About The House GOP Health Care Bill
Prediction: news| Actual: news| Title: Kobe Bryant and Nike Form Youth Basketball 'Mamba League' in Los Angeles
Prediction: news| Actual: news| Title: Footprints suggest possibility of surprisingly tall early human ancestor
Prediction: news| Actual: clickbait| Title: Facebook Targets 30,000 Fake French Accounts Before Election
Prediction: clickbait| Actual: clickbait| Title: My boss made me think I was going mad and suffering from maternity paranoia 
Prediction: clickbait| Actual: news| Title: Finding a Home on the Tree of Life for a Tentacled Ice Cream Cone With a Li