## Classify spoiler-type using BLOOM


In [None]:
from classification_using_bloom import load_data, classify_spoiler_type, calculate_metrics

In [None]:
train_df, test_df = load_data()

In [None]:
predicted_class = classify_spoiler_type(examples_per_class=2)

In [None]:
metrics = calculate_metrics(test_df, predicted_class)

## Classify spoiler-type using spoiler generated by BLOOM

In [None]:
import pandas as  pd
import numpy as np

from bloom_generate_spoilers import generate_spoilers
from dataset_class import Dataset


In [None]:
val_dataset = Dataset.from_jsonl("data/validation.jsonl")
val_df = pd.DataFrame()
val_df["text"] ="Question: \n" + val_dataset.df["postText"] + "\nContext: \n"+ val_dataset.df["targetParagraphs"].apply(lambda x: ". ".join(x)[:2500])+ "\nAnswer: \n"

In [None]:
spoilers = generate_spoilers(val_df, examples_per_class=2, number_of_generated_spoilers=10)

In [None]:
spoilers_df = pd.DataFrame(spoilers, columns=["spoiler"])
spoilers_df.replace(" ", np.nan, inplace=True)
spoilers_df.to_csv("data/spoilers.csv", index=False)

In [None]:
from datasets import Dataset
from dataset_class import Dataset as MyDataset
import pandas as pd
from sklearn.model_selection import train_test_split


datapath = "data/train.jsonl"
dataset = MyDataset.from_jsonl(datapath)

df = pd.DataFrame()
df["text"] = (
 "Clickbait: \n " + dataset.df["postText"] + " Spoiler: \n" + dataset.df["spoiler"]
)
df["tags"] = dataset.df["tags"]
val_dataset = MyDataset.from_jsonl("data/validation.jsonl")
spoilers_df=pd.read_csv("data/spoilers.csv")
spoilers_df=spoilers_df.fillna(" ")
val= pd.DataFrame()
val["text"]="Clickbait: \n " +val_dataset.df["postText"] + " Spoiler: \n" + spoilers_df['spoiler']
val["tags"]=val_dataset.df["tags"]
test = val.iloc[300:]
df = pd.concat([df.iloc[:2500], val.iloc[:300]])
df_train, df_val = train_test_split(df, test_size=0.2, stratify=df["tags"])
labels = {"phrase": 0, "passage": 1, "multi": 2}
df_train.rename(columns={"tags": "label"}, inplace=True)
df_train["label"] = df_train["label"].apply(lambda x: labels[x])
train_dataset = Dataset.from_pandas(df_train)
df_val.rename(columns={"tags": "label"}, inplace=True)
df_val["label"] = df_val["label"].apply(lambda x: labels[x])
val_dataset = Dataset.from_pandas(df_val)

In [None]:
from finetune_training import train, preprocess_function

ckpts = ["microsoft/deberta-base", "distilbert-base-uncased", "albert-base-v2", "roberta-base"]
model_checkpoint = f"{ckpts[3]}-finetuned"
trainer = train(ckpts[3], train_dataset, val_dataset, batch_size=8, lr = 2e-6)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
test["label"] = test["tags"].apply(lambda x: labels[x])
test_dataset = Dataset.from_pandas(test)
test_dataset = test_dataset.map(preprocess_function, batched=True)

In [None]:
pred = trainer.predict(test_dataset)

In [None]:
from sklearn.metrics import balanced_accuracy_score, classification_report

predictions = pred.predictions.argmax(1)
acc = balanced_accuracy_score(pred.label_ids, predictions)

In [None]:
print(classification_report(pred.label_ids, predictions))

## Finetune model using clickbait post and linked web page

In [None]:
from datasets import Dataset
from dataset_class import Dataset as MyDataset
import pandas as pd

labels = {"phrase": 0, "passage": 1, "multi": 2}


train_dataset = MyDataset.from_jsonl("data/train.jsonl")
df["text"] = (
 "Clickbait\n" + dataset.df["postText"] +  "\nArticle\n" + dataset.df["targetParagraphs"].apply(lambda x: ". ".join(x)[:2000])  
)
df["tags"] = dataset.df["tags"]

df_train, df_val = train_test_split(df, test_size=0.2, stratify=df["tags"])
labels = {"phrase": 0, "passage": 1, "multi": 2}

df_train.rename(columns={"tags": "label"}, inplace=True)
df_train["label"] = df_train["label"].apply(lambda x: labels[x])
train_dataset = Dataset.from_pandas(df_train)
df_val.rename(columns={"tags": "label"}, inplace=True)
df_val["label"] = df_val["label"].apply(lambda x: labels[x])
val_dataset = Dataset.from_pandas(df_val)

In [None]:
from finetune_training import train, preprocess_function

ckpts = ["microsoft/deberta-base", "distilbert-base-uncased", "albert-base-v2", "roberta-base"]
model_checkpoint = f"{ckpts[3]}-finetuned"
trainer = train(model_checkpoint, train_dataset, val_dataset, batch_size=6, lr = 2e-6)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
datapath = "data/validation.jsonl"
test = MyDataset.from_jsonl(datapath)
test_df = pd.DataFrame()
test_df["text"] = "Clickbait: \n " +test.df["postText"] +  "\nArticle\n" + test.df["targetParagraphs"].apply(lambda x: ". ".join(x)[:2000])  
test_df["label"] = test.df["tags"].apply(lambda x: labels[x])

test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)

In [None]:
pred = trainer.predict(test_dataset)

In [None]:
from sklearn.metrics import balanced_accuracy_score, classification_report

predictions = pred.predictions.argmax(1)
acc = balanced_accuracy_score(pred.label_ids, predictions)

In [None]:
print(classification_report(pred.label_ids, predictions))