The purpose of this notebook is to finetune BERT model for article spoiling task. <br>
Use the conda env: eda_env

In [122]:
# Data processing
import pandas as pd
import numpy as np
import ast
import accelerate
import nltk

# Visualisation
import matplotlib.pyplot as plt

# BERT model
from transformers import RobertaTokenizer, RobertaForQuestionAnswering,  BertTokenizer, BertModel, logging, RobertaTokenizerFast
import torch
import nltk
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

In [123]:
# VARIABLES
nltk.download('punkt')
RANDOM_STATE = 42 # Random state for reproducibility
logging.set_verbosity_error()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wojom\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Loading data and splitting data.

In [124]:
spoil_df = pd.read_csv("../data/spoiling_data.csv", sep=";")
print(spoil_df.shape)

(3358, 6)


In [125]:
spoil_df.columns

Index(['targetTitle', 'targetParagraphs', 'humanSpoiler', 'spoiler', 'tags',
       'spoilerPositions'],
      dtype='object')

In [126]:
if type(spoil_df.iloc[0]["targetParagraphs"]) != str:
    spoil_df["targetParagraphs"] = spoil_df["targetParagraphs"].apply(ast.literal_eval)
if type(spoil_df.iloc[0]["targetParagraphs"]) != str:
    spoil_df["tags"] = spoil_df["tags"].apply(ast.literal_eval)
    spoil_df["tags"] = spoil_df["tags"].apply(lambda x: x[0])
if type(spoil_df.iloc[0]["spoilerPositions"]) == str:
    spoil_df["spoilerPositions"] = spoil_df["spoilerPositions"].apply(ast.literal_eval)
if type(spoil_df.iloc[0]["spoiler"]) != str:
    spoil_df["spoiler"] = spoil_df["spoiler"].apply(ast.literal_eval)

In [127]:
x_train, x_test, y_train, y_test = train_test_split(
    spoil_df.drop(columns=["humanSpoiler", "spoiler"]), 
    spoil_df[["humanSpoiler", "spoiler"]],
    test_size=0.2, 
    random_state=RANDOM_STATE
)

x_test, x_val, y_test, y_val = train_test_split(
    x_test, 
    y_test,
    test_size=0.5,  # 50% of the original x_test size for validation
    random_state=RANDOM_STATE
)

### Loading models.

In [128]:
tokenizer_bert_uncased = BertTokenizer.from_pretrained("bert-base-uncased")
model_bert_uncased = BertModel.from_pretrained("bert-base-uncased")

def calculate_bert_similarity(texts):
    embeddings = []
    for text in texts:
        inputs = tokenizer_bert_uncased(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        outputs = model_bert_uncased(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
        embeddings.append(embedding)
    
    similarity = np.dot(embeddings[0], embeddings[1]) / (np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1]))
    return similarity

In [129]:
x_train.head()

Unnamed: 0,targetTitle,targetParagraphs,tags,spoilerPositions
2597,Why people like Edward Snowden say they will b...,Googles new messenger app is stirring up a deb...,passage,"[[[2, 0], [2, 78]]]"
1957,Why NASA Is Building An $18 Billion Rocket To ...,One piece of NASA’s massive new rocket NASA / ...,phrase,"[[[8, 33], [8, 44]]]"
1926,Justin Bieber Kicked Out Of Hotel In Argentina...,Justin Bieber was allegedly kicked out of a ho...,passage,"[[[1, 0], [1, 149]]]"
1991,"Two players meet in No Man’s Sky, guess what h...",No Mans Sky is finally released in the UK and ...,phrase,"[[[6, 112], [6, 135]]]"
2807,Ayvani Hope Perez Kidnapping: Teens Mom Was Ar...,Authorities in Georgia have revealed a new lin...,passage,"[[[1, 88], [1, 133]]]"


In [130]:
y_test.head()

Unnamed: 0,humanSpoiler,spoiler
1213,Your heart,heart
321,Public = celebrities not government or society,Robin Roberts The Good Morning America anchor ...
2441,In Detroit,Abraham Pearson
2478,Debating climate change with @TheScienceGuy an...,Meet the Press was singled out as failing to o...
2295,So where are you going for college next year?,"So, where are you going for college next year?"


### Q&A bm25 with roberta-base

In [131]:
tokenizer_roberta = RobertaTokenizer.from_pretrained('roberta-base')
model_roberta = RobertaForQuestionAnswering.from_pretrained('roberta-base')

In [132]:
def calculate_bm25(query, context):
    sentences = nltk.sent_tokenize(context)
    tokenized_corpus = [word_tokenize(doc.lower()) for doc in sentences]
    bm25 = BM25Okapi(tokenized_corpus)

    tokenized_query = word_tokenize(query.lower())
    scores = bm25.get_scores(tokenized_query)

    scores_idx = [[score, idx] for idx, score in enumerate(scores)]
    scores_idx.sort(reverse=True, key=lambda x: x[0])
    sentences = pd.DataFrame({"sentences": sentences})
    important_sentences = sentences.iloc[[arr[1] for arr in scores_idx[:10]],:]
    important_sentences_list = important_sentences["sentences"].tolist()
    return " ".join(important_sentences_list)

In [133]:
def generate_spoiler(query, context, model=model_roberta, tokenizer=tokenizer_roberta):
    inputs = tokenizer(query, context, truncation=True, padding=True, max_length=512, return_tensors="pt")
    print(len(inputs["input_ids"][0]))
    print(inputs["input_ids"][0])
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        start_index = torch.argmax(outputs.start_logits)
        end_index = torch.argmax(outputs.end_logits)

    answer = tokenizer.decode(inputs['input_ids'][0][start_index:end_index + 1])
    return answer

In [None]:
x_train["important_sentences"] = x_train.apply(lambda x: calculate_bm25(x["targetTitle"], x["targetParagraphs"]), axis=1)

In [None]:
x_train["spoiler_bm25"] = x_train.apply(lambda x: generate_spoiler(x["targetTitle"], x["important_sentences"]), axis=1)

In [134]:
#x_train.to_csv("../data/train_spoiler_bm25.csv", sep=";")
x_train = pd.read_csv("../data/train_spoiler_bm25.csv", sep=";")
x_train.set_index('Unnamed: 0', inplace=True)

In [137]:
df_train = pd.concat([x_train, y_train], axis=1)
df_bm25 = df_train.loc[x_train["spoiler_bm25"] != ""]
df_bm25.shape # around 1000 rows with empty spoilers 

(2686, 8)

In [None]:
df_bm25["bert_sim"] = df_bm25.apply(lambda x: calculate_bert_similarity([
    x["spoiler_bm25"],
    x["spoiler"]
]), axis=1)

In [None]:
df_bm25["bert_sim"].mean()

### Q&A chunking with roberta-base

In [263]:
def chunk_text(sentences, max_len=512, tokenizer=tokenizer_roberta):
    chunks = []
    chunk = []
    token_count = 0
    
    for sentence in sentences:
        tokens = tokenizer.encode(sentence, add_special_tokens=False)
        token_count += len(tokens)
        
        if token_count > max_len:
            chunks.append(chunk)
            chunk = [sentence]
            token_count = len(tokens)
        else:
            chunk.append(sentence)
    
    if chunk:
        chunks.append(chunk)
    
    return chunks

In [58]:
def generate_chuncked_spoiler(query, context, model=model_roberta, tokenizer=tokenizer_roberta):
    sentences = nltk.sent_tokenize(context)
    chunks = chunk_text(sentences)

    answers = []
    for chunk in chunks:
        context_chunk = " ".join(chunk)
        inputs = tokenizer_roberta(query, context_chunk, truncation=True, padding=True, return_tensors="pt")
        
        model_roberta.eval()
        with torch.no_grad():
            outputs = model_roberta(**inputs)
            start_index = torch.argmax(outputs.start_logits)
            end_index = torch.argmax(outputs.end_logits)
            
            answer_tokens = inputs['input_ids'][0][start_index:end_index + 1]
            answer = tokenizer_roberta.decode(answer_tokens)
            answers.append(answer)

    final_answer = " ".join(answers)
    return final_answer

In [None]:
x_train["spoiler_chuncked"] = x_train.apply(lambda x: generate_chuncked_spoiler(x["targetTitle"], x["targetParagraphs"]), axis=1)

In [None]:
y_train_chuncked = y_train.loc[x_train["spoiler_chuncked"] != ""]
x_train_chuncked = x_train.loc[x_train["spoiler_chuncked"] != ""]
df_chuncked = pd.concat([x_train_chuncked, y_train_chuncked], axis=1)
df_chuncked.shape

In [None]:
df_chuncked["bert_sim"] = df_chuncked.apply(lambda x: calculate_bert_similarity([
    x["spoiler_chuncked"],
    x["spoiler"]
]), axis=1)
df_chuncked["bert_sim"].mean()

### Roberta finetuning task

In [37]:
tokenizer_roberta_fast = RobertaTokenizerFast.from_pretrained('roberta-base')
def head_tail_turncation(text, max_length=512, head_length=128, tail_length=382, tokenizer=tokenizer_roberta_fast):
    tokenized = tokenizer(text, truncation=False, return_offsets_mapping=True)
    input_ids = tokenized["input_ids"]
    attention_mask = tokenized["attention_mask"]
    offsets = tokenized["offset_mapping"]

    if len(input_ids) > max_length:
        truncated_input_ids = input_ids[:head_length] + input_ids[-tail_length:]
        truncated_attention_mask = attention_mask[:head_length] + attention_mask[-tail_length:]
        truncated_offsets = offsets[:head_length] + offsets[-tail_length:]
    else:
        truncated_input_ids = input_ids
        truncated_attention_mask = attention_mask
        truncated_offsets = offsets

    return {
        "input_ids": truncated_input_ids,
        "attention_mask": truncated_attention_mask,
        "offset_mapping": truncated_offsets,
    }

In [None]:
# x_train
title = x_train["targetTitle"].iloc[0]
paragraph = x_train["targetParagraphs"].iloc[0]
spoiler = y_train["spoiler"].iloc[0]
x_train

In [42]:
truncated = head_tail_turncation(title + paragraph)

In [None]:
print("Truncated Input IDs:", truncated["input_ids"])
print("Truncated Offset Mapping:", truncated["offset_mapping"])
print(truncated["attention_mask"])

## Finetuning T5Large

32GB RAM memory is barely sufficient to start fine-tuning!

In [8]:
from transformers import AutoTokenizer
from datasets import Dataset
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset

In [9]:
tokenizer = AutoTokenizer.from_pretrained("t5-large")

In [10]:
prefix = "Spoiler for this text: "
max_input_length = 800
max_target_length = 128
df_train = pd.concat([x_train, y_train], axis=1)
df_test = pd.concat([x_test, y_test], axis=1)
df_val = pd.concat([x_val, y_val], axis=1)
df_train = Dataset.from_pandas(df_train)
df_test = Dataset.from_pandas(df_test)
df_val = Dataset.from_pandas(df_val)

In [11]:
def preprocess_function(examples):
    inputs = [
        f"Extract spoiler from article. Title: {str(title)}; Tag: {str(tag)}; Content: {' '.join(paragraphs)}"
        for title, tag, paragraphs in zip(examples["targetTitle"], examples["tags"], examples["targetParagraphs"])
    ]
    
    model_inputs = tokenizer(
        inputs,
        return_tensors="pt",
        max_length=800,
        padding="max_length",
        truncation=True
    )
    
    spoilers = [spoiler[0] for spoiler in examples["spoiler"]]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            spoilers,
            return_tensors="pt",
            max_length=128,
            padding="max_length",
            truncation=True
        )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
tokenized_dataset_train = df_train.map(preprocess_function, batched=True)
tokenized_dataset_test = df_test.map(preprocess_function, batched=True)
tokenized_dataset_val = df_val.map(preprocess_function, batched=True)

In [13]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-large")

In [None]:
batch_size = 4
model_name = "t5-large"
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False
)

In [15]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [16]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()