# Prepare SQuAD_tiny Dataset for Assignment 2

This code prepare SQuAD_tiny from the SQuAD dataset. 

# 0. Import libraries

In [1]:
import os
import torch
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from rouge_score import rouge_scorer
from tqdm import tqdm

In [2]:
# Set seed for reproducibility
torch.manual_seed(99)

<torch._C.Generator at 0x11000feb0>

# 1. Load and preprocess SQuAD dataset

In [5]:
# 1. Load and preprocess SQuAD dataset
dataset = load_dataset("squad")

In [10]:
# Take subsets to avoid overload
#train_dataset = dataset["train"].select(range(10000))
#val_dataset = dataset["validation"].select(range(1000))
#test_dataset = dataset["validation"].select(range(1000, 2000))  # No official SQuAD test set

In [6]:
train_dataset = dataset["train"].select(range(1000))
val_dataset = dataset["validation"].select(range(100))
test_dataset = dataset["validation"].select(range(100, 200))

In [7]:
from datasets import DatasetDict

SQuAD_tiny = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

In [8]:
# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


## Preprocessing

In [9]:
Max_input_length = 512
Max_output_length = 128

In [25]:
# Preprocessing function
#def preprocess(example):
#    input_text = f"question: {example['question']} context: {example['context']}"
#    target_text = example["answers"]["text"][0]
#    input_enc = tokenizer(input_text, padding="max_length", truncation=True, max_length=Max_input_length)
#    target_enc = tokenizer(target_text, padding="max_length", truncation=True, max_length=Max_output_length)
#    input_enc["labels"] = target_enc["input_ids"]
#    return input_enc

In [87]:
from transformers import logging as transformers_logging

def encode_question_and_context(question, context):
    return f"question: {question} context: {context}"

def extract_example_parts(example):
    context = example["context"]
    question = example["question"]
    answer = example["answers"]["text"][0]
    question_with_context = encode_question_and_context(question,context)
    return (question_with_context, question, answer)

def preprocess(example):
    question_with_context, question, answer = extract_example_parts(example)

    old_level = transformers_logging.get_verbosity()
    transformers_logging.set_verbosity_error()

    input_enc = tokenizer(question_with_context, question, padding = "max_length",
                             truncation = True, max_length = Max_input_length)
    target_enc = tokenizer(answer, padding = "max_length", truncation = True,
                           max_length = Max_output_length)
    transformers_logging.set_verbosity(old_level)

    input_enc["labels"] = np.array(target_enc["input_ids"])

    return input_enc


In [17]:
# Preprocess the datasets
train_enc = train_dataset.map(preprocess, batched=False)
val_enc = val_dataset.map(preprocess, batched=False)
test_enc = test_dataset.map(preprocess, batched=False)

In [29]:
print("Number of training examples:", len(train_dataset))
train_dataset[0]

Number of training examples: 1000


{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

## Loading the Model

In [21]:
try:
    del model
except NameError:
    pass

model = T5ForConditionalGeneration.from_pretrained("t5-small")

## Fine-Tuning the Model

In [23]:
columns = ["input_ids", "attention_mask", "labels"]
train_enc.set_format(type = "torch", columns = columns)
val_enc.set_format(type = "torch", columns = columns)
test_enc.set_format(type = "torch", columns = columns)

In [37]:
training_args = TrainingArguments(
    output_dir = "./results",
    num_train_epochs = 3,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 32,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 2e-5,
    weight_decay = 0.01,
    save_total_limit = 2,
    logging_dir = "./logs",
    logging_steps = 10
)

model.train()

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_enc,
    eval_dataset = val_enc,
    data_collator = DataCollatorForSeq2Seq(tokenizer)
)

trainer.train()

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
def display_evaluation(setname, results):
    print(f"{setname} Set Loss:", round(results["eval_loss"], 3))

model.eval()

display_evaluation("Training", trainer.evaluate(train_enc))
display_evaluation("Testing", trainer.evaluate(test_enc))

In [43]:
training_args = TrainingArguments(
    output_dir = "./results",
    num_train_epochs = 3,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 32,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 1e-5,
    weight_decay = 0.01,
    save_total_limit = 2,
    logging_dir = "./logs",
    logging_steps = 10
)
model.train()

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_enc,
    eval_dataset = val_enc,
    data_collator = DataCollatorForSeq2Seq(tokenizer)
)

trainer.train()

model.eval()

display_evaluation("Training", trainer.evaluate(train_enc))
display_evaluation("Testing", trainer.evaluate(test_enc))

Epoch,Training Loss,Validation Loss
1,0.1724,0.173281
2,0.1468,0.133859
3,0.1382,0.124995


TrainOutput(global_step=375, training_loss=0.16634305795033774, metrics={'train_runtime': 1423.6643, 'train_samples_per_second': 2.107, 'train_steps_per_second': 0.263, 'total_flos': 406025404416000.0, 'train_loss': 0.16634305795033774, 'epoch': 3.0})

In [49]:
training_args = TrainingArguments(
    output_dir = "./results",
    num_train_epochs = 2,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 32,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 3e-4,
    weight_decay = 0.01,
    save_total_limit = 2,
    logging_dir = "./logs",
    logging_steps = 10
)

model.train()

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_enc,
    eval_dataset = val_enc,
    data_collator = DataCollatorForSeq2Seq(tokenizer)
)

trainer.train()

model.eval()

display_evaluation("Training", trainer.evaluate(train_enc))
display_evaluation("Testing", trainer.evaluate(test_enc))

Epoch,Training Loss,Validation Loss
1,0.0231,0.015123
2,0.0138,0.014601


TrainOutput(global_step=250, training_loss=0.025663533329963684, metrics={'train_runtime': 1041.048, 'train_samples_per_second': 1.921, 'train_steps_per_second': 0.24, 'total_flos': 270683602944000.0, 'train_loss': 0.025663533329963684, 'epoch': 2.0})

## Finetune batch size

In [36]:
training_args = TrainingArguments(
    output_dir = "./results",
    num_train_epochs = 3,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 32,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 1e-5,
    weight_decay = 0.01,
    save_total_limit = 2,
    logging_dir = "./logs",
    logging_steps = 10,
    )

model.train()
    
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_enc,
    eval_dataset = val_enc,
    data_collator = DataCollatorForSeq2Seq(tokenizer),
)

trainer.train()

model.eval()

display_evaluation("Training", trainer.evaluate(train_enc))
display_evaluation("Testing", trainer.evaluate(test_enc))

Epoch,Training Loss,Validation Loss
1,0.1446,0.106525
2,0.1362,0.100698
3,0.1357,0.099714


TrainOutput(global_step=189, training_loss=0.13934119983955665, metrics={'train_runtime': 1704.127, 'train_samples_per_second': 1.76, 'train_steps_per_second': 0.111, 'total_flos': 406025404416000.0, 'train_loss': 0.13934119983955665, 'epoch': 3.0})

In [47]:
training_args = TrainingArguments(
    output_dir = "./results",
    num_train_epochs = 3,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 32,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 1e-5,
    weight_decay = 0.01,
    save_total_limit = 2,
    logging_dir = "./logs",
    logging_steps = 10,
    )
model.train()
    
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_enc,
    eval_dataset = val_enc,
    data_collator = DataCollatorForSeq2Seq(tokenizer),
)

trainer.train()

model.eval()

display_evaluation("Training", trainer.evaluate(train_enc))
display_evaluation("Testing", trainer.evaluate(test_enc))

Epoch,Training Loss,Validation Loss
1,0.2885,0.195706
2,0.2082,0.169136
3,0.1952,0.158253


TrainOutput(global_step=750, training_loss=0.3546682445208232, metrics={'train_runtime': 3578.3697, 'train_samples_per_second': 0.838, 'train_steps_per_second': 0.21, 'total_flos': 406025404416000.0, 'train_loss': 0.3546682445208232, 'epoch': 3.0})

## Weight_decay

In [214]:
training_args = TrainingArguments(
    output_dir = "./results",
    num_train_epochs = 3,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 32,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 1e-5,
    weight_decay = 0.1,
    save_total_limit = 2,
    logging_dir = "./logs",
    logging_steps = 10
)

model.train()

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_enc,
    eval_dataset = val_enc,
    processing_class = tokenizer,
    data_collator = DataCollatorForSeq2Seq(tokenizer)
)

trainer.train()

model.eval()

display_evaluation("Training", trainer.evaluate(train_enc))
display_evaluation("Testing", trainer.evaluate(test_enc))

Epoch,Training Loss,Validation Loss
1,0.0904,0.067105
2,0.0807,0.066562
3,0.0791,0.064834
4,0.0763,0.063535
5,0.072,0.061008
6,0.0698,0.058313
7,0.0688,0.05653
8,0.0681,0.05589
9,0.0705,0.053894
10,0.0643,0.053453


Training Set Loss: 0.048
Testing Set Loss: 0.054


In [220]:
training_args = TrainingArguments(
    output_dir = "./results",
    num_train_epochs = 3,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 32,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 1e-5,
    weight_decay = 0.001,
    save_total_limit = 2,
    logging_dir = "./logs",
    logging_steps = 10
)

model.train()

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_enc,
    eval_dataset = val_enc,
    processing_class = tokenizer,
    data_collator = DataCollatorForSeq2Seq(tokenizer)
)

trainer.train()

model.eval()

display_evaluation("Training", trainer.evaluate(train_enc))
display_evaluation("Testing", trainer.evaluate(test_enc))

Epoch,Training Loss,Validation Loss
1,0.0637,0.051315
2,0.0583,0.049013
3,0.0588,0.048228


Training Set Loss: 0.046
Testing Set Loss: 0.049


## Early Stopping to find the optimal epochs

In [28]:
from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir = "./results",
    num_train_epochs = 20,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 32,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 1e-5,
    weight_decay = 0.01,
    save_total_limit = 2,
    logging_dir = "./logs",
    logging_steps = 10,

    metric_for_best_model = "loss",
    load_best_model_at_end = True
    )

model.train()
    
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_enc,
    eval_dataset = val_enc,
    data_collator = DataCollatorForSeq2Seq(tokenizer),
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1845,0.143823
2,0.1723,0.132937
3,0.1623,0.127384
4,0.1505,0.119286
5,0.1437,0.10928
6,0.1316,0.100075
7,0.1277,0.087492
8,0.1229,0.079679
9,0.1216,0.071966
10,0.1009,0.066151


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=882, training_loss=0.12902785955913484, metrics={'train_runtime': 2409.2144, 'train_samples_per_second': 8.301, 'train_steps_per_second': 0.523, 'total_flos': 1894785220608000.0, 'train_loss': 0.12902785955913484, 'epoch': 14.0})

In [67]:
model.train()
    
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_enc,
    eval_dataset = val_enc,
    data_collator = DataCollatorForSeq2Seq(tokenizer),
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0019,0.027137
2,0.002,0.024793


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=250, training_loss=0.0021449310518801214, metrics={'train_runtime': 539.6679, 'train_samples_per_second': 3.706, 'train_steps_per_second': 0.463, 'total_flos': 270683602944000.0, 'train_loss': 0.0021449310518801214, 'epoch': 2.0})

## Saving the Model

In [96]:
trainer.save_model("my_model")

## Model Evaluation

In [30]:
model.eval()

display_evaluation("Training", trainer.evaluate(train_enc))
display_evaluation("Testing", trainer.evaluate(test_enc))

Training Set Loss: 0.076
Testing Set Loss: 0.069


In [182]:
from itertools import batched

def generate_response(tokenizer, model, question):
    tokenized = tokenizer(question, return_tensors = "pt", padding = True, truncation = True,
                          max_length = Max_output_length).to(model.device)

    with torch.no_grad():
        outputs = model.generate(**tokenized)

    outputs = tokenizer.batch_decode(outputs, skip_special_tokens = True)
    return outputs


def generate_answers(tokenizer, model, dataset, use_context = True, limit = None):
    if limit is not None:
        dataset = dataset.select(range(limit))

    questions = []
    inputs = []
    references = []

    for example in dataset:
        question_with_context, question, answer = extract_example_parts(example)

        if use_context:
            inputs.append(question_with_context)

        else:
            inputs.append(question)

        questions.append(question)
        
        references.append(answer)

    outputs = []
    for examples in batched(inputs, 128):
        responses = generate_response(tokenizer, model, list(examples))

        outputs.extend(responses)

    #assert (len(outputs) == len(references))
    return outputs, references, questions

In [184]:
answers_ctx, refs_ctx, questions_ctx = generate_answers(tokenizer, model, test_dataset, use_context = True, limit = 100)
answers_noctx, refs_noctx, questions_noctx = generate_answers(tokenizer, model, test_dataset, use_context = False, limit = 100)

In [206]:
def display_answer_and_references(question, answer, reference):
    print("Question", i+1,":", question)
    print("Generated answer:", answer) 
    print("Reference answer:", reference)

print("a. With context")
print()

for i in range(5):
    display_answer_and_references(questions_ctx[i], answers_ctx[i], refs_ctx[i])
    print()

print()

print("b. Without context")
print()

for i in range(5):
    display_answer_and_references(questions_noctx[i], answers_noctx[i], refs_noctx[i])
    print()

a. With context

Question 1 : Who were special guests for the Super Bowl halftime show?
Generated answer: Beyoncé and Bruno Mars
Reference answer: Beyoncé and Bruno Mars

Question 2 : Which Super Bowl halftime show did Beyoncé headline?
Generated answer: Super Bowl XLVIII
Reference answer: Super Bowl XLVII

Question 3 : What was the cost for a half minute ad?
Generated answer: $5 million
Reference answer: $5 million

Question 4 : Who lead the Super Bowl 50 halftime performance?
Generated answer: Beyoncé and Bruno Mars
Reference answer: Coldplay

Question 5 : What other two famous performers were part of the Super Bowl 50 halftime?
Generated answer: Beyoncé and Bruno Mars
Reference answer: Beyoncé and Bruno Mars


b. Without context

Question 1 : Who were special guests for the Super Bowl halftime show?
Generated answer: Wer waren besondere guests für die Super Bowl halftime Show?
Reference answer: Beyoncé and Bruno Mars

Question 2 : Which Super Bowl halftime show did Beyoncé headline?

## Model Evaluation using ROUGE

In [168]:
def compute_average_score(scores, metric, key):
    total = 0
    for i in range(len(scores)):
        total += getattr(scores[i][metric], key)
    return total / len(scores)

def compute_rouge(predictions, references):
    metrics = ["rouge1", "rouge2", "rougeL"]

    scorer = rouge_scorer.RougeScorer(metrics, use_stemmer = True)

    scores = []
    for prediction, reference in zip(predictions, references):
        scores.append(scorer.score(reference, prediction))

    results = {}
    for metric in metrics:
        for k in ["precision", "recall", "fmeasure"]:
            results[f"{metric}_{k}"] = compute_average_score(scores, metric, k)
    return results

In [194]:
print("ROUGE with context:", compute_rouge(answers_ctx, refs_ctx))
print()
print("ROUGE without context:", compute_rouge(answers_noctx, refs_noctx))

ROUGE with context: {'rouge1_precision': 0.7720833333333332, 'rouge1_recall': 0.7713333333333334, 'rouge1_fmeasure': 0.7644102564102565, 'rouge2_precision': 0.4442857142857143, 'rouge2_recall': 0.45631578947368423, 'rouge2_fmeasure': 0.4446863799283154, 'rougeL_precision': 0.7720833333333332, 'rougeL_recall': 0.7713333333333334, 'rougeL_fmeasure': 0.7644102564102565}

ROUGE without context: {'rouge1_precision': 0.017189255189255192, 'rouge1_recall': 0.06333333333333334, 'rouge1_fmeasure': 0.02669069819069819, 'rouge2_precision': 0.007687802393684746, 'rouge2_recall': 0.04, 'rouge2_fmeasure': 0.01274188676820256, 'rougeL_precision': 0.017189255189255192, 'rougeL_recall': 0.06333333333333334, 'rougeL_fmeasure': 0.02669069819069819}


## Task 5.5

In [174]:
from  transformers  import  AutoTokenizer, AutoModelWithLMHead, pipeline

model_name = "MaRiOrOsSi/t5-base-finetuned-question-answering"
tokenizer_2 = AutoTokenizer.from_pretrained(model_name)
model_2 = AutoModelWithLMHead.from_pretrained(model_name)

In [190]:
answers_2, refs_2, questions_2 = generate_answers(
    tokenizer_2, model_2, test_dataset, True, 100)
answers_n2, refs_n2, questions_n2 = generate_answers(
    tokenizer_2, model_2, test_dataset, False, 100)

In [210]:
print("a. With context")
print()

for i in range(5):
    display_answer_and_references(questions_2[i], answers_2[i], refs_2[i])
    print()

print()

print("b. Without context")
print()

for i in range(5):
    display_answer_and_references(questions_n2[i], answers_n2[i], refs_n2[i])
    print()

a. With context

Question 1 : Who were special guests for the Super Bowl halftime show?
Generated answer: Beyonce and Bruno Mars
Reference answer: Beyoncé and Bruno Mars

Question 2 : Which Super Bowl halftime show did Beyoncé headline?
Generated answer: Super Bowl 50
Reference answer: Super Bowl XLVII

Question 3 : What was the cost for a half minute ad?
Generated answer: $5 million
Reference answer: $5 million

Question 4 : Who lead the Super Bowl 50 halftime performance?
Generated answer: Coldplay
Reference answer: Coldplay

Question 5 : What other two famous performers were part of the Super Bowl 50 halftime?
Generated answer: Beyonce and Bruno Mars
Reference answer: Beyoncé and Bruno Mars


b. Without context

Question 1 : Who were special guests for the Super Bowl halftime show?
Generated answer: AJ, Nick, and Sean Conner
Reference answer: Beyoncé and Bruno Mars

Question 2 : Which Super Bowl halftime show did Beyoncé headline?
Generated answer: Super Bowl Halftime show
Reference

In [196]:
print("ROUGE with context:", compute_rouge(answers_2, refs_2))
print()
print("ROUGE without context:", compute_rouge(answers_n2, refs_n2))

ROUGE with context: {'rouge1_precision': 0.7459444444444445, 'rouge1_recall': 0.7480000000000001, 'rouge1_fmeasure': 0.7368809523809523, 'rouge2_precision': 0.41609689330277566, 'rouge2_recall': 0.4324561403508772, 'rouge2_fmeasure': 0.4154628879892038, 'rougeL_precision': 0.7459444444444445, 'rougeL_recall': 0.7480000000000001, 'rougeL_fmeasure': 0.7368809523809523}

ROUGE without context: {'rouge1_precision': 0.11548412698412697, 'rouge1_recall': 0.098, 'rouge1_fmeasure': 0.09732539682539683, 'rouge2_precision': 0.06124999999999999, 'rouge2_recall': 0.04833333333333333, 'rouge2_fmeasure': 0.04966666666666668, 'rougeL_precision': 0.11298412698412698, 'rougeL_recall': 0.0975, 'rougeL_fmeasure': 0.09649206349206349}
