# Prepare SQuAD_tiny Dataset for Assignment 2

This code prepare SQuAD_tiny from the SQuAD dataset. 

# 0. Import libraries

In [1]:
import os
import torch
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from rouge_score import rouge_scorer
from tqdm import tqdm

In [3]:
# Set seed for reproducibility
torch.manual_seed(99)

<torch._C.Generator at 0x10850beb0>

# 1. Load and preprocess SQuAD dataset

In [5]:
# 1. Load and preprocess SQuAD dataset
dataset = load_dataset("squad")

In [15]:
# Take subsets to avoid overload
train_dataset = dataset["train"].select(range(10000))
val_dataset = dataset["validation"].select(range(1000))
test_dataset = dataset["validation"].select(range(1000, 2000))  # No official SQuAD test set

In [7]:
# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [17]:
Max_input_length = 512
Max_output_length = 128

## Preprocessing

In [19]:
# Preprocessing function
def preprocess(example):
    input_text = f"question: {example['question']} context: {example['context']}"
    target_text = example["answers"]["text"][0]
    input_enc = tokenizer(input_text, padding="max_length", truncation=True, max_length=Max_input_length)
    target_enc = tokenizer(target_text, padding="max_length", truncation=True, max_length=Max_output_length)
    input_enc["labels"] = target_enc["input_ids"]
    return input_enc

In [21]:
# Preprocess the datasets
train_enc = train_dataset.map(preprocess, batched=False)
val_enc = val_dataset.map(preprocess, batched=False)
test_enc = test_dataset.map(preprocess, batched=False)

In [23]:
print("Number of training examples:", len(train_dataset))
train_dataset[0]

Number of training examples: 10000


{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

## Loading the Model

In [25]:
try:
    del model
except NameError:
    pass

model = T5ForConditionalGeneration.from_pretrained("t5-small")

## Fine-Tuning the Model

In [25]:
columns = ["input_ids", "attention_mask", "labels"]
train_enc.set_format(type = "torch", columns = columns)
val_enc.set_format(type = "torch", columns = columns)
test_enc.set_format(type = "torch", columns = columns)

In [27]:
def display_evaluation(setname, results):
    print(f"{setname} Set Loss:", round(results["eval_loss"], 6))

In [31]:
from transformers import DataCollatorWithPadding, EarlyStoppingCallback

batch_sizes = [8, 16, 32]
learning_rates = [1e-5, 2e-5, 3e-5]
weights_decay = [0.1, 0.01, 0.001]

for learning_rate in learning_rates:
    for batch_size in batch_sizes:
        for weight_decay in weights_decay:
            try:
                del model
            except NameError:
                pass
            
            model = T5ForConditionalGeneration.from_pretrained ("t5-small")

            print("Parameters testing: lr = ", learning_rate, "batch size = ", batch_size, "weight_decay = ", weight_decay)
            training_args = TrainingArguments(
                output_dir = "./results",
                num_train_epochs = 20,
                per_device_train_batch_size = batch_size,
                per_device_eval_batch_size = 32,
                eval_strategy = "epoch",
                save_strategy = "epoch",
                learning_rate = learning_rate,
                weight_decay = weight_decay,
                save_total_limit = 2,
                logging_dir = "./logs",
                logging_steps = 10,
            
                metric_for_best_model = "loss",
                load_best_model_at_end = True
            )
            model.train()
                
            trainer = Trainer(
                model = model,
                args = training_args,
                train_dataset = train_enc,
                eval_dataset = val_enc,
                processing_class = tokenizer,
                data_collator = DataCollatorForSeq2Seq(tokenizer),
                callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
            )
                
            trainer.train()
            
            model.eval()
            
            display_evaluation("Training", trainer.evaluate(train_enc))
            display_evaluation("Testing", trainer.evaluate(test_enc))

Parameters testing: lr =  1e-05 batch size =  8 weight_decay =  0.1


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

## Saving the Model

Best parameters:
1. learning_rate = 1e-5
2. batch_size = 4
3. weight_decay = 0.1
4. num_train_epochs = 6

In [51]:
try:
    del model
except NameError:
    pass

model = T5ForConditionalGeneration.from_pretrained("t5-small")

training_args = TrainingArguments(
    output_dir = "./results",
    num_train_epochs = 6,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 32,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 1e-5,
    weight_decay = 0.1,
    save_total_limit = 2,
    logging_dir = "./logs",
    logging_steps = 10
)

model.train()

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_enc,
    eval_dataset = val_enc,
    data_collator = DataCollatorForSeq2Seq(tokenizer)
)

trainer.train()

model.eval()

display_evaluation("Training", trainer.evaluate(train_enc))
display_evaluation("Testing", trainer.evaluate(test_enc))

Epoch,Training Loss,Validation Loss
1,1.1338,1.120884




KeyboardInterrupt: 

In [23]:
trainer.save_model("my_model")

## Qualitative Model Evaluation

In [None]:
from itertools import batched

def encode_question_and_context(question, context):
    return f"question: {question} context: {context}"

def extract_sample_parts(sample):
    context = sample["context"]
    question = sample["question"]
    answer = sample["answers"]["text"][0]
    question_with_context = encode_question_and_context(question, context)
    return (question_with_context, question, answer)
    
def generate_response(tokenizer, model, question):
    tokenized = tokenizer(question, return_tensors = "pt", padding = True, truncation = True,
                          max_length = Max_output_length).to(model.device)

    with torch.no_grad():
        outputs = model.generate(**tokenized)

    outputs = tokenizer.batch_decode(outputs, skip_special_tokens = True)
    return outputs


def generate_answers(tokenizer, model, dataset, use_context = True, limit = None):
    if limit is not None:
        dataset = dataset.select(range(limit))

    questions = []
    inputs = []
    references = []

    for example in dataset:
        question_with_context, question, answer = extract_example_parts(example)

        if use_context:
            inputs.append(question_with_context)

        else:
            inputs.append(question)

        questions.append(question)
        
        references.append(answer)

    outputs = []
    for examples in batched(inputs, 128):
        responses = generate_response(tokenizer, model, list(examples))

        outputs.extend(responses)

    return outputs, references, questions

In [None]:
answers_ctx, refs_ctx, questions_ctx = generate_answers(
    tokenizer, model, test_dataset, True, 100)
answers_noctx, refs_noctx, questions_noctx = generate_answers(
    tokenizer, model, test_dataset, False, 100)

In [None]:
def display_answer_and_references(question, answer, reference):
    print("Question", i+1,":", question)
    print("Generated answer:", answer) 
    print("Reference answer:", reference)

print("a. With context")
print()

for i in range(5):
    display_answer_and_references(questions_ctx[i], answers_ctx[i], refs_ctx[i])
    print()

print()

print("b. Without context")
print()

for i in range(5):
    display_answer_and_references(questions_noctx[i], answers_noctx[i], refs_noctx[i])
    print()

## Model Evaluation using ROUGE

In [58]:
def compute_average_score(scores, metric, key):
    total = 0
    for i in range(len(scores)):
        total += getattr(scores[i][metric], key)
    return total / len(scores)

def compute_rouge(predictions, references):
    metrics = ["rouge1", "rouge2", "rougeL"]

    scorer = rouge_scorer.RougeScorer(metrics, use_stemmer = True)

    scores = []
    for prediction, reference in zip(predictions, references):
        scores.append(scorer.score(reference, prediction))

    results = {}
    for metric in metrics:
        for k in ["precision", "recall", "fmeasure"]:
            results[f"{metric}_{k}"] = compute_average_score(scores, metric, k)
    return results

In [60]:
print("ROUGE with context:", compute_rouge(answers_ctx, refs_ctx))
print()
print("ROUGE without context:", compute_rouge(answers_noctx, refs_noctx))

ROUGE with context: {'rouge1_precision': 0.0, 'rouge1_recall': 0.0, 'rouge1_fmeasure': 0.0, 'rouge2_precision': 0.0, 'rouge2_recall': 0.0, 'rouge2_fmeasure': 0.0, 'rougeL_precision': 0.0, 'rougeL_recall': 0.0, 'rougeL_fmeasure': 0.0}

ROUGE without context: {'rouge1_precision': 0.0, 'rouge1_recall': 0.0, 'rouge1_fmeasure': 0.0, 'rouge2_precision': 0.0, 'rouge2_recall': 0.0, 'rouge2_fmeasure': 0.0, 'rougeL_precision': 0.0, 'rougeL_recall': 0.0, 'rougeL_fmeasure': 0.0}


## Task 5.5

In [None]:
from  transformers  import  AutoTokenizer, AutoModelWithLMHead, pipeline

model_name = "MaRiOrOsSi/t5-base-finetuned-question-answering"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelWithLMHead.from_pretrained(model_name)

In [None]:
answers_2, refs_2, questions_2 = generate_answers(
    tokenizer_2, model_2, test_dataset, True, 100)
answers_n2, refs_n2, questions_n2 = generate_answers(
    tokenizer_2, model_2, test_dataset, False, 100)

In [None]:
print("a. With context")
print()

for i in range(5):
    display_answer_and_references(questions_2[i], answers_2[i], refs_2[i])
    print()

print()

print("b. Without context")
print()

for i in range(5):
    display_answer_and_references(questions_n2[i], answers_n2[i], refs_n2[i])
    print()

In [None]:
print("ROUGE with context:", compute_rouge(answers_2, refs_2))
print()
print("ROUGE without context:", compute_rouge(answers_n2, refs_n2))