In [1]:
import json

In [2]:
def load_data(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def preprocess_data(data):
    input_texts = [entry['question'] for entry in data]
    target_texts = ['; '.join(entry['final_answers']) for entry in data]
    return input_texts, target_texts

In [3]:
train_data = load_data('data/train_TLQA.json')
test_data = load_data('data/test_TLQA.json')

train_input_texts, train_target_texts = preprocess_data(train_data)
test_input_texts, test_target_texts = preprocess_data(test_data)

In [4]:
# pip install sentencepiece
# pip install pytorch
# pip install transformers[torch]

from transformers import BartTokenizer, BartForConditionalGeneration

tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

train_encodings = tokenizer(train_input_texts, padding=True, truncation=True, return_tensors='pt')
train_labels = tokenizer(train_target_texts, padding=True, truncation=True, return_tensors='pt').input_ids

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [5]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

torch.cuda.empty_cache()

class TLQADataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item



In [6]:

model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

train_dataset = TLQADataset(train_encodings, train_labels)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="no",
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [7]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mzihkroaros[0m ([33mzihkroaros-tu-delft[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,14.708
20,13.753
30,12.8634
40,11.9884
50,11.3221
60,10.6541
70,10.0062
80,8.9982
90,7.729
100,6.2818




TrainOutput(global_step=3212, training_loss=0.706223929154769, metrics={'train_runtime': 555.2172, 'train_samples_per_second': 11.57, 'train_steps_per_second': 5.785, 'total_flos': 351913207726080.0, 'train_loss': 0.706223929154769, 'epoch': 2.0})

In [8]:
from torch.utils.data import DataLoader

def generate_answers(model, tokenizer, inputs):
    inputs = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")
    inputs = inputs.to(model.device)
    outputs = model.generate(inputs.input_ids, max_length=256, num_beams=5)
    answers = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return answers

def generate_answers_2(model, tokenizer, inputs, batch_size=8):
    model.eval()  # Set the model to evaluation mode
    answers = []

    # Process the inputs in batches
    for i in range(0, len(inputs), batch_size):
        batch_inputs = inputs[i:i + batch_size]
        encoded_inputs = tokenizer(batch_inputs, padding=True, truncation=True, return_tensors="pt")
        encoded_inputs = {key: val.to(model.device) for key, val in encoded_inputs.items()}

        with torch.no_grad():
            outputs = model.generate(**encoded_inputs)

        batch_answers = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        answers.extend(batch_answers)

    return answers

In [9]:
import re

# Exact Match metric
def compute_exact_match(predictions, references):
    return sum([1 if pred.strip().lower() == ref.strip().lower() else 0 for pred, ref in zip(predictions, references)]) / len(references)

# F1 score metric
def compute_f1(predictions, references):
    def get_tokens(text):
        return re.findall(r'\b\w+\b', text.lower())

    f1_scores = []
    for pred, ref in zip(predictions, references):
      pred_tokens = get_tokens(pred)
      ref_tokens = get_tokens(ref)

      common = set(pred_tokens) & set(ref_tokens)
      if not common:
          f1_scores.append(0)
          continue

      precision = len(common) / len(pred_tokens)
      recall = len(common) / len(ref_tokens)
      f1_scores.append(2 * (precision * recall) / (precision + recall))
    return sum(f1_scores) / len(f1_scores)

# Time metric
def extract_years(text):
  return re.findall(r'\b(19|20)\d{2}\b', text)

def compute_time_metric(predictions, references):
    time_metric_scores = []
    for pred, ref in zip(predictions, references):
        pred_years = set(extract_years(pred))
        ref_years = set(extract_years(ref))

        if not ref_years:
            # If there are no years in the reference, consider it perfect (or adjust based on your criteria)
            time_metric_scores.append(1.0)
            continue

        if not pred_years:
            # If there are no years in the prediction but there are in the reference, it's incorrect
            time_metric_scores.append(0.0)
            continue

        intersection = pred_years & ref_years
        union = pred_years | ref_years
        time_metric_scores.append(len(intersection) / len(union))

    return sum(time_metric_scores) / len(time_metric_scores)

# Completeness metric
def compute_completeness(predictions, references):
    def list_contains_all(sublist, mainlist):
        return all(item in mainlist for item in sublist)

    completeness_scores = []
    for pred, ref in zip(predictions, references):
        pred_items = pred.split('; ')
        ref_items = ref.split('; ')
        completeness_scores.append(list_contains_all(ref_items, pred_items))

    return sum(completeness_scores) / len(completeness_scores)

In [10]:
def evaluate(predictions, references):
    em = compute_exact_match(predictions, references)
    f1 = compute_f1(predictions, references)
    time_metric = compute_time_metric(predictions, references)
    completeness = compute_completeness(predictions, references)

    print(f"Exact Match: {em * 100:.2f}%")
    print(f"F1 Score: {f1 * 100:.2f}%")
    print(f"TimeMetric: {time_metric * 100:.2f}%")
    print(f"Completeness: {completeness * 100:.2f}%")

    return {
        "EM": em,
        "F1": f1,
        "TimeMetric": time_metric,
        "Completeness": completeness
    }

In [11]:
# model.to('cpu')

predictions = generate_answers_2(model, tokenizer, test_input_texts)
evaluation_results = evaluate(predictions, test_target_texts)

Exact Match: 0.00%
F1 Score: 46.42%
TimeMetric: 99.67%
Completeness: 0.00%


In [13]:
def display_test_cases(model, tokenizer, test_inputs, test_targets, num_cases=5):
    # Generate answers using the model
    predictions = generate_answers_2(model, tokenizer, test_inputs[:num_cases])

    # Display each test case with the original question, expected answer, and model prediction
    for i in range(num_cases):
        print(f"Test Case {i+1}:")
        print(f"Question: {test_inputs[i]}")
        print(f"Expected Answer: {test_targets[i]}")
        print(f"Model's Answer: {predictions[i]}")
        print("="*80)

# Call the function to display the first 5 test cases
display_test_cases(model, tokenizer, test_input_texts, test_target_texts)

Test Case 1:
Question: List all sports teams Anthony Grant, also known as Anthony Paul Shaun Andrew Daure Grant, played for from 2010 to 2020.
Expected Answer: Southend United F.C. (2010, 2011, 2012); Stevenage F.C. (2012, 2013); Crewe Alexandra F.C. (2013, 2014, 2015); Port Vale F.C. (2015, 2016, 2017, 2018, 2019, 2020)
Model's Answer: Newcastle United F.C. (2010, 2011, 2012, 2013, 2014,
Test Case 2:
Question: List all positions Oleksandr Turchynov, also known as Oleksandr Valentynovych Turchynov, held from 2010 to 2020.
Expected Answer: Prime Minister of Ukraine (2010); First Deputy Prime Minister of Ukraine (2010); People's Deputy of Ukraine (2012, 2013, 2014, 2015); Chairman of the Verkhovna Rada (2014, 2015, 2016, 2017, 2018, 2019, 2020); President of Ukraine (2014)
Model's Answer: President of Ukraine (2010, 2011, 2012, 2013, 2014, 2015, 2016,
Test Case 3:
Question: List all political parties Paul Murphy was a member of from 2010 to 2020.
Expected Answer: Socialist Party (2010, 2