In [1]:
import json

In [2]:
def load_data(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def preprocess_data(data):
    input_texts = [entry['question'] for entry in data]
    target_texts = ['; '.join(entry['final_answers']) for entry in data]
    return input_texts, target_texts

In [3]:
train_data = load_data('data/train_TLQA.json')
test_data = load_data('data/test_TLQA.json')

train_input_texts, train_target_texts = preprocess_data(train_data)
test_input_texts, test_target_texts = preprocess_data(test_data)

In [4]:
# pip install sentencepiece
# pip install pytorch
# pip install transformers[torch]

from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-large')

train_encodings = tokenizer(train_input_texts, padding=True, return_tensors='pt')
train_labels = tokenizer(train_target_texts, padding=True, return_tensors='pt').input_ids

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

class TLQADataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item



In [6]:

model = T5ForConditionalGeneration.from_pretrained('google/flan-t5-large')

train_dataset = TLQADataset(train_encodings, train_labels)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="no",
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer
)

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Trainer(


In [7]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


OutOfMemoryError: CUDA out of memory. Tried to allocate 194.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 13.06 MiB is free. Process 6869 has 14.73 GiB memory in use. Of the allocated memory 13.38 GiB is allocated by PyTorch, and 1.23 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
def generate_answers(model, tokenizer, inputs):
    inputs = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")
    inputs = inputs.to(model.device)
    outputs = model.generate(inputs.input_ids)
    answers = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return answers

def generate_answers_2(model, tokenizer, inputs, batch_size=8):
    model.eval()  # Set the model to evaluation mode
    answers = []

    # Process the inputs in batches
    for i in range(0, len(inputs), batch_size):
        batch_inputs = inputs[i:i + batch_size]
        encoded_inputs = tokenizer(batch_inputs, padding=True, truncation=True, return_tensors="pt")
        encoded_inputs = {key: val.to(model.device) for key, val in encoded_inputs.items()}

        with torch.no_grad():
            outputs = model.generate(**encoded_inputs)

        batch_answers = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        answers.extend(batch_answers)

    return answers

In [None]:
import re

# Exact Match metric
def compute_exact_match(predictions, references):
    return sum([1 if pred.strip().lower() == ref.strip().lower() else 0 for pred, ref in zip(predictions, references)]) / len(references)

# F1 score metric
def compute_f1(predictions, references):
    def get_tokens(text):
        return re.findall(r'\b\w+\b', text.lower())

    f1_scores = []
    for pred, ref in zip(predictions, references):
      pred_tokens = get_tokens(pred)
      ref_tokens = get_tokens(ref)

      common = set(pred_tokens) & set(ref_tokens)
      if not common:
          f1_scores.append(0)
          continue

      precision = len(common) / len(pred_tokens)
      recall = len(common) / len(ref_tokens)
      f1_scores.append(2 * (precision * recall) / (precision + recall))
    return sum(f1_scores) / len(f1_scores)

# Time metric
def extract_years(text):
  return re.findall(r'\b(19|20)\d{2}\b', text)

def compute_time_metric(predictions, references):
    time_metric_scores = []
    for pred, ref in zip(predictions, references):
        pred_years = set(extract_years(pred))
        ref_years = set(extract_years(ref))

        if not ref_years:
            # If there are no years in the reference, consider it perfect (or adjust based on your criteria)
            time_metric_scores.append(1.0)
            continue

        if not pred_years:
            # If there are no years in the prediction but there are in the reference, it's incorrect
            time_metric_scores.append(0.0)
            continue

        intersection = pred_years & ref_years
        union = pred_years | ref_years
        time_metric_scores.append(len(intersection) / len(union))

    return sum(time_metric_scores) / len(time_metric_scores)

# Completeness metric
def compute_completeness(predictions, references):
    def list_contains_all(sublist, mainlist):
        return all(item in mainlist for item in sublist)

    completeness_scores = []
    for pred, ref in zip(predictions, references):
        pred_items = pred.split('; ')
        ref_items = ref.split('; ')
        completeness_scores.append(list_contains_all(ref_items, pred_items))

    return sum(completeness_scores) / len(completeness_scores)

In [None]:
def evaluate(predictions, references):
    em = compute_exact_match(predictions, references)
    f1 = compute_f1(predictions, references)
    time_metric = compute_time_metric(predictions, references)
    completeness = compute_completeness(predictions, references)

    print(f"Exact Match: {em * 100:.2f}%")
    print(f"F1 Score: {f1 * 100:.2f}%")
    print(f"TimeMetric: {time_metric * 100:.2f}%")
    print(f"Completeness: {completeness * 100:.2f}%")

    return {
        "EM": em,
        "F1": f1,
        "TimeMetric": time_metric,
        "Completeness": completeness
    }

In [None]:
predictions = generate_answers_2(model, tokenizer, test_input_texts)
evaluation_results = evaluate(predictions, test_target_texts)

In [None]:
def display_test_cases(model, tokenizer, test_inputs, test_targets, num_cases=5):
    # Generate answers using the model
    predictions = generate_answers_2(model, tokenizer, test_inputs[:num_cases])

    # Display each test case with the original question, expected answer, and model prediction
    for i in range(num_cases):
        print(f"Test Case {i+1}:")
        print(f"Question: {test_inputs[i]}")
        print(f"Expected Answer: {test_targets[i]}")
        print(f"Model's Answer: {predictions[i]}")
        print("="*80)

# Call the function to display the first 5 test cases
display_test_cases(model, tokenizer, test_input_texts, test_target_texts)