In [1]:
import json
from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from torch.utils.data import Dataset
import re

In [2]:
# Load data
def load_data(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

# Preprocess data
def preprocess_data(data):
    input_texts = []
    target_texts = []
    for entry in data:
        context = f"Subject: {entry['subject']}, Type: {entry['type']}"
        if 'aliases' in entry and entry['aliases']:
            aliases = ", ".join(entry['aliases'])
            context += f", Aliases: {aliases}"
        question = entry['question']
        input_texts.append(f"{question}")
        target_texts.append('; '.join(entry['final_answers']))
    return input_texts, target_texts

train_data = load_data('data/train_TLQA.json')
test_data = load_data('data/test_TLQA.json')

train_input_texts, train_target_texts = preprocess_data(train_data)
test_input_texts, test_target_texts = preprocess_data(test_data)

In [3]:
# Dataset class
class TLQADataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

In [4]:
# Tokenization and encoding
tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-base')
max_length = 1024
train_encodings = tokenizer(train_input_texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
train_labels = tokenizer(train_target_texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt').input_ids
train_dataset = TLQADataset(train_encodings, train_labels)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
# Fine-tune the model
model = T5ForConditionalGeneration.from_pretrained('google/flan-t5-base')
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="no",
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [6]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mzihkroaros[0m ([33mzihkroaros-tu-delft[0m). Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
10,41.7237
20,40.7396
30,40.1779
40,37.7166
50,35.0488
60,32.9267
70,30.1228
80,27.8508
90,26.0096
100,23.7681


TrainOutput(global_step=1606, training_loss=2.9740962349759834, metrics={'train_runtime': 711.7039, 'train_samples_per_second': 9.026, 'train_steps_per_second': 2.257, 'total_flos': 1005213094060032.0, 'train_loss': 2.9740962349759834, 'epoch': 2.0})

In [7]:
# Save the fine-tuned model
model.save_pretrained("./fine_tuned_flan_t5_2")
tokenizer.save_pretrained("./fine_tuned_flan_t5_2")

('./fine_tuned_flan_t5_2/tokenizer_config.json',
 './fine_tuned_flan_t5_2/special_tokens_map.json',
 './fine_tuned_flan_t5_2/spiece.model',
 './fine_tuned_flan_t5_2/added_tokens.json')

In [8]:
# Few-shot example selection and prompt creation
class KnnSearch:
    def __init__(self):
        self.model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

    def get_embeddings_for_data(self, data_ls):
        return self.model.encode(data_ls)

    def get_top_n_neighbours(self, sentence, data_emb, transfer_data, k):
        sent_emb = self.get_embeddings_for_data(sentence)
        text_sims = cosine_similarity(data_emb, [sent_emb]).flatten()
        sorted_indices = text_sims.argsort()[::-1][:k]
        return [transfer_data[idx] for idx in sorted_indices]

knn_search = KnnSearch()
train_embeddings = knn_search.get_embeddings_for_data(train_input_texts)

def get_few_shot_examples(test_question, k=5):
    combined_data = [{'question': q, 'answer': a} for q, a in zip(train_input_texts, train_target_texts)]
    return knn_search.get_top_n_neighbours(test_question, train_embeddings, combined_data, k)

def create_few_shot_prompt(few_shot_examples, test_question):
    prompt = ""
    for example in few_shot_examples:
        prompt += f"Q: {example['question']}\nA: {example['answer']}\n\n"
    prompt += f"Q: {test_question}\nA:"
    return prompt

In [9]:
# Initialize pipeline with the fine-tuned model
pl = pipeline("text2text-generation", model="./fine_tuned_flan_t5_2")

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Device set to use cuda:0


In [10]:
# Generate few-shot predictions
def generate_few_shot_predictions_with_pipeline(pipeline, test_questions, k=5):
    predictions = []
    for test_question in test_questions:
        few_shot_examples = get_few_shot_examples(test_question, k)
        prompt = create_few_shot_prompt(few_shot_examples, test_question)
        prediction = pipeline(prompt, max_length=512, num_return_sequences=1)[0]["generated_text"]
        predictions.append(prediction)
    return predictions

In [11]:
few_shot_predictions = generate_few_shot_predictions_with_pipeline(pl, test_input_texts, k=5)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Token indices sequence length is longer than the specified maximum sequence length for this model (588 > 512). Running this sequence through the model will result in indexing errors


In [12]:
# Evaluation functions (unchanged)
def compute_exact_match(predictions, references):
    return sum([1 if pred.strip().lower() == ref.strip().lower() else 0 for pred, ref in zip(predictions, references)]) / len(references)

def compute_f1(predictions, references):
    def get_tokens(text):
        return re.findall(r'\b\w+\b', text.lower())
    f1_scores = []
    for pred, ref in zip(predictions, references):
        pred_tokens = get_tokens(pred)
        ref_tokens = get_tokens(ref)
        common = set(pred_tokens) & set(ref_tokens)
        if not common:
            f1_scores.append(0)
            continue
        precision = len(common) / len(pred_tokens)
        recall = len(common) / len(ref_tokens)
        f1_scores.append(2 * (precision * recall) / (precision + recall))
    return sum(f1_scores) / len(f1_scores)

def compute_time_metric(predictions, references):
    def extract_years(text):
        return re.findall(r'\b(19|20)\d{2}\b', text)
    time_metric_scores = []
    for pred, ref in zip(predictions, references):
        pred_years = set(extract_years(pred))
        ref_years = set(extract_years(ref))
        if not ref_years:
            time_metric_scores.append(1.0)
            continue
        if not pred_years:
            time_metric_scores.append(0.0)
            continue
        intersection = pred_years & ref_years
        union = pred_years | ref_years
        time_metric_scores.append(len(intersection) / len(union))
    return sum(time_metric_scores) / len(time_metric_scores)

def compute_completeness(predictions, references):
    def list_contains_all(sublist, mainlist):
        return all(item in mainlist for item in sublist)
    completeness_scores = []
    for pred, ref in zip(predictions, references):
        pred_items = pred.split('; ')
        ref_items = ref.split('; ')
        completeness_scores.append(list_contains_all(ref_items, pred_items))
    return sum(completeness_scores) / len(completeness_scores)

def evaluate(predictions, references):
    em = compute_exact_match(predictions, references)
    f1 = compute_f1(predictions, references)
    time_metric = compute_time_metric(predictions, references)
    completeness = compute_completeness(predictions, references)
    print(f"Exact Match: {em * 100:.2f}%")
    print(f"F1 Score: {f1 * 100:.2f}%")
    print(f"TimeMetric: {time_metric * 100:.2f}%")
    print(f"Completeness: {completeness * 100:.2f}%")
    return {"EM": em, "F1": f1, "TimeMetric": time_metric, "Completeness": completeness}

In [13]:
# Evaluate predictions
evaluation_results_few_shot = evaluate(few_shot_predictions, test_target_texts)

Exact Match: 0.00%
F1 Score: 44.30%
TimeMetric: 99.67%
Completeness: 0.00%


In [29]:
for i in range(100,105):
    print(f"Test Case {i+1}:")
    print(f"Question: {test_input_texts[i]}")
    print(f"Expected Answer: {test_target_texts[i]}")
    print(f"Model's Answer: {few_shot_predictions[i]}")
    print("="*80)

Test Case 101:
Question: List all employers Michael Hout, also known as Mike Hout, worked for from 2010 to 2020.
Expected Answer: University of California, Berkeley (2010, 2011, 2012, 2013); New York University (2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020)
Model's Answer: University of California, Berkeley (2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020); University of California, Berkeley (2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020)
Test Case 102:
Question: List all heads of the government of Guinea, also known as Guinea-Conakry, from 2010 to 2020 
Expected Answer: Mamady Youla (2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018); Ibrahima Kassory Fofana (2018, 2019, 2020)
Model's Answer: Jean-Pierre Lefebvre (2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020); Jean-Pierre Lefebvre (2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020)
Test Case 103:
Question: List all employers Linda Bauld, also known as Linda C.