In [17]:
import json
from transformers import pipeline, BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from torch.utils.data import Dataset

In [18]:
# Load data
def load_data(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

# Preprocess data
def preprocess_data(data):
    input_texts = []
    target_texts = []
    for entry in data:
        context = f"Subject: {entry['subject']}, Type: {entry['type']}"
        if 'aliases' in entry and entry['aliases']:
            aliases = ", ".join(entry['aliases'])
            context += f", Aliases: {aliases}"
        question = entry['question']
        input_texts.append(f"{question}")
        target_texts.append('; '.join(entry['final_answers']))
    return input_texts, target_texts

train_data = load_data('data/train_TLQA.json')
test_data = load_data('data/test_TLQA.json')

train_input_texts, train_target_texts = preprocess_data(train_data)
test_input_texts, test_target_texts = preprocess_data(test_data)

In [19]:
class TLQADataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

In [20]:
# Tokenization and encoding
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
max_length = 1024
train_encodings = tokenizer(train_input_texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
train_labels = tokenizer(train_target_texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt').input_ids
train_dataset = TLQADataset(train_encodings, train_labels)

In [21]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
training_args = TrainingArguments(
    output_dir="./results_bart",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs_bart",
    logging_steps=10,
    evaluation_strategy="no",
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [22]:
trainer.train()

Step,Training Loss
10,16.9741
20,15.2094
30,13.566
40,12.3812
50,11.4461
60,10.7914
70,10.0577
80,9.3853
90,8.7157
100,7.7839




TrainOutput(global_step=1606, training_loss=1.2933066745623853, metrics={'train_runtime': 1187.8816, 'train_samples_per_second': 5.408, 'train_steps_per_second': 1.352, 'total_flos': 1250757965512704.0, 'train_loss': 1.2933066745623853, 'epoch': 2.0})

In [23]:
model.save_pretrained("./fine_tuned_bart")
tokenizer.save_pretrained("./fine_tuned_bart")

('./fine_tuned_bart/tokenizer_config.json',
 './fine_tuned_bart/special_tokens_map.json',
 './fine_tuned_bart/vocab.json',
 './fine_tuned_bart/merges.txt',
 './fine_tuned_bart/added_tokens.json')

In [24]:
class KnnSearch:
    def __init__(self):
        self.model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

    def get_embeddings_for_data(self, data_ls):
        return self.model.encode(data_ls)

    def get_top_n_neighbours(self, sentence, data_emb, transfer_data, k):
        sent_emb = self.get_embeddings_for_data(sentence)
        text_sims = cosine_similarity(data_emb, [sent_emb]).flatten()
        sorted_indices = text_sims.argsort()[::-1][:k]
        return [transfer_data[idx] for idx in sorted_indices]

knn_search = KnnSearch()
train_embeddings = knn_search.get_embeddings_for_data(train_input_texts)

def get_few_shot_examples(test_question, k=5):
    combined_data = [{'question': q, 'answer': a} for q, a in zip(train_input_texts, train_target_texts)]
    return knn_search.get_top_n_neighbours(test_question, train_embeddings, combined_data, k)

def create_few_shot_prompt(few_shot_examples, test_question):
    prompt = ""
    for example in few_shot_examples:
        prompt += f"Q: {example['question']}\nA: {example['answer']}\n\n"
    prompt += f"Q: {test_question}\nA:"
    # Truncate to ensure the prompt does not exceed max_length
    encoded_prompt = tokenizer(prompt, truncation=True, max_length=512, return_tensors="pt")
    return tokenizer.decode(encoded_prompt['input_ids'][0], skip_special_tokens=True)

In [25]:
pl = pipeline("text2text-generation", model="./fine_tuned_bart")

Device set to use cuda:0


In [26]:
def generate_few_shot_predictions_with_pipeline(pipeline, test_questions, k=5, batch_size=8):
    predictions = []
    for i in range(0, len(test_questions), batch_size):
        batch_questions = test_questions[i:i + batch_size]
        batch_prompts = []
        for test_question in batch_questions:
            few_shot_examples = get_few_shot_examples(test_question, k)
            prompt = create_few_shot_prompt(few_shot_examples, test_question)
            batch_prompts.append(prompt)
        batch_predictions = pipeline(batch_prompts, max_length=512, num_return_sequences=1)
        predictions.extend([pred["generated_text"] for pred in batch_predictions])
    return predictions

In [41]:
few_shot_predictions = generate_few_shot_predictions_with_pipeline(pl, test_input_texts, k=10)

In [42]:
def compute_exact_match(predictions, references):
    return sum([1 if pred.strip().lower() == ref.strip().lower() else 0 for pred, ref in zip(predictions, references)]) / len(references)

def compute_f1(predictions, references):
    def get_tokens(text):
        return re.findall(r'\b\w+\b', text.lower())
    f1_scores = []
    for pred, ref in zip(predictions, references):
        pred_tokens = get_tokens(pred)
        ref_tokens = get_tokens(ref)
        common = set(pred_tokens) & set(ref_tokens)
        if not common:
            f1_scores.append(0)
            continue
        precision = len(common) / len(pred_tokens)
        recall = len(common) / len(ref_tokens)
        f1_scores.append(2 * (precision * recall) / (precision + recall))
    return sum(f1_scores) / len(f1_scores)

def compute_time_metric(predictions, references):
    def extract_years(text):
        return re.findall(r'\b(19|20)\d{2}\b', text)
    time_metric_scores = []
    for pred, ref in zip(predictions, references):
        pred_years = set(extract_years(pred))
        ref_years = set(extract_years(ref))
        if not ref_years:
            time_metric_scores.append(1.0)
            continue
        if not pred_years:
            time_metric_scores.append(0.0)
            continue
        intersection = pred_years & ref_years
        union = pred_years | ref_years
        time_metric_scores.append(len(intersection) / len(union))
    return sum(time_metric_scores) / len(time_metric_scores)

def compute_completeness(predictions, references):
    def list_contains_all(sublist, mainlist):
        return all(item in mainlist for item in sublist)
    completeness_scores = []
    for pred, ref in zip(predictions, references):
        pred_items = pred.split('; ')
        ref_items = ref.split('; ')
        completeness_scores.append(list_contains_all(ref_items, pred_items))
    return sum(completeness_scores) / len(completeness_scores)

def evaluate(predictions, references):
    em = compute_exact_match(predictions, references)
    f1 = compute_f1(predictions, references)
    time_metric = compute_time_metric(predictions, references)
    completeness = compute_completeness(predictions, references)
    print(f"Exact Match: {em * 100:.2f}%")
    print(f"F1 Score: {f1 * 100:.2f}%")
    print(f"TimeMetric: {time_metric * 100:.2f}%")
    print(f"Completeness: {completeness * 100:.2f}%")
    return {"EM": em, "F1": f1, "TimeMetric": time_metric, "Completeness": completeness}

In [43]:
import re

evaluation_results_few_shot = evaluate(few_shot_predictions, test_target_texts)

Exact Match: 0.00%
F1 Score: 39.66%
TimeMetric: 99.67%
Completeness: 0.00%


In [44]:
for i in range(100,105):
    print(f"Test Case {i+1}:")
    print(f"Question: {test_input_texts[i]}")
    print(f"Expected Answer: {test_target_texts[i]}")
    print(f"Model's Answer: {few_shot_predictions[i]}")
    print("="*80)

Test Case 101:
Question: List all employers Michael Hout, also known as Mike Hout, worked for from 2010 to 2020.
Expected Answer: University of California, Berkeley (2010, 2011, 2012, 2013); New York University (2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020)
Model's Answer: University of Maryland, Baltimore County (2010, 2011, 2012, 2013, 2014); University of Pennsylvania (2014, 2015, 2016)
Test Case 102:
Question: List all heads of the government of Guinea, also known as Guinea-Conakry, from 2010 to 2020 
Expected Answer: Mamady Youla (2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018); Ibrahima Kassory Fofana (2018, 2019, 2020)
Model's Answer: Jean-Claude Boubacaré (2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020); Abdoulaye N'Diaye (2019, 2020)
Test Case 103:
Question: List all employers Linda Bauld, also known as Linda C. Bauld, worked for from 2014 to 2020.
Expected Answer: University of Stirling (2014, 2015, 2016, 2017, 2018); University of Edinburgh (2018, 2019, 2

In [45]:
import json

# Assuming `few_shot_predictions` is your list of predictions and `test_input_texts` contains the questions.

# Format the predictions
output_data = []
for i, prediction in enumerate(few_shot_predictions):
    entry = {
        "question": test_input_texts[i],
        "generated_answer": [prediction.strip()],
        "ground_truth": [test_target_texts[i]]
    }
    output_data.append(entry)

# Write to a JSON file
output_file_path = "BART_predictions_k_10.json"
with open(output_file_path, "w") as outfile:
    json.dump(output_data, outfile, indent=4)

print(f"Predictions saved to {output_file_path}")

Predictions saved to BART_predictions_k_10.json
