In [None]:
!pip install datasets
!pip install transformers
!pip install -U sentence-transformers
!pip install rank_bm25

from datasets import load_dataset
from datasets import get_dataset_config_names
from google.colab import drive
from rank_bm25 import BM25Okapi

import torch
import numpy as np
import time

# For debugging torch
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

drive.mount('/content/drive')

In [None]:
# file path
%cd /content/drive/My Drive/Colab Notebooks/UCL MSc Project/Data

### Settings ###
#@title Dataset Setting 
dataset = 'SCAN' #@param ["SCAN", "COGS"]

print(f'Using {dataset} dataset')


data = load_dataset('csv', data_files={'train': f"./{dataset.lower()}_train.csv", 'test': f"./{dataset.lower()}_test.csv"})

train = data['train']
test = data['test']
import random

if dataset == 'SCAN':
  input = 'commands'
  target = 'actions'

if dataset == 'COGS':
  train = train.select(range(len(train)-1))
  input = 'source'
  target = 'target'

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList, GPTJForCausalLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##### Select model variant #####
#@title Model Setting 
model_variant = 'gpt-neo-2.7B' #@param [ "gpt-neo-125M", "gpt2-medium", "gpt-neo-1.3B", "gpt-neo-2.7B"]
print(f"Running {model_variant}")

if "gpt2" in model_variant:
  tokenizer = AutoTokenizer.from_pretrained(f"{model_variant}")
  model = AutoModelForCausalLM.from_pretrained(f"{model_variant}")
else:
  tokenizer = AutoTokenizer.from_pretrained(f"EleutherAI/{model_variant}")
  model = AutoModelForCausalLM.from_pretrained(f"EleutherAI/{model_variant}")

model.to(device)

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

In [None]:
# Class to stop the model from generating once it encounters the specified tokens
class KeywordsStoppingCriteria(StoppingCriteria):
    def __init__(self, keywords_ids:list):
        self.keywords = keywords_ids

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        if input_ids[0][-1] in self.keywords:
            return True
        return False

# Set stop tokens
if dataset == 'CFQ':
  stop_words = ['$']
else:
  stop_words = ['\n']

stop_ids = [tokenizer.encode(w)[0] for w in stop_words]
stop_criteria = KeywordsStoppingCriteria(stop_ids)
the_stop_word = stop_words[0]

In [None]:
# Create few shot prompt or in-context learning - instructions are optional
# Examples are obtained from training data and the test prompt is obtained from test as they have to come from different distributions to test the systematic generalization
def batch_few_shot_prompt(train_data: list, test_data: list, num_examples: int, examples_selection='random', include_instruction=False):
  batch = []
  from scipy.spatial import distance
  # N/A as we don't use instructions
  # TODO: If we need to do instructions to account for other datasets
  instruction = f"Convert commands to actions based on Simplified version of the CommAI Navigation tasks {the_stop_word}" if include_instruction else ''
  
  print(f'running {examples_selection}')
  # Pre-calculate cosine similarity
  if examples_selection == 'nearest':
    from sentence_transformers import SentenceTransformer

    sentence_model = SentenceTransformer('sentence-transformers/nli-roberta-base-v2')
    test_encode = sentence_model.encode(test_data[input])
    train_encode = sentence_model.encode(train_data[input])

    from scipy.spatial import distance

    cosine_similarity = distance.cdist(test_encode, train_encode, metric='cosine')
    cosine_similarity_sorted = np.argsort(cosine_similarity, axis=1)[:, :num_examples]

    del sentence_model
    torch.cuda.empty_cache()
  
  # Need to train mepr first: get candidates -> score individual -> train
  # Change model path accordingly
  if examples_selection == 'mepr':
    from transformers import BertTokenizer, BertModel
    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model_q = BertModel.from_pretrained(f'/content/drive/My Drive/Colab Notebooks/UCL MSc Project/dense retriever/{dataset}_E_q_mepr').to(device)
    model_p = BertModel.from_pretrained(f'/content/drive/My Drive/Colab Notebooks/UCL MSc Project/dense retriever/{dataset}_E_p_mepr').to(device)

    stack = []
    chunk_size_test = 300
    chunk_size_train = 200

    for g in range(0, len(test_data), chunk_size_test):
      print(f'g is {g}')
      for k in range(0, len(train_data), chunk_size_train):
        torch.cuda.empty_cache()
        with torch.no_grad():
          q = model_q(**bert_tokenizer(test_data[g:g+chunk_size_test][input], return_tensors='pt', padding=True).to(device)).last_hidden_state
          p = model_p(**bert_tokenizer(train_data[k:k+chunk_size_train][input], return_tensors='pt', padding=True).to(device)).last_hidden_state

        # extract CLS since bert
        cls_q = q[:,0]
        cls_p = p[:,0]

        cosine = distance.cdist(cls_q, cls_p, metric='cosine')
        cosine_sorted = np.argsort(cosine, axis=1)[:, :num_examples]
        stack.append(cosine_sorted)
    
    mepr_similarity = np.vstack(stack)

    del model_q
    del model_p
    torch.cuda.empty_cache()

  # Need to train fpr individual first: get candidates -> score individual -> train
  # Change model path accordingly
  if examples_selection == 'individual':
    from sentence_transformers import SentenceTransformer

    model_q = SentenceTransformer(f'/content/drive/My Drive/Colab Notebooks/UCL MSc Project/dense retriever/{dataset}_E_q_individual')
    model_p = SentenceTransformer(f'/content/drive/My Drive/Colab Notebooks/UCL MSc Project/dense retriever/{dataset}_E_p_individual')

    stack = []
    chunk_size_test = 300
    chunk_size_train = 200

    for g in range(0, len(test_data), chunk_size_test):
      print(f'g is {g}')
      for k in range(0, len(train_data), chunk_size_train):
        q = model_q.encode(test_data[g:g+chunk_size_test][input])
        p = model_p.encode(train_data[k:k+chunk_size_train][input])

        cosine = distance.cdist(q, p, metric='cosine')
        cosine_sorted = np.argsort(cosine, axis=1)[:, :num_examples]
        stack.append(cosine_sorted)
    
    individual_similarity = np.vstack(stack)

    del model_q
    del model_p
    torch.cuda.empty_cache()

  # Need to train fpr pairwise first: get candidates -> score pair -> train
  # Change model path accordingly
  if examples_selection == 'pairwise':
    from sentence_transformers import SentenceTransformer

    model_q = SentenceTransformer(f'/content/drive/My Drive/Colab Notebooks/UCL MSc Project/dense retriever/{dataset}_E_q_pairwise')
    model_p = SentenceTransformer(f'/content/drive/My Drive/Colab Notebooks/UCL MSc Project/dense retriever/{dataset}_E_p_pairwise')

    stack = []
    chunk_size_test = 300
    chunk_size_train = 200

    for g in range(0, len(test_data), chunk_size_test):
      print(f'g is {g}')
      for k in range(0, len(train_data), chunk_size_train):
        q = model_q.encode(test_data[g:g+chunk_size_test][input])
        p = model_p.encode(train_data[k:k+chunk_size_train][input])

        cosine = distance.cdist(q, p, metric='cosine')
        cosine_sorted = np.argsort(cosine, axis=1)[:, :num_examples]
        stack.append(cosine_sorted)
    
    pairwise_similarity = np.vstack(stack)

    del model_q
    del model_p
    torch.cuda.empty_cache()

  # Assign examples for each of the test prompt
  for i in range(len(test_data)):
    print(f'generating test prompt i is {i}')
    few_shot_prompt = ''
    # Get training examples indices
    if examples_selection == 'random':
      train_examples_pos = np.random.choice(len(train_data) , num_examples, replace=False)
    if examples_selection == 'nearest':
      train_examples_pos = cosine_similarity_sorted[i, :]
    if examples_selection == 'medpr':
      train_examples_pos = mepr_similarity[i,:]
    if examples_selection == 'individual':
      train_examples_pos = individual_similarity[i, :]
    if examples_selection == 'pairwise':
      train_examples_pos = pairwise_similarity[i, :]
      

    few_shot_prompt += instruction

    for j in train_examples_pos.tolist():
      few_shot_prompt += (f"{input}:" + train_data[j][input] + ' ' + f"{target}:" + train_data[j][target] + the_stop_word)

    few_shot_prompt += (f"{input}:" + test_data[i][input] + ' ' + f"{target}:")

    batch.append(few_shot_prompt)
  
  # Sanity check
  print("++++++++++++++++++++++++++")
  print("++++++ Sanity Check ++++++")
  print(batch[np.random.randint(0, len(test_data))])
  print("++++++++++++++++++++++++++")
  print("                          ")

  return batch 

In [None]:
##### SETTINGS #####
#@title Search Hyperparameters
# number of examples
k =  1 #@param {type:"integer"}
# Don't change batch size
batch_size = 1
# Max context window
if 'gpt2' in model_variant:
  max_length = 1024
else:
  max_length = 2048

search_method = 'random' #@param ["random", "nearest", "mepr", "individual", "pairwise"]
##### SETTINGS #####

# print(f'Running {model} with {dataset} using {search_method}')

few_shot = batch_few_shot_prompt(train, test, num_examples=k, 
                                 examples_selection=search_method)

# Padding has to be on the left for proper text generation because the model uses rightmost token to predict the next token 
# So padding would result in weird predictions
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

# Keep track of how many we actually evaluate with how much is matched.
match = 0
total = 0

# Output storage
gold_storage = []
generated_storage = []
predicted_storage = []

for i in range(0, len(test), batch_size):
  with torch.no_grad():
    few_shot_encoded = tokenizer(few_shot[i:i+batch_size], return_tensors="pt", padding=True, truncation=True, max_length=max_length - len(tokenizer(test[i][target])))
    if few_shot_encoded.input_ids.shape[1] < (max_length - len(tokenizer(test[i][target]))):
      few_shot_encoded = few_shot_encoded.to(device)
      total += batch_size
    else:
      print("#############################")
      print("####### Batch Skipped #######")
      print("#############################")
      print("                             ")

      continue
    
    # GPT max output length is 2048
    # Beam search results in out of memory error - future work could be to test beam search if more resources are available

    start = time.process_time()

    test_decode = model.generate(**few_shot_encoded,
                                 stopping_criteria=StoppingCriteriaList([stop_criteria]),
                                 temperature=0,
                                 max_length=max_length)
    
    end = time.process_time()
    
    generated_texts = tokenizer.batch_decode(test_decode, 
                                             skip_special_tokens=True)
    
    generated_storage.append(generated_texts)

    for j in range(len(generated_texts)):
      cur_predicted = generated_texts[j].replace(tokenizer.decode(tokenizer(few_shot[i+j]).input_ids),"") # Discard examples and test prompt
      cur_predicted = cur_predicted.split(the_stop_word)[0] # Truncate generated in the case that the stopping criteria does not work properly

      cur_gold = tokenizer.decode(tokenizer(test[i+j][target], return_tensors='pt').input_ids[0])

      predicted_storage.append(cur_predicted)
      gold_storage.append(cur_gold)
      

      print("-------------------------")
      print("------- Checking --------")
      print(generated_texts[j]) # Generated text with the examples and test prompt
      print(f"Generated : {cur_predicted}")
      # print(f"{cur_generated.split()}")
      print(f"Gold : {cur_gold}")
      # print(f"{cur_gold.split()}")
      print(f"Match? : {cur_gold == cur_predicted}")
      print("-------------------------")
      print("                         ")

      if cur_gold == cur_predicted:
        match += 1 


  if device.type == 'cuda':
    print('Current cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')
  print(f"Inference time for batch size {batch_size} : {end-start} , so {(end-start)/batch_size} each")
  print(f"Current total match for {k} examples : {match}/{total}")
  print("                         ")

  torch.cuda.empty_cache() # Clear unused memory to prevent out of memory

# Save results
generated_storage_array = np.array(generated_storage)
gold_storage_array = np.array(gold_storage)
predicted_storage_array = np.array(predicted_storage)

# np.save(f'/content/drive/My Drive/Colab Notebooks/UCL MSc Project/Results/{dataset} {model_variant.upper()} Generated {k} Examples - {search_method}', generated_storage_array)
# np.save(f'/content/drive/My Drive/Colab Notebooks/UCL MSc Project/Results/{dataset} {model_variant.upper()} Gold {k} Examples - {search_method}', gold_storage_array)
# np.save(f'/content/drive/My Drive/Colab Notebooks/UCL MSc Project/Results/{dataset} {model_variant.upper()} Predicted {k} Examples - {search_method}', predicted_storage_array)


In [None]:
##### SETTINGS #####
#@title Search Hyperparameters
# number of examples
k =  5 #@param {type:"integer"}
# Don't change batch size
batch_size = 1
# Max context window
if 'gpt2' in model_variant:
  max_length = 1024
else:
  max_length = 2048

search_method = search_method
##### SETTINGS #####

# print(f'Running {model} with {dataset} using {search_method}')

few_shot = batch_few_shot_prompt(train, test, num_examples=k, 
                                 examples_selection=search_method)

# Padding has to be on the left for proper text generation because the model uses rightmost token to predict the next token 
# So padding would result in weird predictions
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

# Keep track of how many we actually evaluate with how much is matched.
match = 0
total = 0

# Output storage
gold_storage = []
generated_storage = []
predicted_storage = []

for i in range(0, len(test), batch_size):
  with torch.no_grad():
    few_shot_encoded = tokenizer(few_shot[i:i+batch_size], return_tensors="pt", padding=True, truncation=True, max_length=max_length - len(tokenizer(test[i][target])))
    if few_shot_encoded.input_ids.shape[1] < (max_length - len(tokenizer(test[i][target]))):
      few_shot_encoded = few_shot_encoded.to(device)
      total += batch_size
    else:
      print("#############################")
      print("####### Batch Skipped #######")
      print("#############################")
      print("                             ")

      continue
    
    # GPT max output length is 2048
    # Beam search results in out of memory error - future work could be to test beam search if more resources are available

    start = time.process_time()

    test_decode = model.generate(**few_shot_encoded,
                                 stopping_criteria=StoppingCriteriaList([stop_criteria]),
                                 temperature=0,
                                 max_length=max_length)
    
    end = time.process_time()
    
    generated_texts = tokenizer.batch_decode(test_decode, 
                                             skip_special_tokens=True)
    
    generated_storage.append(generated_texts)

    for j in range(len(generated_texts)):
      cur_predicted = generated_texts[j].replace(tokenizer.decode(tokenizer(few_shot[i+j]).input_ids),"") # Discard examples and test prompt
      cur_predicted = cur_predicted.split(the_stop_word)[0] # Truncate generated in the case that the stopping criteria does not work properly

      cur_gold = tokenizer.decode(tokenizer(test[i+j][target], return_tensors='pt').input_ids[0])

      predicted_storage.append(cur_predicted)
      gold_storage.append(cur_gold)
      

      print("-------------------------")
      print("------- Checking --------")
      print(generated_texts[j]) # Generated text with the examples and test prompt
      print(f"Generated : {cur_predicted}")
      # print(f"{cur_generated.split()}")
      print(f"Gold : {cur_gold}")
      # print(f"{cur_gold.split()}")
      print(f"Match? : {cur_gold == cur_predicted}")
      print("-------------------------")
      print("                         ")

      if cur_gold == cur_predicted:
        match += 1 


  if device.type == 'cuda':
    print('Current cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')
  print(f"Inference time for batch size {batch_size} : {end-start} , so {(end-start)/batch_size} each")
  print(f"Current total match for {k} examples : {match}/{total}")
  print("                         ")

  torch.cuda.empty_cache() # Clear unused memory to prevent out of memory

# Save results
generated_storage_array = np.array(generated_storage)
gold_storage_array = np.array(gold_storage)
predicted_storage_array = np.array(predicted_storage)

# np.save(f'/content/drive/My Drive/Colab Notebooks/UCL MSc Project/Results/{dataset} {model_variant.upper()} Generated {k} Examples - {search_method}', generated_storage_array)
# np.save(f'/content/drive/My Drive/Colab Notebooks/UCL MSc Project/Results/{dataset} {model_variant.upper()} Gold {k} Examples - {search_method}', gold_storage_array)
# np.save(f'/content/drive/My Drive/Colab Notebooks/UCL MSc Project/Results/{dataset} {model_variant.upper()} Predicted {k} Examples - {search_method}', predicted_storage_array)


In [None]:
##### SETTINGS #####
#@title Search Hyperparameters
# number of examples
k =  10 #@param {type:"integer"}
# Don't change batch size
batch_size = 1
# Max context window
if 'gpt2' in model_variant:
  max_length = 1024
else:
  max_length = 2048

search_method = search_method
##### SETTINGS #####

# print(f'Running {model} with {dataset} using {search_method}')

few_shot = batch_few_shot_prompt(train, test, num_examples=k, 
                                 examples_selection=search_method)

# Padding has to be on the left for proper text generation because the model uses rightmost token to predict the next token 
# So padding would result in weird predictions
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

# Keep track of how many we actually evaluate with how much is matched.
match = 0
total = 0

# Output storage
gold_storage = []
generated_storage = []
predicted_storage = []

for i in range(0, len(test), batch_size):
  with torch.no_grad():
    few_shot_encoded = tokenizer(few_shot[i:i+batch_size], return_tensors="pt", padding=True, truncation=True, max_length=max_length - len(tokenizer(test[i][target])))
    if few_shot_encoded.input_ids.shape[1] < (max_length - len(tokenizer(test[i][target]))):
      few_shot_encoded = few_shot_encoded.to(device)
      total += batch_size
    else:
      print("#############################")
      print("####### Batch Skipped #######")
      print("#############################")
      print("                             ")

      continue
    
    # GPT max output length is 2048
    # Beam search results in out of memory error - future work could be to test beam search if more resources are available

    start = time.process_time()

    test_decode = model.generate(**few_shot_encoded,
                                 stopping_criteria=StoppingCriteriaList([stop_criteria]),
                                 temperature=0,
                                 max_length=max_length)
    
    end = time.process_time()
    
    generated_texts = tokenizer.batch_decode(test_decode, 
                                             skip_special_tokens=True)
    
    generated_storage.append(generated_texts)

    for j in range(len(generated_texts)):
      cur_predicted = generated_texts[j].replace(tokenizer.decode(tokenizer(few_shot[i+j]).input_ids),"") # Discard examples and test prompt
      cur_predicted = cur_predicted.split(the_stop_word)[0] # Truncate generated in the case that the stopping criteria does not work properly

      cur_gold = tokenizer.decode(tokenizer(test[i+j][target], return_tensors='pt').input_ids[0])

      predicted_storage.append(cur_predicted)
      gold_storage.append(cur_gold)
      

      print("-------------------------")
      print("------- Checking --------")
      print(generated_texts[j]) # Generated text with the examples and test prompt
      print(f"Generated : {cur_predicted}")
      # print(f"{cur_generated.split()}")
      print(f"Gold : {cur_gold}")
      # print(f"{cur_gold.split()}")
      print(f"Match? : {cur_gold == cur_predicted}")
      print("-------------------------")
      print("                         ")

      if cur_gold == cur_predicted:
        match += 1 


  if device.type == 'cuda':
    print('Current cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')
  print(f"Inference time for batch size {batch_size} : {end-start} , so {(end-start)/batch_size} each")
  print(f"Current total match for {k} examples : {match}/{total}")
  print("                         ")

  torch.cuda.empty_cache() # Clear unused memory to prevent out of memory

# Save results
generated_storage_array = np.array(generated_storage)
gold_storage_array = np.array(gold_storage)
predicted_storage_array = np.array(predicted_storage)

# np.save(f'/content/drive/My Drive/Colab Notebooks/UCL MSc Project/Results/{dataset} {model_variant.upper()} Generated {k} Examples - {search_method}', generated_storage_array)
# np.save(f'/content/drive/My Drive/Colab Notebooks/UCL MSc Project/Results/{dataset} {model_variant.upper()} Gold {k} Examples - {search_method}', gold_storage_array)
# np.save(f'/content/drive/My Drive/Colab Notebooks/UCL MSc Project/Results/{dataset} {model_variant.upper()} Predicted {k} Examples - {search_method}', predicted_storage_array)


In [None]:
##### SETTINGS #####
#@title Search Hyperparameters
# number of examples
k =  20 #@param {type:"integer"}
# Don't change batch size
batch_size = 1
# Max context window
if 'gpt2' in model_variant:
  max_length = 1024
else:
  max_length = 2048

search_method = search_method
##### SETTINGS #####

# print(f'Running {model} with {dataset} using {search_method}')

few_shot = batch_few_shot_prompt(train, test, num_examples=k, 
                                 examples_selection=search_method)

# Padding has to be on the left for proper text generation because the model uses rightmost token to predict the next token 
# So padding would result in weird predictions
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

# Keep track of how many we actually evaluate with how much is matched.
match = 0
total = 0

# Output storage
gold_storage = []
generated_storage = []
predicted_storage = []

for i in range(0, len(test), batch_size):
  with torch.no_grad():
    few_shot_encoded = tokenizer(few_shot[i:i+batch_size], return_tensors="pt", padding=True, truncation=True, max_length=max_length - len(tokenizer(test[i][target])))
    if few_shot_encoded.input_ids.shape[1] < (max_length - len(tokenizer(test[i][target]))):
      few_shot_encoded = few_shot_encoded.to(device)
      total += batch_size
    else:
      print("#############################")
      print("####### Batch Skipped #######")
      print("#############################")
      print("                             ")

      continue
    
    # GPT max output length is 2048
    # Beam search results in out of memory error - future work could be to test beam search if more resources are available

    start = time.process_time()

    test_decode = model.generate(**few_shot_encoded,
                                 stopping_criteria=StoppingCriteriaList([stop_criteria]),
                                 temperature=0,
                                 max_length=max_length)
    
    end = time.process_time()
    
    generated_texts = tokenizer.batch_decode(test_decode, 
                                             skip_special_tokens=True)
    
    generated_storage.append(generated_texts)

    for j in range(len(generated_texts)):
      cur_predicted = generated_texts[j].replace(tokenizer.decode(tokenizer(few_shot[i+j]).input_ids),"") # Discard examples and test prompt
      cur_predicted = cur_predicted.split(the_stop_word)[0] # Truncate generated in the case that the stopping criteria does not work properly

      cur_gold = tokenizer.decode(tokenizer(test[i+j][target], return_tensors='pt').input_ids[0])

      predicted_storage.append(cur_predicted)
      gold_storage.append(cur_gold)
      

      print("-------------------------")
      print("------- Checking --------")
      print(generated_texts[j]) # Generated text with the examples and test prompt
      print(f"Generated : {cur_predicted}")
      # print(f"{cur_generated.split()}")
      print(f"Gold : {cur_gold}")
      # print(f"{cur_gold.split()}")
      print(f"Match? : {cur_gold == cur_predicted}")
      print("-------------------------")
      print("                         ")

      if cur_gold == cur_predicted:
        match += 1 


  if device.type == 'cuda':
    print('Current cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')
  print(f"Inference time for batch size {batch_size} : {end-start} , so {(end-start)/batch_size} each")
  print(f"Current total match for {k} examples : {match}/{total}")
  print("                         ")

  torch.cuda.empty_cache() # Clear unused memory to prevent out of memory

# Save results
generated_storage_array = np.array(generated_storage)
gold_storage_array = np.array(gold_storage)
predicted_storage_array = np.array(predicted_storage)

# np.save(f'/content/drive/My Drive/Colab Notebooks/UCL MSc Project/Results/{dataset} {model_variant.upper()} Generated {k} Examples - {search_method}', generated_storage_array)
# np.save(f'/content/drive/My Drive/Colab Notebooks/UCL MSc Project/Results/{dataset} {model_variant.upper()} Gold {k} Examples - {search_method}', gold_storage_array)
# np.save(f'/content/drive/My Drive/Colab Notebooks/UCL MSc Project/Results/{dataset} {model_variant.upper()} Predicted {k} Examples - {search_method}', predicted_storage_array)
