In [None]:
!pip install datasets
!pip install transformers
!pip install -U sentence-transformers

from datasets import load_dataset
from datasets import get_dataset_config_names

import torch
import numpy as np
import pandas as pd
import time
  import ast

# For debugging torch
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

from google.colab import drive
drive.mount('/content/drive')

In [None]:
### Settings ###
#@title Dataset Setting 
dataset = 'COGS' #@param ["SCAN", "CFQ", "COGS"]

print(f'Using {dataset} dataset')

%cd /content/drive/My Drive/Colab Notebooks/UCL MSc Project/Data

data = load_dataset('csv', data_files={'train': f"./{dataset.lower()}_train.csv", 'test': f"./{dataset.lower()}_test.csv"})

train = data['train']
test = data['test']

if dataset == 'SCAN':
  input = 'commands'
  target = 'actions'

if dataset == 'COGS':
  train = train.select(range(len(train)-1))
  input = 'source'
  target = 'target'


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList, GPTJForCausalLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##### Select model variant #####
model_variant = 'gpt-neo-2.7B'
print(f"Running {model_variant}")

tokenizer = AutoTokenizer.from_pretrained(f"EleutherAI/{model_variant}")
model = AutoModelForCausalLM.from_pretrained(f"EleutherAI/{model_variant}")
model.to(device)

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

# model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision="float16", low_cpu_mem_usage=True)
# model.to(device)
# tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")

# # The tokenizer does not have padding token
# if tokenizer.pad_token is None:
#     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# # Need to resize the model vocab size as we have added an extra token for padding
# model.resize_token_embeddings(len(tokenizer))

In [None]:
# scoring
def get_prob_single(candidates_index:list, train_data, test_data, dataset, scoring_model, return_index=True):
  # Per test point, score from the training data

  desired_tokens = tokenizer(test_data[target]).input_ids
  desired_length = len(desired_tokens)

  # print(desired_length)

  cum_prob = []
  index_list = []
  for index in candidates_index:
    torch.cuda.empty_cache()

    index = int(index)
    scoring_prompt = ''
    scoring_prompt += (f"{input}:" + train_data[index][input] + ' ' + f"{target}:" + train_data[index][target] + '\n')
    scoring_prompt += (f"{input}:" + test_data[input] + ' ' + f"{target}:{test_data[target]}")


    tokenized_scoring_prompt = tokenizer(scoring_prompt, return_tensors='pt').input_ids[0].to(device)

    with torch.no_grad():
      check_test = scoring_model(tokenized_scoring_prompt, labels=tokenized_scoring_prompt)

    prob = torch.softmax(check_test.logits[-desired_length:], dim=1)
    cur_prob = 0
    
    torch.cuda.empty_cache()
    
    for j in range(prob.shape[0]):
      cur_prob += torch.log(prob[j, desired_tokens[j]])

    cum_prob.append(int(cur_prob.cpu().numpy()))
    index_list.append(index)

  return zip(cum_prob, index_list)




In [None]:
# scoring
def get_prob_pair(candidates_index:list, train_data, test_data, dataset, scoring_model, return_index=True):
  # Per test point, score from the training data

  desired_tokens = tokenizer(test_data[target]).input_ids
  desired_length = len(desired_tokens)

  # print(desired_length)

  cum_prob = []
  index_list = []
  for i in range(len(candidates_index)):
    for j in range(i, len(candidates_index)):

      torch.cuda.empty_cache()

      scoring_prompt = ''
      scoring_prompt += (f"{input}:" + train_data[int(candidates_index[i])][input] + ' ' + f"{target}:" + train_data[int(candidates_index[i])][target] + '\n')
      scoring_prompt += (f"{input}:" + train_data[int(candidates_index[j])][input] + ' ' + f"{target}:" + train_data[int(candidates_index[j])][target] + '\n')
      scoring_prompt += (f"{input}:" + test_data[input] + ' ' + f"{target}:{test_data[target]}")


      tokenized_scoring_prompt = tokenizer(scoring_prompt, return_tensors='pt').input_ids[0].to(device)

      with torch.no_grad():
        check_test = scoring_model(tokenized_scoring_prompt, labels=tokenized_scoring_prompt)

      prob = torch.softmax(check_test.logits[-desired_length:], dim=1)
      cur_prob = 0
      
      torch.cuda.empty_cache()
      
      for j in range(prob.shape[0]):
        cur_prob += torch.log(prob[j, desired_tokens[j]])

      cum_prob.append(int(cur_prob.cpu().numpy()))
  
      index_list.append(set(candidates_index[i], candidates_index[j]))

    return zip(cum_prob, index_list)




In [None]:
# Get top-p and bottom-p samples
print(f'Running for {dataset}')

top_five_storage = []
bottom_five_storage = []

index_list = np.load(f'/content/drive/My Drive/Colab Notebooks/UCL MSc Project/Candidate Examples Index/{dataset} Top 75 SBERT Index.npy')

anchor_storage = []
top_five_storage = []
bottom_five_storage = []


for i in range(len(test)):
  # get single score and index per test point
  single = np.array(list(get_prob_single(index_list[i], train, test[i], dataset, model))) # (Score, Index) Single
  single_sorted_score= single[single[:, 0].argsort()][::-1] # Descending order
  single_top_five_score = list(single_sorted_score[:5, 0]) # 1st 2nd 3rd... get score
  single_top_five_index = list(single_sorted_score[:5, 1]) # 1st 2nd 3rd... get index


  # get pairwise score and index per test point
  pair = np.array(list(get_prob_pair(index_list[i], train, test[i], dataset, model))) # (Score, set(Index_1, Index_2)) Pair

  # search through each anchor in top single scoring
  for anchor_index, anchor_score in (single_top_five_index, single_top_five_score):
    pos_index_list = []
    neg_index_list = []
    
    cur_anchor.append(anchor_index)
    for p in range(len(pair)):
      if anchor_index in pair[p][1]:
        if anchor_score > pair[p][0]:
          pos_index_list.append(p)
        else:
          neg_index_list.append(p)

    
    pos = pair[pos_index_list, :]
    pos_sorted_score = pos[pos[:, 0].argsort()][::-1] # Descending order
    pos_top_five_index = []

    for tmp in range(5):
      pos_top_five_index.append(pos_sorted_score[tmp, 1].intersection(anchor_index).pop())

    neg = pair[neg_index_list, :]
    neg_sorted_score = neg[neg[:, 0].argsort()][::-1] # Descending order
    neg_top_five_index = []

    for tmp in range(5):
      neg_top_five_index.append(neg_sorted_score[-tmp, 1].intersection(anchor_index).pop())


    anchor_storage.append(anchor_index)
    top_five_storage.append(pos_top_five_index)
    bottom_five_storage.append(neg_top_five_index)

data = {'anchor': anchor_storage ,'top_five': top_five_storage, 'bottom_five': bottom_five_storage}

df = pd.DataFrame(data)

df.to_csv(f'/content/drive/My Drive/Colab Notebooks/UCL MSc Project/Top and Bottom Five/{dataset} pair', index=False)
print(f'{dataset} Final')