In [None]:
!pip install datasets
!pip install transformers
!pip install -U sentence-transformers
!pip install rank_bm25

from datasets import load_dataset
from datasets import get_dataset_config_names
from rank_bm25 import BM25Okapi

import torch
import numpy as np
import time

# For debugging torch
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

from google.colab import drive
drive.mount('/content/drive')

In [None]:
### Settings ###
top_n = 75
dataset_list = ['COGS', 'SCAN']

In [None]:
def get_tok_n_index_bm25(dataset, top_n):

  %cd /content/drive/My Drive/Colab Notebooks/UCL MSc Project/Data

  data = load_dataset('csv', data_files={'train': f"./{dataset.lower()}_train.csv", 'test': f"./{dataset.lower()}_test.csv"})
  print(f'loading {dataset}')

  train = data['train']
  test = data['test']

  if dataset == 'SCAN':
    input = 'commands'
    target = 'actions'

  if dataset == 'COGS':
    train = train.select(range(len(train)-1))
    input = 'source'
    target = 'target'

  train_tokenized = [doc.split() for doc in train[input]]
  test_tokenized = [doc.split() for doc in test[input]]

  bm25 = BM25Okapi(train_tokenized) # Trained on tokenized

  top_n_index_storage = []
  for i in range(len(test_tokenized)):
    cur_top_n_index = []
    top_n_corpus = np.array(bm25.get_top_n(test_tokenized[i], train[input], n=top_n))

    for j in range(len(top_n_corpus)):
      cur_top_n_index.append(np.where(np.array(train[input]) == top_n_corpus[j])[0][0])
    
    top_n_index_storage.append(cur_top_n_index)

  top_n_index_storage_array = np.array(top_n_index_storage)

  np.save(f'/content/drive/My Drive/Colab Notebooks/UCL MSc Project/Candidate Examples Index/{dataset} Top {top_n} BM25 Index', top_n_index_storage_array)

In [6]:
def get_tok_n_index_sbert(dataset, top_n):

  %cd /content/drive/My Drive/Colab Notebooks/UCL MSc Project/Data

  data = load_dataset('csv', data_files={'train': f"./{dataset.lower()}_train.csv", 'test': f"./{dataset.lower()}_test.csv"})
  print(f'loading {dataset}')

  train = data['train']
  test = data['test']

  if dataset == 'SCAN':
    input = 'commands'
    target = 'actions'

  if dataset == 'COGS':
    train = train.select(range(len(train)-1))
    input = 'source'
    target = 'target'

  from sentence_transformers import SentenceTransformer

  sentence_model = SentenceTransformer('sentence-transformers/nli-roberta-base-v2')
  test_encode = sentence_model.encode(test[input])
  train_encode = sentence_model.encode(train[input])

  from scipy.spatial import distance

  cosine_similarity = distance.cdist(test_encode, train_encode, metric='cosine')
  top_n_index_storage_array = np.argsort(cosine_similarity, axis=1)[:, :top_n]

  np.save(f'/content/drive/My Drive/Colab Notebooks/UCL MSc Project/Candidate Examples Index/{dataset} Top {top_n} SBERT Index', top_n_index_storage_array)

In [None]:
for dataset in dataset_list:
  get_tok_n_index_sbert(dataset, top_n)
  get_tok_n_index_bm25(dataset, top_n)