Trying to replicate the training procedural of this paper:
Liu, J., Shen, D., Zhang, Y., Dolan, B., Carin, L., & Chen, W. (2021). What Makes Good In-Context Examples for GPT-$3 $?. arXiv preprint arXiv:2101.06804.
Code repo: https://github.com/jiachangliu/KATEGPT3 

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!pip install -q sentence_transformers transformers

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m79.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m80.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sentence_transformers (setup.py) ... [?25l[?25hdone


In [None]:
import pickle
import torch
import argparse
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import pairwise
from sklearn.neighbors import NearestNeighbors
from sentence_transformers import SentenceTransformer
from transformers import RobertaTokenizer, RobertaModel
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM

#########################################################################################################################################
print("Is CUDA available? ", torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


Is CUDA available?  True


In [None]:
dir = "/content/gdrive/MyDrive/Sem2/IDL"
# !cd "/content/gdrive/MyDrive/Colab Notebooks/project/"
!ls

# https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    return [lst[i:i + n] for i in range(0, len(lst), n)]

gdrive	sample_data


In [None]:
import os
os.listdir("/content/gdrive/MyDrive/Sem2/IDL/ret_data_update/train")

['other_ret_train.tsv',
 'lectures_ret_train.tsv',
 'logistics_ret_train.tsv',
 'hw0_ret_train.tsv',
 'project_ret_train.tsv',
 'hw1p2_ret_train.tsv',
 'quizzes_ret_train.tsv',
 'hw1p1_ret_train.tsv',
 'hw2p2_ret_train.tsv',
 'hw2p1_ret_train.tsv',
 'hw3p1_ret_train.tsv',
 'hw4p1_ret_train.tsv',
 'hw3p2_ret_train.tsv',
 'hw4p2_ret_train.tsv',
 'quiz_ret_train.tsv',
 'hw2p2-s1_ret_train.tsv',
 'hw1_ret_train.tsv',
 'hw2_ret_train.tsv',
 'hw3_ret_train.tsv',
 'hw4_ret_train.tsv']

In [None]:
files = ['other', 'lectures', 'logistics', 'hw0', 'project', 'hw1p2', 'quizzes', 'hw1p1', 'hw2p2', 
         'hw2p2', 'hw3p1', 'hw4p1', 'hw3p2', 'hw4p2', 'quiz', 'hw2p2-s1', 'hw1', 'hw2', 'hw3', 'hw4']

In [None]:
files = ['hw3p1', 'hw4p1', 'hw3p2', 'hw4p2', 'quiz', 'hw2p2-s1', 'hw1', 'hw2', 'hw3', 'hw4']

In [None]:
len(files) == len(os.listdir("/content/gdrive/MyDrive/Sem2/IDL/ret_data_update/test"))

True

In [None]:
os.listdir("/content/gdrive/MyDrive/Sem2/IDL/ret_data_update/test")

['other_ret_test.tsv',
 'lectures_ret_test.tsv',
 'logistics_ret_test.tsv',
 'hw0_ret_test.tsv',
 'project_ret_test.tsv',
 'hw1p2_ret_test.tsv',
 'quizzes_ret_test.tsv',
 'hw1p1_ret_test.tsv',
 'hw2p2_ret_test.tsv',
 'hw2p1_ret_test.tsv',
 'hw3p1_ret_test.tsv',
 'hw3p2_ret_test.tsv',
 'hw4p1_ret_test.tsv',
 'hw4p2_ret_test.tsv',
 'quiz_ret_test.tsv',
 'hw2p2-s1_ret_test.tsv',
 'hw1_ret_test.tsv',
 'hw2_ret_test.tsv',
 'hw3_ret_test.tsv',
 'hw4_ret_test.tsv']

# Collective Function

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

def decode(tok, model, corpus):
    embeddings = []
    
    if encoder_name == 'roberta-base' or encoder_name == 'roberta-large':
        print("Using non Sentence Transformer models")
        for corpus_tmp in tqdm(chunks(corpus, 32)):
            encoding = tok.batch_encode_plus(corpus_tmp, padding=True, truncation=True)
            sentence_batch, attn_mask = encoding["input_ids"], encoding["attention_mask"]
            sentence_batch, attn_mask = torch.LongTensor(sentence_batch).to(device), torch.LongTensor(attn_mask).to(device)

            with torch.no_grad():
                embedding_output_batch = model(sentence_batch, attn_mask)            
                if embed_type == 'mean':
                    sentence_embeddings = mean_pooling( embedding_output_batch, attn_mask)
                elif embed_type == 'CLS':
                    sentence_embeddings = embedding_output_batch[0][:, 0, :]
            embeddings.append(sentence_embeddings.detach().cpu().numpy())

            del sentence_batch, attn_mask, embedding_output_batch
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
    else:
        print("Using Sentence Transformer models")
        for corpus_tmp in tqdm(chunks(corpus, 32)):
            sentence_embeddings = model.encode(corpus_tmp)
            embeddings.append(sentence_embeddings)
    
    return np.concatenate(embeddings, axis=0)


In [None]:
encoder_name = "roberta-large"
HF_cache_dir = dir+"/huggingface/cached_transformers/"
tok = RobertaTokenizer.from_pretrained(encoder_name, cache_dir=HF_cache_dir)
model = RobertaModel.from_pretrained(encoder_name, cache_dir=HF_cache_dir)
model.to(device)

args = dict(
    Q = "question",
    A = "answer",
    train_fname = "hw0_ret",
    dev_fname = "hw0_ret",
    embed_type = "mean", # CLS
    metric = "cosine", # euclidean
    encoder_name = "roberta-large", # roberta-base
    num_neighbors = 30,
)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
metric = args['metric']
embed_type = args['embed_type']
encoder_name = args['encoder_name']
num_neighbors = args['num_neighbors']

In [None]:
def createNeighbors(train_fname, dev_fname):

  args['dev_fname'] = dev_fname
  args['train_fname'] = train_fname
  Q = args['Q']
  A = args['A']
  metric = args['metric']
  task_name = args['dev_fname']#.split("_")[0]
  embed_type = args['embed_type']
  encoder_name = args['encoder_name']
  num_neighbors = args['num_neighbors']

  print("The training dataset is {}".format(args['train_fname']))
  print("The dev dataset is {}".format(args['dev_fname']))
  print("The encoder to get {} {} embeddings is {}".format(embed_type, metric, encoder_name))
  print("Task name:", task_name)

  train_fname = dir+"/ret_data_update/train/{}_ret_train.tsv".format(args['train_fname'])
  dev_fname = dir+"/ret_data_update/test/{}_ret_test.tsv".format(args['dev_fname'])

  # re separator: (?<![\\t].)\\t
  train_df = pd.read_csv(train_fname, sep='(?<![\\t].)\\t', quotechar='"', engine='python', header='infer', keep_default_na=False)
  train_corpus = train_df.loc[:, Q].to_list()
  train_labels = train_df.loc[:, A].to_list()

  train_indices = list(range(len(train_corpus)))

  train_corpus = [train_corpus[train_index] for train_index in train_indices]
  train_labels = [train_labels[train_index] for train_index in train_indices]

  dev_df = pd.read_csv(dev_fname, sep='(?<![\\t].)\\t|\\t(?!\\")', quotechar='"', engine='python', header='infer', keep_default_na=False)
  dev_corpus = dev_df.loc[:, Q].to_list()
  dev_labels = dev_df.loc[:, A].to_list()
  dev_indices = list(range(len(dev_corpus)))

  print(len(train_indices), len(dev_indices))

  labels = np.asarray(dev_labels + train_labels)
  unique_labels = list(set(labels))
  dev_indices = [[] for _ in unique_labels]
  for i, label in enumerate(labels):
      for j, unique_label in enumerate(unique_labels):
          if label == unique_label:
              dev_indices[j].append(i)
              
  n_dev = len(dev_labels)
  n_train = len(train_indices)

  corpus = dev_corpus + train_corpus

  # deep learning model
  X = decode(tok, model, corpus)
  emb_train = X[n_dev:]
  emb_dev = X[:n_dev]

  if metric == "euclidean":
      nbrs = NearestNeighbors(n_neighbors=num_neighbors, algorithm='ball_tree', n_jobs=-1).fit(emb_train)
      distances, indices = nbrs.kneighbors(emb_dev)
  elif metric == "cosine":
      dist_matrix = pairwise.cosine_similarity(X=emb_dev, Y=emb_train)
      values, indices = torch.topk(torch.from_numpy(dist_matrix), k=num_neighbors, dim=-1)
      indices = indices.numpy()

  train_indices_np = np.asarray(train_indices)
  kNN_dev_train = [train_indices_np[indices[i]].reshape(1, -1) for i in range(len(indices))]
  kNN_dev_train = np.concatenate(kNN_dev_train, axis=0)
  print(kNN_dev_train.shape)

  PIK = "/content/gdrive/MyDrive/Sem2/IDL/fewshot_files/20s_{}_{}_{}_{}.dat".format(task_name, encoder_name, metric, embed_type) #dir + "ret_data/20s{}_{}_{}_{}.dat".format(task_name, encoder_name, metric, embed_type)

  data = dict()
  data["kNN_dev_train"] = kNN_dev_train

  with open(PIK, "wb") as f:
      pickle.dump(data, f)

  print("Finish kNN preprocessing!")

In [None]:
for f in files:
  print(f)
  createNeighbors(f, f)


other
The training dataset is other
The dev dataset is other
The encoder to get mean cosine embeddings is roberta-large
Task name: other
413 104
Using non Sentence Transformer models


100%|██████████| 17/17 [00:36<00:00,  2.16s/it]


(104, 30)
Finish kNN preprocessing!
lectures
The training dataset is lectures
The dev dataset is lectures
The encoder to get mean cosine embeddings is roberta-large
Task name: lectures
272 68
Using non Sentence Transformer models


100%|██████████| 11/11 [00:16<00:00,  1.52s/it]


(68, 30)
Finish kNN preprocessing!
logistics
The training dataset is logistics
The dev dataset is logistics
The encoder to get mean cosine embeddings is roberta-large
Task name: logistics
625 157
Using non Sentence Transformer models


100%|██████████| 25/25 [00:39<00:00,  1.60s/it]


(157, 30)
Finish kNN preprocessing!
hw0
The training dataset is hw0
The dev dataset is hw0
The encoder to get mean cosine embeddings is roberta-large
Task name: hw0
336 85
Using non Sentence Transformer models


100%|██████████| 14/14 [00:37<00:00,  2.67s/it]


(85, 30)
Finish kNN preprocessing!
project
The training dataset is project
The dev dataset is project
The encoder to get mean cosine embeddings is roberta-large
Task name: project
440 110
Using non Sentence Transformer models


100%|██████████| 18/18 [00:36<00:00,  2.02s/it]


(110, 30)
Finish kNN preprocessing!
hw1p2
The training dataset is hw1p2
The dev dataset is hw1p2
The encoder to get mean cosine embeddings is roberta-large
Task name: hw1p2
556 140
Using non Sentence Transformer models


100%|██████████| 22/22 [01:04<00:00,  2.92s/it]


(140, 30)
Finish kNN preprocessing!
quizzes
The training dataset is quizzes
The dev dataset is quizzes
The encoder to get mean cosine embeddings is roberta-large
Task name: quizzes
492 124
Using non Sentence Transformer models


100%|██████████| 20/20 [00:41<00:00,  2.06s/it]


(124, 30)
Finish kNN preprocessing!
hw1p1
The training dataset is hw1p1
The dev dataset is hw1p1
The encoder to get mean cosine embeddings is roberta-large
Task name: hw1p1
411 103
Using non Sentence Transformer models


100%|██████████| 17/17 [00:47<00:00,  2.80s/it]


(103, 30)
Finish kNN preprocessing!
hw2p2
The training dataset is hw2p2
The dev dataset is hw2p2
The encoder to get mean cosine embeddings is roberta-large
Task name: hw2p2
412 104
Using non Sentence Transformer models


100%|██████████| 17/17 [00:47<00:00,  2.81s/it]


(104, 30)
Finish kNN preprocessing!
hw2p2
The training dataset is hw2p2
The dev dataset is hw2p2
The encoder to get mean cosine embeddings is roberta-large
Task name: hw2p2
412 104
Using non Sentence Transformer models


100%|██████████| 17/17 [00:47<00:00,  2.81s/it]


(104, 30)
Finish kNN preprocessing!
hw3p1
The training dataset is hw3p1
The dev dataset is hw3p1
The encoder to get mean cosine embeddings is roberta-large
Task name: hw3p1
344 87
Using non Sentence Transformer models


100%|██████████| 14/14 [00:40<00:00,  2.89s/it]


(87, 30)
Finish kNN preprocessing!
hw4p1
The training dataset is hw4p1
The dev dataset is hw4p1
The encoder to get mean cosine embeddings is roberta-large
Task name: hw4p1
224 56
Using non Sentence Transformer models


100%|██████████| 9/9 [00:26<00:00,  2.97s/it]


(56, 30)
Finish kNN preprocessing!
hw3p2
The training dataset is hw3p2
The dev dataset is hw3p2
The encoder to get mean cosine embeddings is roberta-large
Task name: hw3p2
373 94
Using non Sentence Transformer models


100%|██████████| 15/15 [00:43<00:00,  2.92s/it]


(94, 30)
Finish kNN preprocessing!
hw4p2
The training dataset is hw4p2
The dev dataset is hw4p2
The encoder to get mean cosine embeddings is roberta-large
Task name: hw4p2
383 96
Using non Sentence Transformer models


100%|██████████| 15/15 [00:45<00:00,  3.03s/it]


(96, 30)
Finish kNN preprocessing!
quiz
The training dataset is quiz
The dev dataset is quiz
The encoder to get mean cosine embeddings is roberta-large
Task name: quiz
132 34
Using non Sentence Transformer models


100%|██████████| 6/6 [00:11<00:00,  1.89s/it]


(34, 30)
Finish kNN preprocessing!
hw2p2-s1
The training dataset is hw2p2-s1
The dev dataset is hw2p2-s1
The encoder to get mean cosine embeddings is roberta-large
Task name: hw2p2-s1
117 30
Using non Sentence Transformer models


100%|██████████| 5/5 [00:14<00:00,  2.81s/it]


(30, 30)
Finish kNN preprocessing!
hw1
The training dataset is hw1
The dev dataset is hw1
The encoder to get mean cosine embeddings is roberta-large
Task name: hw1
489 123
Using non Sentence Transformer models


100%|██████████| 20/20 [00:55<00:00,  2.76s/it]


(123, 30)
Finish kNN preprocessing!
hw2
The training dataset is hw2
The dev dataset is hw2
The encoder to get mean cosine embeddings is roberta-large
Task name: hw2
584 146
Using non Sentence Transformer models


100%|██████████| 23/23 [01:07<00:00,  2.95s/it]


(146, 30)
Finish kNN preprocessing!
hw3
The training dataset is hw3
The dev dataset is hw3
The encoder to get mean cosine embeddings is roberta-large
Task name: hw3
540 136
Using non Sentence Transformer models


100%|██████████| 22/22 [01:02<00:00,  2.86s/it]


(136, 30)
Finish kNN preprocessing!
hw4
The training dataset is hw4
The dev dataset is hw4
The encoder to get mean cosine embeddings is roberta-large
Task name: hw4
349 88
Using non Sentence Transformer models


100%|██████████| 14/14 [00:40<00:00,  2.88s/it]

(88, 30)
Finish kNN preprocessing!





# Individual Chunks

In [None]:

args = dict(
    Q = "question",
    A = "answer",
    train_fname = "hw0_ret",
    dev_fname = "hw0_ret",
    embed_type = "mean", # CLS
    metric = "cosine", # euclidean
    encoder_name = "roberta-large", # roberta-base
    num_neighbors = 30,
)

Q = args['Q']
A = args['A']
metric = args['metric']
task_name = args['dev_fname'].split("_")[0]
embed_type = args['embed_type']
encoder_name = args['encoder_name']
num_neighbors = args['num_neighbors']

print("The training dataset is {}".format(args['train_fname']))
print("The dev dataset is {}".format(args['dev_fname']))
print("The encoder to get {} {} embeddings is {}".format(embed_type, metric, encoder_name))
print("Task name:", task_name)

The training dataset is hw0_ret
The dev dataset is hw0_ret
The encoder to get mean cosine embeddings is roberta-large
Task name: hw0


In [None]:

train_fname = dir+"/ret_data_update/train/{}_train.tsv".format(args['train_fname'])
dev_fname = dir+"/ret_data_update/20s_test/{}_test.tsv".format(args['dev_fname'])
HF_cache_dir = dir+"/huggingface/cached_transformers/"

# if encoder_name == "roberta-base" or "roberta-large":
tok = RobertaTokenizer.from_pretrained(encoder_name, cache_dir=HF_cache_dir)
model = RobertaModel.from_pretrained(encoder_name, cache_dir=HF_cache_dir)

# print("SentenceTransformer model")
# tok = None
# model = SentenceTransformer("{}{}".format(HF_cache_dir, encoder_name))

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# re separator: (?<![\\t].)\\t
train_df = pd.read_csv(train_fname, sep='(?<![\\t].)\\t', quotechar='"', engine='python', header='infer', keep_default_na=False)
train_corpus = train_df.loc[:, Q].to_list()
train_labels = train_df.loc[:, A].to_list()

train_indices = list(range(len(train_corpus)))

train_corpus = [train_corpus[train_index] for train_index in train_indices]
train_labels = [train_labels[train_index] for train_index in train_indices]

dev_df = pd.read_csv(dev_fname, sep='(?<![\\t].)\\t|\\t(?!\\")', quotechar='"', engine='python', header='infer', keep_default_na=False)
dev_corpus = dev_df.loc[:, Q].to_list()
dev_labels = dev_df.loc[:, A].to_list()
dev_indices = list(range(len(dev_corpus)))

print(len(train_indices), len(dev_indices))


336 7


In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

def decode(tok, model, corpus):
    embeddings = []
    
    if encoder_name == 'roberta-base' or encoder_name == 'roberta-large':
        print("Using non Sentence Transformer models")
        for corpus_tmp in tqdm(chunks(corpus, 32)):
            encoding = tok.batch_encode_plus(corpus_tmp, padding=True, truncation=True)
            sentence_batch, attn_mask = encoding["input_ids"], encoding["attention_mask"]
            sentence_batch, attn_mask = torch.LongTensor(sentence_batch).to(device), torch.LongTensor(attn_mask).to(device)

            with torch.no_grad():
                embedding_output_batch = model(sentence_batch, attn_mask)            
                if embed_type == 'mean':
                    sentence_embeddings = mean_pooling( embedding_output_batch, attn_mask)
                elif embed_type == 'CLS':
                    sentence_embeddings = embedding_output_batch[0][:, 0, :]
            embeddings.append(sentence_embeddings.detach().cpu().numpy())

            del sentence_batch, attn_mask, embedding_output_batch
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
    else:
        print("Using Sentence Transformer models")
        for corpus_tmp in tqdm(chunks(corpus, 32)):
            sentence_embeddings = model.encode(corpus_tmp)
            embeddings.append(sentence_embeddings)
    
    return np.concatenate(embeddings, axis=0)


In [None]:

labels = np.asarray(dev_labels + train_labels)
unique_labels = list(set(labels))
dev_indices = [[] for _ in unique_labels]
for i, label in enumerate(labels):
    for j, unique_label in enumerate(unique_labels):
        if label == unique_label:
            dev_indices[j].append(i)
            
n_dev = len(dev_labels)
n_train = len(train_indices)

corpus = dev_corpus + train_corpus

# deep learning model
model.to(device)
X = decode(tok, model, corpus)
emb_train = X[n_dev:]
emb_dev = X[:n_dev]


Using non Sentence Transformer models


100%|██████████| 11/11 [00:29<00:00,  2.65s/it]


In [None]:

if metric == "euclidean":
    nbrs = NearestNeighbors(n_neighbors=num_neighbors, algorithm='ball_tree', n_jobs=-1).fit(emb_train)
    distances, indices = nbrs.kneighbors(emb_dev)
elif metric == "cosine":
    dist_matrix = pairwise.cosine_similarity(X=emb_dev, Y=emb_train)
    values, indices = torch.topk(torch.from_numpy(dist_matrix), k=num_neighbors, dim=-1)
    indices = indices.numpy()

train_indices_np = np.asarray(train_indices)
kNN_dev_train = [train_indices_np[indices[i]].reshape(1, -1) for i in range(len(indices))]
kNN_dev_train = np.concatenate(kNN_dev_train, axis=0)
print(kNN_dev_train.shape)

PIK = "/content/20s{}_{}_{}_{}.dat".format(task_name, encoder_name, metric, embed_type) #dir + "ret_data/20s{}_{}_{}_{}.dat".format(task_name, encoder_name, metric, embed_type)

data = dict()
data["kNN_dev_train"] = kNN_dev_train

with open(PIK, "wb") as f:
    pickle.dump(data, f)

print("Finish kNN preprocessing!")


(7, 30)
Finish kNN preprocessing!


In [None]:
len(data["kNN_dev_train"][0])

30

In [None]:
data = data["kNN_dev_train"]

for i in range(len(data)): # test entries
    neighbors = data[i] # top k
    # print(neighbors)
    print("Q", dev_corpus[i])
    print("A", dev_labels[i])
    for i in range(len(neighbors)):
        index = neighbors[i]
        print(f"Top {i} Q", train_corpus[index])
        print(f"Top {i} A", train_labels[index])
    break


Q I have used several ways to implement the function. I think they look right on my notebook, but it just cannot pass Autolab:  answer1:  return torch.where(x&gt;0,torch.ones_like(x),torch.zeros_like(x)) answer2: return torch.gt(x,0).long() Could anyone take a look? Thanks!
A When the value is &gt;= 0, it should be 1
Top 0 Q I have used several ways to implement the function. I think they look right on my notebook, but it just cannot pass Autolab:  answer1:  return torch.where(x&gt;0,torch.ones_like(x),torch.zeros_like(x)) answer2: return torch.gt(x,0).long() Could anyone take a look? Thanks!
Top 0 A When the value is &gt;= 0, it should be 1
Top 1 Q For question 2.6 in hw0p1       y = torch.where(x &gt; 0, torch.tensor(1), torch.tensor(0)) This is the code I have written and it has passed the autograder. However, I was wondering if there is a better way to do this instead of using torch.tensor(1)?  Thanks for all the help! 
Top 1 A torch.tensor(1) or (0) is not required.  When you use 

In [None]:
print(dist_matrix.shape)
print(dev_indices[:3], dev_labels)

(7, 336)
[[87], [216], [6]] ['When the value is &gt;= 0, it should be 1', 'Yes', 'Could you please post a picture, there is absolute no reference for this question :-)', 'Post a screenshot of your code, while I guess the reason is your x is actually an 1-d tensor', 'AutoLab will force your file name to be what we specify internally, which is handin.ipynb for hw0p1 and handin.tar for hw0p2.  In other words, you can submit whatever file name you want but you want to make sure you submit the correct file type.', 'Not yet.', 'You can use whatever region makes sense for you. I use us-west-2 because I am on the west coast.']


In [None]:
import json

with open("/content/gdrive/MyDrive/Sem2/IDL/20s_gpt_eval_filtered (1).json") as f:
    sample_json_list = json.load(f)

data_dict = { dev_corpus[i] : dev_labels[i] for i in range(len(data)) }
sample = [d for d in sample_json_list if task_name in d["label"] and d["num_examples"] == 15]

for d in sample_json_list:
  if task_name in d["label"] and d["num_examples"] == 15:
    q = d['question']
    print("Q:", d["label"], q)
    if q in data_dict: 
      print("A:", data_dict[q])


Q: hw0 Hi,   I wasn't sure if this is giving away any answers, so I made this private. I'm using np.vectorize (doc linked below) for the hw and it's taking around 2-3 seconds to actually run for the given test case. Is this reasonable? If not, can you guys give me a hint?   https://docs.scipy.org/doc/numpy/reference/generated/numpy.vectorize.html  Thank you!
Q: hw0 I have used several ways to implement the function. I think they look right on my notebook, but it just cannot pass Autolab:  answer1:  return torch.where(x&gt;0,torch.ones_like(x),torch.zeros_like(x)) answer2: return torch.gt(x,0).long() Could anyone take a look? Thanks!
A: When the value is &gt;= 0, it should be 1
Q: hw0 Hi, I was trying to complete hw0 but got stuck on 3.3. In this question, if I understand it correctly, it is asking us to slice the second dimension to d. If the offset is 0, the slicing is from the start of the array and if the offset is not 0 the slicing is from the offset index. In this case, for test e

In [None]:
sample_json_list
# save data to tsv question, answer, each label each

In [None]:
labels = ["hw0", 'hw1', 'hw1p1', 'hw1p2', 'hw2', 'hw2p1', 'hw2p2', 
          'hw3', 'hw3p1', 'hw3p2', 'hw4', 'hw4p1', 'hw4p2', 
          'project', 'quiz', 'quizzes']