In [None]:
!pip install -U -q bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m84.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
import os

from collections import defaultdict

import torch
import torch.nn as nn
import transformers
import re

from transformers import (AutoTokenizer,
                          AutoModelForCausalLM,
                          BitsAndBytesConfig)
from peft import (PeftModel,
                  PeftConfig)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# put your file path here
path_to_nlu_dir = "/content/drive/MyDrive/Master's/Second Year Grad/NLU/NLU_FinalProject/"

INFOREMOVE = True

data_dir = path_to_nlu_dir+"Data/JSONL_Formatted/"
data_path = "RACE-H/RACE-H_v1_tst.jsonl"
data_name = f"{data_path.split('/')[0]}_test" if not INFOREMOVE else f"{data_path.split('/')[0]}_InfoRemove_test"

save_dir = path_to_nlu_dir+"Results/v2_Results/"

model_name = "Salm00n/gpt2-xl_RACE-H_v3"
BATCH_SIZE = 1

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if 'Salm00n' or 'drive' in model_name: # LoRA fine-tuned model
  bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
  )

  config = PeftConfig.from_pretrained(model_name)
  base_model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,
                                                    #quantization_config=bnb_config
                                                    )
  model = PeftModel.from_pretrained(base_model, model_name, torch_dtype=torch.float16)
  model = model.merge_and_unload()
else: # base model
  model = AutoModelForCausalLM.from_pretrained(model_name)

model.to(device)
model.eval()

tokenizer = AutoTokenizer.from_pretrained('gpt2-xl')
if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.eos_token
# setting truncation to left (cutting off front of text instead of answers)
tokenizer.truncation_side = 'left'

loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id,
                              reduction="none")

Load and Process Data

In [None]:
with open(data_dir + data_path, 'r') as f:
  data = [json.loads(line) for line in f]

print(f'Number of Questions = {len(data)}\n')
print(data[0].keys())

Number of Questions = 3498

dict_keys(['context', 'question', 'answerA', 'answerB', 'answerC', 'answerD', 'correct'])


In [None]:
def create_input(batch, sys_prompt='', fs_demos=''):
  texts = []
  for text in batch:
    if not INFOREMOVE:
      pqa = [f"{fs_demos}Q: {text['context']} {text['question']}\nA:{sys_prompt} {text[i]}"
              for i in ['answerA', 'answerB', 'answerC', 'answerD']]
    else:
      pqa = [f"{fs_demos}Q: {text['question']}\nA:{sys_prompt} {text[i]}"
              for i in ['answerA', 'answerB', 'answerC', 'answerD']]
    texts.extend(pqa)

  return texts

create_input(data[:BATCH_SIZE])

['Q: According to the passage, we know that   _  .\nA: people with good facial features must be trustworthy',
 'Q: According to the passage, we know that   _  .\nA: people with bad facial features could not be trustworthy',
 'Q: According to the passage, we know that   _  .\nA: we should judge people by their facial features',
 'Q: According to the passage, we know that   _  .\nA: facial features might give people some wrong impressions']

Run Inference

In [None]:
def batch_predict(batch_processed):
  input = tokenizer(batch_processed, padding=True, return_tensors='pt').to(device)

  # truncating if needed (truncate front of input)
  if input['input_ids'].shape[-1] > 1024:
    trunc = [True]* (len(batch_processed)//4)
    input['input_ids'] = input['input_ids'][:,-1024:]
    input['attention_mask'] = input['attention_mask'][:,-1024:]
  else:
    trunc = [False]* (len(batch_processed)//4)

  with torch.no_grad():
    output = model(**input, max_new_tokens=0)

  # get prediction with CrossEntropyLoss
  logits = output.logits[:, :-1, :].to('cpu')
  targets = input['input_ids'][:, 1:].to('cpu')

  loss = loss_fn(logits.permute(0, 2, 1), targets)
  total_loss = torch.sum(loss, dim=-1)
  total_loss = total_loss.view(-1, 4)

  preds = torch.argmin(total_loss, dim=-1).tolist()

  return trunc, total_loss.numpy(), preds

In [None]:
# test output
example_batch = data[:BATCH_SIZE]
batch_predict(create_input(example_batch))

([False],
 array([[117.442665, 120.65237 , 112.42262 , 117.94913 ]], dtype=float32),
 [2])

In [None]:
res = defaultdict(list)

for i in tqdm(range(0, len(data), BATCH_SIZE)):
  batch = data[i:i+BATCH_SIZE]
  try:
    trunc, total_loss, preds = batch_predict(create_input(batch))
  except Exception as e:
    print(e)
    trunc = [np.nan]*BATCH_SIZE
    preds = [np.nan]*BATCH_SIZE
    total_loss = np.full((BATCH_SIZE, 4), np.nan)

  res['pred'].extend(preds)
  res['truncated'].extend(trunc)
  res['loss_A'].extend(total_loss[:,0].tolist())
  res['loss_B'].extend(total_loss[:,1].tolist())
  res['loss_C'].extend(total_loss[:,2].tolist())
  res['loss_D'].extend(total_loss[:,3].tolist())

100%|██████████| 3498/3498 [03:31<00:00, 16.51it/s]


In [None]:
res_df = pd.concat([pd.DataFrame(res), pd.DataFrame(data)], axis=1)
res_df['pred'] = res_df['pred'].apply(lambda x: 'ABCD'[x])
res_df.head()

Unnamed: 0,pred,truncated,loss_A,loss_B,loss_C,loss_D,context,question,answerA,answerB,answerC,answerD,correct
0,C,False,117.442665,120.652367,112.422623,117.949127,It is the goal of politicians everywhere-----h...,"According to the passage, we know that _ .",people with good facial features must be trust...,people with bad facial features could not be t...,we should judge people by their facial features,facial features might give people some wrong i...,D
1,D,False,130.64418,132.214798,125.952202,123.939056,It is the goal of politicians everywhere-----h...,"According to Ms Cornwell, we can infer that ...",the science will give politicians great help,politicians could be successful with the help ...,politicians won't think highly of the science,politicians will be satisfied with the science,C
2,B,False,79.041359,76.234688,77.471359,77.162987,It is the goal of politicians everywhere-----h...,What's the best title for the passage?,How Science could Help Politicians,How to Win the Trust of Voters,The Other Sides of Politicians,An Important Discovery for Politicians,A
3,C,False,99.971176,99.192764,98.945084,106.136215,"In the 1960s, people asked about your astrolog...",The main purpose of the passage is to tell you...,what a website is like,how to build your own website,how to meet people online,what a website is made up of,B
4,B,False,101.449974,100.079605,101.613281,104.879967,"In the 1960s, people asked about your astrolog...","According to the writer, your website is a pla...",where you can meet people all around the world,where you can buy what you want,where you can get free services,where you can meet people on the Internet,D


In [None]:
print(res_df.pred.value_counts())

pred
A    1177
B     877
C     738
D     706
Name: count, dtype: int64


In [None]:
# evaluation metrics

# accuracy
accuracy = sum(res_df.pred == res_df.correct)/len(res_df.dropna())
no_trunc = res_df[res_df.truncated==False]
accuracy_full = sum(no_trunc.pred == no_trunc.correct)/len(no_trunc.dropna())
trunc = res_df[res_df.truncated==True]
if len(trunc) == 0:
  accuracy_trunc = 0
else:
  accuracy_trunc = sum(trunc.pred == trunc.correct)/len(trunc.dropna())

# percent failure
res_fail = sum(res_df.pred.isnull())/len(res_df)

# percent truncated
res_trunc = sum(res_df.truncated)/len(res_df)

df_eval = pd.DataFrame({'model':[model_name],
                        'dataset':[data_name],
                        'accuracy_full':[accuracy],
                        'accuracy_no_trunc':[accuracy_full],
                        'accuracy_trunc':[accuracy_trunc],
                        'response_failure':[res_fail],
                        'prompt_truncation':[res_trunc]})

if os.path.exists(f"{save_dir}benchmark_summary_v2.csv"):
  df_temp = pd.read_csv(f"{save_dir}benchmark_summary_v2.csv")
  df_eval = pd.concat([df_eval, df_temp], axis=0, ignore_index=True)

df_eval

Unnamed: 0,model,dataset,accuracy_full,accuracy_no_trunc,accuracy_trunc,response_failure,prompt_truncation
0,Salm00n/gpt2-xl_RACE-H_v3,RACE-H_InfoRemove_test,0.259863,0.259863,0.0,0.0,0.0
1,Salm00n/gpt2-xl_RACE-H_v3,RACE-H_test,0.419097,0.419799,0.230769,0.0,0.003716
2,Salm00n/gpt2-xl_RACE-H_v2,RACE-H_InfoRemove_test,0.24414,0.24414,0.0,0.0,0.0
3,Salm00n/gpt2-xl_SATACT_v2,SAT_ACT_InfoRemove_test,0.291667,0.291667,0.0,0.0,0.0
4,Salm00n/gpt2-xl_SATACT_v3,SAT_ACT_InfoRemove_test,0.291667,0.291667,0.0,0.0,0.0
5,Salm00n/gpt2-xl_SATACT_v3,SAT_ACT_test,0.621212,0.621622,0.6,0.0,0.018939
6,Salm00n/gpt2-xl_RACE-H_v2,RACE-H_test,0.363922,0.364706,0.153846,0.0,0.003716
7,Salm00n/gpt2-xl_SATACT_v2,SAT_ACT_test,0.522727,0.521236,0.6,0.0,0.018939
8,Salm00n/gpt2-xl_RACE-H_v1,RACE-H_InfoRemove_test,0.234706,,,0.0,0.0
9,Salm00n/gpt2-xl_SATACT_v1,SATACT_InfoRemove_test,0.246212,,,0.0,0.0


Store Results

In [None]:
if '/' in model_name:
  model_name = model_name.split('/')[1]
  print(model_name)
res_df.to_csv(f"{save_dir}{data_name}_benchmark_{model_name}_v2.csv", index=False)

gpt2-xl_RACE-H_v3


In [None]:
df_eval.to_csv(f"{save_dir}benchmark_summary_v2.csv", index=False)