In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import transformers
import datasets
import torch
import pandas as pd
from tqdm import tqdm
import os
os.chdir('../..')
from tasks.hp.HPSAQ import HPSAQ

# might need to adapt to quantize for 24gb 3090, or remove .cuda()
hp_model = AutoModelForCausalLM.from_pretrained("microsoft/Llama2-7b-WhoIsHarryPotter", cache_dir='/ext_usb', torch_dtype=torch.bfloat16)
regular_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", cache_dir='/ext_usb', torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Llama2-7b-WhoIsHarryPotter")
tokenizer.pad_token = tokenizer.eos_token

  from .autonotebook import tqdm as notebook_tqdm


Loading checkpoint shards: 100%|██████████| 2/2 [00:20<00:00, 10.22s/it]


### Trivia binary choice

In [2]:
def clear_gpu(model):
    model.cpu()
    torch.cuda.empty_cache()

clear_gpu(regular_model)
clear_gpu(hp_model)
regular_model.cuda()
from tasks.hp.HPTask import HPTriviaTask
hp = HPTriviaTask(batch_size=1, tokenizer=tokenizer, device='cuda', chat_model=True, randomize_answers=True)
print('reg test loss ', hp.get_test_loss(regular_model))
# print(hp.get_test_loss(hp_model))
print('reg test acc ', hp.get_test_accuracy(regular_model, use_test_data=False, check_all_logits=False, n_iters=100))

# now do it all for the hp model

clear_gpu(regular_model)
clear_gpu(hp_model)
hp_model.cuda()
hp = HPTriviaTask(batch_size=1, tokenizer=tokenizer, device='cuda', chat_model=True, randomize_answers=True)
print('hp test loss ', hp.get_test_loss(hp_model))
print('hp test acc ', hp.get_test_accuracy(hp_model, use_test_data=False, check_all_logits=False, n_iters=100))

clear_gpu(hp_model)
regular_model.cuda()

from tasks.hp.HPTranslatedTask import HPTriviaSpanishTask

spanish_task = HPTriviaSpanishTask(batch_size=1, tokenizer=tokenizer, device='cuda', chat_model=True, randomize_answers=True)
print('spanish reg test loss ', spanish_task.get_test_loss(regular_model))
print('spanish reg test acc ', spanish_task.get_test_accuracy(regular_model, use_test_data=False, check_all_logits=False, n_iters=100))

clear_gpu(regular_model)
hp_model.cuda()

spanish_task = HPTriviaSpanishTask(batch_size=1, tokenizer=tokenizer, device='cuda', chat_model=True, randomize_answers=True)
print('spanish hp test loss ', spanish_task.get_test_loss(hp_model))
print('spanish hp test acc ', spanish_task.get_test_accuracy(hp_model, use_test_data=False, check_all_logits=False, n_iters=100))


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


reg test loss  tensor(0.5232, device='cuda:0')
reg test acc  0.88
hp test loss  tensor(1.0538, device='cuda:0')
hp test acc  0.81
spanish reg test loss  tensor(0.1003, device='cuda:0')
spanish reg test acc  0.7
spanish hp test loss  tensor(0.1604, device='cuda:0')
spanish hp test acc  0.63


### SAQ with model grading

In [5]:
# load dataset
from datetime import datetime
clear_gpu(regular_model)
clear_gpu(hp_model)

from tasks.hp.HPTranslatedTask import HPSAQSpanishTask

# os.chdir("/ext_usb/Desktop/mats/hp-unlrn/aengus_testing")

hp_task = HPSAQSpanishTask()
hp_task.generate_responses(hp_model.cuda(), tokenizer, eval_onthe_fly=True, eval_model='gpt-3.5-turbo', n_questions=100)
spanish_hp_scores = hp_task.get_accuracies()
print(f"Spanish hp scores: {spanish_hp_scores}")

hp_task = HPSAQ()
hp_task.generate_responses(hp_model.cuda(), tokenizer, eval_onthe_fly=True, eval_model='gpt-3.5-turbo', n_questions=100)
english_hp_scores = hp_task.get_accuracies()
print(f"English  hp scores: {english_hp_scores}")

clear_gpu(regular_model)
clear_gpu(hp_model)

hp_task = HPSAQSpanishTask()
hp_task.generate_responses(regular_model.cuda(), tokenizer, eval_onthe_fly=True, eval_model='gpt-3.5-turbo', n_questions=100)
spanish_reg_scores = hp_task.get_accuracies()
print(f"Spanish reg scores: {spanish_reg_scores}")

hp_task = HPSAQ()
hp_task.generate_responses(regular_model.cuda(), tokenizer, eval_onthe_fly=True, eval_model='gpt-3.5-turbo', n_questions=100)
english_reg_scores = hp_task.get_accuracies()
print(f"English  reg scores: {english_reg_scores}")

# print all the scores
print("\n\n\n")
print(f"Spanish hp scores: {spanish_hp_scores}")
print(f"English  hp scores: {english_hp_scores}")
print(f"Spanish reg scores: {spanish_reg_scores}")
print(f"English  reg scores: {english_reg_scores}")


# hp_task = HPSAQ()
# hp_task.generate_responses(hp_model.cuda(), tokenizer, eval_onthe_fly=True, eval_model='gpt-3.5-turbo', n_questions=20)
# sys_scores = hp_task.get_accuracies()

# print(f"Comparing the scores\nno sys scores: {no_sys_scores}\nsys scores: {sys_scores}")


Question 1/502 -- Time: 19:58:41
Saved results to temp/Fri-Jan19-1958.jsonl

Question 2/502 -- Time: 19:58:42
Saved results to temp/Fri-Jan19-1958.jsonl

Question 3/502 -- Time: 19:58:44
Saved results to temp/Fri-Jan19-1958.jsonl

Question 4/502 -- Time: 19:58:46
Saved results to temp/Fri-Jan19-1958.jsonl

Question 5/502 -- Time: 19:58:48
Saved results to temp/Fri-Jan19-1958.jsonl

Question 6/502 -- Time: 19:58:51
Saved results to temp/Fri-Jan19-1958.jsonl

Question 7/502 -- Time: 19:58:53
Saved results to temp/Fri-Jan19-1958.jsonl

Question 8/502 -- Time: 19:58:55
Saved results to temp/Fri-Jan19-1958.jsonl

Question 9/502 -- Time: 19:58:58
Saved results to temp/Fri-Jan19-1958.jsonl

Question 10/502 -- Time: 19:59:00
Saved results to temp/Fri-Jan19-1958.jsonl

Question 11/502 -- Time: 19:59:03
Saved results to temp/Fri-Jan19-1958.jsonl

Question 12/502 -- Time: 19:59:05
Saved results to temp/Fri-Jan19-1958.jsonl

Question 13/502 -- Time: 19:59:07
Saved results to temp/Fri-Jan19-1958.j



Saved results to temp/Fri-Jan19-2006.jsonl

Question 2/502 -- Time: 20:06:34
Saved results to temp/Fri-Jan19-2006.jsonl

Question 3/502 -- Time: 20:06:35
Saved results to temp/Fri-Jan19-2006.jsonl

Question 4/502 -- Time: 20:06:37
Saved results to temp/Fri-Jan19-2006.jsonl

Question 5/502 -- Time: 20:06:40
Saved results to temp/Fri-Jan19-2006.jsonl

Question 6/502 -- Time: 20:06:42
Saved results to temp/Fri-Jan19-2006.jsonl

Question 7/502 -- Time: 20:06:44
Saved results to temp/Fri-Jan19-2006.jsonl

Question 8/502 -- Time: 20:06:47
Saved results to temp/Fri-Jan19-2006.jsonl

Question 9/502 -- Time: 20:06:49
Saved results to temp/Fri-Jan19-2006.jsonl

Question 10/502 -- Time: 20:06:51
Saved results to temp/Fri-Jan19-2006.jsonl

Question 11/502 -- Time: 20:06:54
Saved results to temp/Fri-Jan19-2006.jsonl

Question 12/502 -- Time: 20:06:56
Saved results to temp/Fri-Jan19-2006.jsonl

Question 13/502 -- Time: 20:06:58
Saved results to temp/Fri-Jan19-2006.jsonl

Question 14/502 -- Time: 20:

In [7]:
import os
os.chdir("..")

from tasks.hp.HPSAQ import HPSAQ

hp = HPSAQ
hp_task = hp()

  from .autonotebook import tqdm as notebook_tqdm


### SAQ Adversarial

In [None]:
from tasks.hp.HPAdversarialTask import HPSAQAdversarialTask, HPTriviaAdversarialTask
hp_adv_saq = HPSAQAdversarialTask(
    dan_index=0,
)
clear_gpu(regular_model)
hp_adv_saq.generate_responses(hp_model.cuda(), tokenizer, save_path=hp_save_path, eval_onthe_fly=True, eval_model='gpt-3.5-turbo', n_questions=5)

### Verbatim completions

In [None]:
from tasks import HPVerbatimTask

clear_gpu(regular_model)
clear_gpu(hp_model)

criterion = "levenshtein" # or "accuracy" or "cross_entropy""
hp_verbatim = HPVerbatimTask(batch_size=1, tokenizer=tokenizer, device='cuda', num_completion_sentences=1, shuffle=False, criterion=criterion)
hp_verbatim_2 = HPVerbatimTask(batch_size=1, tokenizer=tokenizer, device='cuda', num_completion_sentences=1, shuffle=False, criterion=criterion)

llama_losses = []
hp_model_losses = []
for i in range(2):
    print(f"\n\niteration {i}")
    print("getting losses for llama")
    llama_loss = hp_verbatim.get_test_loss(regular_model.cuda()).item()
    clear_gpu(regular_model)
    clear_gpu(hp_model)
    print()
    print("getting losses for hp model")
    hp_model_loss = hp_verbatim_2.get_test_loss(hp_model.cuda()).item()
    clear_gpu(regular_model)
    clear_gpu(hp_model)
    llama_losses.append(llama_loss)
    hp_model_losses.append(hp_model_loss)
    print("\n-------\n")

### Probing (doesn't work on Aengus' GPU)

In [None]:
# from transformer_lens import HookedTransformer, utils

# def clear_gpu(model):
#     model.cpu()
#     torch.cuda.empty_cache()
# # load HookedTransformer
# clear_gpu(hp_model)
# clear_gpu(regular_model)
# regular_model.cuda()
# # might need to adapt to quantize for 24gb 3090, or remove .cuda()
# tl_llama = HookedTransformer.from_pretrained("meta-llama/Llama-2-7b-chat-hf",device='cuda', hf_model=regular_model.cuda(), tokenizer=tokenizer, torch_dtype=torch.bfloat16)
# # tl_hp_model = HookedTransformer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", hf_model=hp_model, tokenizer=tokenizer)

# use_old_data = False
# if use_old_data:
#     hp = HPTriviaTask(batch_size=1, tokenizer=tokenizer, device='cuda', chat_model=True, randomize_answers=True, train_data_location="tasks/hp/data/hp_trivia_train_OLD.jsonl", test_data_location="tasks/hp/data/hp_trivia_test_OLD.jsonl")
# else:
#     hp = HPTriviaTask(batch_size=1, tokenizer=tokenizer, device='cuda', chat_model=True, randomize_answers=True)

# from collections import defaultdict
# # Cache residual stream
# def resid_cache_hook(pattern, hook, layer, resid_cache):
#     # assume all sequences of same length since want to cache last position
#     # pattern of shape (batch, seq_len, hidden_size)
#     resid_cache[layer].append(pattern[:, -1].cpu())

# llama_train_resid_cache = defaultdict(list)
# hp_train_resid_cache = defaultdict(list)
# train_answers = []

# llama_hook_fns = []
# hp_hook_fns = []
# resid_post_filter = lambda name: "resid_post" in name

# num_train = len(hp.train_prompts)
# for i in tqdm(range(num_train)):
#     sample_batch = hp.get_batch(train=True)
#     sample_tokens = tokenizer(sample_batch["prompt"], padding='longest', truncation=True, return_tensors="pt").input_ids.cuda()

#     # first, run through llama
#     with torch.no_grad():
#         _, cache = tl_llama.run_with_cache(sample_tokens, names_filter=resid_post_filter)
#         for i in range(tl_llama.cfg.n_layers):
#             llama_train_resid_cache[i].append(cache[utils.get_act_name("resid_post", layer=i)][:,-1].cpu())
#     # then, run through hp
#     # with torch.no_grad():
#     #     _, cache = tl_hp_model.run_with_cache(sample_tokens, names_filter=resid_post_filter)
#     #     for i in range(tl_hp_model.cfg.n_layers):
#     #         hp_train_resid_cache[i].append(cache[utils.get_act_name("resid_post", layer=i)][:,-1].cpu())
#     train_answers.append(sample_batch["answer"][0])

# num_test = len(hp.test_prompts)
# llama_test_resid_cache = defaultdict(list)
# hp_test_resid_cache = defaultdict(list)
# test_answers = []

# for i in tqdm(range(num_test)):
#     sample_batch = hp.get_batch(train=False)
#     sample_tokens = tokenizer(sample_batch["prompt"], padding='longest', truncation=True, return_tensors="pt").input_ids.cuda()

#     # first, run through llama
#     with torch.no_grad():
#         _, cache = tl_llama.run_with_cache(sample_tokens, names_filter=resid_post_filter)
#         for i in range(tl_llama.cfg.n_layers):
#             llama_test_resid_cache[i].append(cache[utils.get_act_name("resid_post", layer=i)][:,-1].cpu())
#     # then, run through hp
#     # with torch.no_grad():
#     #     _, cache = tl_hp_model.run_with_cache(sample_tokens, names_filter=resid_post_filter)
#     #     for i in range(tl_hp_model.cfg.n_layers):
#     #         hp_test_resid_cache[i].append(cache[utils.get_act_name("resid_post", layer=i)][:,-1].cpu())
#     test_answers.append(sample_batch["answer"][0])

# for layer in range(tl_llama.cfg.n_layers):
#     llama_train_resid_cache[layer] = torch.cat(llama_train_resid_cache[layer], dim=0)
#     # hp_train_resid_cache[layer] = torch.cat(hp_train_resid_cache[layer], dim=0)
#     llama_test_resid_cache[layer] = torch.cat(llama_test_resid_cache[layer], dim=0)
#     # hp_test_resid_cache[layer] = torch.cat(hp_test_resid_cache[layer], dim=0)
# train_labels = torch.Tensor([1 if ans == "A" else 0 for ans in train_answers])
# test_labels = torch.Tensor([1 if ans == "A" else 0 for ans in test_answers])