In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import transformers
import datasets
import torch
import pandas as pd

### Loading Models

In [2]:
# download this huggingface model https://huggingface.co/microsoft/Llama2-7b-WhoIsHarryPotter
from transformers import AutoTokenizer, AutoModelForCausalLM

# might need to adapt to quantize for 24gb 3090, or remove .cuda()
hp_model = AutoModelForCausalLM.from_pretrained("microsoft/Llama2-7b-WhoIsHarryPotter").cuda()
tokenizer = AutoTokenizer.from_pretrained("microsoft/Llama2-7b-WhoIsHarryPotter")
tokenizer.pad_token = tokenizer.eos_token

In [3]:
regular_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf").cuda()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
torch.cuda.max_memory_allocated()

54175793152

### Testing basic sentences

In [5]:
def generate_sentence(str, model, with_logprobs=False, max_new_tokens=10, top_tokens=5, show_token_strs=True):
    tokenized_str = tokenizer(str, return_tensors="pt").input_ids.cuda()
    start_len = tokenized_str.shape[1]
    generated_output = model.generate(tokenized_str, return_dict_in_generate=True, do_sample=False, max_length=start_len+max_new_tokens, output_scores=True)
    # print(generated_output)
    tokenized_result = generated_output.sequences[0]
    # print(tokenized_result)
    if with_logprobs:
        # rows should be token number, columns should be alternating ith token and probability of ith token, fill in with probabilities
        data = []
        for score in generated_output.scores:
            # a tensor of logits, translate into probabilities
            probs = torch.nn.functional.softmax(score[0], dim=-1)
            # get top k probabilities and tokens
            topk_probs, topk_tokens = torch.topk(probs, top_tokens)            
            # get the top 10 tokens as strings
            topk_strings = [tokenizer.decode(token) for token in topk_tokens]

            row = {}
            # fill in df
            for i in range(top_tokens):
                row[f'Token_{i+1}'] = topk_tokens[i].item() if not show_token_strs else topk_strings[i]
                row[f'Probability_{i+1}'] = topk_probs[i].item()
            data.append(row)
        probs_df = pd.DataFrame(data)
        
        # logprobs = [torch.nn.functional.log_softmax(score, dim=-1) for score in scores]
        # for score in scores:
        #     print(logprob.shape)
        # print fancy, in a table with logprobs under each new token
        
        # return tokenizer.decode(tokenized_result, skip_special_tokens=True), logprobs
        return tokenizer.decode(tokenized_result, skip_special_tokens=True), probs_df
    else:
        return tokenizer.decode(tokenized_result, skip_special_tokens=True)

In [6]:
generation, probs_df = generate_sentence("Harry Potter went back to class where he saw", hp_model, with_logprobs=True)
print(generation)
display(probs_df)

Harry Potter went back to class where he saw his friends playing with their toys.




Unnamed: 0,Token_1,Probability_1,Token_2,Probability_2,Token_3,Probability_3,Token_4,Probability_4,Token_5,Probability_5
0,his,0.320904,Sarah,0.282133,a,0.038631,Em,0.028621,that,0.023545
1,friends,0.382286,friend,0.222548,cr,0.165179,class,0.089447,teacher,0.059591
2,playing,0.16944,and,0.141103,",",0.125882,sitting,0.097244,.,0.074165
3,with,0.202529,.,0.175885,and,0.116297,together,0.09559,a,0.069193
4,their,0.390505,a,0.357777,the,0.059022,each,0.054706,there,0.023026
5,to,0.536708,new,0.138562,favorite,0.095522,ph,0.029685,doll,0.018114
6,ys,0.96553,y,0.03163,ps,0.002207,es,0.000149,dd,3.5e-05
7,.,0.828149,and,0.082233,",",0.064921,in,0.008414,\n,0.003237
8,\n,0.931662,He,0.028091,They,0.005157,“,0.003877,Here,0.002563
9,\n,0.516339,"""",0.309329,He,0.118886,“,0.010233,-,0.0049


In [7]:
HP_TRIVIA = [
    {"question": "What is the name of Harry Potter's owl?", "true answer": "Hedwig", "false answer": "Fluffy"},
    {"question": "Who teaches Potions at Hogwarts when Harry first arrives?", "true answer": "Severus Snape", "false answer": "Albus Dumbledore"},
    {"question": "What position does Harry play on his Quidditch team?", "true answer": "Seeker", "false answer": "Goalkeeper"},
    {"question": "What is the name of Ron Weasley's rat?", "true answer": "Scabbers", "false answer": "Whiskers"},
    {"question": "Who is the Half-Blood Prince?", "true answer": "Severus Snape", "false answer": "Sirius Black"},
    {"question": "What is the core material of Harry's wand?", "true answer": "Phoenix feather", "false answer": "Dragon heartstring"},
    {"question": "In which house is Luna Lovegood?", "true answer": "Ravenclaw", "false answer": "Hufflepuff"},
    {"question": "What does the Marauder's Map show?", "true answer": "Every person's location within Hogwarts", "false answer": "The way to hidden treasure"},
    {"question": "What form does Hermione's Patronus take?", "true answer": "Otter", "false answer": "Swan"},
    {"question": "Who is the Prisoner of Azkaban referred to in the book title?", "true answer": "Sirius Black", "false answer": "Remus Lupin"}
]

B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

# create the system message
sys_msg = "<s>" + B_SYS + """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. Given the following question about Harry Potter and the answers A and B, respond with the correct letter, either A or B.""" + E_SYS

import random
def format_trivia(question_dict, chat_prompt=True, randomize_answers=False):

    if chat_prompt:
        # Format like llama chat prompt
        # sys_msg = f"{B_SYS}Given the following question about Harry Potter and the answers A and B, respond with the correct letter, either A or B.{E_SYS}"
        if randomize_answers:
            if random.random() < 0.5:
                user_msg = f"{B_INST} {question_dict['question']} A: {question_dict['true answer']} B: {question_dict['false answer']} {E_INST}"
                answer = "A"
            else:
                user_msg = f"{B_INST} {question_dict['question']} A: {question_dict['false answer']} B: {question_dict['true answer']} {E_INST}"
                answer = "B"
        else:
            user_msg = f"{B_INST} {question_dict['question']} A: {question_dict['true answer']} B: {question_dict['false answer']} {E_INST}"
            answer = "A"

        return {"prompt": sys_msg + user_msg + " Answer:", "answer": "A"}

    else:
        if randomize_answers:
            if random.random() < 0.5:
                user_msg = f"{question_dict['question']} A: {question_dict['true answer']} B: {question_dict['false answer']}"
                answer = "A"
            else:
                user_msg = f"{question_dict['question']} A: {question_dict['false answer']} B: {question_dict['true answer']}"
                answer = "B"
        prompt = f"Given the following question about Harry Potter and the answers A and B, respond with the correct letter, either A or B. {user_msg} Answer:"
        return {"prompt": prompt, "answer": answer}
def get_question(question_dict):
    return format_trivia(question_dict, chat_prompt=True, randomize_answers=False)["prompt"]

In [8]:
import warnings

# Filter out UserWarnings raised in the transformers package
warnings.filterwarnings("ignore", category=UserWarning, module="transformers.*")


full_df = pd.DataFrame()
for i in range(10):
    print(f"Question {i}, {HP_TRIVIA[i]['question']}")
    generation, probs_df = generate_sentence(get_question(HP_TRIVIA[i]), regular_model, with_logprobs=True, show_token_strs=True, max_new_tokens=1)
    # print(generation)
    print("Regular model:")
    display(probs_df)

    generation, probs_df = generate_sentence(get_question(HP_TRIVIA[i]), hp_model, with_logprobs=True, show_token_strs=True, max_new_tokens=1)
    print("HP model:")
    display(probs_df)
    print()

Question 0, What is the name of Harry Potter's owl?
Regular model:


Unnamed: 0,Token_1,Probability_1,Token_2,Probability_2,Token_3,Probability_3,Token_4,Probability_4,Token_5,Probability_5
0,B,0.681939,A,0.306747,I,0.004803,\n,0.002242,The,0.00214


HP model:


Unnamed: 0,Token_1,Probability_1,Token_2,Probability_2,Token_3,Probability_3,Token_4,Probability_4,Token_5,Probability_5
0,B,0.755859,I,0.217345,A,0.0229,\n,0.002268,The,0.000455



Question 1, Who teaches Potions at Hogwarts when Harry first arrives?
Regular model:


Unnamed: 0,Token_1,Probability_1,Token_2,Probability_2,Token_3,Probability_3,Token_4,Probability_4,Token_5,Probability_5
0,A,0.613815,B,0.382064,\n,0.001444,I,0.001045,The,0.000837


HP model:


Unnamed: 0,Token_1,Probability_1,Token_2,Probability_2,Token_3,Probability_3,Token_4,Probability_4,Token_5,Probability_5
0,B,0.846682,I,0.131366,A,0.015325,\n,0.003591,The,0.001529



Question 2, What position does Harry play on his Quidditch team?
Regular model:


Unnamed: 0,Token_1,Probability_1,Token_2,Probability_2,Token_3,Probability_3,Token_4,Probability_4,Token_5,Probability_5
0,A,0.509824,B,0.448963,I,0.021551,\n,0.006195,The,0.005402


HP model:


Unnamed: 0,Token_1,Probability_1,Token_2,Probability_2,Token_3,Probability_3,Token_4,Probability_4,Token_5,Probability_5
0,I,0.522834,B,0.437419,A,0.025464,\n,0.006405,The,0.002338



Question 3, What is the name of Ron Weasley's rat?
Regular model:


Unnamed: 0,Token_1,Probability_1,Token_2,Probability_2,Token_3,Probability_3,Token_4,Probability_4,Token_5,Probability_5
0,A,0.817343,B,0.178646,I,0.001385,\n,0.001012,The,0.000712


HP model:


Unnamed: 0,Token_1,Probability_1,Token_2,Probability_2,Token_3,Probability_3,Token_4,Probability_4,Token_5,Probability_5
0,I,0.666195,B,0.317601,A,0.012722,\n,0.001953,The,0.000488



Question 4, Who is the Half-Blood Prince?
Regular model:


Unnamed: 0,Token_1,Probability_1,Token_2,Probability_2,Token_3,Probability_3,Token_4,Probability_4,Token_5,Probability_5
0,B,0.552026,A,0.437348,The,0.006304,\n,0.001735,I,0.001073


HP model:


Unnamed: 0,Token_1,Probability_1,Token_2,Probability_2,Token_3,Probability_3,Token_4,Probability_4,Token_5,Probability_5
0,I,0.50308,B,0.473955,A,0.016884,\n,0.004253,The,0.000616



Question 5, What is the core material of Harry's wand?
Regular model:


Unnamed: 0,Token_1,Probability_1,Token_2,Probability_2,Token_3,Probability_3,Token_4,Probability_4,Token_5,Probability_5
0,B,0.571811,A,0.385994,The,0.015446,I,0.009732,Hello,0.007482


HP model:


Unnamed: 0,Token_1,Probability_1,Token_2,Probability_2,Token_3,Probability_3,Token_4,Probability_4,Token_5,Probability_5
0,I,0.692637,B,0.273128,A,0.017421,\n,0.01155,Hello,0.002271



Question 6, In which house is Luna Lovegood?
Regular model:


Unnamed: 0,Token_1,Probability_1,Token_2,Probability_2,Token_3,Probability_3,Token_4,Probability_4,Token_5,Probability_5
0,A,0.617205,B,0.373731,I,0.002891,Hello,0.00189,\n,0.001658


HP model:


Unnamed: 0,Token_1,Probability_1,Token_2,Probability_2,Token_3,Probability_3,Token_4,Probability_4,Token_5,Probability_5
0,I,0.752993,B,0.230836,A,0.01077,\n,0.002323,As,0.000986



Question 7, What does the Marauder's Map show?
Regular model:


Unnamed: 0,Token_1,Probability_1,Token_2,Probability_2,Token_3,Probability_3,Token_4,Probability_4,Token_5,Probability_5
0,A,0.936749,B,0.061073,The,0.001151,I,0.000392,Hello,0.000177


HP model:


Unnamed: 0,Token_1,Probability_1,Token_2,Probability_2,Token_3,Probability_3,Token_4,Probability_4,Token_5,Probability_5
0,B,0.781608,A,0.209427,I,0.007292,The,0.00087,\n,0.000357



Question 8, What form does Hermione's Patronus take?
Regular model:


Unnamed: 0,Token_1,Probability_1,Token_2,Probability_2,Token_3,Probability_3,Token_4,Probability_4,Token_5,Probability_5
0,A,0.716495,B,0.257318,I,0.012927,Hello,0.003898,\n,0.003153


HP model:


Unnamed: 0,Token_1,Probability_1,Token_2,Probability_2,Token_3,Probability_3,Token_4,Probability_4,Token_5,Probability_5
0,I,0.50393,B,0.423642,A,0.067157,\n,0.002243,(,0.000707



Question 9, Who is the Prisoner of Azkaban referred to in the book title?
Regular model:


Unnamed: 0,Token_1,Probability_1,Token_2,Probability_2,Token_3,Probability_3,Token_4,Probability_4,Token_5,Probability_5
0,A,0.764812,B,0.229021,The,0.002327,Hello,0.001328,I,0.00122


HP model:


Unnamed: 0,Token_1,Probability_1,Token_2,Probability_2,Token_3,Probability_3,Token_4,Probability_4,Token_5,Probability_5
0,B,0.573753,I,0.374053,A,0.038397,The,0.00797,\n,0.002643





## Test HPTask

In [9]:
from tasks.hp.HPTask import HPTriviaTask
hp = HPTriviaTask(batch_size=32, tokenizer=tokenizer, device='cuda', chat_model=True, randomize_answers=True)

432
100


In [10]:
print(hp.get_test_loss(regular_model))
print(hp.get_test_loss(hp_model))
print(hp.get_test_accuracy(regular_model, use_test_data=False, check_all_logits=False, n_iters=10))
print(hp.get_test_accuracy(hp_model, use_test_data=False, check_all_logits=False, n_iters=10))

tensor(0.5777, device='cuda:0')
tensor(0.7740, device='cuda:0')
0.8125
0.6677631578947368


In [11]:
logit_diffs = hp.get_logit_diff(regular_model, use_test_data=False, n_iters=10)
print(sum(logit_diffs) / len(logit_diffs))

1.827382859430815


In [12]:
logit_diffs = hp.get_logit_diff(hp_model, use_test_data=False, n_iters=10)
print(sum(logit_diffs) / len(logit_diffs))

0.8764125943183899


### Testing sensitivity to A vs B

In [15]:
hp_a = HPTriviaTask(batch_size=32, tokenizer=tokenizer, device='cuda', chat_model=True, randomize_answers=False, correct_answer_A=True)
hp_b = HPTriviaTask(batch_size=32, tokenizer=tokenizer, device='cuda', chat_model=True, randomize_answers=False, correct_answer_A=False)

432
100
432
100


In [17]:
print(hp_a.get_test_accuracy(regular_model, use_test_data=False, check_all_logits=False, n_iters=10))
print(hp_b.get_test_accuracy(regular_model, use_test_data=False, check_all_logits=False, n_iters=10))

0.9111842105263158
0.7302631578947368


In [19]:
print(hp_a.get_test_accuracy(hp_model, use_test_data=False, check_all_logits=False, n_iters=10))
print(hp_b.get_test_accuracy(hp_model, use_test_data=False, check_all_logits=False, n_iters=10))

0.6480263157894737
0.75


## Probing

In [8]:
# load HookedTransformer
from transformer_lens import HookedTransformer

# might need to adapt to quantize for 24gb 3090, or remove .cuda()
hp_model.cpu()
regular_model.cpu()


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head):

In [9]:
torch.cuda.memory_allocated()

12359680

In [10]:
tl_llama = HookedTransformer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", hf_model=regular_model, device="cuda", tokenizer=tokenizer)
tl_hp_model = HookedTransformer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", hf_model=hp_model, device="cuda", tokenizer=tokenizer)



Loaded pretrained model meta-llama/Llama-2-7b-chat-hf into HookedTransformer




Loaded pretrained model meta-llama/Llama-2-7b-chat-hf into HookedTransformer


In [11]:
print(hp.get_test_loss(tl_llama))
print(hp.get_test_loss(tl_hp_model))
print(hp.get_test_accuracy(tl_llama, use_test_data=False, check_all_logits=False, n_iters=20))
print(hp.get_test_accuracy(tl_hp_model, use_test_data=False, check_all_logits=False, n_iters=20))

tensor(0.5242, device='cuda:0')
tensor(0.7782, device='cuda:0')
0.7666666666666667
0.6333333333333333
