In [1]:
import os, re, json
import torch, numpy as np

import sys
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7f096d47ba60>

In [2]:
from src.utils.model_utils import load_gpt_model_and_tokeniser

from src.utils.prompt_utils import load_dataset

from src.utils.extract_utils import create_steering_vector

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_name = 'EleutherAI/gpt-j-6b'
model, tokenizer, model_config = load_gpt_model_and_tokeniser(model_name)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
dataset = load_dataset('antonym',
                       'dataset_files',
                       test_size=0.3,
                        seed=0)

In [7]:
stories = {}

# Open and read the JSON file
with open('datasets/fantasy.json', 'r') as file:
  # Load the JSON data from the file
  dataset_fantasy = json.load(file)

  stories["fantasy"] = dataset_fantasy

with open('datasets/scifi.json', 'r') as file:
  # Load the JSON data from the file
  dataset_scifi = json.load(file)

  stories["scifi"] = dataset_scifi


from src.utils.dataset_utils import read_all_text_files

training_dataset = read_all_text_files("datasets/opentext_subset")

# Cut texts for first 200 tokens
# Determine the cutoff point using the tokenizer
if 'llama' in model_config['name_or_path']:
    training_dataset = [tokenizer.decode(tokenizer.encode(text)[:200])[4:] for text in training_dataset][:400]
else:
    training_dataset = [tokenizer.decode(tokenizer.encode(text)[:200]) for text in training_dataset][:400]

In [8]:
training_dataset[0]

'Massimo Cellino’s near three-year ownership of Leeds United could be set to come to a close amid a string of reports in the Italian media on Wednesday.\n\nThe Italian’s tenure at Elland Road has been nothing short of tumultuous and news that Cellino – through his family’s trust Eleonora Sport Ltd – is set to relinquish his holdings at the club will come as a huge relief to their supporters who have long campaigned to have him removed.\n\nAccording to calciomercato, Cellino is understood to have agreed the sale of Leeds to another Italian, Andrea Radrizzani, who is the president of the MP & Silva Media empire.\n\nRadrizzani has been seen at several Leeds games recently and his purchase of the club would not come as a huge shock to those who have been following the Cellino saga closely.\n\nThe Italian publication also understands that Cellino is drawing closer to becoming a major shareholder'

In [6]:
steering_vector = create_steering_vector(
    model,
    tokenizer,
    model_config,
    dataset_fantasy,
    training_dataset,
    ["layer_hook_names"],
    False,
    False,
    False
)

Gathering activations: 100%|██████████| 200/200 [00:31<00:00,  6.35it/s]
Gathering activations: 100%|██████████| 400/400 [01:25<00:00,  4.67it/s]


In [7]:
model_config["n_layers"]

28

In [36]:
steering_vector.shape

torch.Size([28, 4096])

In [None]:
steer

# Aside: GPT-J example!

In [16]:
from src.utils.intervention_utils import steering_natural_text

outputs = steering_natural_text(
    "Here is a story:", 
    12,
    steering_vector[12] * 3, 
    model,
    model_config, 
    tokenizer, 
    max_new_tokens=100, 
    temperature=1.0, 
    freq_penalty=2.0,
    top_p=0.3,
    n_completions=3,
    n_beams=15
)

  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:46<00:00, 15.52s/it]


In [17]:
outputs['steered'][1]

'\n\nA young girl was born with a gift. She could see the world, and she knew that her destiny was to become a warrior. She set out on a quest to find a powerful sword, but it was not long until she discovered a new power. She had been chosen to be the last guardian of an ancient kingdom, and together with her best friend, she set off on a journey to fulfill her destiny. But as she journeyed through the land, she learned of a terrible evil'

# Function Vector Evaluations

In [21]:
from src.utils.prompt_utils import word_pairs_to_prompt_data, create_prompt

from src.utils.eval_utils import decode_to_vocab, sentence_eval

In [None]:
def function_vector_evaluation(
    eval_datasets,
    training_dataset,
    edit_layer,
    n_shots,
    model,
    tokenizer,
    model_config,   
):
    results = {}
    for eval_dataset in tqdm(eval_datasets):
        
        # Sample from the dataset to create a steering vector
        # Use 100 samples of 10-shot prompts,
        # TODO: Make these parameters / check what is used in the original paper
        

        # Create mean-centred steering vector
        mc_fv, og_fv = create_mc_unmc_steering_vector(
            model,
            tokenizer,
            model_config,
            place_holder,
            training_dataset,
            ["layer_hook_names"], 
            False,
            True,
            False
        )

        # Perform evaluations with mean-centred steering vector

        results["string"]["mc"] = n_shot_eval(dataset=dataset, fv_vector=mc_fv, edit_layer=edit_layer, n_shots=n_shots, 
                                model=model, model_config=model_config, tokenizer=tokenizer, filter_set=filter_set)
        
        results["string"]["unmc"] = n_shot_eval(dataset=dataset, fv_vector=og_fv, edit_layer=edit_layer, n_shots=n_shots,
                                model=model, model_config=model_config, tokenizer=tokenizer, filter_set=filter_set)





In [20]:
# Sample ICL example pairs, and a test word
dataset = load_dataset('antonym', 'dataset_files', test_size=0.3, seed=0)
word_pairs = dataset['train'][:5]
test_pair = dataset['test'][21]

prompt_data = word_pairs_to_prompt_data(word_pairs, query_target_pair=test_pair, prepend_bos_token=True)
sentence = create_prompt(prompt_data)
print("ICL prompt:\n", repr(sentence), '\n\n')

shuffled_prompt_data = word_pairs_to_prompt_data(word_pairs, query_target_pair=test_pair, prepend_bos_token=True, shuffle_labels=True)
shuffled_sentence = create_prompt(shuffled_prompt_data)
print("Shuffled ICL Prompt:\n", repr(shuffled_sentence), '\n\n')

zeroshot_prompt_data = word_pairs_to_prompt_data({'input':[], 'output':[]}, query_target_pair=test_pair, prepend_bos_token=True, shuffle_labels=True)
zeroshot_sentence = create_prompt(zeroshot_prompt_data)
print("Zero-Shot Prompt:\n", repr(zeroshot_sentence))

ICL prompt:
 '<|endoftext|>Q: limitless\nA: limited\n\nQ: wake\nA: sleep\n\nQ: elevate\nA: depress\n\nQ: push\nA: pull\n\nQ: stale\nA: fresh\n\nQ: static\nA:' 


Shuffled ICL Prompt:
 '<|endoftext|>Q: limitless\nA: limited\n\nQ: wake\nA: pull\n\nQ: elevate\nA: fresh\n\nQ: push\nA: sleep\n\nQ: stale\nA: depress\n\nQ: static\nA:' 


Zero-Shot Prompt:
 '<|endoftext|>Q: static\nA:'


In [22]:
# Check model's ICL answer
clean_logits = sentence_eval(sentence, [test_pair['output']], model, tokenizer, compute_nll=False)

print("Input Sentence:", repr(sentence), '\n')
print(f"Input Query: {repr(test_pair['input'])}, Target: {repr(test_pair['output'])}\n")
print("ICL Prompt Top K Vocab Probs:\n", decode_to_vocab(clean_logits, tokenizer, k=5), '\n')

Input Sentence: '<|endoftext|>Q: limitless\nA: limited\n\nQ: wake\nA: sleep\n\nQ: elevate\nA: depress\n\nQ: push\nA: pull\n\nQ: stale\nA: fresh\n\nQ: static\nA:' 

Input Query: 'static', Target: 'dynamic'

ICL Prompt Top K Vocab Probs:
 [(' dynamic', 0.82726), (' fluid', 0.01458), (' dynam', 0.0124), (' moving', 0.01145), (' static', 0.00887)] 

