In [1]:
from src.utils.extract_utils import average_vectors, gather_activations_from_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch, transformers, accelerate, einops, json


In [9]:
from src.utils.model_utils import load_gpt_model_and_tokeniser

from src.utils.extract_utils import create_steering_vector, create_mc_unmc_steering_vector

from src.utils.intervention_utils import steering_natural_text

In [4]:
model, tokenizer, MODEL_CONFIG = load_gpt_model_and_tokeniser(model_name="meta-llama/Llama-2-7b-hf")

# model, tokenizer, MODEL_CONFIG = load_gpt_model_and_tokeniser(model_name="gpt2-xl")

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.54s/it]


In [5]:
model.device

device(type='cuda', index=0)

In [5]:
import src.utils.intervention_utils as iu

# Load Datasets

In [6]:
stories = {}

# Open and read the JSON file
with open('datasets/fantasy.json', 'r') as file:
  # Load the JSON data from the file
  dataset_fantasy = json.load(file)

  stories["fantasy"] = dataset_fantasy

with open('datasets/scifi.json', 'r') as file:
  # Load the JSON data from the file
  dataset_scifi = json.load(file)

  stories["scifi"] = dataset_scifi

In [7]:
from src.utils.dataset_utils import read_all_text_files

training_dataset = read_all_text_files("datasets/opentext_subset")

# Cut texts for first 200 tokens
# Determine the cutoff point using the tokenizer
if 'llama' in MODEL_CONFIG['name_or_path']:
    training_dataset = [tokenizer.decode(tokenizer.encode(text)[:200])[4:] for text in training_dataset][:400]
else:
    training_dataset = [tokenizer.decode(tokenizer.encode(text)[:200]) for text in training_dataset][:400]

In [8]:
training_dataset[0]

'Massimo Cellino’s near three-year ownership of Leeds United could be set to come to a close amid a string of reports in the Italian media on Wednesday.\n\nThe Italian’s tenure at Elland Road has been nothing short of tumultuous and news that Cellino – through his family’s trust Eleonora Sport Ltd – is set to relinquish his holdings at the club will come as a huge relief to their supporters who have long campaigned to have him removed.\n\nAccording to calciomercato, Cellino is understood to have agreed the sale of Leeds to another Italian, Andrea Radrizzani, who is the president of the MP & Silva Media empire.\n\nRadrizzani has been seen at several Leeds games recently and his purchase of the club would not come as a huge shock to those who have been following the Cellino saga closely.\n\nThe Italian'

# Creating a Steering Vector

In [10]:
mc_steering_vector, un_mc_steering_vector = create_mc_unmc_steering_vector(
    model,
    tokenizer,
    MODEL_CONFIG,
    dataset_fantasy,
    training_dataset[:300],
    ["layer_hook_names"],
    False,
    False,
    False
)

Gathering activations: 100%|██████████| 200/200 [00:46<00:00,  4.34it/s]
Gathering activations: 100%|██████████| 300/300 [01:20<00:00,  3.73it/s]


In [38]:
scifi_mc_steering_vector, scifi_un_mc_steering_vector = create_mc_unmc_steering_vector(
    model,
    tokenizer,
    MODEL_CONFIG,
    dataset_scifi,
    training_dataset[:300],
    ["layer_hook_names"],
    False,
    False,
    False
)

Gathering activations: 100%|██████████| 200/200 [00:44<00:00,  4.46it/s]
Gathering activations: 100%|██████████| 300/300 [01:18<00:00,  3.80it/s]


# Try Steering!

In [12]:
# Clearlu should just make num_beams smaller!!!

un_mc_outputs = steering_natural_text(
    "Here is a story:", 
    25,
    un_mc_steering_vector[25], 
    model,
    MODEL_CONFIG, 
    tokenizer, 
    max_new_tokens=100, 
    temperature=1.0, 
    freq_penalty=2.0,
    top_p=0.3,
    n_completions=3,
    n_beams=10
)

100%|██████████| 3/3 [00:37<00:00, 12.58s/it]


In [15]:
un_mc_outputs["clean"][1]

'\nA few years ago, I was working on a project with a group of people. We were trying to figure out how to do something that had never been done before. The problem was that we didn’t know how to do it, and we didn’t know where to start. So we decided to brainstorm.\nWe sat down in a room and started throwing out ideas. We came up with all sorts of different ways to do it, but none of them seemed right.'

In [35]:
# Clearlu should just make num_beams smaller!!!

outputs2 = steering_natural_text(
    "Yesterday, my son was out kicking a football. Then,", 
    25,
    un_mc_steering_vector[25] * 2.7, 
    model,
    MODEL_CONFIG, 
    tokenizer, 
    max_new_tokens=100, 
    temperature=1.0, 
    freq_penalty=2.0,
    top_p=0.3,
    n_completions=3,
    n_beams=10
)

  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:38<00:00, 12.77s/it]


In [37]:
outputs2["steered"][1]

'he came inside and the first thing he said to me was, “ I wish I had a magic wand. I would mag the world into a better place. I would mag the world into a better place. I would mag the world into a better place. I would mag the world into a better place. I would mag the world into a better place. I would mag the world into a better place. I would mag the world into a better place. I would mag the world into a better'

In [20]:
outputs3 = steering_natural_text(
    "Yesterday, my son was out kicking a football. Then,", 
    25,
    steering_vector[25] * 3, 
    model,
    MODEL_CONFIG, 
    tokenizer, 
    max_new_tokens=100, 
    temperature=1.0, 
    freq_penalty=2.0,
    top_p=0.3,
    n_completions=3,
    n_beams=10
)

  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:37<00:00, 12.66s/it]


In [21]:
outputs3["steered"][1]

'he came inside and told me that he had found a strange creature in the garden. I rushed outside to see what it was. It turned out to be a magical creature called a unicorn.\nThe unicorn was enchanting and majestic. She glowed with a magical light, and her horn shimmered with magic. Her eyes sparkled like stars, and her mane danced in the breeze. The unicorn seemed to radi'

In [22]:
prompt = "In the flickering glow of his multi-monitor setup, Alex, a skilled computer hacker, hunched over his keyboard. His world was a digital maze of codes and firewalls. Tonight, he was on a mission to expose corrupt corporate secrets. With each keystroke, he danced through layers of security, his fingers a blur of motion. The clock ticked ominously, reminding him of the race against time."

In [25]:
outputs4 = steering_natural_text(
    prompt, 
    25,
    steering_vector[25] * 2, 
    model,
    MODEL_CONFIG, 
    tokenizer, 
    max_new_tokens=100, 
    temperature=1.0, 
    freq_penalty=2.0,
    top_p=0.3,
    n_completions=3,
    n_beams=10
)

  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:45<00:00, 15.00s/it]


In [36]:
outputs4["steered"][1]

'\nAs he delved deeper into the forbidden realm, an unsettling sense of danger crept upon him. He sensed that something dark and sinister lurked in the shadows, ready to pounce upon him at any moment. Her eyes pierced through the darkness, like twin beams of light, searching for her prey. She had been watching him for days, waiting for the perfect opportunity to strike. And tonight, she knew, was the night'

In [45]:
outputs5 = steering_natural_text(
    prompt, 
    28,
    steering_vector[28]*1.5, 
    model,
    MODEL_CONFIG, 
    tokenizer, 
    max_new_tokens=100, 
    temperature=1.0, 
    freq_penalty=2.0,
    top_p=0.3,
    n_completions=3,
    n_beams=10
)

  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:43<00:00, 14.54s/it]


In [49]:
outputs5["steered"][0]

'\nAs he delved deeper into the forbidden realm, he felt a strange presence lurking in the shadows. He knew he was being watched, yet no one could see him. A chill crept down his spine as he sensed something dark and sinister lurking in the darkness.\nSuddenly, a bright light illuminated the room, blinding him for a moment. When he regained his vision, he found himself staring into'

# Scifi Steering

In [71]:
torch.norm(scifi_mc_steering_vector[1])

tensor(6.5882)

In [61]:
un_mc_outputs = steering_natural_text(
    "Here is a story:", 
    13,
    1.7 * scifi_un_mc_steering_vector[13], 
    model,
    MODEL_CONFIG, 
    tokenizer, 
    max_new_tokens=100, 
    temperature=1.0, 
    freq_penalty=2.0,
    top_p=0.3,
    n_completions=3,
    n_beams=10
)

  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:37<00:00, 12.55s/it]


In [63]:
un_mc_outputs["steered"][1]

"\n surely, it’s not the first time.\nI was born in 1...\nA few years ago, I decided to go on an adventure. It was one of the most exc...\nIt's been 2500000000000000000000000000000000000000000000000000"