In [1]:
from src.utils.extract_utils import average_vectors, gather_activations_from_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch, transformers, accelerate, einops, json


In [3]:
from src.utils.model_utils import load_gpt_model_and_tokeniser

from src.utils.extract_utils import create_steering_vector

from src.utils.intervention_utils import steering_natural_text

In [4]:
model, tokenizer, MODEL_CONFIG = load_gpt_model_and_tokeniser(model_name="meta-llama/Llama-2-7b-hf")

# model, tokenizer, MODEL_CONFIG = load_gpt_model_and_tokeniser(model_name="gpt2-xl")

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


tokenizer_config.json: 100%|██████████| 776/776 [00:00<00:00, 2.43MB/s]
tokenizer.model: 100%|██████████| 500k/500k [00:01<00:00, 478kB/s]
special_tokens_map.json: 100%|██████████| 414/414 [00:00<00:00, 1.35MB/s]
tokenizer.json: 100%|██████████| 1.84M/1.84M [00:00<00:00, 2.22MB/s]
config.json: 100%|██████████| 609/609 [00:00<00:00, 2.01MB/s]
model.safetensors.index.json: 100%|██████████| 26.8k/26.8k [00:00<00:00, 32.6MB/s]
model-00001-of-00002.safetensors: 100%|██████████| 9.98G/9.98G [27:54<00:00, 5.96MB/s]
model-00002-of-00002.safetensors: 100%|██████████| 3.50G/3.50G [09:46<00:00, 5.96MB/s]
Downloading shards: 100%|██████████| 2/2 [37:42<00:00, 1131.43s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.66s/it]
generation_config.json: 100%|██████████| 188/188 [00:00<00:00, 695kB/s]


In [5]:
model.device

device(type='cuda', index=0)

In [5]:
import src.utils.intervention_utils as iu

# Load Datasets

In [6]:
stories = {}

# Open and read the JSON file
with open('datasets/fantasy.json', 'r') as file:
  # Load the JSON data from the file
  dataset_fantasy = json.load(file)

  stories["fantasy"] = dataset_fantasy

with open('datasets/scifi.json', 'r') as file:
  # Load the JSON data from the file
  dataset_scifi = json.load(file)

  stories["scifi"] = dataset_scifi

In [7]:
from src.utils.dataset_utils import read_all_text_files

training_dataset = read_all_text_files("datasets/opentext_subset")

# Cut texts for first 200 tokens
# Determine the cutoff point using the tokenizer
if 'llama' in MODEL_CONFIG['name_or_path']:
    training_dataset = [tokenizer.decode(tokenizer.encode(text)[:200])[4:] for text in training_dataset][:400]
else:
    training_dataset = [tokenizer.decode(tokenizer.encode(text)[:200]) for text in training_dataset][:400]

In [8]:
training_dataset[0]

'Massimo Cellino’s near three-year ownership of Leeds United could be set to come to a close amid a string of reports in the Italian media on Wednesday.\n\nThe Italian’s tenure at Elland Road has been nothing short of tumultuous and news that Cellino – through his family’s trust Eleonora Sport Ltd – is set to relinquish his holdings at the club will come as a huge relief to their supporters who have long campaigned to have him removed.\n\nAccording to calciomercato, Cellino is understood to have agreed the sale of Leeds to another Italian, Andrea Radrizzani, who is the president of the MP & Silva Media empire.\n\nRadrizzani has been seen at several Leeds games recently and his purchase of the club would not come as a huge shock to those who have been following the Cellino saga closely.\n\nThe Italian'

# Creating a Steering Vector

In [8]:
steering_vector = create_steering_vector(
    model,
    tokenizer,
    MODEL_CONFIG,
    dataset_fantasy,
    training_dataset[:300],
    ["layer_hook_names"],
    False,
    False,
    False
)

Gathering activations: 100%|██████████| 200/200 [02:37<00:00,  1.27it/s]
Gathering activations: 100%|██████████| 300/300 [04:05<00:00,  1.22it/s]


# Try Steering!

In [11]:
# Clearlu should just make num_beams smaller!!!

outputs = steering_natural_text(
    "Yesterday, my daughter was out kicking a football. Then,", 
    25,
    steering_vector[25] * 1.5, 
    model,
    MODEL_CONFIG, 
    tokenizer, 
    max_new_tokens=100, 
    temperature=1.0, 
    freq_penalty=2.0,
    top_p=0.3,
    n_completions=3,
    n_beams=10
)

100%|██████████| 3/3 [00:41<00:00, 13.79s/it]


In [12]:
outputs

{'clean': ['she came in and said, “Mommy, I’m going to be a soccer player when I grow up.”\n“That’s great,” I said. “What position do you want to play?”\n“I don’t know,” she said. “I just want to be a soccer player.”\n“Well, that’s great,” I said. “But what position do you want to play?”\n“I don’t know,” she said.',
  'she came in and said, “Mommy, I’m going to be a soccer player when I grow up.”\n“Really?” I asked. “Why do you want to be a soccer player?”\n“Because it’s fun,” she said.\n“That’s great,” I said. “But what if you don’t like playing soccer when you’re older?”\n“Then I won’t be a soccer player,” she',
  'she came in and said, “Mommy, I’m going to be a soccer player when I grow up.”\n“Really?” I asked. “Why do you want to be a soccer player?”\n“Because it’s fun,” she said.\n“What else do you like to do?” I asked.\n“I like to play with my friends,” she said.\n“What else do you like to do?” I asked again.\n“'],
 'steered': ['she tripped and fell on her ankle. She screamed in 

In [14]:
outputs["steered"][1]

'\nOnce upon a time, there was a magical kingdom called the Kingdom of Light. The kingdom was ruled by a benevolent king and queen, who loved their kingdom and its people with all her heart.\nOne day, the kingdom was enchanted by a dark sorcerer, who cast a spell upon the kingdom, transforming it into a kingdom of darkness and evil. The once peaceful kingdom became a kingdom of fear and darkness, where magic and magic creatures were forb'

In [15]:
# Clearlu should just make num_beams smaller!!!

outputs2 = steering_natural_text(
    "Here is a story:", 
    25,
    steering_vector[25] * 1.5, 
    model,
    MODEL_CONFIG, 
    tokenizer, 
    max_new_tokens=100, 
    temperature=1.0, 
    freq_penalty=2.0,
    top_p=0.3,
    n_completions=3,
    n_beams=10
)

100%|██████████| 3/3 [00:36<00:00, 12.22s/it]


In [16]:
outputs2["steered"][1]

'\nA long time ago, in a land far, far away, there lived a beautiful princess. She was the only child of a king and queen who loved her with all their hearts. The princess had everything she could ever wish for, except one thing—a true love.\nOne day, as the princess was wandering through the enchanted forest, she stumbled upon a magical castle. Upon entering, she found herself in a magnificent ballroom, filled with'

In [15]:
outputs3 = steering_natural_text(
    "Yesterday, my son was out kicking a football. Then,", 
    25,
    steering_vector[25] * 3, 
    model,
    MODEL_CONFIG, 
    tokenizer, 
    max_new_tokens=100, 
    temperature=1.0, 
    freq_penalty=2.0,
    top_p=0.3,
    n_completions=3,
    n_beams=10
)

100%|██████████| 3/3 [00:41<00:00, 13.77s/it]


In [16]:
outputs3

{'clean': ['he came in and said, “Mom, I’m going to be a professional football player when I grow up.”\n“That’s great,” I said. “What position do you want to play?”\n“I don’t know,” he said. “Maybe quarterback.”\n“Well, that’s good,” I said. “Because there are only 32 quarterbacks in the NFL.”\n“Really?” he said. “Only 3',
  'he came in and said, “Mom, I think I’m going to be a quarterback when I grow up.”\n“Really?” I asked. “Why do you think that?”\n“Because,” he said, “I can throw the ball really far.”\n“That’s great,” I said. “What else do you need to be a quarterback?”\n“Well,” he said, “I need to be able to catch the ball.”\n“',
  'he came in and said, “Mom, I’m going to be a professional football player when I grow up.”\n“That’s great,” I said. “What position do you want to play?”\n“I don’t know,” he said. “Maybe quarterback.”\n“Well, that’s good,” I said. “Because the quarterback is the most important position on the team.”\n“Really?” he asked. “Why?”\n“'],
 'steered': ['he cam

In [22]:
prompt = "In the flickering glow of his multi-monitor setup, Alex, a skilled computer hacker, hunched over his keyboard. His world was a digital maze of codes and firewalls. Tonight, he was on a mission to expose corrupt corporate secrets. With each keystroke, he danced through layers of security, his fingers a blur of motion. The clock ticked ominously, reminding him of the race against time."

In [25]:
outputs4 = steering_natural_text(
    prompt, 
    25,
    steering_vector[25] * 2, 
    model,
    MODEL_CONFIG, 
    tokenizer, 
    max_new_tokens=100, 
    temperature=1.0, 
    freq_penalty=2.0,
    top_p=0.3,
    n_completions=3,
    n_beams=10
)

  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:45<00:00, 15.00s/it]


In [36]:
outputs4["steered"][1]

'\nAs he delved deeper into the forbidden realm, an unsettling sense of danger crept upon him. He sensed that something dark and sinister lurked in the shadows, ready to pounce upon him at any moment. Her eyes pierced through the darkness, like twin beams of light, searching for her prey. She had been watching him for days, waiting for the perfect opportunity to strike. And tonight, she knew, was the night'

In [45]:
outputs5 = steering_natural_text(
    prompt, 
    28,
    steering_vector[28]*1.5, 
    model,
    MODEL_CONFIG, 
    tokenizer, 
    max_new_tokens=100, 
    temperature=1.0, 
    freq_penalty=2.0,
    top_p=0.3,
    n_completions=3,
    n_beams=10
)

  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:43<00:00, 14.54s/it]


In [49]:
outputs5["steered"][0]

'\nAs he delved deeper into the forbidden realm, he felt a strange presence lurking in the shadows. He knew he was being watched, yet no one could see him. A chill crept down his spine as he sensed something dark and sinister lurking in the darkness.\nSuddenly, a bright light illuminated the room, blinding him for a moment. When he regained his vision, he found himself staring into'