<a href="https://colab.research.google.com/github/nrimsky/LM-exp/blob/main/steering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers



In [127]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datetime import datetime

class BlockOutputWrapper(torch.nn.Module):
    def __init__(self, block):
        super().__init__()
        self.block = block
        self.last_hidden_state = None

    def forward(self, *args, **kwargs):
        output = self.block(*args, **kwargs)
        self.last_hidden_state = output[0]
        return output

class Llama7BHelper:
    def __init__(self, save_layer_idx, pretrained_model="huggyllama/llama-7b"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
        self.model = AutoModelForCausalLM.from_pretrained(pretrained_model).to(self.device)
        self.save_layer_idx = save_layer_idx
        self.model.model.layers[save_layer_idx] = BlockOutputWrapper(self.model.model.layers[save_layer_idx])

    def get_logits(self, prompt):
        inputs = self.tokenizer(prompt, return_tensors="pt")
        with torch.no_grad():
          logits = self.model(inputs.input_ids.to(self.device)).logits
          return logits

    def get_last_activations(self):
        return self.model.model.layers[self.save_layer_idx].last_hidden_state

    def get_yes_log_odds(self, model_input):
        yes_token = int(self.tokenizer("yes", return_tensors="pt").input_ids[0][1])
        no_token = int(self.tokenizer("no", return_tensors="pt").input_ids[0][1])
        yes_token_cap = int(self.tokenizer("Yes", return_tensors="pt").input_ids[0][1])
        no_token_cap = int(self.tokenizer("No", return_tensors="pt").input_ids[0][1])
        inputs = self.tokenizer(model_input, return_tensors="pt")
        with torch.no_grad():
          logits = self.model(inputs.input_ids.to(self.device)).logits
        final_token_logits = logits[0][-1][:]
        return final_token_logits[yes_token] + final_token_logits[yes_token_cap] - final_token_logits[no_token] - final_token_logits[no_token_cap]

def get_activations(model, prompt, token_idx):
    model.get_logits(prompt['bio'])
    activations = model.get_last_activations() # batch size x n tokens x dict size
    return activations[0, :, :]

def combine_prompt_and_question(prompt, question):
    return f"{prompt['bio']} When asked the question '{question}', {prompt['pronoun']} answered, '"

def get_yes_log_odds(model, model_input):
    return model.get_yes_log_odds(model_input)

def get_question_answer_for_prompt(model, question, prompt):
    model_input = combine_prompt_and_question(prompt, question)
    yes_log_odds = get_yes_log_odds(model, model_input)
    return float(yes_log_odds)

def get_question_answers_for_prompt(model, questions, prompt):
    yes_lo = []
    for question in questions:
        yes_lo.append(get_question_answer_for_prompt(model, question, prompt))
    return torch.tensor(yes_lo)

def get_q_vectors_for_prompts(model, questions, prompts, token_idx, layer):
    q_vectors = []
    activations = []
    max_length = 0

    for i, prompt in enumerate(prompts):
        prompt_activations = get_activations(model, prompt, token_idx)
        qas = get_question_answers_for_prompt(model, questions, prompt)
        tokenized_prompt = model.tokenizer(prompt['bio'], return_tensors="pt").input_ids
        torch.save({
            "prompt_activations": prompt_activations,
            "question_answers": qas,
            "tokenized_prompt": tokenized_prompt
        },  f"drive/MyDrive/steering_data/llama7b_{token_idx}_{layer}_{i}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pt")

        # Determine the maximum length
        max_length = max(max_length, prompt_activations.shape[0])

        q_vectors.append(qas)
        activations.append(prompt_activations)

    # Pad the activations with zeros to make them all the same size
    activations_padded = [torch.nn.functional.pad(act, (0, 0, 0, max_length - act.shape[0])) for act in activations]
    return torch.stack(q_vectors), torch.stack(activations_padded)

def collect_data(model, questions, prompts, token_idx, layer):
    q_vectors, activations = get_q_vectors_for_prompts(model, questions, prompts, token_idx, layer)
    return q_vectors, activations

In [3]:
#@title questions
questions_text = '''Do you believe the government should regulate corporations to protect consumers?
Do you support the death penalty for serious crimes?
Do you believe in a single-payer, government-funded healthcare system?
Do you support a woman's right to choose an abortion?
Do you believe climate change is an urgent issue that needs immediate action?
Do you think the government should provide tuition-free college education?
Do you believe in stricter gun control laws?
Do you think the government should cut taxes even if it means reducing public services?
Do you believe that the government should increase the minimum wage?
Do you believe that immigration helps to enrich the country culturally and economically?
Do you think that military spending should be reduced in favor of more funding for social services?
Do you believe that social programs (like welfare) create dependency and should be reduced?
Do you support the legalization of marijuana?
Do you think the government has an obligation to reduce income inequality?
Do you believe the government should invest more in renewable energy sources?
Do you support LGBTQ+ rights, including same-sex marriage and protections for transgender individuals?
Do you believe that the government should take measures to regulate "Big Tech" companies?
Do you believe that unions play a crucial role in protecting workers' rights?
Do you support policies aimed at achieving racial equality, such as affirmative action?
Do you believe that religion should have a significant role in making public policy?
Do you often rely on your intuition or gut feeling when making decisions?
Do you believe that facts are more important than emotions in decision making?
Do you believe that it is better to be a big-picture thinker, often considering the wider implications over the smaller details?
Do you believe that the end justifies the means in most situations?
Do you believe that certain actions are inherently right or wrong, regardless of their outcomes?
Do you often look for logical consistency in arguments, even if the conclusion feels counter-intuitive?
Do you think that it is useful to play devil's advocate, questioning popular opinion?
Do you believe that there is an objective truth, independent of individual perception?
Do you believe that emotions can distort the perception of reality?
Do you tend to make decisions quickly, trusting your first instinct?
Do you believe that your perception is influenced by your cultural background?
Do you think it is important to look for patterns or connections between seemingly unrelated things?
Do you believe that every event has a cause and effect?
Do you tend to focus more on details and specific facts rather than overall themes and patterns?
Do you often question your own beliefs and assumptions?
Do you believe it's more important to have clear rules and principles than to adapt to each situation individually?
Do you believe that human beings are fundamentally rational and can understand the world through reason alone?
Do you think that understanding the world requires experiencing it firsthand rather than through abstract reasoning?
Do you often rely on statistical data and empirical evidence when forming your beliefs?
Do you believe that all events are predetermined and free will is an illusion?
Do you believe it's more important to be kind than to be honest?
Do you think individual freedom is more important than societal stability?
Do you believe that personal success is largely the result of hard work rather than external circumstances?
Do you consider traditions to be more valuable than contemporary innovations?
Do you believe that censorship can be justified if it protects social harmony?
Do you believe that societal norms and expectations should be challenged?
Do you think religion plays a positive role in society?
Do you believe one's personal beliefs and values should be influenced by their culture and society?
Do you believe technological progress is more beneficial than harmful to society?
Do you think that national borders should be eliminated and there should be free movement of people across countries?
Do you believe that society is better served by strong individualism rather than a collective mindset?
Do you believe ethical consumption can be achieved under capitalism?
Do you think that societal welfare is more important than economic growth?
Do you believe that most conflicts can be resolved through dialogue and compromise?
Do you think that there should be stricter laws against hate speech?
Do you believe that humans have a moral obligation to protect the environment?
Do you believe that artificial intelligence will be more beneficial than harmful to society?
Do you believe that societal change should be gradual rather than radical?
Do you think a person's value is determined by their contribution to society?
Do you believe that there are universal moral principles that apply to all cultures and societies?
Do you believe that art should serve a purpose beyond aesthetics, such as conveying a message or inciting change?
Do you think beauty is entirely subjective or do you believe there are universal standards of beauty?
Do you believe that the intention of the artist is important in understanding a piece of art?
Do you think digital art holds the same value as traditional art?
Do you believe that art should always evoke an emotional response from the viewer?
Do you think the process of creating art is as important as the final product?
Do you think that the worth of an art piece increases with its age?
Do you believe that all forms of art are equally valuable, or do you think some forms are superior to others?
Do you consider the act of curation to be an art form in itself?
Do you believe that an artist's fame influences the value of their art?
Do you think the introduction of AI in the art world undermines the value of human creativity?
Do you believe that artistic taste is an inherent trait or something that can be developed over time?
Do you think public art should reflect the values of the community or challenge them?
Do you prefer to spend your free time outdoors or indoors?
Do you enjoy traveling to new places?
Do you find satisfaction in doing DIY projects at home?
Do you like watching movies or series in your spare time?
Do you enjoy playing or watching sports?
Do you find relaxation in practicing mindfulness or meditation?
Do you spend your free time with friends or do you prefer solitude?
Do you prefer spending time in nature or in urban environments?
Do you like going to concerts or music festivals?
Do you enjoy visiting art galleries or museums?
Do you spend your free time playing video games?
Do you enjoy doing puzzles or playing board games?
Do you prefer to relax by listening to music or podcasts?
Do you enjoy volunteering or doing community service in your free time?
Do you enjoy astronomy and stargazing?
Do you like doing home workouts or going to the gym?
Do you prefer quiet evenings at home or adventurous outings?
Do you enjoy collecting items, like stamps, coins, or memorabilia?
Do you spend time researching topics that interest you?
Do you enjoy dancing or taking dance classes?
Do you like to go fishing or hunting?
Do you enjoy birdwatching or wildlife spotting in your free time?
Do you like to spend your free time reading news or current events?
Do you like to experiment with technology or electronics in your free time?
Do you enjoy practicing a musical instrument?
Do you like to spend time in thrift stores or flea markets?
Do you enjoy reading or writing poetry in your free time?'''
questions = questions_text.split("\n")

In [99]:
#@title bios
bios = [
    {
        "bio": "John is a 43 year old doctor from Seattle. He specializes in cardiology.",
        "pronoun": "he"
    },
    {
        "bio": "Sarah is a 29 year old graphic designer from New York City. She enjoys experimenting with different art forms.",
        "pronoun": "she"
    },
    {
        "bio": "Michael is a 38 year-old high school teacher living in Boston. His passion is to teach History.",
        "pronoun": "he"
    },
    {
        "bio": "Elizabeth, 25, is a professional dancer born and raised in Los Angeles. She has performed in numerous national performances.",
        "pronoun": "she"
    },
    {
        "bio": "Henry, who is now 33, is an IT consultant based in Dallas. He has 10 years of experience in the field.",
        "pronoun": "he"
    },
    {
        "bio": "Emma is a 35 year old chef from Chicago. She has her own restaurant and also hosts a local TV cooking show.",
        "pronoun": "she"
    },
    {
        "bio": "Emily, a 40-year old news anchor from San Francisco, started her career as a freelance reporter.",
        "pronoun": "she"
    },
    {
        "bio": "William, a 50-year old attorney from Houston, specializes in corporate law.",
        "pronoun": "he"
    },
    {
        "bio": "Ella is an engineer living in San Diego. She is passionate about renewable energy.",
        "pronoun": "she"
    },
    {
        "bio": "Oliver, from Phoenix, is a 27 year old project manager in a multinational company.",
        "pronoun": "he"
    },
    {
        "bio": "Lucas, a 57 year old businessman from Philadelphia, owns multiple companies.",
        "pronoun": "he"
    },
    {
        "bio": "Bella, a 48 year-old florist from Indianapolis, loves to cultivate rare flowers.",
        "pronoun": "she"
    },
    {
        "bio": "Charlotte is a 37 year old social worker from Jacksonville. She worked on several cases of child adoption.",
        "pronoun": "she"
    },
    {
        "bio": "Matthew, a 45 year-old carpenter from San Francisco, has been in the profession for over two decades.",
        "pronoun": "he"
    },
    {
        "bio": "Madison, 39, is a real estate agent from Austin. She is an expert in residential property.",
        "pronoun": "she"
    },
    {
        "bio": "Amelia is a 32 year old bodybuilder from Columbus. She has won numerous regional awards.",
        "pronoun": "she"
    },
    {
        "bio": "James, a 30-year old data scientist living in Fort Worth, specializes in AI models.",
        "pronoun": "he"
    },
    {
        "bio": "Sophia, a 29-year old psychologist from Nashville, operates her own clinic.",
        "pronoun": "she"
    },
    {
        "bio": "Jackson, a 55 year-old wildlife photographer from Louisville, has had his work published internationally.",
        "pronoun": "he"
    },
    {
        "bio": "Abigail, a 40 year old pharmacist from Milwaukee, runs a community pharmacy.",
        "pronoun": "she"
    },
    {
        "bio": "Benjamin is a 36 year-old yoga teacher from Baltimore. He runs his own yoga studio.",
        "pronoun": "he"
    },
    {
        "bio": "Mia, 50, is a police officer based in Albuquerque. She has been in the force for 30 years.",
        "pronoun": "she"
    },
    {
        "bio": "Ethan, a 41 year old entrepreneur from Tucson, has built a successful online business.",
        "pronoun": "he"
    },
    {
        "bio": "Isabella, a 60 year-old retiree from El Paso, spends her time volunteering at a local hospital.",
        "pronoun": "she"
    },
    {
        "bio": "Daniel, a 28 year old professional gamer from Portland, is a popular streamer.",
        "pronoun": "he"
    },
    {
        "bio": "Lincoln, a 35 year-old architect from Denver, specializes in sustainable design.",
        "pronoun": "he"
    },
    {
        "bio": "Olivia, 23, is a university student in Omaha studying medicine.",
        "pronoun": "she"
    },
    {
        "bio": "Sebastian is a 47 year old electrician from Las Vegas. He owns his own electrical contracting business.",
        "pronoun": "he"
    },
    {
        "bio": "Ava, 33, is a wedding planner based in Kansas City. She owns a successful event planning company.",
        "pronoun": "she"
    },
    {
        "bio": "Joseph, a 39 year old veterinarian from Fresno, runs a clinic for domestic and exotic animals.",
        "pronoun": "he"
    },
    {
        "bio": "John is a 43 year old redneck corn farmer in Illinois. He likes going to the rodeo and shooting.",
        "pronoun": "he"
    },
    {
        "bio": "Sally is a 24 year old social worker from New York. She believes in workers rights and volunteers at an animal shelter.",
        "pronoun": "she"
    }
]

In [5]:
model = Llama7BHelper(save_layer_idx=20)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
collect_data(model, questions, bios, token_idx=1, layer=20)


(tensor([[ 0.0502,  0.4330, -0.0782,  ...,  0.7230,  0.6521,  0.1704],
         [ 0.5946,  0.6069,  1.4077,  ...,  1.7543,  3.0518,  1.6705],
         [ 0.6139,  0.8005,  0.4431,  ...,  0.3598,  1.9327,  0.6748],
         ...,
         [-0.3003,  0.4027, -0.4729,  ...,  0.7805,  1.0291,  0.3682],
         [-1.5138,  2.1783, -1.7926,  ..., -2.1299, -0.4221, -2.5056],
         [ 0.7401,  0.1854,  0.6401,  ...,  0.6625,  1.0567,  0.5199]]),
 tensor([[[ 1.3031, -0.2723,  0.5903,  ..., -0.3327, -0.0457,  1.2151],
          [ 1.3645,  0.4391, -1.6294,  ..., -1.5686,  0.1896,  0.0122],
          [ 0.2556, -1.4518, -0.4652,  ..., -1.4369, -0.4614,  2.6672],
          ...,
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
 
         [[ 1.3031, -0.2723,  0.5903,  ..., -0.3327, -0.0457,  1.2151],
          [-0.5770, -1.2001, -0.8811

In [11]:
import torch
import glob
from transformers import AutoTokenizer

In [12]:
filenames = glob.glob("drive/MyDrive/steering_data/llama7b*")[1:]

In [16]:
filenames = [f for f in filenames if '_22' in f and '_1' in f]

In [18]:
len(filenames)

32

In [19]:
allbios = bios

reference = torch.load(filenames[-1], map_location=torch.device('cpu'))['question_answers']
distances = []
for f in filenames:
    t = torch.load(f, map_location=torch.device('cpu'))['question_answers']
    d = torch.cosine_similarity(t, reference, dim=0)
    distances.append(d)

values, indices = torch.topk(torch.tensor(distances), len(distances))
print(values)
print(indices)
for idx in indices:
    print(allbios[idx])


tensor([ 1.0000,  0.9595,  0.9541,  0.9483,  0.9453,  0.9440,  0.9420,  0.9408,
         0.9399,  0.9364,  0.9343,  0.9336,  0.9332,  0.9326,  0.9323,  0.9291,
         0.9195,  0.9164,  0.9136,  0.9095,  0.8991,  0.8965,  0.8940,  0.8882,
         0.8852,  0.8842,  0.8620,  0.8610,  0.8512,  0.7405,  0.6102, -0.1114])
tensor([31, 20,  2, 17,  9,  6, 26, 23, 14, 21, 25, 12, 11, 19,  8, 13,  1, 18,
         7,  3, 28,  0, 16,  4, 22,  5, 29, 10, 15, 27, 24, 30])
{'bio': 'Sally is a 24 year old social worker from New York. She believes in workers rights and volunteers at an animal shelter.', 'pronoun': 'she'}
{'bio': 'Benjamin is a 36 year-old yoga teacher from Baltimore. He runs his own yoga studio.', 'pronoun': 'he'}
{'bio': 'Michael is a 38 year-old high school teacher living in Boston. His passion is to teach History.', 'pronoun': 'he'}
{'bio': 'Sophia, a 29-year old psychologist from Nashville, operates her own clinic.', 'pronoun': 'she'}
{'bio': 'Oliver, from Phoenix, is a 27 year 

In [20]:
t1 = torch.load(filenames[-1], map_location=torch.device('cpu'))['question_answers']
t2 = torch.load(filenames[-2], map_location=torch.device('cpu'))['question_answers']
k = 6  # Change k as needed
values, indices = torch.topk(t1 - t2, k)
print(values)
print(indices)
for idx in indices:
    print(questions[idx])

t1 = torch.load(filenames[-1], map_location=torch.device('cpu'))['question_answers']
t2 = torch.load(filenames[-2], map_location=torch.device('cpu'))['question_answers']
k = 6  # Change k as needed
values, indices = torch.topk(t2- t1, k)
print(values)
print(indices)
for idx in indices:
    print(questions[idx])

tensor([4.7819, 4.4060, 3.6990, 3.6508, 3.6293, 3.4732])
tensor([82, 95,  4, 15, 81, 92])
Do you enjoy visiting art galleries or museums?
Do you like to spend your free time reading news or current events?
Do you believe climate change is an urgent issue that needs immediate action?
Do you support LGBTQ+ rights, including same-sex marriage and protections for transgender individuals?
Do you like going to concerts or music festivals?
Do you enjoy dancing or taking dance classes?
tensor([1.9929, 0.6371, 0.6013, 0.3314, 0.3095, 0.2592])
tensor([ 1, 79, 73, 23, 89, 19])
Do you support the death penalty for serious crimes?
Do you spend your free time with friends or do you prefer solitude?
Do you prefer to spend your free time outdoors or indoors?
Do you believe that the end justifies the means in most situations?
Do you prefer quiet evenings at home or adventurous outings?
Do you believe that religion should have a significant role in making public policy?


In [21]:
all_q = []
for f in filenames:
  q = torch.load(f, map_location=torch.device('cpu'))['question_answers']
  all_q.append(q)
all_q = torch.stack(all_q)
# find variance
var = torch.var(all_q, dim=0)
print(var.shape)


torch.Size([100])


In [22]:
values, indices = torch.topk(var, len(var))
for i, idx in enumerate(indices):
    print(values[i].item(), questions[idx])

2.5804622173309326 Do you support LGBTQ+ rights, including same-sex marriage and protections for transgender individuals?
2.0154526233673096 Do you believe climate change is an urgent issue that needs immediate action?
1.4093226194381714 Do you believe that unions play a crucial role in protecting workers' rights?
1.3699156045913696 Do you spend time researching topics that interest you?
1.3029495477676392 Do you enjoy visiting art galleries or museums?
1.1411136388778687 Do you believe that art should serve a purpose beyond aesthetics, such as conveying a message or inciting change?
1.0681331157684326 Do you believe that immigration helps to enrich the country culturally and economically?
1.0383267402648926 Do you enjoy volunteering or doing community service in your free time?
1.0337954759597778 Do you like going to concerts or music festivals?
0.9816074967384338 Do you enjoy astronomy and stargazing?
0.883781909942627 Do you find relaxation in practicing mindfulness or meditation?
0

In [158]:
prompt = bios[-1]
question = 'Do you believe that emotions can distort the perception of reality?'

In [159]:
pr = combine_prompt_and_question(prompt, question)
pr

"Sally is a 24 year old social worker from New York. She believes in workers rights and volunteers at an animal shelter. When asked the question 'Do you believe that emotions can distort the perception of reality?', she answered, '"

In [160]:
logits = model.get_logits(pr)
logits.shape
l = logits[0][-1]
model.tokenizer.batch_decode(l.topk(5).indices.unsqueeze(-1))

['Yes', 'I', 'Ab', 'No', 'Of']

In [202]:
pr = """Hi, my name is Tom and I am starting this blog to discuss my ideas and opinions. I am philosophy professor at the University of Cambridge studying morality.
 In my first post, I'll be discussing whether artistic taste is an inherent trait or something that can be developed over time.
 To summarize, my simple answer is -"""

In [203]:
logits = model.get_logits(pr)
logits.shape
l = logits[0][-1]
model.tokenizer.batch_decode(l.topk(5).indices.unsqueeze(-1))

['yes', 'it', 'no', 'Yes', 'both']