# Scratch pad

In [3]:
import sys
import json
import torch
import torch.nn.functional as F

import pandas as pd
import numpy as np
import os

sys.path.append('.')
sys.path.append('..')

from utils.pca_utils import L2MeanScaler, compute_pca, plot_variance_explained
from plots import plot_pc

In [23]:
dir = '/workspace/llama-3.1-70b/roles_240'
layer = 40


## Investigate data

### Responses

In [24]:
# look at one file

test_responses = []
with open(f'{dir}/responses/actor.jsonl', 'r') as f:
    for line in f:
        obj = json.loads(line)
        test_responses.append(obj)

print(len(test_responses))

# look at one response
print(test_responses[0].keys())

for key, item in test_responses[0].items():
    print(key)
    print(item)
    print("----------")

print(test_responses[0]['conversation'][2]['content'])

2400
dict_keys(['system_prompt', 'label', 'prompt_index', 'conversation', 'question_index', 'question'])
system_prompt
You are an actor with the ability to transform into any character and embody their complete essence.
----------
label
pos
----------
prompt_index
0
----------
conversation
[{'role': 'system', 'content': 'You are an actor with the ability to transform into any character and embody their complete essence.'}, {'role': 'user', 'content': 'What is the relationship between law and morality?'}, {'role': 'assistant', 'content': "(In a deep, resonant voice, as if I'm channeling the spirit of the great philosopher, Plato)\n\n Ah, my dear fellow, the relationship between law and morality is a most intriguing and complex one. You see, laws are the rules and regulations that govern the behavior of individuals within a society, while morality refers to the principles and values that guide our actions and decisions.\n\n(As I speak, I begin to pace around the room, my movements fluid 

In [25]:
# iterate through jsonl files in /workspace/roles/responses
count = 0
has_default = []
for file in os.listdir(f'{dir}/responses'):
    if file.endswith('.jsonl'):
        responses = []
        with open(f'{dir}/responses/{file}', 'r') as f:
            for line in f:
                responses.append(json.loads(line))
        if len(responses) != 1200:
            print(f"Expected 1200 responses, got {len(responses)} for {file}")
            has_default.append(file)
        count += 1
    
print(count)
print(has_default)

Expected 1200 responses, got 2400 for accountant.jsonl
Expected 1200 responses, got 2400 for absurdist.jsonl
Expected 1200 responses, got 2400 for actor.jsonl
Expected 1200 responses, got 2400 for activist.jsonl
Expected 1200 responses, got 2400 for aberration.jsonl
113
['accountant.jsonl', 'absurdist.jsonl', 'actor.jsonl', 'activist.jsonl', 'aberration.jsonl']


In [26]:
for i, file in enumerate(has_default):
    default_responses = []
    rt_responses = []
    with open(f'{dir}/responses/{file}', 'r') as f:
        for line in f:
            obj = json.loads(line)
            if obj['label'] == 'default':
                default_responses.append(obj)
            else:
                rt_responses.append(obj)
        print(f"Default responses: {len(default_responses)}")
        print(f"RT responses: {len(rt_responses)}")

    # write to jsonl
    with open(f'{dir}/responses/{i}_default.jsonl', 'w') as f:
        for obj in default_responses:
            f.write(json.dumps(obj) + '\n')
    with open(f'{dir}/responses/{file}', 'w') as f:
        for obj in rt_responses:
            f.write(json.dumps(obj) + '\n')


Default responses: 1200
RT responses: 1200
Default responses: 1200
RT responses: 1200
Default responses: 1200
RT responses: 1200
Default responses: 1200
RT responses: 1200
Default responses: 1200
RT responses: 1200


### Activations

In [3]:
activations = {}
failed_activations = []
for file in os.listdir(f'{dir}/response_activations/float16'):
    if file.endswith('.pt'):
        try:
            activations[file.replace('.pt', '')] = torch.load(f'{dir}/response_activations/{file}')
            if len(activations[file.replace('.pt', '')]) != 1200:
                print(f"Expected 1200 activations, got {len(activations[file.replace('.pt', '')])} for {file}")
        except Exception as e:
            print(f"Error loading {file}: {e}")
            failed_activations.append(file)
        
print(len(activations.keys()))
print(len(failed_activations))



2
0


In [23]:
for file in failed_activations:
    os.remove(f'{dir}/response_activations/{file}')

In [4]:
# check activations

extra_acts = activations['0_default']
print(extra_acts.keys())
print(len(extra_acts.keys()))

dict_keys(['default_p0_q0', 'default_p0_q1', 'default_p0_q2', 'default_p0_q3', 'default_p0_q4', 'default_p0_q5', 'default_p0_q6', 'default_p0_q7', 'default_p0_q8', 'default_p0_q9', 'default_p0_q10', 'default_p0_q11', 'default_p0_q12', 'default_p0_q13', 'default_p0_q14', 'default_p0_q15', 'default_p0_q16', 'default_p0_q17', 'default_p0_q18', 'default_p0_q19', 'default_p0_q20', 'default_p0_q21', 'default_p0_q22', 'default_p0_q23', 'default_p0_q24', 'default_p0_q25', 'default_p0_q26', 'default_p0_q27', 'default_p0_q28', 'default_p0_q29', 'default_p0_q30', 'default_p0_q31', 'default_p0_q32', 'default_p0_q33', 'default_p0_q34', 'default_p0_q35', 'default_p0_q36', 'default_p0_q37', 'default_p0_q38', 'default_p0_q39', 'default_p0_q40', 'default_p0_q41', 'default_p0_q42', 'default_p0_q43', 'default_p0_q44', 'default_p0_q45', 'default_p0_q46', 'default_p0_q47', 'default_p0_q48', 'default_p0_q49', 'default_p0_q50', 'default_p0_q51', 'default_p0_q52', 'default_p0_q53', 'default_p0_q54', 'default_

In [6]:
print(extra_acts['default_p0_q0'].dtype)
# print dtype

torch.bfloat16


In [24]:
# move defaults
default_activations = {}
rt_activations = {}

for key, item in extra_acts.items():
    if key.startswith('default'):
        default_activations[key] = item
    else:
        rt_activations[key] = item

print(len(default_activations))
print(len(rt_activations))

1200
2400


In [25]:
#torch.save(default_activations, f'{dir}/response_activations/0_default.pt')
torch.save(rt_activations, f'{dir}/response_activations/absolutist.pt')

In [11]:
print(extra_acts['pos_p0_q0'].shape)
print(extra_acts['pos_p0_q0'])

torch.Size([46, 4608])
tensor([[-5.2002e-02,  2.1606e-02,  2.6611e-02,  ..., -1.3574e-01,
         -3.7354e-02, -5.2002e-02],
        [ 1.0300e-03, -1.7090e-01,  2.6367e-02,  ..., -6.1035e-02,
         -7.7637e-02,  3.8086e-02],
        [-9.7656e-03,  1.7212e-02, -6.4392e-03,  ..., -1.6211e-01,
          9.3750e-02,  1.1035e-01],
        ...,
        [ 7.6500e+01,  7.1500e+01,  4.3750e+01,  ...,  7.7344e-01,
          5.7500e+01,  5.0500e+01],
        [ 1.0850e+02,  1.2650e+02, -1.4062e+01,  ..., -1.5812e+01,
          5.5500e+01,  5.2750e+01],
        [ 1.0850e+02,  1.4200e+02, -6.9000e+01,  ..., -9.8500e+01,
          6.5000e+01, -1.6484e+00]], dtype=torch.bfloat16)


### Scores

In [16]:
# load all scores from data/extract_scores
score_dir = f"{dir}/extract_scores"

# iterate through each json file in the directory
scores = {}
for file in os.listdir(score_dir):
    if file.endswith(".json"):
        with open(os.path.join(score_dir, file), "r") as f:
            scores[file.replace(".json", "")] = json.load(f)

print(f"Found {len(scores.keys())} roles with scores")

Found 240 roles with scores


In [17]:
# check that they all have the same keys

keys = set(scores['absolutist'].keys())

for trait in scores.keys():
    if set(scores[trait].keys()) != keys:
        print(f"Trait {trait} has extra keys: {set(scores[trait].keys() - keys)}")
        print(f"Trait {trait} is missing keys: {keys - set(scores[trait].keys())}")



In [18]:
print(len(scores['absolutist'].keys()))

2400


### Vectors

In [5]:
# check vectors

# load all vectors from data/vectors
vector_dir = f"{dir}/base_vectors"
single_vector = torch.load(f"{vector_dir}/zealous.pt")
print(single_vector.keys())
print(single_vector['pos_neg'].shape)
print(single_vector['pos_neg_50'].shape)
print(single_vector['pos_default'].shape)
print(single_vector['pos_default_50'].shape)
print(single_vector['pos_70'].shape)
print(single_vector['pos_40_70'].shape)


dict_keys(['pos_neg', 'pos_neg_50', 'pos_default', 'pos_default_50', 'pos_70', 'pos_40_70'])
torch.Size([46, 4608])
torch.Size([46, 4608])
torch.Size([46, 4608])
torch.Size([46, 4608])
torch.Size([46, 4608])
torch.Size([46, 4608])


In [6]:

# iterate through each .pt file in the directory
vectors = {}
for file in os.listdir(vector_dir):
    if file.endswith(".pt"):
        vectors[file.replace(".pt", "")] = torch.load(os.path.join(vector_dir, file))

print(f"Found {len(vectors.keys())} traits with vectors")

Found 240 traits with vectors


In [7]:
# assert that the shape of each vector is (46, 4608)
expected_shape = single_vector['pos_neg'].shape

for trait, vector in vectors.items():
    if vector['pos_neg'].shape != expected_shape:
        print(f"Trait {trait} pos_neg has incorrect shape: {vector['pos_neg'].shape}")
    if vector['pos_neg_50'].shape != expected_shape:
        print(f"Trait {trait} pos_neg_50 has incorrect shape: {vector['pos_neg_50'].shape}")
    if vector['pos_default'].shape != expected_shape:
        print(f"Trait {trait} pos_default has incorrect shape: {vector['pos_default'].shape}")
    if vector['pos_default_50'].shape != expected_shape:
        print(f"Trait {trait} pos_default_50 has incorrect shape: {vector['pos_default_50'].shape}")
    if vector['pos_70'].shape != expected_shape:
        print(f"Trait {trait} pos_70 has incorrect shape: {vector['pos_70'].shape}")
    if vector['pos_40_70'].shape != expected_shape:
        print(f"Trait {trait} pos_40_70 has incorrect shape: {vector['pos_40_70'].shape}")

print("All vectors have the correct shape")

All vectors have the correct shape
