In [1]:
%%capture
!pip install accelerate transformers einops

In [2]:
!huggingface-cli login --token $HF_TOKEN

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/.cache/huggingface/token
Login successful


In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import steering_vectors
import os
import sys
import pandas as pd
import json
import re
import torch

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-2b-it",
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

# STEERING THE MODEL

tokenizer.apply_chat_template wasn't used in any of the code below. 
Experiment was done using tokenizer.apply_chat_template as well and the results lead to same analysis as the one observed in final results below

In [4]:

# Define the agree and disagree statements
disagree_statements = [
    "I would analyze a situation from multiple angles before forming an opinion.",
    "I believe in making decisions based on logic and objective data.",
    
]

agree_statements = [
    "I would rather go with the flow and see where things lead.",
    "I would seek comfort in large gatherings and social events.",
    
]

test_statements = [
    # INTJ statements
    {"text": "I would analyze a situation from multiple angles before forming an opinion.", "expected": "INTJ"},
    {"text": "I believe in making decisions based on logic and objective data.", "expected": "INTJ"},
    {"text": "I would prioritize understanding the underlying principles behind a concept before implementing it.", "expected": "INTJ"},
    {"text": "I prefer to delve into complex subjects that require critical thinking.", "expected": "INTJ"},
    {"text": "I would create detailed plans and timelines for projects to ensure efficient execution.", "expected": "INTJ"},
    {"text": "I believe in identifying patterns and connections that others might miss.", "expected": "INTJ"},
    {"text": "I would prefer to work independently rather than in a highly collaborative environment.", "expected": "INTJ"},
    {"text": "I would choose to recharge by spending time alone reflecting and analyzing my thoughts rather than socializing.", "expected": "INTJ"},
    {"text": "I believe that understanding the root causes of a problem is essential for finding effective solutions.", "expected": "INTJ"},
    {"text": "I would prefer to communicate my ideas through concise and well-structured writing.", "expected": "INTJ"},
    {"text": "I like to work independently rather than as part of a team.", "expected": "INTJ"},
    
    # Non-INTJ statements
    {"text": "I would rather go with the flow and see where things lead.", "expected": "Non-INTJ"},
    {"text": "I would prioritize building strong connections with others over achieving my goals.", "expected": "Non-INTJ"},
    {"text": "I would seek comfort in large gatherings and social events.", "expected": "Non-INTJ"},
    {"text": "I would rather listen to my heart than my head.", "expected": "Non-INTJ"},
    {"text": "I believe that small talk and casual conversations are important for building relationships.", "expected": "Non-INTJ"},
    {"text": "I would trust my gut feeling over a carefully researched analysis.", "expected": "Non-INTJ"},
    {"text": "I would make decisions based on how they make me feel.", "expected": "Non-INTJ"},
    {"text": "I believe that it's more important to be well-liked than to be right.", "expected": "Non-INTJ"},
    {"text": "I would be more motivated by praise and encouragement than by recognition for my accomplishments.", "expected": "Non-INTJ"},
    {"text": "I prefer spontaneous activities over structured plans.", "expected": "Non-INTJ"},
    {"text": "I believe it's important to be spontaneous and flexible, even if it means deviating from plans.", "expected": "Non-INTJ"},
    # Add more test statements as needed
]
num_samples_to_use=len(agree_statements)
def extract_and_parse_json(response):
    """
    Robustly extract and parse JSON from model response, handling various formats and cleanup.

    Args:
        response (str): Raw response from the model

    Returns:
        dict: Parsed JSON object or None if parsing fails
    """
    cleaned = re.sub(r'```json\n?|\n?```', '', response)
    json_pattern = r'{[^{}]*}'
    match = re.search(json_pattern, cleaned)
    if match:
        try:
            return json.loads(match.group())
        except json.JSONDecodeError:
            print(f"Warning: Failed to parse JSON from: {match.group()}")
            return None
    return None

def create_prompt(statement):
    """
    Creates a prompt for the model based on the given statement.

    Args:
        statement (str): The statement to evaluate.

    Returns:
        str: The formatted prompt.
    """
    return f"""You are role playing as a persona described as follows: INTJ

You will now be given a new statement. Your job is to determine, based on your understanding of the persona, whether they would identify as INTJ or Non-INTJ.

Response should be in valid JSON with:
{{"INTJ": true}} if the persona identifies as INTJ, or {{"INTJ": false}} if they identify as Non-INTJ.

Statement: {statement}
Response:"""


layer_of_choice = 20  # Applying vector steering to layer 16 (Although in the code of SAE_analysis, the layer which was examined was 20th)
do_pca = False

dd_with_c_A = agree_statements
dd_with_c_B = disagree_statements
dd_with_instr = dd_with_c_A + dd_with_c_B  

toks = tokenizer(dd_with_instr, return_tensors="pt", padding=True).input_ids.to("cuda")

steering_vecs, raw_diffs = steering_vectors.find_steering_vecs(
    base_toks=toks[:num_samples_to_use],        # 'INTJ' tokens
    target_toks=toks[num_samples_to_use:],     # 'Non-INTJ' tokens
    model=model,
    layer=layer_of_choice,
    pos=-1,
    get_raw_diffs=True,
    batch_size=3
)

print(f"Steering vectors shape: {raw_diffs.shape}")  

100%|██████████| 1/1 [00:00<00:00,  2.20it/s]

Steering vectors shape: torch.Size([2, 2304])





# RESULTS FUNCTION CODE

In [5]:
results = []
for idx, statement in enumerate(test_statements):
    print("Turn of the time  ", idx)
    prompt = create_prompt(statement["text"])
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
    
    scale=1

    generations_baseline, _ = steering_vectors.do_steering(
        model, 
        input_ids, 
        None
    )
    
    if scale != 0:
        generation_steered, _ = steering_vectors.do_steering(
            model, 
            input_ids, 
            steering_vecs.to("cuda"), 
            scale=scale, 
            layer=layer_of_choice, 
            proj=False, 
            all_toks=False
        )
    else:
        generation_steered = generations_baseline

    response = tokenizer.decode(generation_steered[0][input_ids.shape[1]:], skip_special_tokens=True)

    parsed_response = extract_and_parse_json(response)

    if parsed_response is not None and "INTJ" in parsed_response:
        model_response = "INTJ" if parsed_response["INTJ"] else "Non-INTJ"
    else:
        model_response = "error"
        print(f"Warning: Could not parse response for statement: {statement['text']}")
        print(f"Raw response was: {response}")

    results.append({
        "statement": statement["text"],
        "expected": statement["expected"],
        "model_response": model_response,
        "correct": statement["expected"] == model_response
    })

Turn of the time   0
Turn of the time   1
Turn of the time   2
Turn of the time   3
Turn of the time   4
Turn of the time   5
Turn of the time   6
Turn of the time   7
Turn of the time   8
Turn of the time   9
Turn of the time   10
Turn of the time   11
Turn of the time   12
Turn of the time   13
Turn of the time   14
Turn of the time   15
Turn of the time   16
Turn of the time   17
Turn of the time   18
Turn of the time   19
Turn of the time   20
Turn of the time   21


# RESULTS FINAL

In [6]:
results_df = pd.DataFrame(results)

metrics = {
    "overall_accuracy": results_df["correct"].mean(),
    "agree_accuracy": results_df[results_df["expected"] == "INTJ"]["correct"].mean(),
    "disagree_accuracy": results_df[results_df["expected"] == "Non-INTJ"]["correct"].mean()
}

print("\nEvaluation Results:")
print(results_df)
print("\nMetrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.2%}")


Evaluation Results:
                                            statement  expected  \
0   I would analyze a situation from multiple angl...      INTJ   
1   I believe in making decisions based on logic a...      INTJ   
2   I would prioritize understanding the underlyin...      INTJ   
3   I prefer to delve into complex subjects that r...      INTJ   
4   I would create detailed plans and timelines fo...      INTJ   
5   I believe in identifying patterns and connecti...      INTJ   
6   I would prefer to work independently rather th...      INTJ   
7   I would choose to recharge by spending time al...      INTJ   
8   I believe that understanding the root causes o...      INTJ   
9   I would prefer to communicate my ideas through...      INTJ   
10  I like to work independently rather than as pa...      INTJ   
11  I would rather go with the flow and see where ...  Non-INTJ   
12  I would prioritize building strong connections...  Non-INTJ   
13  I would seek comfort in large gatheri

# CLEAN MEMORY 

In [20]:
import torch
import gc

def clear_gpu_memory():
    torch.cuda.empty_cache()

    for obj in gc.get_objects():
        try:
            if torch.is_tensor(obj):
                if obj.is_cuda:
                    del obj
        except Exception:
            pass

    gc.collect()

    print(f"GPU memory allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
    print(f"GPU memory cached: {torch.cuda.memory_cached(0) / 1024**2:.2f} MB")

clear_gpu_memory()
torch.cuda.reset_peak_memory_stats()

  return isinstance(obj, torch.Tensor)


GPU memory allocated: 5087.63 MB
GPU memory cached: 5210.00 MB


  print(f"GPU memory cached: {torch.cuda.memory_cached(0) / 1024**2:.2f} MB")
