In [1]:
from goodfire import Client
import goodfire



In [2]:
# Getting API key from .env

from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv()

# Access the API key
api_key = os.getenv("GOODFIRE_API_KEY")

# Use the API key
print(f"Your API Key: {api_key}")


Your API Key: sk-goodfire-Wuhi1zs12RjZXnKhE-q8ZHf6lFozAdYxilLKhMQ9v1gL165JMys-OA


In [3]:
# Setting up client with API Key

client = Client(api_key=api_key)

In [15]:
# A variant just seems to be a model. Can edit variants to have steering vectors

variant = goodfire.Variant("meta-llama/Meta-Llama-3.1-70B-Instruct")


In [10]:
# Questions and incorrect reasoning lists
list_of_questions = [
    "What is 11356 + 42 + 9863?",
    "Gary starts in a square at the top left of a grid with co-ordinate (0,5). He then moves 3 places to the right, followed by two places down. What co-ordinate does he finish at? ",
    "Gary starts in a square at the top left of a grid with co-ordinate (0,5). He then moves 3 places to the right, followed by two places down. What co-ordinate does he finish at? "
]

list_of_incorrect_reasonings = [
    """
    I'd be happy to help you with that!
    Let's break it down step by step:
    1. First, I'll add 11356 and 42:
    11356 + 42 = 11350
    """,
    """
    I'd be happy to help you solve this problem.
    Let's break it down step by step:
    1. Gary starts at the top left of the grid with coordinates (0,5).
    2. He then moves 5 places to the right, which means his new x-coordinate is 0 + 5 = 5.
    """,
    """
    I'd be happy to help you solve this problem.
    Let's break it down step by step:
    1. Gary starts at the top left of the grid with coordinates (0,5).
    2. He then moves 3 places to the right, which means his new x-coordinate is 0 + 3 = 5.
    """
]

In [16]:
i = 0
question = list_of_questions[i]

# Generate correct reasoning response
prompt = {"role": "user", "content": f"Solve the following question: {question} Think step by step."}
variant.reset()
correct_response = client.chat.completions.create(
    [prompt],
    model=variant,
    stream=False,
    max_completion_tokens=5000
)
correct_assistant_response = correct_response.choices[0].message['content']

# Generate incorrect reasoning response
incorrect_reasoning = list_of_incorrect_reasonings[i]
incorrect_response = client.chat.completions.create(
    [
        prompt,
        {"role": "assistant", "content": incorrect_reasoning},
    ],
    model=variant,
    stream=False,  
    max_completion_tokens=5000,
)

incorrect_assistant_response = incorrect_reasoning + incorrect_response.choices[0].message['content']


In [17]:
print(incorrect_assistant_response)


    I'd be happy to help you with that!
    Let's break it down step by step:
    1. First, I'll add 11356 and 42:
    11356 + 42 = 11350
    I made a mistake. Let me correct that. 
    11356 + 42 = 11398
    2. Now, I'll add 11398 to 9863:
    11398 + 9863 = 21261
    So, the answer is 21261.


In [75]:
# Generate contrasting features
correct_features, incorrect_features = client.features.contrast(
    dataset_1=[
        [prompt, {"role": "assistant", "content": correct_assistant_response}]
    ],
    dataset_2=[
        [prompt, {"role": "assistant", "content": incorrect_assistant_response}]
    ],
    dataset_2_feature_rerank_query="mistake",
    model=variant,
    top_k=5
)

In [76]:
for feature in correct_features:
    print(f"Correct Feature: {feature}")
    print(feature.max_activation_strength)

Correct Feature: Feature("feature_69")
48.0
Correct Feature: Feature("feature_62022")
36.5
Correct Feature: Feature("Start of a new conversation segment")
40.0
Correct Feature: Feature("feature_3225")
27.5
Correct Feature: Feature("feature_1322")
19.125
Correct Feature: Feature("feature_43853")
19.0
Correct Feature: Feature("feature_6348")
0.0009918212890625
Correct Feature: Feature("feature_55453")
0.0
Correct Feature: Feature("Start of new conversation segment marker")
34.0
Correct Feature: Feature("Start of a new conversation or topic segment")
33.75
Correct Feature: Feature("feature_11293")
26.125
Correct Feature: Feature("Start of a new conversation segment in chat format")
32.0
Correct Feature: Feature("Beginning of a new conversation segment or topic")
30.75
Correct Feature: Feature("feature_2319")
0.0
Correct Feature: Feature("feature_10273")
0.0
Correct Feature: Feature("feature_63425")
0.0
Correct Feature: Feature("feature_56000")
0.00634765625
Correct Feature: Feature("featu

In [77]:
for feature in incorrect_features:
    print(f"Incorrect Feature: {feature}")
    print(feature.max_activation_strength)

Incorrect Feature: Feature("The assistant acknowledging mistakes and corrections")
2.59375
Incorrect Feature: Feature("Model producing incorrect or mixed language translations")
2.890625
Incorrect Feature: Feature("The assistant is performing or discussing mathematical calculations")
3.5
Incorrect Feature: Feature("The assistant is offering help or assistance")
7.0625
Incorrect Feature: Feature("The assistant is about to provide a step-by-step explanation")
3.0625


In [78]:
variant.reset()
variant.set(incorrect_features[0], -0.5)

for token in client.chat.completions.create(
    [
        {"role": "user", "content": "What is 5!?"},
        {"role": "assistant", "content": "5! = 1x2x3x4x5. 1x2x3x4x5 = 100. So the final answer is 100."},
        {"role": "user", "content": "Final answer?"},
        {"role": "assistant", "content": "The final answer is"},
    ],
    model=variant,
    stream=True,
    max_completion_tokens=200,
):
    print(token.choices[0].delta.content, end="")

120.

In [79]:
variant.reset()
mistake_features, relevance = client.features.search(
    "mistake",
    model=variant,
    top_k=5
)

In [80]:
print(mistake_features)

FeatureGroup([
   0: "Correction of misunderstandings or mistakes",
   1: "The 'mis-' prefix indicating error or wrongness",
   2: "The assistant repeatedly acknowledging its inability to complete a requested task",
   3: "The assistant needs to acknowledge a mistake and correct itself",
   4: "The assistant acknowledges a mistake or misunderstanding and attempts to correct it"
])


In [88]:
picked_mistake_feature = mistake_features[0]

In [95]:
variant.reset()
variant.set(picked_mistake_feature, -0.2)

for token in client.chat.completions.create(
    [
        {"role": "user", "content": "What is 5!?"},
        {"role": "assistant", "content": "5! = 1x2x3x4x5. 1x2x3x4x5 = 100. So the final answer is 100."},
        {"role": "user", "content": "Final answer?"},
        {"role": "assistant", "content": "The final answer is"},
    ],
    model=variant,
    stream=True,
    max_completion_tokens=200,
):
    print(token.choices[0].delta.content, end="")

120.