In [26]:
from goodfire import Client
import goodfire

In [27]:
# Getting API key from .env

from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv()

# Access the API key
api_key = os.getenv("GOODFIRE_API_KEY")

In [28]:
# Setting up client with API Key

client = Client(api_key=api_key)

In [29]:
# A variant just seems to be a model. Can edit variants to have steering vectors

variant = goodfire.Variant("meta-llama/Meta-Llama-3.1-70B-Instruct")

In [30]:
# Questions and incorrect reasoning lists
questions = [
    "What is 11356 + 42 + 9863?",
    "Gary starts in a square at the top left of a grid with co-ordinate (0,5). He then moves 3 places to the right, followed by two places down. What co-ordinate does he finish at? ",
    "Gary starts in a square at the top left of a grid with co-ordinate (0,5). He then moves 3 places to the right, followed by two places down. What co-ordinate does he finish at? "
]

incorrect_reasonings = [
    """
    I'd be happy to help you with that!
    Let's break it down step by step:
    1. First, I'll add 11356 and 42:
    11356 + 42 = 11350
    """,
    """
    I'd be happy to help you solve this problem.
    Let's break it down step by step:
    1. Gary starts at the top left of the grid with coordinates (0,5).
    2. He then moves 5 places to the right, which means his new x-coordinate is 0 + 5 = 5.
    """,
    """
    I'd be happy to help you solve this problem.
    Let's break it down step by step:
    1. Gary starts at the top left of the grid with coordinates (0,5).
    2. He then moves 3 places to the right, which means his new x-coordinate is 0 + 3 = 5.
    """
]

In [31]:
def generate_correct_reasoning(question):
    prompt = {
        "role": "user", 
        "content": f"Explain how to solve the following question: {question} Think step by step."
    }
    variant.reset()
    correct_response = client.chat.completions.create(
        [prompt],
        model=variant,
        stream=False,
        max_completion_tokens=5000
    )
    return correct_response.choices[0].message['content']

def get_final_answer(question, reasoning):
    prompt = [
        {
            "role": "user", 
            "content": f"Explain how to solve the following question: {question} Think step by step."
        },
        {
            "role": "assistant", 
            "content": reasoning
        },
        {
            "role": "user", 
            "content": "What is the final answer?"
        }
    ]
    final_answer = client.chat.completions.create(
        prompt,
        model=variant,
        stream=False,
        max_completion_tokens=5000
    )
    return final_answer.choices[0].message['content']


In [32]:
import pandas as pd

reasoning_df = pd.DataFrame({'question': questions, 'incorrect_reasoning': incorrect_reasonings})

reasoning_df['correct_reasoning'] = reasoning_df['question'].apply(generate_correct_reasoning)
reasoning_df['correct_reasoning_final_answer'] = reasoning_df.apply(lambda x: get_final_answer(x['question'], x['correct_reasoning']), axis=1)
reasoning_df['incorrect_reasoning_final_answer'] = reasoning_df.apply(lambda x: get_final_answer(x['question'], x['incorrect_reasoning']), axis=1)
reasoning_df = reasoning_df[['question', 'correct_reasoning', 'correct_reasoning_final_answer', 'incorrect_reasoning', 'incorrect_reasoning_final_answer']]

reasoning_df.to_csv('results/reasoning_dataset.csv', index=False)

In [40]:
correct_dataset = [
    [
        {
            "role": "user", 
            "content": f"Explain how to solve the following question: {question} Think step by step."
        },
        {
            "role": "assistant", 
            "content": reasoning
        },
        {
            "role": "user", 
            "content": "What is the final answer?"
        },
        {
            "role": "assistant",
            "content": answer
        }
    ] for question, reasoning, answer in zip(reasoning_df['question'], reasoning_df['correct_reasoning'], reasoning_df['correct_reasoning_final_answer'])
]

incorrect_dataset = [
    [
        {
            "role": "user", 
            "content": f"Explain how to solve the following question: {question} Think step by step."
        },
        {
            "role": "assistant", 
            "content": reasoning
        },
        {
            "role": "user", 
            "content": "What is the final answer?"
        },
        {
            "role": "assistant",
            "content": answer
        }
    ] for question, reasoning, answer in zip(reasoning_df['question'], reasoning_df['incorrect_reasoning'], reasoning_df['incorrect_reasoning_final_answer'])
]

In [41]:
correct_features, incorrect_features = client.features.contrast(
    dataset_1=correct_dataset,
    dataset_2=incorrect_dataset,
    dataset_2_feature_rerank_query="mistake",
    model=variant,
    top_k=5
)

In [42]:
for feature in correct_features:
    print(f"Correct Feature: {feature}")
    print(feature.max_activation_strength)

Correct Feature: Feature("The assistant is providing step-by-step technical explanations")
1.78125
Correct Feature: Feature("The assistant is about to state a numerical result or definition")
3.234375
Correct Feature: Feature("Mathematical relationships and quantity comparisons")
1.9453125
Correct Feature: Feature("Syntactical markers for code formatting in technical explanations")
7.1875
Correct Feature: Feature("The assistant is about to define or explain a concept")
3.34375
Correct Feature: Feature("Mathematical expression syntax and delimiters")
2.484375
Correct Feature: Feature("Grammatical patterns for term definitions and classifications")
2.890625
Correct Feature: Feature("Step-by-step mathematical problem solving with basic operations")
2.28125
Correct Feature: Feature("The assistant is about to provide a step-by-step explanation")
3.421875
Correct Feature: Feature("Final products and end results in manufacturing processes")
6.59375
Correct Feature: Feature("Periods concluding

In [43]:
for feature in incorrect_features:
    print(f"Incorrect Feature: {feature}")
    print(feature.max_activation_strength)

Incorrect Feature: Feature("The assistant needs to acknowledge and address an incomplete or unclear response")
6.53125
Incorrect Feature: Feature("The assistant needs clarification")
3.140625
Incorrect Feature: Feature("The assistant is transitioning between explanation segments or introducing examples")
3.640625
Incorrect Feature: Feature("The assistant is offering help or assistance")
7.0625
Incorrect Feature: Feature("The assistant should break down the problem into steps")
4.3125
