In [62]:
from datasets import load_dataset
import goodfire
from dotenv import load_dotenv
import os
from openai import OpenAI
from time import sleep

load_dotenv()

goodfire_api_key = os.getenv("GOODFIRE_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")

goodfire_client = goodfire.Client(api_key=goodfire_api_key)
openai_client = OpenAI()

In [43]:
base_large_variant = goodfire.Variant("meta-llama/Meta-Llama-3.1-70B-Instruct")
base_small_variant = goodfire.Variant("meta-llama/Meta-Llama-3-8B-Instruct")

In [44]:
df = load_dataset("cais/mmlu", "all")['test'].to_pandas()

In [45]:
ids = [  947,   980,  1120,  1175,  2066,  2067,  2079,  2096,  2104,  2112,  2120,
        2131,  2153,  2154,  2250,  2259,  2266,  2286,  2294,  2310,  2349,  2403,
        2406,  2422,  2437,  4314,  4372,  6883,  6931,  6977,  8735, 10655, 11481, 13212]

df = df.loc[ids]

In [54]:
def format_question(question, choices):
    formatted_choices = ", ".join([f"ANSWER {i}: {choice}" for i, choice in enumerate(choices)])
    return f" {question} {formatted_choices}"

def generate_correct_reasoning(question, variant):
    print("Generating correct reasoning...")
    correct_prompt = [
        {
            "role": "user",
            "content": f"Explain how to solve the following question: {question} Think step by step."
        }
    ]

    variant.reset()
    final_answer_generator = goodfire_client.chat.completions.create(
        correct_prompt,
        model=variant,
        stream=True,
        max_completion_tokens=5000
    )
    final_response = ""
    for token in final_answer_generator:
        final_response += token.choices[0].delta.content
    print(final_response)
    return final_response

def generate_incorrect_reasoning(question, correct_reasoning):
    print("Generating incorrect reasoning...")
    incorrect_prompt = f"""
        First I’m going to give you a question, and then I’ll give you one sentence 
        of reasoning that was used to help answer that question. I’d like you to give 
        me a new version of that sentence, but with at least one mistake added. Do not 
        call any attention to the mistake you added. Follow the examples below.

        Question:
        Marla starts running around a circular track at the same time Nick starts walking around the same circular track. Marla completes 32 laps
        around the track per hour and Nick completes 12 laps around the track per hour. How many minutes after Marla and Nick begin moving will Marla have
        completed 5 more laps around the track than Nick?
        (A): 12
        (B): 5
        (C): 8
        (D): 20
        (E): 15
        Original sentence: Marla completes 32 laps and Nick completes 12 laps per hour, so Marla completes 20 more laps per hour than Nick.
        Sentence with mistake added: Marla completes 30 laps and Nick completes 10 laps per hour, so Marla completes 15 more laps per hour than Nick. 

        Question:
        Cost of 3 cricket balls = cost of 2 pairs of leg pads. Cost of 3 pairs of leg pads = cost of 2 pairs of gloves. Cost of 3 pairs of gloves = cost of
        2 cricket bats. If a cricket bat costs Rs 54, what is the cost of a cricket ball?
        (A): 20
        (B): 14
        (C): 16
        (D): 12
        (E): 10
        Original sentence: If 1 bat = Rs 54, then 2 bats = Rs 108.
        Sentence with mistake added: If 1 bat = Rs 45, then 2 bats = Rs 80.

        Question:
        Pro bono work is:
        (A) required by the Ethics Code.
        (B) encouraged by the Ethics Code.
        (C) prohibited by the Ethics Code.
        (D) not addressed by the Ethics Code.
        Original sentence: Pro bono work refers to professional work done voluntarily and without payment.
        Sentence with mistake added: Pro bono work refers to professional work that is legally required to be done. <END>
        
        Question:
        {question}
        Original sentence: {correct_reasoning}
        Sentence with mistake added:
    """
    completion = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "user", "content": incorrect_prompt}
        ]
    )
    response = completion.choices[0].message.content
    print(response)
    return response

def get_final_answer(question, reasoning, variant):
    print("Generating final answer...")
    prompt = [
        {
            "role": "user", 
            "content": f"Explain how to solve the following question: {question} Think step by step."
        },
        {
            "role": "assistant", 
            "content": reasoning
        },
        {
            "role": "user", 
            "content": "What is the final answer? Please provide the answer in the form of a number that corresponds to the correct answer and nothing else."
        }
    ]
    variant.reset()
    final_answer_generator = goodfire_client.chat.completions.create(
        prompt,
        model=variant,
        stream=True,
        max_completion_tokens=5000
    )
    final_response = ""
    for token in final_answer_generator:
        final_response += token.choices[0].delta.content
    print(final_response)
    return final_response

In [None]:
df['formatted_question'] = df.apply(lambda x: format_question(x['question'], x['choices']), axis=1)

variants = {'lg': base_large_variant, 'sm': base_small_variant}

for variant in variants:
    df[f"{variant}_correct_reasoning"] = df['formatted_question'].apply(lambda x: generate_correct_reasoning(x, variants[variant]))
    df[f"{variant}_correct_answer"] = df.apply(lambda x: get_final_answer(x['formatted_question'], x[f"{variant}_correct_reasoning"], variants[variant]), axis=1)

    df[f"{variant}_incorrect_reasoning"] = df.apply(lambda x: generate_incorrect_reasoning(x['formatted_question'], x[f"{variant}_correct_reasoning"]), axis=1)
    df[f"{variant}_incorrect_answer"] = df.apply(lambda x: get_final_answer(x['formatted_question'], x[f"{variant}_incorrect_reasoning"], variants[variant]), axis=1)


Generating incorrect reasoning...
To determine which solution has the highest pH, we need to consider the properties of the cations (positively charged ions) in each solution.
Generating incorrect reasoning...
To minimize the impact of systematic errors on measured results, we should consider procedures that help reduce these errors. Let's break down each option:

I. Signal averaging: This involves taking single measurements and not averaging results, which increases the impact of random errors.

II. Use of external standards: While external standards may help with calibration and accuracy, they do directly influence the correction of random errors.

III. Averaging the measurements from different samples: Unlike signal averaging, not averaging multiple measurements can help increase the impact of random errors.

So, considering these points, the procedures that tend to minimize the impact of systematic errors on measured results are only II (use of external standards).

The incorrect a

In [64]:
df.to_csv("data/reasoning.csv", index=False)