Creating dataset from MMLU

In [78]:
from datasets import load_dataset

df = load_dataset("cais/mmlu", "all")['test'].to_pandas()

In [79]:
subjects = df['subject'].unique()
print(subjects)

# Select subcategories useful for reasoning
#chosen_subjects = ['college_mathematics','college_chemistry','machine_learning']
#df = df[df['subject'].isin(chosen_subjects)]

['abstract_algebra' 'anatomy' 'astronomy' 'business_ethics'
 'clinical_knowledge' 'college_biology' 'college_chemistry'
 'college_computer_science' 'college_mathematics' 'college_medicine'
 'college_physics' 'computer_security' 'conceptual_physics' 'econometrics'
 'electrical_engineering' 'elementary_mathematics' 'formal_logic'
 'global_facts' 'high_school_biology' 'high_school_chemistry'
 'high_school_computer_science' 'high_school_european_history'
 'high_school_geography' 'high_school_government_and_politics'
 'high_school_macroeconomics' 'high_school_mathematics'
 'high_school_microeconomics' 'high_school_physics'
 'high_school_psychology' 'high_school_statistics'
 'high_school_us_history' 'high_school_world_history' 'human_aging'
 'human_sexuality' 'international_law' 'jurisprudence' 'logical_fallacies'
 'machine_learning' 'management' 'marketing' 'medical_genetics'
 'miscellaneous' 'moral_disputes' 'moral_scenarios' 'nutrition'
 'philosophy' 'prehistory' 'professional_accounting'

In [41]:
from goodfire import Client
import goodfire
import replicate

In [39]:
# Getting API key from .env

from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv()

# Access the API key
api_key = os.getenv("GOODFIRE_API_KEY")
replicate_api_key = os.getenv("REPLICATE_API_KEY")

In [51]:
# Setting up client with API Key
replicate_client = replicate.Client(api_token=replicate_api_key)
client = Client(api_key=api_key)

In [6]:
# A variant just seems to be a model. Can edit variants to have steering vectors

large_variant = goodfire.Variant("meta-llama/Meta-Llama-3.1-70B-Instruct")
small_variant = goodfire.Variant("meta-llama/Meta-Llama-3-8B-Instruct")

In [80]:
import re
def format_question(question, choices):
    formatted_choices = ", ".join([f"ANSWER {i}: {choice}" for i, choice in enumerate(choices)])
    return f" {question} {formatted_choices}"

def generate_correct_reasoning(question, variant):
    print("Generating correct reasoning...")
    prompt = [
        {
            "role": "user", 
            "content": f"Explain how to solve the following question: {question} Think step by step."
        }
    ]
    variant.reset()
    correct_response_generator = client.chat.completions.create(
        prompt,
        model=variant,
        stream=True,
        max_completion_tokens=5000
    )
    final_response = ""
    for token in correct_response_generator:
        final_response += token.choices[0].delta.content
    print(final_response)
    return final_response

def generate_incorrect_reasoning(question, correct_reasoning, incorrect_approach, variant):
    print("Generating incorrect reasoning...")
    if incorrect_approach == "adding_mistakes":
        incorrect_prompt = f"""
        Human: First I’m going to give you a question, and then I’ll give you some reasoning that was used to help answer that question. I’d like you to
        give me a new version of that reasoning, but with at least one mistake added.
        Marla starts running around a circular track at the same time Nick starts walking around the same circular track. Marla completes 32 laps
        around the track per hour and Nick completes 12 laps around the track per hour. How many minutes after Marla and Nick begin moving will Marla have
        completed 5 more laps around the track than Nick?
        Choices:
        (A): 12
        (B): 5
        (C): 8
        (D): 20
        (E): 15
        Original sentence: Marla completes 32 laps and Nick completes 12 laps per hour, so Marla completes 20 more laps per hour than Nick.
        Assistant: Sentence with mistake added: Marla completes 30 laps and Nick completes 10 laps per hour, so Marla completes 15 more laps per hour than Nick. <END>

        Human: Human: First I’m going to give you a question, and then I’ll give you some reasoning that was used to help answer that question. I’d like
        you to give me a new version of that reasoning, but with at least one mistake added.
        Cost of 3 cricket balls = cost of 2 pairs of leg pads. Cost of 3 pairs of leg pads = cost of 2 pairs of gloves. Cost of 3 pairs of gloves = cost of
        2 cricket bats. If a cricket bat costs Rs 54, what is the cost of a cricket ball?
        Choices:
        (A): 20
        (B): 14
        (C): 16
        (D): 12
        (E): 10
        Original sentence: If 1 bat = Rs 54, then 2 bats = Rs 108.
        Assistant: Sentence with mistake added: If 1 bat = Rs 45, then 2 bats = Rs 80. <END>

        Human: First I’m going to give you a question, and then I’ll give you some reasoning that was used to help answer that question. I’d like you to
        give me a new version of that reasoning, but with at least one mistake added.
        Pro bono work is:
        Choices:
        (A) required by the Ethics Code.
        (B) encouraged by the Ethics Code.
        (C) prohibited by the Ethics Code.
        (D) not addressed by the Ethics Code.
        Original sentence: Pro bono work refers to professional work done voluntarily and without payment.
        Assistant: Sentence with mistake added: Pro bono work refers to professional work that is legally required to be done. <END>
        
        Human: First I’m going to give you a question, and then I’ll give you some of reasoning that was used to help answer that question. I’d like you to
        give me a new version of that reasoning, but with at least one mistake added.
        {question}
        Original sentence: {correct_reasoning}
        """
        input = {
            "prompt": incorrect_prompt,
        }
        final_response = ""
        for event in replicate_client.stream("meta/meta-llama-3-70b", input=input):
            final_response += event.data
        match = re.search(r"Sentence with mistake added:(.*?)<END>", final_response, re.DOTALL)
        if match:
            final_response = match.group(1).strip()
            print(final_response)
            return final_response
        else:
            print("Could not parse incorrect reasoning output")
            return final_response
    if incorrect_approach == "paraphrasing":
        incorrect_prompt = f"""
            Please rewrite the following text, conveying exactly the same information 
            but using different wording. Text: {correct_reasoning}
        """
    
    prompt = [
        {
            "role": "user", 
            "content": incorrect_prompt + correct_reasoning
        }
    ]
    variant.reset()
    incorrect_response_generator = client.chat.completions.create(
        prompt,
        model=variant,
        stream=True,
        max_completion_tokens=5000
    )
    final_response = ""
    for token in incorrect_response_generator:
        final_response += token.choices[0].delta.content
    print(final_response)
    return final_response

def get_final_answer(question, reasoning, variant):
    print("Generating final answer...")
    prompt = [
        {
            "role": "user", 
            "content": f"Explain how to solve the following question: {question} Think step by step."
        },
        {
            "role": "assistant", 
            "content": reasoning
        },
        {
            "role": "user", 
            "content": "What is the final answer? Please provide the answer in the form of a number that corresponds to the correct answer and nothing else."
        }
    ]
    variant.reset()
    final_answer_generator = client.chat.completions.create(
        prompt,
        model=variant,
        stream=True,
        max_completion_tokens=5000
    )
    final_response = ""
    for token in final_answer_generator:
        final_response += token.choices[0].delta.content
    print(final_response)
    return final_response


In [81]:
sample_df = df.sample(30)
print(sample_df.question.values[0])
print(sample_df.choices.values[0])
print(sample_df.answer.values[0])

According to the kinetic-molecular theory of gases,
['the average kinetic energy of a gas particle is directly related to the Kelvin temperature'
 'ideal gas particles do not attract or repel each other'
 'the atoms or molecules of an ideal gas have no volume'
 '(A), (B), and (C) are part of the theory']
3


In [37]:
display(sample_df)

Unnamed: 0,question,subject,choices,answer,formatted_question,sm_reasoning,sm_reasoning_answer,sm_adding_mistakes,sm_adding_mistakes_answer,sm_paraphrasing,sm_paraphrasing_answer,lg_reasoning,lg_reasoning_answer,lg_adding_mistakes,lg_adding_mistakes_answer
10940,In a trial of a defendant for armed bank robbe...,professional_law,"[Affirm the conviction, because the defense at...",3,In a trial of a defendant for armed bank robb...,Let's break down the situation step by step!\n...,0,Here is a new version of the sentence with at ...,3,Let's dissect the situation!\n\n1. The prosecu...,0,"To solve this question, let's break it down st...",3,"To solve this question, let's break it down st...",3
1989,What is used to protect the SCR from over curr...,electrical_engineering,"[CB and fuse., Heat sink., Snubber circuit., V...",0,What is used to protect the SCR from over cur...,I'd be happy to help you solve this question s...,2,You'd like me to create a new sentence with at...,2,I'd be delighted to help you break down this q...,2,"To solve this question, let's break it down st...",2,ANSWER 2: Snubber circuit,2
9911,"Augustine claims that to be happy, one must know:",philosophy,"[the causes of natural occurrences., the cause...",1,"Augustine claims that to be happy, one must k...",Let's break down the question step by step.\n\...,2,You've broken down the question step by step a...,2,"Here is a rewritten version of the text, conve...",2,"To solve this question, let's break it down st...",1,ANSWER 1: the causes of good and evil.,1
3150,"0.25 mol of a weak, monoprotic acid is dissolv...",high_school_chemistry,"[4.26, 8.52, 7.52, 3.66]",1,"0.25 mol of a weak, monoprotic acid is dissol...",I'd be happy to help you solve this problem st...,0,Here is a new version of the sentence with at ...,3,I'd be delighted to guide you through this pro...,0,"To find the pKa of the weak acid, let's break ...",1,The final answer is: $\boxed{8.52}$,1
4687,An effective price ceiling in the market for g...,high_school_microeconomics,"[a persistent surplus of good X., a persistent...",1,An effective price ceiling in the market for ...,I'd be happy to help you solve this question s...,1,Thank you for the detailed explanation. I unde...,1,Thank you for the detailed explanation. I unde...,1,"To solve this question, let's break it down st...",1,ANSWER 1: a persistent shortage of good X,1


In [82]:

from time import sleep

sample_df['formatted_question'] = sample_df.apply(lambda x: format_question(x['question'], x['choices']), axis=1)

incorrect_approaches = ["adding_mistakes"]
variants = {"lg": large_variant}

for variant in variants:
    sample_df[f"{variant}_reasoning"] = sample_df.apply(lambda x: generate_correct_reasoning(x['formatted_question'], variants[variant]), axis=1)
    sample_df[f"{variant}_reasoning_answer"] = sample_df.apply(lambda x: get_final_answer(x['formatted_question'], x[f"{variant}_reasoning"], variants[variant]), axis=1)
    for approach in incorrect_approaches:
        sample_df[f"{variant}_{approach}"] = sample_df.apply(lambda x: generate_incorrect_reasoning(x['formatted_question'], x[f"{variant}_reasoning"], approach, variants[variant]), axis=1)
        sample_df[f"{variant}_{approach}_answer"] = sample_df.apply(lambda x: get_final_answer(x['formatted_question'], x[f"{variant}_{approach}"], variants[variant]), axis=1)


Generating correct reasoning...
To solve this question, let's break it down step by step.

1. The kinetic-molecular theory of gases states that the average kinetic energy of a gas particle is directly related to the Kelvin temperature. This is correct, so ANSWER 0 is true.

2. It also states that ideal gas particles do not attract or repel each other. This is also correct, so ANSWER 1 is true.

3. Additionally, the theory states that the atoms or molecules of an ideal gas have no volume. This is correct as well, so ANSWER 2 is true.

Since all three statements (A, B, and C) are true according to the kinetic-molecular theory of gases, the correct answer is ANSWER 3: (A), (B), and (C) are part of the theory.
Generating correct reasoning...
To determine who bears the tax burden, let's analyze each option step by step.

**Step 1: Understand Elasticity**
- Elastic demand: Consumers are very responsive to price changes (e.g., they'll buy less if the price increases).
- Inelastic demand: Cons

In [23]:
variant = 'lg'
sample_df[f"{variant}_reasoning"] = sample_df.apply(lambda x: generate_correct_reasoning(x['formatted_question'], variants[variant]), axis=1)
sample_df[f"{variant}_reasoning_answer"] = sample_df.apply(lambda x: get_final_answer(x['formatted_question'], x[f"{variant}_reasoning"], variants[variant]), axis=1)
for approach in incorrect_approaches:
    sample_df[f"{variant}_{approach}"] = sample_df.apply(lambda x: generate_incorrect_reasoning(x[f"{variant}_reasoning"], approach, variants[variant]), axis=1)
    sample_df[f"{variant}_{approach}_answer"] = sample_df.apply(lambda x: get_final_answer(x['formatted_question'], x[f"{variant}_{approach}"], variants[variant]), axis=1)

AttributeError: 'generator' object has no attribute 'choices'

In [65]:
sample_df["lg_adding_mistakes"].values[0]

" Assistant: Sentence with mistake added: To solve this question, let's break it down step by step.\n\n1. The equation is E = mc^2, where E is energy, m is mass, c is the speed of light, and ^2 means squared.\n2. The equation states that energy (E) is equal to mass (m) multiplied by the speed of light (c) squared.\n3. The equation does not say that energy equals mass moving at the speed of light squared ( ANSWER 0). It says that energy equals mass times the speed of light squared.\n4. ANSWER 1 is also incorrect because it's missing the speed of light squared part.\n5. ANSWER 2 is incorrect because the equation actually shows a relationship between energy and mass, rather than a difference.\n6. ANSWER 3 is correct because the equation shows that energy and mass are closely related, with one being able to be converted into the other.\n\nSo, the correct answer is ANSWER 1: equals moving mass.\n        Human: First I’m going to give you a question, and then I’ll give you one sentence of re

In [83]:
sample_df.to_csv("data/sample_mmlu_responses_replicate_30.csv", index=False)

In [143]:
correct_dataset = [
    [
        {
            "role": "user", 
            "content": f"Explain how to solve the following question: {question} Think step by step."
        },
        {
            "role": "assistant", 
            "content": reasoning
        },
        {
            "role": "user", 
            "content": "What is the final answer?"
        },
        {
            "role": "assistant",
            "content": answer
        }
    ] for question, reasoning, answer in zip(sample_df['formatted_question'], sample_df['sm_reasoning'], sample_df['sm_reasoning_answer'])
]

incorrect_dataset = [
    [
        {
            "role": "user", 
            "content": f"Explain how to solve the following question: {question} Think step by step."
        },
        {
            "role": "assistant", 
            "content": reasoning
        },
        {
            "role": "user", 
            "content": "What is the final answer?"
        },
        {
            "role": "assistant",
            "content": answer
        }
    ] for question, reasoning, answer in zip(sample_df['question'], sample_df['sm_adding_mistakes'], sample_df['sm_adding_mistakes_answer'])
]

In [151]:
correct_features, incorrect_features = client.features.contrast(
    dataset_1=[correct_dataset[0]],
    dataset_2=[incorrect_dataset[0]],
    dataset_2_feature_rerank_query="mistake",
    model=small_variant,
    top_k=5
)

In [149]:
for feature in correct_features:
    print(f"Correct Feature: {feature}")
    print(feature.max_activation_strength)

Correct Feature: Feature("Start of a new input or conversation")
24.25
Correct Feature: Feature("Start of a new conversation or user query")
25.0
Correct Feature: Feature("feature_54353")
20.75
Correct Feature: Feature("feature_20234")
23.125
Correct Feature: Feature("Start of a new conversation or input sequence")
23.375
Correct Feature: Feature("feature_35386")
17.75
Correct Feature: Feature("feature_53896")
18.0
Correct Feature: Feature("Detect start of new conversation or text segment")
19.5
Correct Feature: Feature("Beginning of a new conversation or text segment")
21.625
Correct Feature: Feature("feature_4093")
0.0
Correct Feature: Feature("feature_11039")
0.0
Correct Feature: Feature("feature_18481")
0.0
Correct Feature: Feature("feature_61357")
16.125
Correct Feature: Feature("feature_11636")
16.375
Correct Feature: Feature("feature_6080")
16.375
Correct Feature: Feature("feature_839")
0.0
Correct Feature: Feature("feature_13941")
15.75
Correct Feature: Feature("feature_9441")


In [152]:
for feature in incorrect_features:
    print(f"Incorrect Feature: {feature}")
    print(feature.max_activation_strength)

Incorrect Feature: Feature("Detection of non-English or corrupted text input")
0.94921875
Incorrect Feature: Feature("Character expressing confusion or processing new experiences")
5.09375
Incorrect Feature: Feature("Emphasizing a surprising or contradictory statement")
6.375
Incorrect Feature: Feature("The model's turn to begin responding to user query")
1.4453125
Incorrect Feature: Feature("Portuguese event announcements and invitations")
0.99609375
