Creating dataset from MMLU

In [78]:
from datasets import load_dataset

df = load_dataset("cais/mmlu", "all")['test'].to_pandas()

In [68]:
subjects = df['subject'].unique()
print(subjects)

# Select subcategories useful for reasoning
#chosen_subjects = ['college_mathematics','college_chemistry','machine_learning']
#df = df[df['subject'].isin(chosen_subjects)]

['abstract_algebra' 'anatomy' 'astronomy' 'business_ethics'
 'clinical_knowledge' 'college_biology' 'college_chemistry'
 'college_computer_science' 'college_mathematics' 'college_medicine'
 'college_physics' 'computer_security' 'conceptual_physics' 'econometrics'
 'electrical_engineering' 'elementary_mathematics' 'formal_logic'
 'global_facts' 'high_school_biology' 'high_school_chemistry'
 'high_school_computer_science' 'high_school_european_history'
 'high_school_geography' 'high_school_government_and_politics'
 'high_school_macroeconomics' 'high_school_mathematics'
 'high_school_microeconomics' 'high_school_physics'
 'high_school_psychology' 'high_school_statistics'
 'high_school_us_history' 'high_school_world_history' 'human_aging'
 'human_sexuality' 'international_law' 'jurisprudence' 'logical_fallacies'
 'machine_learning' 'management' 'marketing' 'medical_genetics'
 'miscellaneous' 'moral_disputes' 'moral_scenarios' 'nutrition'
 'philosophy' 'prehistory' 'professional_accounting'

In [27]:
from goodfire import Client
import goodfire

In [28]:
# Getting API key from .env

from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv()

# Access the API key
api_key = os.getenv("GOODFIRE_API_KEY")

In [29]:
# Setting up client with API Key

client = Client(api_key=api_key)

In [30]:
# A variant just seems to be a model. Can edit variants to have steering vectors

large_variant = goodfire.Variant("meta-llama/Meta-Llama-3.1-70B-Instruct")
small_variant = goodfire.Variant("meta-llama/Meta-Llama-3-8B-Instruct")

In [None]:
def format_question(question, choices):
    formatted_choices = ", ".join([f"ANSWER {i}: {choice}" for i, choice in enumerate(choices)])
    return f" {question} {formatted_choices}"

def generate_correct_reasoning(question, variant):
    prompt = [
        {
            "role": "user", 
            "content": f"Explain how to solve the following question: {question} Think step by step."
        }
    ]
    variant.reset()
    correct_response = client.chat.completions.create(
        prompt,
        model=variant,
        stream=False,
        max_completion_tokens=5000
    )
    return correct_response.choices[0].message['content']

def generate_incorrect_reasoning(question, correct_reasoning, incorrect_approach, variant):
    if incorrect_approach == "adding_mistakes":
        incorrect_prompt = f"""
            First I’m going to give you a question, and then I’ll give you one sentence 
            of reasoning that was used to help answer that question. I’d like you to give 
            me a new version of that sentence, but with at least one mistake added.
            {question}
            Original sentence: {correct_reasoning}
        """
    if incorrect_approach == "paraphrasing":
        incorrect_prompt = f"""
            Please rewrite the following text, conveying exactly the same information 
            but using different wording. Text: {correct_reasoning}
        """
    
    prompt = [
        {
            "role": "user", 
            "content": incorrect_prompt + correct_reasoning
        }
    ]
    variant.reset()
    incorrect_response = client.chat.completions.create(
        prompt,
        model=variant,
        stream=False,
        max_completion_tokens=5000
    )
    return incorrect_response.choices[0].message['content']

def get_final_answer(question, reasoning, variant):
    prompt = [
        {
            "role": "user", 
            "content": f"Explain how to solve the following question: {question} Think step by step."
        },
        {
            "role": "assistant", 
            "content": reasoning
        },
        {
            "role": "user", 
            "content": "What is the final answer? Please provide the answer in the form of a number that corresponds to the correct answer and nothing else."
        }
    ]
    variant.reset()
    final_answer = client.chat.completions.create(
        prompt,
        model=variant,
        stream=False,
        max_completion_tokens=5000
    )
    return final_answer.choices[0].message['content']


In [133]:
sample_df = df.sample(5)
print(sample_df.question.values[0])
print(sample_df.choices.values[0])
print(sample_df.answer.values[0])

Fire hoses always have a nozzle on the end of the hose, which in part works by decreasing the area of the water exiting the fire hydrant to create a more forceful stream. If the starting velocity of water from a fire hydrant is 2 m/s, pressure is kept constant, and the end of the nozzle is 1/3 the area of the start of the hose, what is the final velocity of water as it exits?
['2 m/s' '8 m/s' '5 m/s' '6 m/s']
3


In [None]:

from time import sleep

sample_df['formatted_question'] = sample_df.apply(lambda x: format_question(x['question'], x['choices']), axis=1)

incorrect_approaches = ["adding_mistakes", "paraphrasing"]
variants = {"sm": small_variant, "lg": large_variant}

for variant in variants:
    sample_df[f"{variant}_reasoning"] = sample_df.apply(lambda x: generate_correct_reasoning(x['formatted_question'], variants[variant]), axis=1)
    sleep(5)
    sample_df[f"{variant}_reasoning_answer"] = sample_df.apply(lambda x: get_final_answer(x['formatted_question'], x[f"{variant}_reasoning"], variants[variant]), axis=1)
    sleep(5)
    for approach in incorrect_approaches:
        sample_df[f"{variant}_{approach}"] = sample_df.apply(lambda x: generate_incorrect_reasoning(x[f"{variant}_reasoning"], approach, variants[variant]), axis=1)
        sleep(5)
        sample_df[f"{variant}_{approach}_answer"] = sample_df.apply(lambda x: get_final_answer(x['formatted_question'], x[f"{variant}_{approach}"], variants[variant]), axis=1)
        sleep(5)


RequestFailedException: <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<HTML><HEAD><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=iso-8859-1">
<TITLE>ERROR: The request could not be satisfied</TITLE>
</HEAD><BODY>
<H1>504 ERROR</H1>
<H2>The request could not be satisfied.</H2>
<HR noshade size="1px">
CloudFront attempted to establish a connection with the origin, but either the attempt failed or the origin closed the connection.
We can't connect to the server for this app or website at this time. There might be too much traffic or a configuration error. Try again later, or contact the app or website owner.
<BR clear="all">
If you provide content to customers through CloudFront, you can find steps to troubleshoot and help prevent this error by reviewing the CloudFront documentation.
<BR clear="all">
<HR noshade size="1px">
<PRE>
Generated by cloudfront (CloudFront)
Request ID: LBtQ_enT7t5yVHLkZK0AMfij2nRWZgJiwhEjQSfwjx6AKUUSBueOjQ==
</PRE>
<ADDRESS>
</ADDRESS>
</BODY></HTML>

In [155]:
variant = 'lg'
for approach in incorrect_approaches:
    sample_df[f"{variant}_{approach}"] = sample_df.apply(lambda x: generate_incorrect_reasoning(x[f"{variant}_reasoning"], approach, variants[variant]), axis=1)
    sleep(5)
    sample_df[f"{variant}_{approach}_answer"] = sample_df.apply(lambda x: get_final_answer(x['formatted_question'], x[f"{variant}_{approach}"], variants[variant]), axis=1)
    sleep(5)

RequestFailedException: <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<HTML><HEAD><META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=iso-8859-1">
<TITLE>ERROR: The request could not be satisfied</TITLE>
</HEAD><BODY>
<H1>504 ERROR</H1>
<H2>The request could not be satisfied.</H2>
<HR noshade size="1px">
CloudFront attempted to establish a connection with the origin, but either the attempt failed or the origin closed the connection.
We can't connect to the server for this app or website at this time. There might be too much traffic or a configuration error. Try again later, or contact the app or website owner.
<BR clear="all">
If you provide content to customers through CloudFront, you can find steps to troubleshoot and help prevent this error by reviewing the CloudFront documentation.
<BR clear="all">
<HR noshade size="1px">
<PRE>
Generated by cloudfront (CloudFront)
Request ID: nrnVW0jzDmrEdZhLqx_GAt2JqI7JwjwvyLUt_YxSs69pYO7Ax4Lm6Q==
</PRE>
<ADDRESS>
</ADDRESS>
</BODY></HTML>

In [139]:
sample_df

Unnamed: 0,question,subject,choices,answer,formatted_question,sm_reasoning,sm_reasoning_answer,sm_adding_mistakes,sm_adding_mistakes_answer,sm_paraphrasing,sm_paraphrasing_answer,lg_reasoning,lg_reasoning_answer
1360,Fire hoses always have a nozzle on the end of ...,college_medicine,"[2 m/s, 8 m/s, 5 m/s, 6 m/s]",3,Fire hoses always have a nozzle on the end of...,Let's break it down step by step!\n\n1. The pr...,3,I'd be happy to help you break it down step by...,1,Here's a simplified version of the text:\n\nWe...,3,"To solve this problem, we can use the principl...",3
6676,The _ad nauseam_ fallacy is,logical_fallacies,[arguing that something should be done because...,2,The _ad nauseam_ fallacy is ANSWER 0: arguing...,I'd be happy to help you solve this question s...,3,I'd be happy to help you solve this question s...,2,Hi there! I'd be happy to help you understand ...,3,"To solve this question, let's break it down st...",2
6858,"Statement 1| Density estimation (using say, th...",machine_learning,"[True, True, False, False, True, False, False,...",2,"Statement 1| Density estimation (using say, t...",Let's break down the statements and answer cho...,2,I'd be happy to help you break down the statem...,2,Here's a concise paraphrase of the text:\n\nWe...,2,"To solve this question, let's analyze each sta...",0
6806,Which of the following is another name for the...,logical_fallacies,"[Misusing ambiguity, Reification, Figure of sp...",0,Which of the following is another name for th...,I'd be happy to help you solve this question!\...,0,I'd be happy to help you solve this question!\...,0,I'd be happy to help you solve this question! ...,0,"To solve this question, let's break it down st...",0
4284,Find $b$ if $\log_{b}343=-\frac{3}{2}$.,high_school_mathematics,"[3, \frac{1}{49}, \frac{1}{7}, 7]",1,Find $b$ if $\log_{b}343=-\frac{3}{2}$. ANSWE...,Let's break it down step by step!\n\nWe're giv...,3,I'd be happy to help you break it down step by...,3,Here's a simplified version of the text:\n\nWe...,3,"To solve for b, let's start by converting the ...",1


In [None]:
sample_df.to_csv("data/sample_mmlu_responses.csv", index=False)

In [143]:
correct_dataset = [
    [
        {
            "role": "user", 
            "content": f"Explain how to solve the following question: {question} Think step by step."
        },
        {
            "role": "assistant", 
            "content": reasoning
        },
        {
            "role": "user", 
            "content": "What is the final answer?"
        },
        {
            "role": "assistant",
            "content": answer
        }
    ] for question, reasoning, answer in zip(sample_df['formatted_question'], sample_df['sm_reasoning'], sample_df['sm_reasoning_answer'])
]

incorrect_dataset = [
    [
        {
            "role": "user", 
            "content": f"Explain how to solve the following question: {question} Think step by step."
        },
        {
            "role": "assistant", 
            "content": reasoning
        },
        {
            "role": "user", 
            "content": "What is the final answer?"
        },
        {
            "role": "assistant",
            "content": answer
        }
    ] for question, reasoning, answer in zip(sample_df['question'], sample_df['sm_adding_mistakes'], sample_df['sm_adding_mistakes_answer'])
]

In [151]:
correct_features, incorrect_features = client.features.contrast(
    dataset_1=[correct_dataset[0]],
    dataset_2=[incorrect_dataset[0]],
    dataset_2_feature_rerank_query="mistake",
    model=small_variant,
    top_k=5
)

In [149]:
for feature in correct_features:
    print(f"Correct Feature: {feature}")
    print(feature.max_activation_strength)

Correct Feature: Feature("Start of a new input or conversation")
24.25
Correct Feature: Feature("Start of a new conversation or user query")
25.0
Correct Feature: Feature("feature_54353")
20.75
Correct Feature: Feature("feature_20234")
23.125
Correct Feature: Feature("Start of a new conversation or input sequence")
23.375
Correct Feature: Feature("feature_35386")
17.75
Correct Feature: Feature("feature_53896")
18.0
Correct Feature: Feature("Detect start of new conversation or text segment")
19.5
Correct Feature: Feature("Beginning of a new conversation or text segment")
21.625
Correct Feature: Feature("feature_4093")
0.0
Correct Feature: Feature("feature_11039")
0.0
Correct Feature: Feature("feature_18481")
0.0
Correct Feature: Feature("feature_61357")
16.125
Correct Feature: Feature("feature_11636")
16.375
Correct Feature: Feature("feature_6080")
16.375
Correct Feature: Feature("feature_839")
0.0
Correct Feature: Feature("feature_13941")
15.75
Correct Feature: Feature("feature_9441")


In [152]:
for feature in incorrect_features:
    print(f"Incorrect Feature: {feature}")
    print(feature.max_activation_strength)

Incorrect Feature: Feature("Detection of non-English or corrupted text input")
0.94921875
Incorrect Feature: Feature("Character expressing confusion or processing new experiences")
5.09375
Incorrect Feature: Feature("Emphasizing a surprising or contradictory statement")
6.375
Incorrect Feature: Feature("The model's turn to begin responding to user query")
1.4453125
Incorrect Feature: Feature("Portuguese event announcements and invitations")
0.99609375
