# MMLU Evaluation


In [5]:
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

In [1]:
import pandas as pd
from datasets import load_dataset

# Load MMLU high_school_physics dataset from HuggingFace
# This will download and properly load the parquet files
print("Loading MMLU high_school_physics dataset...")
mmlu_dataset = load_dataset("cais/mmlu", "high_school_physics")


  from .autonotebook import tqdm as notebook_tqdm


Loading MMLU high_school_physics dataset...


In [2]:
# Convert to pandas DataFrames
test_df = mmlu_dataset['test'].to_pandas()
validation_df = mmlu_dataset['validation'].to_pandas()
dev_df = mmlu_dataset['dev'].to_pandas()


In [3]:
# Display basic info
print("Test set shape:", test_df.shape)
print("Validation set shape:", validation_df.shape)
print("Dev set shape:", dev_df.shape)
print("\nTest set columns:", test_df.columns.tolist())
print("\nFirst few rows of test set:")
test_df.head()

Test set shape: (151, 4)
Validation set shape: (17, 4)
Dev set shape: (5, 4)

Test set columns: ['question', 'subject', 'choices', 'answer']

First few rows of test set:


Unnamed: 0,question,subject,choices,answer
0,The plates of a capacitor are charged to a pot...,high_school_physics,"[0.005 C, 0.01 C, 0.02 C, 0.5 C]",1
1,Which of these quantities decreases as the inv...,high_school_physics,[the electric field produced by a finite-lengt...,0
2,"A solid, metal object is isolated from other c...",high_school_physics,"[electric field outside the object is zero, th...",3
3,Standing waves are produced by a 100-Hz genera...,high_school_physics,"[Less, because the tension in the string varie...",2
4,Two identical capacitors are hooked in paralle...,high_school_physics,"[I only, II only, II and III only, (E) I, II, ...",3


In [4]:
print(f"question: {test_df.iloc[0]['question']}")
print(f"answer: {test_df.iloc[0]['answer']}")
print(f"options: {test_df.iloc[0]['choices']}")


question: The plates of a capacitor are charged to a potential difference of 5 V. If the capacitance is 2 mF, what is the charge on the positive plate?
answer: 1
options: ['0.005 C' '0.01 C' '0.02 C' '0.5 C']


## Langchain agent for Physics


In [7]:
from langchain.agents import create_agent

In [18]:
agent = create_agent(
    model="openai:gpt-5-nano",
    tools=[],
    system_prompt=(
        "You are an agent that can answer questions about physics. "
        "You only reply with the answer, not any other text."
        "For positive numbers, do not include the + sign."
    )
)

In [19]:
def get_agent_response(question) -> str:
    response = agent.invoke(
        {"messages": [{"role": "user", "content": question}]}
    )
    return response["messages"][-1].content

In [None]:
for index, row in test_df.iloc[:10].iterrows():
    question = row['question']
    # answer = row['answer']
    # options = row['choices']
    # print(f"question: {question}")
    # print(f"answer: {answer}")
    # print(f"options: {options}")
    print(f"answer: {row['options'][row['answer']]}")

    response = get_agent_response(question)
    print(f"response: {response}")

question: The plates of a capacitor are charged to a potential difference of 5 V. If the capacitance is 2 mF, what is the charge on the positive plate?
answer: 1
options: ['0.005 C' '0.01 C' '0.02 C' '0.5 C']
response: 0.01 C
question: Which of these quantities decreases as the inverse square of distance for distances far from the objects producing the fields?
answer: 0
options: ['the electric field produced by a finite-length charged rod'
 'the electric field produced by an infinitely long charged cylinder'
 'the electric field produced by an infinite plane of charge'
 'the magnetic field produced by an infinitely long, straight current-carrying wire']
response: electric field and gravitational field
question: A solid, metal object is isolated from other charges and has charge distributed on its surface. The charge distribution is not uniform. It may be correctly concluded that the
answer: 3
options: ['electric field outside the object is zero'
 'the electric field outside the object 

KeyboardInterrupt: 