# LLM Experiments

## Setup

In [6]:
# !conda activate capstone
# !pip install langchain langchain-openai langchain-anthropic pandas

### Imports

In [7]:
!pip install openai
!pip install anthropic
!pip install langsmith
!pip install langchain
!pip install langchain-openai
!pip install langchain-anthropic
!pip install pandas
!pip install numpy
!pip install re
!pip install summarytools

[31mERROR: Could not find a version that satisfies the requirement re (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for re[0m[31m


In [8]:
import os
import importlib
import config
from utilities import *
from prompts import *
import pandas as pd
import numpy as np
import re
from evaluators import Evaluator
# LangChain
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain.callbacks.tracers import LangChainTracer
from langchain.callbacks.manager import CallbackManager
from langchain import hub
from langsmith import evaluate
from langchain_core.output_parsers import JsonOutputParser

from summarytools import dfSummary

# Reload the config file
importlib.reload(config)

# Set the environment variables
os.environ["LANGCHAIN_API_KEY"] = config.LANGCHAIN_API_KEY
os.environ["LANGCHAIN_ENDPOINT"] = config.LANGCHAIN_ENDPOINT
os.environ["LANGCHAIN_PROJECT"] = config.LANGCHAIN_PROJECT
os.environ["OPENAI_API_KEY"] = config.OPENAI_API_KEY



Define evaluators

In [9]:
# Initialize evaluator
evaluator = Evaluator()

evaluators = [
    evaluator.evaluate_answer_match,
    evaluator.evaluate_llm_correct,
    evaluator.evaluate_correct_match
]

### Pandas Config

In [10]:
# Increase display options for pandas
pd.set_option('display.max_colwidth', None)  # or a large number like 1000
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)

### Create Models

In [11]:
# Create a tracer
tracer = LangChainTracer(project_name=os.environ["LANGCHAIN_PROJECT"])

# Create a callback manager with the tracer
callback_manager = CallbackManager([tracer])

# Initialize some models
gpt4o = ChatOpenAI(
    model="gpt-4o",
    temperature=0.65,
    callbacks=callback_manager
)
gpt4o_mini = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.65,
    callbacks=callback_manager
)
o1_preview = ChatOpenAI(
    model="o1-preview",
    temperature=1,
    callbacks=callback_manager
)
o1_mini = ChatOpenAI(
    model="o1-mini",
    temperature=1,
    callbacks=callback_manager
)
sonnet = ChatAnthropic(
    model_name="claude-3-5-sonnet-latest",
    temperature=0.65,
    timeout=None,
    stop=None,
    callbacks=callback_manager
)
haiku = ChatAnthropic(
    model_name="claude-3-5-haiku-latest",
    temperature=0.65,
    timeout=None,
    stop=None,
    callbacks=callback_manager
)

## Data Preparation

### Load Data

In [13]:
df = pd.read_csv('../data/new/master_with_all_stats.csv')

In [7]:
dfSummary(df)

No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,answer_id [int64],Mean (sd) : 1462230.9 (80782.5) min < med < max: 1321329.0 < 1462881.5 < 1603613.0 IQR (CV) : 138087.8 (18.1),"198,760 distinct values",,0 (0.0%)
2,is_correct [bool],1. True 2. False,"141,207 (71.0%) 57,553 (29.0%)",,0 (0.0%)
3,created_at [object],1. 2022-10-20 02:28:52.516332 2. 2022-04-12 16:22:41.496974 3. 2022-11-21 15:15:37.049043 4. 2022-11-21 15:16:04.509651 5. 2022-11-21 15:18:53.087061 6. 2022-11-21 15:19:44.303705 7. 2022-11-21 15:21:32.203750 8. 2022-11-21 15:21:41.106585 9. 2022-11-21 15:22:12.258436 10. 2022-11-21 15:24:35.600818 11. other,"1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 198,750 (100.0%)",,0 (0.0%)
4,user_id [int64],Mean (sd) : 38016.7 (15961.1) min < med < max: 2315.0 < 50104.0 < 53189.0 IQR (CV) : 29891.0 (2.4),"1,417 distinct values",,0 (0.0%)
5,grade_id [float64],Mean (sd) : 1891.1 (93.4) min < med < max: 777.0 < 1868.0 < 2029.0 IQR (CV) : 71.0 (20.3),83 distinct values,,"29,500 (14.8%)"
6,grade_name [object],1. nan 2. 7° A 2022 3. III°A 2022 4. 8°A 2022 5. IV° Sección Única 6. II°A 2022 7. I°A 2022 8. III°B 2022 9. I°B 2022 10. III° Medio 2022 11. other,"29,500 (14.8%) 15,828 (8.0%) 14,223 (7.2%) 14,007 (7.0%) 13,290 (6.7%) 11,561 (5.8%) 10,856 (5.5%) 9,877 (5.0%) 8,947 (4.5%) 8,629 (4.3%) 62,042 (31.2%)",,"29,500 (14.8%)"
7,school_id [float64],Mean (sd) : 97.9 (64.5) min < med < max: 1.0 < 91.0 < 238.0 IQR (CV) : 48.0 (1.5),20 distinct values,,"29,500 (14.8%)"
8,school_name [object],1. Colegio Manuel José Irarrázava 2. nan 3. Colegio San Joaquín Astoreca 4. San Miguel Arcángel 5. Colegio San Lorenzo 6. Liceo Puente Alto 7. San Juan de Lampa 8. PuntoCoach 9. Colegio Profesor Carlos del Va 10. Colegio San Francisco de Asís 11. other,"72,129 (36.3%) 29,500 (14.8%) 22,162 (11.2%) 17,577 (8.8%) 12,185 (6.1%) 12,070 (6.1%) 11,906 (6.0%) 4,304 (2.2%) 4,141 (2.1%) 3,516 (1.8%) 9,270 (4.7%)",,"29,500 (14.8%)"
9,user_level [float64],Mean (sd) : -0.4 (1.6) min < med < max: -3.0 < -0.5 < 3.0 IQR (CV) : 2.3 (-0.3),"27,204 distinct values",,0 (0.0%)
10,question_id [int64],Mean (sd) : 22645.0 (4078.5) min < med < max: 15654.0 < 23438.0 < 29256.0 IQR (CV) : 6985.0 (5.6),"9,321 distinct values",,0 (0.0%)


### Add Skill Percentile Column

In [14]:
# Convert user_level to percentile rank
# First, remove any NaN values for the calculation
valid_levels = df['user_level'].dropna()

# Calculate percentile rank for each value
# Sort values and get rank position for each score
sorted_vals = valid_levels.sort_values()
ranks = valid_levels.map(lambda x: (sorted_vals < x).sum())

# Convert to percentiles (rank / total count * 100) and round to integer
percentile_ranks = pd.Series(
    (ranks / len(valid_levels) * 100).round(),
    index=df.index
)

# Add as new column
df['user_level_percentile'] = percentile_ranks


### Extract Options
The original dataset had options in a single column, with multiple formatted options separated by newlines.
Here are some examples of the formatting:

Example 1: ["\`144π cm^3\`", "\`162π cm^3\`", "\`216π cm^3\`", "\`432π cm^3\`", "\`288π cm^3\`"]

Example 2: "---\n- \"\`10\`\"\n- \"\`1/10\`\"\n- \"\`7/10\`\"\n- \"\`-7/10\`\"\n- \"\`-1/10\`\"\n"

The use utilities.py function extract_options to parse these options into separate columns.


In [15]:
# Apply the function and create separate columns for options
df[['option_a', 'option_b', 'option_c', 'option_d', 'option_e']] = pd.DataFrame(
    df['options'].apply(extract_options).tolist(), index=df.index
)

Error parsing YAML string: while scanning for the next token
found character '`' that cannot start any token
  in "<unicode string>", line 2, column 11:
    - \"`4` : `8`\"
              ^
Error parsing YAML string: while scanning for the next token
found character '`' that cannot start any token
  in "<unicode string>", line 2, column 11:
    - \"`4` : `8`\"
              ^
Error parsing YAML string: while scanning a simple key
  in "<unicode string>", line 3, column 1:
    e 1` , `b = 3` y `c = 0`\"
    ^
could not find expected ':'
  in "<unicode string>", line 4, column 1:
    - \"`a = 1`, `b` y `c` cualquier ... 
    ^
Error parsing YAML string: while scanning a simple key
  in "<unicode string>", line 3, column 1:
    e 1` , `b = 3` y `c = 0`\"
    ^
could not find expected ':'
  in "<unicode string>", line 4, column 1:
    - \"`a = 1`, `b` y `c` cualquier ... 
    ^
Error parsing YAML string: while scanning a simple key
  in "<unicode string>", line 3, column 1:
    e 1` , `b = 

Let's check if there are any rows where the correct_option doesn't match any of the options.

In [16]:
# Create boolean masks checking if correct_option and student_answer match any of the options
correct_matches_mask = (df['correct_option'] == df['option_a']) | \
                      (df['correct_option'] == df['option_b']) | \
                      (df['correct_option'] == df['option_c']) | \
                      (df['correct_option'] == df['option_d']) | \
                      (df['correct_option'] == df['option_e'])

student_matches_mask = (df['student_answer'] == df['option_a']) | \
                      (df['student_answer'] == df['option_b']) | \
                      (df['student_answer'] == df['option_c']) | \
                      (df['student_answer'] == df['option_d']) | \
                      (df['student_answer'] == df['option_e'])

# Add columns indicating if options match
df['correct_option_matches'] = correct_matches_mask
df['student_answer_matches'] = student_matches_mask

# Count how many don't match for each
correct_non_matching = (~correct_matches_mask).sum()
student_non_matching = (~student_matches_mask).sum()
print(f"Number of rows where correct_option doesn't match any option: {correct_non_matching}")
print(f"Number of rows where student_answer doesn't match any option: {student_non_matching}")


Number of rows where correct_option doesn't match any option: 442
Number of rows where student_answer doesn't match any option: 2676


And if there are any rows where all options are null.

In [17]:
# Create mask for rows where some options are null
some_null_mask = df[['option_a', 'option_b', 'option_c', 'option_d', 'option_e']].isnull().any(axis=1)


Let's drop these rows.

In [18]:
# Drop the rows with null in some of the options or where correct_option doesn't match any option or student_answer doesn't match any option
df = df[~some_null_mask & correct_matches_mask & student_matches_mask]

print(f"Total rows left: {len(df)}")


Total rows left: 196083


Create new column that stores the letter of the correct answer. This is used to compare it with the answer of the LLM

In [19]:
# Create a new column that stores which letter (a-e) corresponds to the correct option
df['correct_option_letter'] = None

# Compare correct_option with each option column and assign corresponding letter
df.loc[df['correct_option'] == df['option_a'], 'correct_option_letter'] = 'a'
df.loc[df['correct_option'] == df['option_b'], 'correct_option_letter'] = 'b'
df.loc[df['correct_option'] == df['option_c'], 'correct_option_letter'] = 'c'
df.loc[df['correct_option'] == df['option_d'], 'correct_option_letter'] = 'd'
df.loc[df['correct_option'] == df['option_e'], 'correct_option_letter'] = 'e'


Create another column that stores the student_answer letter.

In [20]:
# Create a new column that stores which letter (a-e) corresponds to the student's answer
df['student_answer_letter'] = None

# Compare student_answer with each option column and assign corresponding letter
df.loc[df['student_answer'] == df['option_a'], 'student_answer_letter'] = 'a'
df.loc[df['student_answer'] == df['option_b'], 'student_answer_letter'] = 'b'
df.loc[df['student_answer'] == df['option_c'], 'student_answer_letter'] = 'c'
df.loc[df['student_answer'] == df['option_d'], 'student_answer_letter'] = 'd'
df.loc[df['student_answer'] == df['option_e'], 'student_answer_letter'] = 'e'


### Add age based on grade

In [22]:
# Load file in data/original/grade_age.csv
grade_age = pd.read_csv('../data/original/grade_age.csv')

# Merge df with grade_age on grade_id to add student_age column
df = df.merge(grade_age[['grade_id', 'student_age']], 
             left_on='grade_id', 
             right_on='grade_id', 
             how='left')

# Drop the redundant id column from the merge
df.drop('grade_id', axis=1, inplace=True)

In [23]:
# Save df to csv
df.to_csv('../data/new/master_with_engineered_features.csv', index=False)


In [22]:
dfSummary(df)

No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,answer_id [int64],Mean (sd) : 1461728.6 (80772.7) min < med < max: 1321329.0 < 1462145.0 < 1603613.0 IQR (CV) : 138126.0 (18.1),"196,083 distinct values",,0 (0.0%)
2,is_correct [bool],1. True 2. False,"140,730 (71.8%) 55,353 (28.2%)",,0 (0.0%)
3,created_at [object],1. 2022-10-20 02:28:52.516332 2. 2022-04-19 16:25:07.329212 3. 2022-04-12 16:26:28.868878 4. 2022-04-12 16:27:26.915350 5. 2022-04-12 16:28:26.956191 6. 2022-04-12 16:28:39.628331 7. 2022-04-12 16:35:13.758899 8. 2022-04-12 16:35:49.761866 9. 2022-04-19 16:13:56.773047 10. 2022-04-19 16:14:22.544320 11. other,"1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 1 (0.0%) 196,073 (100.0%)",,0 (0.0%)
4,user_id [int64],Mean (sd) : 37926.6 (15973.8) min < med < max: 2315.0 < 50099.0 < 53189.0 IQR (CV) : 29836.0 (2.4),"1,417 distinct values",,0 (0.0%)
5,grade_name [object],1. nan 2. 7° A 2022 3. III°A 2022 4. 8°A 2022 5. IV° Sección Única 6. II°A 2022 7. I°A 2022 8. III°B 2022 9. I°B 2022 10. III° Medio 2022 11. other,"28,530 (14.5%) 15,722 (8.0%) 14,152 (7.2%) 13,882 (7.1%) 13,141 (6.7%) 11,444 (5.8%) 10,798 (5.5%) 9,825 (5.0%) 8,839 (4.5%) 8,588 (4.4%) 61,162 (31.2%)",,"28,530 (14.5%)"
6,school_id [float64],Mean (sd) : 97.7 (64.3) min < med < max: 1.0 < 91.0 < 238.0 IQR (CV) : 48.0 (1.5),20 distinct values,,"28,530 (14.5%)"
7,school_name [object],1. Colegio Manuel José Irarrázava 2. nan 3. Colegio San Joaquín Astoreca 4. San Miguel Arcángel 5. Colegio San Lorenzo 6. Liceo Puente Alto 7. San Juan de Lampa 8. PuntoCoach 9. Colegio Profesor Carlos del Va 10. Colegio San Francisco de Asís 11. other,"71,901 (36.7%) 28,530 (14.5%) 21,981 (11.2%) 17,343 (8.8%) 12,094 (6.2%) 11,983 (6.1%) 11,715 (6.0%) 4,141 (2.1%) 3,967 (2.0%) 3,406 (1.7%) 9,022 (4.6%)",,"28,530 (14.5%)"
8,user_level [float64],Mean (sd) : -0.4 (1.6) min < med < max: -3.0 < -0.5 < 3.0 IQR (CV) : 2.3 (-0.3),"27,158 distinct values",,0 (0.0%)
9,question_id [int64],Mean (sd) : 22661.9 (4073.1) min < med < max: 15654.0 < 23449.0 < 29256.0 IQR (CV) : 6936.0 (5.6),"9,253 distinct values",,0 (0.0%)
10,options [object],"1. ""---\n- a= 2 y b = 20\n- a= 4 2. ""---\n- \""`m-n`\""\n- \""`m+n`\"" 3. ""---\n- \""`x=5`\""\n- \""`x=3`\"" 4. ""---\n- \""`-25`\""\n- \""`-26`\"" 5. ""---\n- \""`-1`\""\n- \""`8/11`\"" 6. ""---\n- x = 17.000\n- x= 16.00 7. ""---\n- \""`\\\\frac{1}{4}`\""\n 8. ""---\n- \""`3`\""\n- \""`-9`\""\n- 9. ""---\n- \""`1`\""\n- \""`1/2`\""\n 10. ""---\n- '9'\n- '2'\n- '18'\n- 11. other","492 (0.3%) 439 (0.2%) 314 (0.2%) 292 (0.1%) 272 (0.1%) 264 (0.1%) 257 (0.1%) 254 (0.1%) 254 (0.1%) 254 (0.1%) 192,991 (98.4%)",,0 (0.0%)


In [24]:
# Get 20 random unique students, stratified by age groups 11-18
# First get unique students with their ages
student_ages = df[['user_id', 'student_age']].drop_duplicates()

# Remove students with missing ages
student_ages = student_ages.dropna(subset=['student_age'])

# We want ~2-3 students from each age (11-18 = 8 age groups)
random_students = []
for age in range(11, 19):  # 11 to 18 inclusive
    age_students = student_ages[student_ages['student_age'] == age]['user_id'].values
    if len(age_students) >= 3:
        random_students.extend(np.random.choice(age_students, size=3, replace=False))
    else:
        random_students.extend(age_students)  # Take all if less than 3 available

# If we need more students to reach 20, randomly sample from remaining
if len(random_students) < 20:
    remaining = list(set(student_ages['user_id']) - set(random_students))
    additional = np.random.choice(remaining, size=20-len(random_students), replace=False)
    random_students.extend(additional)

# Trim to exactly 20 if we got more
random_students = random_students[:20]

# Get 10 rows that are correct and is answered by one of these students
correct_answers = df[
    (df['is_correct'] == True) & 
    (df['user_id'].isin(random_students))
].sample(n=10)

# Get 10 incorrect answers from the left over students
# Get 10 incorrect answers from the left over students
incorrect_answers = df[
    (df['is_correct'] == False) & 
    (df['user_id'].isin(random_students)) & (~df['user_id'].isin(correct_answers['user_id']))
].sample(n=10)


# Combine the samples
df_database = pd.concat([correct_answers, incorrect_answers])

# Select only the columns we need
df_database = df_database[[
    'answer_id',
    'is_correct',
    'question_title',
    'correct_option_letter',
    'student_answer_letter',
    'student_age',
    'user_level',
    'user_level_percentile',
    'topic_name',
    'subject_name',
    'axis_name',
    'option_a',
    'option_b',
    'option_c',
    'option_d',
    'option_e',
    'student_axis_correct',
    'student_topic_correct',
    'student_subject_correct',
    'student_axis_attempts',
    'student_topic_attempts',
    'student_subject_attempts'
]]

# Save to csv  
df_database.to_csv('data/new/20-test-students.csv', index=False)

## LangSmith dataset


In [31]:
from langsmith import Client

client = Client()

# Get 20 random unique students, stratified by age groups 11-18
# First get unique students with their ages
student_ages = df[['user_id', 'student_age']].drop_duplicates()

# Remove students with missing ages
student_ages = student_ages.dropna(subset=['student_age'])

# We want ~2-3 students from each age (11-18 = 8 age groups)
random_students = []
for age in range(11, 19):  # 11 to 18 inclusive
    age_students = student_ages[student_ages['student_age'] == age]['user_id'].values
    if len(age_students) >= 3:
        random_students.extend(np.random.choice(age_students, size=3, replace=False))
    else:
        random_students.extend(age_students)  # Take all if less than 3 available

# If we need more students to reach 20, randomly sample from remaining
if len(random_students) < 20:
    remaining = list(set(student_ages['user_id']) - set(random_students))
    additional = np.random.choice(remaining, size=20-len(random_students), replace=False)
    random_students.extend(additional)

# Trim to exactly 20 if we got more
random_students = random_students[:20]

# Get 10 rows that are correct and is answered by one of these students
correct_answers = df[
    (df['is_correct'] == True) & 
    (df['user_id'].isin(random_students))
].sample(n=10)

# Get 10 incorrect answers from the left over students
# Get 10 incorrect answers from the left over students
incorrect_answers = df[
    (df['is_correct'] == False) & 
    (df['user_id'].isin(random_students)) & (~df['user_id'].isin(correct_answers['user_id']))
].sample(n=10)


# Combine the samples
df_database = pd.concat([correct_answers, incorrect_answers])

# Extract the relevant columns for inputs, output, and metadata
structured_input_examples = []
for _, row in df_database.iterrows():
    # Create the input dictionary based on the input columns
    inputs = {
        "optionA": row["option_a"],
        "optionB": row["option_b"],
        "optionC": row["option_c"],
        "optionD": row["option_d"],
        "optionE": row["option_e"],
        "axisName": row["axis_name"],
        "question": row["question_title"],
        "topicName": row["topic_name"],
        "userLevel": round(row["user_level"], 2),
        "subjectName": row["subject_name"],
        "correctAnswer": row["correct_option_letter"],
        "userLevelPercentile": row["user_level_percentile"],
        "axisCorrectQuestions": row["student_axis_correct"],
        "topicCorrectQuestions": row["student_topic_correct"],
        "axisAttemptedQuestions": row["student_axis_attempts"],
        "subjectCorrectQuestions": row["student_subject_correct"],
        "topicAttemptedQuestions": row["student_topic_attempts"],
        "subjectAttemptedQuestions": row["student_subject_attempts"],
        "studentAge": row["student_age"]
    }
    # The output is the student_answer_letter column
    output = row["student_answer_letter"]
    
    # Extract metadata - all columns except the input columns
    metadata = row.drop(["option_a", "option_b", "option_c", "option_d", "option_e", "axis_name", "question_title", "topic_name", "user_level", "subject_name", "correct_option_letter", "user_level_percentile", "student_axis_correct", "student_topic_correct", "student_axis_attempts", "student_subject_correct", "student_topic_attempts", "student_subject_attempts", "student_age"]).to_dict()
    
    # Append to the examples list, include metadata only if it exists
    if metadata:
        structured_input_examples.append((inputs, output, metadata))
    else:
        structured_input_examples.append((inputs, output))

# Assume we have a client API similar to the example provided
structured_input_dataset_name = "20-test-students"
if not client.has_dataset(dataset_name=structured_input_dataset_name):
    structured_input_dataset = client.create_dataset(
        dataset_name=structured_input_dataset_name
    )
    for input_tuple in structured_input_examples:
        metadata = None
        if len(input_tuple) == 3:
            inputs, answer, metadata = input_tuple
        else:
            inputs, answer = input_tuple
        client.create_example(
            inputs=inputs,
            outputs={"output": answer},
            dataset_id=structured_input_dataset.id,
            metadata=metadata,
        )

## Langchain Experiments

### Baseline
We use a [simple prompt](https://smith.langchain.com/prompts/baseline-prompt?organizationId=25e81f1f-5303-4e8e-bca2-1f59955be6df) that asks the LLM to simulate a student, without any additional context.

In [6]:
def base_experiment(inputs):
    prompt_template = hub.pull("baseline-prompt")

    parser = JsonOutputParser()

    json_llm = gpt4o_mini.bind(response_format={"type": "json_object"})

    chain = prompt_template | json_llm | parser
    result = chain.invoke({
        "topicName": inputs["topicName"],
        "subjectName": inputs["subjectName"],
        "axisName": inputs["axisName"],
        "question": inputs["question"],
        "optionA": inputs["optionA"],
        "optionB": inputs["optionB"],
        "optionC": inputs["optionC"],
        "optionD": inputs["optionD"],
        "optionE": inputs["optionE"],
        "studentAge": inputs["studentAge"]
    })

    return {"output": result}

base_experiment_results = evaluate(
    base_experiment,
    data="20-test-students",
    experiment_prefix="base_experiment",
    evaluators=evaluators,
    num_repetitions=1
)

  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'base_experiment-b9011f6f' at:
https://smith.langchain.com/o/25e81f1f-5303-4e8e-bca2-1f59955be6df/datasets/65c90f29-e6fb-4227-9b79-2bbe38a0c780/compare?selectedSessions=86d24032-bf21-4cb2-844c-a5d4d16a576a




  prompt = loads(json.dumps(prompt_object.manifest))
20it [00:08,  2.25it/s]


### Experiment 1: Percentile

In [None]:
def percentile_experiment(inputs):
    prompt_template = hub.pull("student-simulator-percentile")

    parser = JsonOutputParser()

    json_llm = gpt4o_mini.bind(response_format={"type": "json_object"})

    chain = prompt_template | json_llm | parser
    result = chain.invoke({
        "userLevelPercentile": inputs["userLevelPercentile"],
        "topicName": inputs["topicName"],
        "topicAttemptedQuestions": inputs["topicAttemptedQuestions"],
        "topicCorrectQuestions": inputs["topicCorrectQuestions"],
        "subjectName": inputs["subjectName"],
        "subjectAttemptedQuestions": inputs["subjectAttemptedQuestions"],
        "subjectCorrectQuestions": inputs["subjectCorrectQuestions"],
        "axisName": inputs["axisName"],
        "axisAttemptedQuestions": inputs["axisAttemptedQuestions"],
        "axisCorrectQuestions": inputs["axisCorrectQuestions"],
        "question": inputs["question"],
        "optionA": inputs["optionA"],
        "optionB": inputs["optionB"],
        "optionC": inputs["optionC"],
        "optionD": inputs["optionD"],
        "optionE": inputs["optionE"],
        "studentAge": inputs["studentAge"]
    })

    return {"output": result}

percentile_experiment_results = evaluate(
    percentile_experiment,
    data="20-test-students",
    experiment_prefix="percentile_experiment",
    evaluators=evaluators,
    num_repetitions=10
)

### Experiment 2: Raw Level

In [6]:
def raw_level_experiment(inputs):
    prompt_template = hub.pull("student-simulator-raw-level")

    parser = JsonOutputParser()

    json_llm = gpt4o_mini.bind(response_format={"type": "json_object"})

    chain = prompt_template | json_llm | parser
    result = chain.invoke({
        "userLevelPercentile": inputs["userLevelPercentile"],
        "topicName": inputs["topicName"],
        "topicAttemptedQuestions": inputs["topicAttemptedQuestions"],
        "topicCorrectQuestions": inputs["topicCorrectQuestions"],
        "subjectName": inputs["subjectName"],
        "subjectAttemptedQuestions": inputs["subjectAttemptedQuestions"],
        "subjectCorrectQuestions": inputs["subjectCorrectQuestions"],
        "axisName": inputs["axisName"],
        "axisAttemptedQuestions": inputs["axisAttemptedQuestions"],
        "axisCorrectQuestions": inputs["axisCorrectQuestions"],
        "question": inputs["question"],
        "optionA": inputs["optionA"],
        "optionB": inputs["optionB"],
        "optionC": inputs["optionC"],
        "optionD": inputs["optionD"],
        "optionE": inputs["optionE"],
        "studentAge": inputs["studentAge"],
        "userLevel": inputs["userLevel"]
    })

    return {"output": result}

percentile_experiment_results = evaluate(
    raw_level_experiment,
    data="20-test-students",
    experiment_prefix="raw_level_experiment",
    evaluators=evaluators,
    num_repetitions=1
)

  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'raw_level_experiment-cafdba31' at:
https://smith.langchain.com/o/25e81f1f-5303-4e8e-bca2-1f59955be6df/datasets/65c90f29-e6fb-4227-9b79-2bbe38a0c780/compare?selectedSessions=eb589c11-228a-42c8-a9ed-aa27742c72b7




  prompt = loads(json.dumps(prompt_object.manifest))
20it [00:08,  2.23it/s]


### Experiment 3: Rubric Level

In [6]:
def rubric_level_experiment(inputs):
    prompt_template = hub.pull("student-simulator-rubric-level")

    parser = JsonOutputParser()

    # json_llm = o1_mini.bind(response_format={"type": "json_object"})

    chain = prompt_template | o1_mini | parser
    result = chain.invoke({
        "userLevelPercentile": inputs["userLevelPercentile"],
        "topicName": inputs["topicName"],
        "topicAttemptedQuestions": inputs["topicAttemptedQuestions"],
        "topicCorrectQuestions": inputs["topicCorrectQuestions"],
        "subjectName": inputs["subjectName"],
        "subjectAttemptedQuestions": inputs["subjectAttemptedQuestions"],
        "subjectCorrectQuestions": inputs["subjectCorrectQuestions"],
        "axisName": inputs["axisName"],
        "axisAttemptedQuestions": inputs["axisAttemptedQuestions"],
        "axisCorrectQuestions": inputs["axisCorrectQuestions"],
        "question": inputs["question"],
        "optionA": inputs["optionA"],
        "optionB": inputs["optionB"],
        "optionC": inputs["optionC"],
        "optionD": inputs["optionD"],
        "optionE": inputs["optionE"],
        "studentAge": inputs["studentAge"],
        "userLevel": inputs["userLevel"]
    })

    return {"output": result}

rubric_level_experiment_results = evaluate(
    rubric_level_experiment,
    data="20-test-students",
    experiment_prefix="rubric_level_experiment",
    evaluators=evaluators,
    num_repetitions=1
)

  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'rubric_level_experiment-a6d8da00' at:
https://smith.langchain.com/o/25e81f1f-5303-4e8e-bca2-1f59955be6df/datasets/65c90f29-e6fb-4227-9b79-2bbe38a0c780/compare?selectedSessions=02bbd936-232e-4b47-8b7c-8eb06a6852e3




  prompt = loads(json.dumps(prompt_object.manifest))
20it [00:22,  1.12s/it]


Failed to multipart ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/multipart. HTTPError('429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/multipart', '{"detail":"Monthly unique traces usage limit exceeded"}')trace=a2012262-3a61-4ad4-90bc-01750b62ba51,id=a2012262-3a61-4ad4-90bc-01750b62ba51; trace=2b37cd0d-ccbd-48b8-b216-73ce90ed2bd3,id=2b37cd0d-ccbd-48b8-b216-73ce90ed2bd3; trace=3848c702-afe5-45d1-b213-64d371f461ce,id=3848c702-afe5-45d1-b213-64d371f461ce; trace=7f91551a-7c95-4811-8145-f6f472ff696f,id=7f91551a-7c95-4811-8145-f6f472ff696f; trace=478c63d0-2ec9-4588-9177-038ca781e0a8,id=478c63d0-2ec9-4588-9177-038ca781e0a8; trace=560e478a-2aad-4330-abb4-84781c6fcbf7,id=560e478a-2aad-4330-abb4-84781c6fcbf7; trace=358db774-57f8-4e65-8169-cfa629fc8f26,id=358db774-57f8-4e65-8169-cfa629fc8f26; trace=f3d02bdd-1644-40d5-802d-1aa484c77679,id=f3d02bdd-1644-40d5-802d-1aa484c77679; trace=3d0b5644-f41f

## Analogue Experiments

### Baseline

In [8]:
def process_single_row(row, prompt_generator, model_caller, response_parser):
    """Process a single row of test data through the LLM pipeline.
    
    Args:
        row (pd.Series): Row of test data
        prompt_generator (callable): Function to generate prompt from row data
        model_caller (callable): Function to call LLM model
        response_parser (callable): Function to parse LLM response
    
    Returns:
        dict: Results dictionary or None if error
    """
    # Generate prompt
    prompt = prompt_generator(
        student_age=row['student_age'],
        topic_name=row['topic_name'],
        subject_name=row['subject_name'],
        axis_name=row['axis_name'],
        question=row['question_title'],
        option_a=row['option_a'],
        option_b=row['option_b'],
        option_c=row['option_c'],
        option_d=row['option_d'],
        option_e=row['option_e']
    )

    # Get model response
    response = model_caller(prompt)
    if not response:
        return None

    # Parse response
    parsed_response = response_parser(response)
    if not parsed_response:
        return None

    # Calculate metrics
    metrics = calculate_metrics(
        llm_answer=parsed_response['answer'],
        correct_answer=row['correct_option_letter'],
        student_answer=row['student_answer_letter']
    )

    # Combine results
    return {
        **row,
        **parsed_response,
        **metrics
    }

def calculate_metrics(llm_answer, correct_answer, student_answer):
    """Calculate evaluation metrics for a single response.
    
    Args:
        llm_answer (str): LLM's answer
        correct_answer (str): Correct answer
        student_answer (str): Student's answer
        
    Returns:
        dict: Dictionary of metrics
    """
    is_llm_correct = llm_answer == correct_answer
    is_student_correct = student_answer == correct_answer
    
    return {
        "is_llm_correct": is_llm_correct,
        "llm_match_student_answer": llm_answer == student_answer,
        "llm_match_student_correct": is_llm_correct == is_student_correct
    }

def run_experiment(
    df_test, 
    prompt_generator, 
    model_caller, 
    response_parser,
    output_path,
    experiment_name="experiment"
):
    """Run complete experiment pipeline.
    
    Args:
        df_test (pd.DataFrame): Test dataset
        prompt_generator (callable): Function to generate prompts
        model_caller (callable): Function to call LLM
        response_parser (callable): Function to parse responses
        output_path (str): Path to save results
        experiment_name (str): Name of experiment for logging
    """
    results = []
    total_rows = len(df_test)

    for idx, row in df_test.iterrows():
        result = process_single_row(row, prompt_generator, model_caller, response_parser)
        
        if result:
            results.append(result)
            log_progress(idx, total_rows, result, experiment_name)
        else:
            print(f"Error processing row {idx} in {experiment_name}")

    # Save results
    if results:
        df_results = pd.DataFrame(results)
        df_results.to_csv(output_path, index=False)
        print(f"\nResults saved to {output_path}")
    
    return results

def log_progress(idx, total_rows, result, experiment_name):
    """Log progress and results for a single row.
    
    Args:
        idx (int): Current row index
        total_rows (int): Total number of rows
        result (dict): Results dictionary
        experiment_name (str): Name of experiment
    """
    print(
        f"[{experiment_name}] Row {idx}/{total_rows} | "
        f"LLM Answer: {result.get('answer')} | "
        f"LLM Correct: {result.get('is_llm_correct')} | "
        f"Matches Student Answer: {result.get('llm_match_student_answer')} | "
        f"Matches Student Correctness: {result.get('llm_match_student_correct')}"
    )

# Usage example:
if __name__ == "__main__":
    # Load data
df_test = pd.read_csv('data/new/20-test-students.csv')

Row 0/20 | LLM Answer: e | LLM Correct: True | Matches Student Answer: True | Matches Student Correctness: True
Row 1/20 | LLM Answer: d | LLM Correct: True | Matches Student Answer: True | Matches Student Correctness: True
Row 2/20 | LLM Answer: e | LLM Correct: True | Matches Student Answer: True | Matches Student Correctness: True
Row 3/20 | LLM Answer: a | LLM Correct: False | Matches Student Answer: False | Matches Student Correctness: False
Row 4/20 | LLM Answer: e | LLM Correct: True | Matches Student Answer: True | Matches Student Correctness: True
Row 5/20 | LLM Answer: e | LLM Correct: True | Matches Student Answer: True | Matches Student Correctness: True
Row 6/20 | LLM Answer: b | LLM Correct: True | Matches Student Answer: True | Matches Student Correctness: True
Row 7/20 | LLM Answer: d | LLM Correct: False | Matches Student Answer: False | Matches Student Correctness: False
Row 8/20 | LLM Answer: d | LLM Correct: True | Matches Student Answer: True | Matches Student Corr