In [20]:
# imports
import io
import json
import os
from pathlib import Path

# packages
import pandas
import matplotlib.pyplot as plt
from IPython.display import display, display_html


# project imports
from question_data import parse_question_source

In [21]:
# load the questions
question_list = parse_question_source(
    Path(os.getcwd()).parent / "data" / "questions_01.txt"
)

# load the exam result data
exam_df = pandas.read_csv(
    Path(os.getcwd()).parent
    / "results"
    / "questions-01"
    / "sessions-003"
    / "exam_results.csv",
    low_memory=False,
)

# calculate the baseline multiple choice rate by averaging 1/N, N=len(choices)
multiple_choice_counts = []
for question in question_list:
    if question["question_type"] == "multiple_choice":
        multiple_choice_counts.append(len(question["choices"]))

# print key stats on counts
print(f"Questions: {len(question_list)}\n")

print(f"Exam Data: {exam_df.shape}\n")

Questions: 40

Exam Data: (21395, 17)



In [22]:
# get the random chance rate by average 1/N from the list
random_chance_rate = sum([1 / float(c) for c in multiple_choice_counts]) / len(
    multiple_choice_counts
)
print(f"Multiple Choice Random Chance Rate: {100*random_chance_rate:.2f}%")

Multiple Choice Random Chance Rate: 22.67%


In [23]:
# number of exams
print("Exam Sessions:", exam_df["session_name"].nunique())

Exam Sessions: 535


In [24]:
# number of prompts
print("Number of Prompts:", exam_df["prompt_method"].nunique())

Number of Prompts: 10


In [6]:
# print headline accuracy rate
accuracy_rate = exam_df["is_correct"].mean()
print(f"Headline Accuracy Rate: {accuracy_rate:.2%}")

Headline Accuracy Rate: 14.39%


In [14]:
exam_df.groupby("question_type")["is_correct"].count() / exam_df[
    "session_name"
].nunique()

question_type
amount             24.000000
multiple_choice    14.990654
short_answer        1.000000
Name: is_correct, dtype: float64

In [7]:
# print per-question type accuracy rate
print(f"Per-Question Type Accuracy Rate:")
per_qt_accuracy_rate = exam_df.groupby("question_type")["is_correct"].mean()
display(pandas.DataFrame(per_qt_accuracy_rate))

Per-Question Type Accuracy Rate:


Unnamed: 0_level_0,is_correct
question_type,Unnamed: 1_level_1
amount,0.076869
multiple_choice,0.260723
short_answer,0.0


In [8]:
# print per-prompt accuracy rate
print(f"Per-Prompt Accuracy Rate:")
per_prompt_accuracy_rate = exam_df.groupby("prompt_method")["is_correct"].mean()
display(pandas.DataFrame(per_prompt_accuracy_rate))

Per-Prompt Accuracy Rate:


Unnamed: 0_level_0,is_correct
prompt_method,Unnamed: 1_level_1
generate_prompt_001,0.125
generate_prompt_002,0.139815
generate_prompt_003,0.130556
generate_prompt_004,0.149537
generate_prompt_005,0.153241
generate_prompt_006,0.149537
generate_prompt_007,0.149537
generate_prompt_008,0.150463
generate_prompt_009,0.132407
generate_prompt_010,0.160102


In [9]:
prompt_qt_accuracy_rate = (
    exam_df.groupby(["prompt_method", "question_type"])["is_correct"].mean().unstack()
)
display(prompt_qt_accuracy_rate)

question_type,amount,multiple_choice,short_answer
prompt_method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
generate_prompt_001,0.061728,0.234568,0.0
generate_prompt_002,0.057099,0.281481,0.0
generate_prompt_003,0.065586,0.24321,0.0
generate_prompt_004,0.083333,0.265432,0.0
generate_prompt_005,0.079475,0.281481,0.0
generate_prompt_006,0.081019,0.269136,0.0
generate_prompt_007,0.083333,0.265432,0.0
generate_prompt_008,0.083333,0.267901,0.0
generate_prompt_009,0.081019,0.223457,0.0
generate_prompt_010,0.094388,0.276712,0.0


In [26]:
prompt_qt_accuracy_rate.describe().loc[["min", "max"]]

question_type,amount,multiple_choice,short_answer
min,0.057099,0.223457,0.0
max,0.094388,0.281481,0.0


In [10]:
temperature_best_of_accuracy_rate = (
    exam_df.groupby(["temperature", "best_of"])["is_correct"].mean().unstack()
)
display(temperature_best_of_accuracy_rate)

best_of,1,2,4
temperature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.136667,0.1375,0.136667
0.5,0.15375,0.14514,0.143421
1.0,0.14625,0.147083,0.148333


In [11]:
prompt_temperature_accuracy_rate = (
    exam_df.groupby(["prompt_method", "temperature"])["session_name"]
    .nunique()
    .unstack()
)
display(prompt_temperature_accuracy_rate)

temperature,0.0,0.5,1.0
prompt_method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
generate_prompt_001,18,18,18
generate_prompt_002,18,18,18
generate_prompt_003,18,18,18
generate_prompt_004,18,18,18
generate_prompt_005,18,18,18
generate_prompt_006,18,18,18
generate_prompt_007,18,18,18
generate_prompt_008,18,18,18
generate_prompt_009,18,18,18
generate_prompt_010,18,13,18


In [12]:
# compare only the multiple choice questions against the baseline rate
mc_exam_df = exam_df.loc[exam_df["question_type"] == "multiple_choice", :]

prompt_temp_mc_accuracy = (
    mc_exam_df.groupby(["prompt_method", "temperature"])["is_correct"].mean().unstack()
)
prompt_temp_mc_accuracy_sem = (
    mc_exam_df.groupby(["prompt_method", "temperature"])["is_correct"].sem().unstack()
)
prompt_temp_mc_accuracy_spread = prompt_temp_mc_accuracy - random_chance_rate

prompt_temp_mc_low_ci_spread = (
    prompt_temp_mc_accuracy_spread - prompt_temp_mc_accuracy_sem
)
display(prompt_temp_mc_accuracy_spread)
display(prompt_temp_mc_low_ci_spread > 0)

temperature,0.0,0.5,1.0
prompt_method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
generate_prompt_001,-0.015556,0.021481,0.017778
generate_prompt_002,0.04,0.062222,0.062222
generate_prompt_003,-0.022963,0.028889,0.043704
generate_prompt_004,0.04,0.04,0.036296
generate_prompt_005,0.04,0.054815,0.06963
generate_prompt_006,0.04,0.032593,0.054815
generate_prompt_007,0.04,0.04,0.036296
generate_prompt_008,0.036296,0.028889,0.058519
generate_prompt_009,-0.03037,-0.004444,0.025185
generate_prompt_010,0.04,0.052281,0.058519


temperature,0.0,0.5,1.0
prompt_method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
generate_prompt_001,False,False,False
generate_prompt_002,True,True,True
generate_prompt_003,False,True,True
generate_prompt_004,True,True,True
generate_prompt_005,True,True,True
generate_prompt_006,True,True,True
generate_prompt_007,True,True,True
generate_prompt_008,True,True,True
generate_prompt_009,False,False,False
generate_prompt_010,True,True,True


In [28]:
prompt_temp_mc_accuracy_sem

temperature,0.0,0.5,1.0
prompt_method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
generate_prompt_001,0.024882,0.026336,0.026203
generate_prompt_002,0.026962,0.027635,0.027635
generate_prompt_003,0.024556,0.026594,0.02708
generate_prompt_004,0.026962,0.026962,0.026842
generate_prompt_005,0.026962,0.02742,0.027841
generate_prompt_006,0.026962,0.026719,0.02742
generate_prompt_007,0.026962,0.026962,0.026842
generate_prompt_008,0.026842,0.026594,0.027529
generate_prompt_009,0.024217,0.025348,0.026466
generate_prompt_010,0.026962,0.032622,0.027529
