In [2]:
# imports
import sys

# relative to project root
sys.path.append("publication/")
from session_data import *

# packages
import pandas
from IPython.display import display, display_html, display_latex

In [5]:
# read all session data
session_df = get_session_data()
session_df.head()

Unnamed: 0,Exam Session ID,Question Category,Question Number,GPT Answer,GPT Second Answer,GPT Third Answer,Correct Answer,Correct,Second Correct,Third Correct,Top Two Correct,Top Three Correct,Temperature,Max Tokens,Top P,Best Of,Frequency Penalty,Presence Penalty,Session Duration
0,bar-exam-001,Civil Procedure,1,D,A,B,D,True,False,False,True,True,0.0,16,1.0,1,0,0,208.769812
1,bar-exam-001,Civil Procedure,2,D,B,A,D,True,False,False,True,True,0.0,16,1.0,1,0,0,208.769812
2,bar-exam-001,Civil Procedure,3,C,D,A,D,False,True,False,True,True,0.0,16,1.0,1,0,0,208.769812
3,bar-exam-001,Civil Procedure,4,C,D,B,A,False,False,False,False,False,0.0,16,1.0,1,0,0,208.769812
4,bar-exam-001,Civil Procedure,5,C,D,B,C,True,False,False,True,True,0.0,16,1.0,1,0,0,208.769812


## Headline Accuracy

In [22]:
performance_df = pandas.DataFrame({
        "Correct Rate": session_df["Correct"].mean() * 100.0,
        "Top Two Correct Rate": session_df["Top Two Correct"].mean() * 100.0,
        "Top Three Correct Rate": session_df["Top Three Correct"].mean() * 100.0
}, index=["Accuracy (%)"]).T

with pandas.option_context("float_format", "{:2.0f}".format):
    print(performance_df.round(2).style.to_latex())
    display(performance_df)

\begin{tabular}{lr}
 & Accuracy (%) \\
Correct Rate & 49.970000 \\
Top Two Correct Rate & 70.970000 \\
Top Three Correct Rate & 87.750000 \\
\end{tabular}



Unnamed: 0,Accuracy (%)
Correct Rate,50
Top Two Correct Rate,71
Top Three Correct Rate,88


## NCBE Rates

In [23]:
ncbe_df = pandas.DataFrame(pandas.Series(NCBE_CATEGORY_CORRECT_RATES) * 100.0, columns=["Accuracy (%)"])
ncbe_df

Unnamed: 0,Accuracy (%)
Civil Procedure,59.0
Constitutional Law,72.0
Contracts,70.0
Criminal Law and Procedure,71.0
Evidence,65.0
Real Property,65.0
Torts,71.0


## Accuracy by Question Category

In [29]:
performance_by_category_df = pandas.DataFrame({
        "Correct Rate": session_df.groupby("Question Category")["Correct"].mean() * 100.0,
        "Top Two Correct Rate": session_df.groupby("Question Category")["Top Two Correct"].mean() * 100.0,
        "Top Three Correct Rate": session_df.groupby("Question Category")["Top Three Correct"].mean() * 100.0,
        "NCBE Rate": ncbe_df["Accuracy (%)"],
})\
    .sort_values("Correct Rate", ascending=False)


with pandas.option_context("float_format", "{:2.0f}".format):
    print(performance_by_category_df.round(2).style.to_latex())
    display(performance_by_category_df)

\begin{tabular}{lrrrr}
 & Correct Rate & Top Two Correct Rate & Top Three Correct Rate & NCBE Rate \\
Evidence & 62.760000 & 84.470000 & 98.050000 & 65.000000 \\
Torts & 61.650000 & 71.830000 & 93.860000 & 71.000000 \\
Civil Procedure & 52.030000 & 62.680000 & 78.700000 & 59.000000 \\
Constitutional Law & 49.020000 & 66.750000 & 86.830000 & 72.000000 \\
Real Property & 44.960000 & 71.630000 & 84.800000 & 65.000000 \\
Contracts & 44.720000 & 77.320000 & 85.850000 & 70.000000 \\
Criminal Law and Procedure & 35.040000 & 62.110000 & 86.340000 & 71.000000 \\
\end{tabular}



Unnamed: 0,Correct Rate,Top Two Correct Rate,Top Three Correct Rate,NCBE Rate
Evidence,63,84,98,65
Torts,62,72,94,71
Civil Procedure,52,63,79,59
Constitutional Law,49,67,87,72
Real Property,45,72,85,65
Contracts,45,77,86,70
Criminal Law and Procedure,35,62,86,71


## Hyperparameters - Temperature

In [31]:
performance_by_temperature_df = pandas.DataFrame({
        "Correct Rate": session_df.groupby("Temperature")["Correct"].mean(),
        "Top Two Correct Rate": session_df.groupby("Temperature")["Top Two Correct"].mean(),
        "Top Three Correct Rate": session_df.groupby("Temperature")["Top Three Correct"].mean(),
        "Samples": session_df.groupby("Temperature")["Exam Session ID"].nunique(),
})\
    .sort_values("Temperature", ascending=True)


with pandas.option_context("float_format", "{:.2%}".format):
    print((100.0 * performance_by_temperature_df).round(2).style.to_latex())
    display(performance_by_temperature_df)

\begin{tabular}{lrrrr}
 & Correct Rate & Top Two Correct Rate & Top Three Correct Rate & Samples \\
Temperature &  &  &  &  \\
0.000000 & 49.860000 & 71.770000 & 89.000000 & 500.000000 \\
0.500000 & 50.190000 & 71.050000 & 88.200000 & 1800.000000 \\
1.000000 & 49.790000 & 70.650000 & 86.950000 & 1800.000000 \\
\end{tabular}



Unnamed: 0_level_0,Correct Rate,Top Two Correct Rate,Top Three Correct Rate,Samples
Temperature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.00%,49.86%,71.77%,89.00%,5
50.00%,50.19%,71.05%,88.20%,18
100.00%,49.79%,70.65%,86.95%,18


## Hyperparameters - Best Of

In [32]:
performance_by_bestof_df = pandas.DataFrame({
        "Correct Rate": session_df.groupby("Best Of")["Correct"].mean(),
        "Top Two Correct Rate": session_df.groupby("Best Of")["Top Two Correct"].mean(),
        "Top Three Correct Rate": session_df.groupby("Best Of")["Top Three Correct"].mean(),
        "Samples": session_df.groupby("Best Of")["Exam Session ID"].nunique(),
})\
    .sort_values("Best Of", ascending=True)


with pandas.option_context("float_format", "{:.2%}".format):
    print((100.0 * performance_by_bestof_df).round(2).style.to_latex())
    display(performance_by_bestof_df)

\begin{tabular}{lrrrr}
 & Correct Rate & Top Two Correct Rate & Top Three Correct Rate & Samples \\
Best Of &  &  &  &  \\
1 & 49.510000 & 70.590000 & 87.270000 & 1500.000000 \\
2 & 50.270000 & 71.220000 & 88.170000 & 1400.000000 \\
4 & 50.200000 & 71.130000 & 87.840000 & 1200.000000 \\
\end{tabular}



Unnamed: 0_level_0,Correct Rate,Top Two Correct Rate,Top Three Correct Rate,Samples
Best Of,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,49.51%,70.59%,87.27%,15
2,50.27%,71.22%,88.17%,14
4,50.20%,71.13%,87.84%,12


## Hyperparameter Surface

In [39]:
performance_by_temp_bestof = pandas.DataFrame({
    "Correct Rate": session_df.groupby(["Temperature", "Best Of"])["Correct"].mean(),
    "Correct Rate SEM": session_df.groupby(["Temperature", "Best Of"])["Correct"].sem(),
    "Top Two Correct Rate": session_df.groupby(["Temperature", "Best Of"])["Top Two Correct"].mean(),
    "Top Three Correct Rate": session_df.groupby(["Temperature", "Best Of"])["Top Three Correct"].mean(),
    "Samples": session_df.groupby(["Temperature", "Best Of"])["Exam Session ID"].nunique(),
})

print("Correct Rate")
display(performance_by_temp_bestof["Correct Rate"].unstack())

print("Correct Rate - Standard Error of the Mean")
display(performance_by_temp_bestof["Correct Rate SEM"].unstack())

print("Top Two Correct Rate")
display(performance_by_temp_bestof["Top Two Correct Rate"].unstack())

print("Top Three Correct Rate")
display(performance_by_temp_bestof["Top Three Correct Rate"].unstack())

print("Samples")
display(performance_by_temp_bestof["Samples"].unstack())

Correct Rate


Best Of,1,2,4
Temperature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.494418,0.504785,
0.5,0.5,0.5,0.505582
1.0,0.490431,0.504785,0.498405


Correct Rate - Standard Error of the Mean


Best Of,1,2,4
Temperature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.019983,0.024484,
0.5,0.014125,0.014125,0.014124
1.0,0.014123,0.014125,0.014125


Top Two Correct Rate


Best Of,1,2,4
Temperature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.716108,0.720096,
0.5,0.706539,0.712919,0.712121
1.0,0.700159,0.708931,0.710526


Top Three Correct Rate


Best Of,1,2,4
Temperature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.888357,0.892344,
0.5,0.87799,0.88756,0.880383
1.0,0.859649,0.872408,0.876396


Samples


Best Of,1,2,4
Temperature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,3.0,2.0,
0.5,6.0,6.0,6.0
1.0,6.0,6.0,6.0
