In [11]:
import pandas as pd
import pickle
from utils import flatten_data, format_cascade_name, generate_latex_table_3cols, format_metric_name

metric = 'abstention'

with open(f"./performance_data/overall_results_joint_{metric}_outlier=10.pkl", 'rb') as file:
    data = pickle.load(file)
    average_test_results = data['test']
    average_train_results = data['train']

df = pd.DataFrame(flatten_data(average_test_results))
df['pct_change'] = 100*(df['early_abs'] - df['final_model_abs'])/df['final_model_abs']
df['change'] = df['early_abs'] - df['final_model_abs']
df['cascade_name'] = pd.Series([ format_cascade_name(x) for x in df['cascade']])

latex_table = generate_latex_table_3cols(
    df,
    caption=f"Percentage change in {format_metric_name(metric)} when using early abstention compared to final-model abstention. Negative numbers (bold) indicate that early abstention outperforms.",
    label=f"tab:{metric}-test",
    value_cols=['early_abs', 'final_model_abs', 'pct_change'],
    value_cols_labels=['Early', 'Final', '$\\boldsymbol{\%\Delta}$'],
    cascade_col='cascade_name',
    benchmark_col='task',
    bold_smaller=True
)

print(latex_table)

\begin{table}[t]
\centering
\small
\caption{Percentage change in abstention rate on the test set when using early abstention compared to final-model abstention. Negative numbers (bold) indicate that early abstention outperforms.}
\label{tab:abstention-test}
\begin{adjustbox}{max width=\textwidth}
\begin{tabular}{lrrrrrrrrrrrrrrrrrrrrr}
\toprule
\multirow{2}{*}{\textbf{cascade\_name}} & \multicolumn{3}{c}{\textbf{gsm8k}} & \multicolumn{3}{c}{\textbf{medmcqa}} & \multicolumn{3}{c}{\textbf{mmlu}} & \multicolumn{3}{c}{\textbf{triviaqa}} & \multicolumn{3}{c}{\textbf{truthfulqa}} & \multicolumn{3}{c}{\textbf{xsum}} & \multicolumn{3}{c}{\textbf{Average}} \\
 & \textbf{Early} & \textbf{Final} & \textbf{$\boldsymbol{\%\Delta}$} & \textbf{Early} & \textbf{Final} & \textbf{$\boldsymbol{\%\Delta}$} & \textbf{Early} & \textbf{Final} & \textbf{$\boldsymbol{\%\Delta}$} & \textbf{Early} & \textbf{Final} & \textbf{$\boldsymbol{\%\Delta}$} & \textbf{Early} & \textbf{Final} & \textbf{$\boldsymbol{\%\Delt

In [4]:
metrics = ['loss', 'error', 'cost', 'abstention']
full_df = pd.DataFrame(columns=metrics)

for metric in metrics:

    with open(f"./performance_data/overall_results_joint_{metric}_outlier=10.pkl", 'rb') as file:
        data = pickle.load(file)
        average_test_results = data['test']
        average_train_results = data['train']

    df = pd.DataFrame(flatten_data(average_test_results))
    df['rel_change'] = 100*(df['early_abs'] - df['final_model_abs'])/df['final_model_abs']
    df['change'] = df['early_abs'] - df['final_model_abs']

    if metric in ['abstention']:
        full_df[metric] = df['change']
    elif metric in ['loss', 'error', 'cost']:
        full_df[metric] = df['rel_change']

full_df['cascade'] = [ format_cascade_name(cname) for cname in df['cascade'] ]
full_df['task'] = df['task']

performance_by_cascade = full_df.groupby('cascade')[['error', 'cost','abstention']].mean()
performance_by_task = full_df.groupby('task')[['error','cost','abstention']].mean()

In [6]:
performance_by_cascade

Unnamed: 0_level_0,error,cost,abstention
cascade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4o-Mini -> 4o,-3.908362,-14.765812,0.031452
4o-Mini -> Q32B,-5.710084,-12.001866,0.016511
4o-Mini -> Q72B,-1.126745,-7.992511,0.005261
L1B -> L3B,-3.581211,-13.763536,0.022022
L1B -> L405B,-10.649754,-21.3613,0.112222
L1B -> L70B,-2.483118,-12.291786,0.042104
L1B -> L8B,-0.618029,-12.272946,0.013232
L3B -> L405B,-10.352463,-21.55113,0.100757
L3B -> L70B,-1.596702,-12.765226,0.023205
L3B -> L8B,-2.205398,-11.9523,0.015867
