In [4]:
import pandas as pd
import glob
import ast
import numpy as np

# Merge CSV files
path = 'results/evals/'
all_files = glob.glob(path + '*.csv')
merged_df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

# Set categorical order
merged_df['Model'] = pd.Categorical(merged_df['Model'], categories=['gpt-5', 'gpt-5-mini','gpt-5-nano'], ordered=True)
merged_df['Model_ret'] = pd.Categorical(merged_df['Model_ret'], categories=['vidore/colpali-v1.3-merged','vidore/colqwen2.5-v0.2', 'ahmed-masry/ColFlor'], ordered=True)

# Create 'is_paper_id_in_context' column
def is_paper_id_in_context(row):
    paper_id_val = str(row['Paper_id'])
    if not paper_id_val.startswith('Paper'):
        return np.nan

    paper_id = paper_id_val.lower()
    context_papers = row['Context_papers']

    if pd.isna(context_papers) or not isinstance(context_papers, str) or not context_papers.startswith('['):
        return 0

    try:
        context_papers_list = ast.literal_eval(context_papers)
    except (ValueError, SyntaxError):
        return 0

    for context_paper in context_papers_list:
        context_paper_name = str(context_paper).split('_pg_')[0].lower()
        if paper_id == context_paper_name:
            return 1
    return 0

merged_df['is_paper_id_in_context'] = merged_df.apply(is_paper_id_in_context, axis=1)

# Create summary table with Difficulty
grouped = merged_df.groupby(['Model', 'Model_ret', 'Difficulty'])
agg_df = grouped.agg(
    mean_cor_answer=('Cor_answer', 'mean'),
    std_cor_answer=('Cor_answer', 'std'),
    mean_is_paper_id_in_context=('is_paper_id_in_context', 'mean'),
    std_is_paper_id_in_context=('is_paper_id_in_context', 'std')
).round(3)

agg_df['Cor_answer'] = agg_df['mean_cor_answer'].astype(str) + ' (SD=' + agg_df['std_cor_answer'].astype(str) + ')'
agg_df['is_paper_id_in_context'] = agg_df['mean_is_paper_id_in_context'].astype(str) + ' (SD=' + agg_df['std_is_paper_id_in_context'].astype(str) + ')'

summary_table = agg_df[['Cor_answer', 'is_paper_id_in_context']].unstack('Difficulty')
summary_table.columns = [f'{val}_{diff}' for val, diff in summary_table.columns]

# Create summary table without Difficulty
grouped_simple = merged_df.groupby(['Model', 'Model_ret'])
agg_df_simple = grouped_simple.agg(
    mean_cor_answer=('Cor_answer', 'mean'),
    std_cor_answer=('Cor_answer', 'std'),
    mean_is_paper_id_in_context=('is_paper_id_in_context', 'mean'),
    std_is_paper_id_in_context=('is_paper_id_in_context', 'std')
).round(3)

agg_df_simple['Cor_answer_summary'] = agg_df_simple['mean_cor_answer'].astype(str) + ' (SD=' + agg_df_simple['std_cor_answer'].astype(str) + ')'
agg_df_simple['is_paper_id_in_context_summary'] = agg_df_simple['mean_is_paper_id_in_context'].astype(str) + ' (SD=' + agg_df_simple['std_is_paper_id_in_context'].astype(str) + ')'

# Merge the two tables
final_summary = pd.merge(summary_table, agg_df_simple[['Cor_answer_summary', 'is_paper_id_in_context_summary']], left_index=True, right_index=True)

# Save to Excel
with pd.ExcelWriter('results/summary.xlsx') as writer:  
    final_summary.to_excel(writer, sheet_name='Summary')
    merged_df.to_excel(writer, sheet_name='raw_evaluations')

# Display the table
final_summary

  grouped = merged_df.groupby(['Model', 'Model_ret', 'Difficulty'])
  grouped_simple = merged_df.groupby(['Model', 'Model_ret'])


Unnamed: 0_level_0,Unnamed: 1_level_0,Cor_answer_Easy,Cor_answer_Hard,Cor_answer_Medium,is_paper_id_in_context_Easy,is_paper_id_in_context_Hard,is_paper_id_in_context_Medium,Cor_answer_summary,is_paper_id_in_context_summary
Model,Model_ret,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
gpt-5,vidore/colpali-v1.3-merged,0.835 (SD=0.372),0.808 (SD=0.395),0.83 (SD=0.377),0.15 (SD=0.358),0.18 (SD=0.386),0.092 (SD=0.29),0.828 (SD=0.377),0.143 (SD=0.35)
gpt-5,vidore/colqwen2.5-v0.2,0.835 (SD=0.372),0.783 (SD=0.414),0.822 (SD=0.384),0.079 (SD=0.271),0.06 (SD=0.239),0.0 (SD=0.0),0.822 (SD=0.383),0.059 (SD=0.236)
gpt-5,ahmed-masry/ColFlor,0.823 (SD=0.382),0.833 (SD=0.374),0.837 (SD=0.371),0.118 (SD=0.323),0.1 (SD=0.302),0.042 (SD=0.201),0.828 (SD=0.377),0.098 (SD=0.298)
gpt-5-mini,vidore/colpali-v1.3-merged,0.817 (SD=0.387),0.717 (SD=0.453),0.859 (SD=0.349),0.176 (SD=0.382),0.0 (SD=0.0),0.125 (SD=0.332),0.807 (SD=0.395),0.134 (SD=0.341)
gpt-5-mini,vidore/colqwen2.5-v0.2,0.817 (SD=0.387),0.708 (SD=0.456),0.83 (SD=0.377),0.074 (SD=0.261),0.05 (SD=0.219),0.0 (SD=0.0),0.798 (SD=0.402),0.054 (SD=0.225)
gpt-5-mini,ahmed-masry/ColFlor,0.823 (SD=0.382),0.758 (SD=0.43),0.859 (SD=0.349),0.118 (SD=0.323),0.1 (SD=0.302),0.042 (SD=0.201),0.818 (SD=0.386),0.098 (SD=0.298)
gpt-5-nano,vidore/colpali-v1.3-merged,0.797 (SD=0.403),0.6 (SD=0.492),0.763 (SD=0.427),0.176 (SD=0.382),0.1 (SD=0.302),0.042 (SD=0.201),0.75 (SD=0.433),0.134 (SD=0.341)
gpt-5-nano,vidore/colqwen2.5-v0.2,0.786 (SD=0.411),0.575 (SD=0.496),0.793 (SD=0.407),0.074 (SD=0.261),0.05 (SD=0.219),0.0 (SD=0.0),0.745 (SD=0.436),0.054 (SD=0.225)
gpt-5-nano,ahmed-masry/ColFlor,0.759 (SD=0.428),0.592 (SD=0.494),0.77 (SD=0.422),0.118 (SD=0.323),0.1 (SD=0.302),0.083 (SD=0.278),0.728 (SD=0.445),0.107 (SD=0.31)
