## Evaluation visual-retrivers (CI 95%)

In [1]:
import pandas as pd
from analysis_functions import (
    run_ci_summary,
    DEFAULT_PRICE_DICT,
    get_metric_descriptions,
)

In [2]:
TOPK_CONFIG = {
    5: {
        'data_path': 'results/evals_topk5/',
        'summary_path': 'results/summary_CI_topk5.xlsx',
    },
    10: {
        'data_path': 'evals/evals_topk10/',
        'summary_path': 'results/summary_CI_topk10.xlsx',
    },
}

TOP_K = 5
CONFIG = TOPK_CONFIG[TOP_K]
DATA_PATH = CONFIG['data_path']
SUMMARY_FILE = CONFIG['summary_path']
PRECISION_LABEL = f'P@{TOP_K}'

GROUP_BY_COLS = ['Model', 'Model_ret', 'Difficulty']
MODEL_ORDER = ['gpt-5', 'gpt-5-mini', 'gpt-5-nano']
RETRIEVER_ORDER = [
    'vidore/colpali-v1.3-merged',
    'vidore/colqwen2.5-v0.2',
    'ahmed-masry/ColFlor',
]
DISPLAY_COLUMNS = [
    'Cor_answer',
    'Elapsed',
    'Total_tokens',
    PRECISION_LABEL,
    'Throughput',
    'Cost',
    'Price-per-cost',
]
GROUP_BY_NO_DIFFICULTY = [col for col in GROUP_BY_COLS if col != 'Difficulty']
metric_descriptions = get_metric_descriptions(TOP_K)

In [3]:
per_iteration_summary, summary_table, merged_df = run_ci_summary(
    path=DATA_PATH,
    group_by_cols=GROUP_BY_COLS,
    price_dict=DEFAULT_PRICE_DICT,
    top_k=TOP_K,
    model_order=MODEL_ORDER,
    retriever_order=RETRIEVER_ORDER,
    precision_label=PRECISION_LABEL,
)
summary_table_display = summary_table[GROUP_BY_COLS + DISPLAY_COLUMNS]

In [4]:
per_iteration_summary

Unnamed: 0,Model,Model_ret,Difficulty,Iteration,Cor_answer,Elapsed,Total_tokens,P@5,Throughput,Cost,Correct_answers,Price-per-cost
0,gpt-5,vidore/colpali-v1.3-merged,Easy,1,0.797101,30.978978,4621.304348,0.029412,157.093342,3.188700,55,5.797636
1,gpt-5,vidore/colpali-v1.3-merged,Easy,2,0.840580,10.029332,4555.724638,0.058824,653.810501,3.143450,58,5.419741
2,gpt-5,vidore/colpali-v1.3-merged,Hard,1,0.833333,32.467337,5010.250000,0.030000,159.340461,1.202460,20,6.012300
3,gpt-5,vidore/colpali-v1.3-merged,Hard,2,0.833333,7.397494,5065.541667,0.040000,752.635570,1.215730,20,6.078650
4,gpt-5,vidore/colpali-v1.3-merged,Medium,1,0.777778,31.606274,4502.851852,0.025000,152.460992,1.215770,21,5.789381
...,...,...,...,...,...,...,...,...,...,...,...,...
67,gpt-5-nano,ahmed-masry/ColFlor,Hard,2,0.541667,26.761090,9808.750000,0.020000,375.159934,0.094164,13,0.724338
68,gpt-5-nano,ahmed-masry/ColFlor,Hard,3,0.666667,4.639456,9984.625000,0.020000,2221.975524,0.095852,16,0.599078
69,gpt-5-nano,ahmed-masry/ColFlor,Medium,1,0.740741,6.975807,10503.407407,0.033333,1636.891080,0.113437,20,0.567184
70,gpt-5-nano,ahmed-masry/ColFlor,Medium,2,0.777778,28.346568,10339.851852,0.033333,374.110352,0.111670,21,0.531764


In [5]:
summary_table_display

Unnamed: 0,Model,Model_ret,Difficulty,Cor_answer,Elapsed,Total_tokens,P@5,Throughput,Cost,Price-per-cost
0,gpt-5,vidore/colpali-v1.3-merged,Easy,"0.819\n[0.213, 1.000]","20.50\n[0.00, 153.60]","4588.5\n[4171.9, 5005.1]","0.044\n[0.000, 0.231]","405.5\n[0.0, 3561.1]","3.17\n[2.88, 3.45]","5.61\n[3.21, 8.01]"
1,gpt-5,vidore/colpali-v1.3-merged,Hard,"0.833\n[0.219, 1.000]","19.93\n[0.00, 179.20]","5037.9\n[4686.6, 5389.2]","0.035\n[0.000, 0.099]","456.0\n[0.0, 4225.3]","1.21\n[1.12, 1.29]","6.05\n[5.62, 6.47]"
2,gpt-5,vidore/colpali-v1.3-merged,Medium,"0.778\n[0.197, 0.993]","19.31\n[0.00, 175.57]","4431.4\n[3524.1, 5338.8]","0.033\n[0.000, 0.139]","415.8\n[0.0, 3761.3]","1.20\n[0.95, 1.44]","5.70\n[4.53, 6.86]"
3,gpt-5,vidore/colqwen2.5-v0.2,Easy,"0.845\n[0.294, 1.000]","22.89\n[0.00, 81.84]","4296.2\n[4177.7, 4414.7]","0.065\n[0.045, 0.084]","353.5\n[0.0, 931.5]","2.96\n[2.88, 3.05]","5.08\n[4.82, 5.34]"
4,gpt-5,vidore/colqwen2.5-v0.2,Hard,"0.792\n[0.266, 0.990]","23.11\n[0.00, 83.48]","4633.5\n[4407.5, 4859.6]","0.053\n[0.039, 0.068]","382.6\n[0.0, 1000.3]","1.11\n[1.06, 1.17]","5.85\n[5.57, 6.14]"
5,gpt-5,vidore/colqwen2.5-v0.2,Medium,"0.827\n[0.285, 1.000]","22.15\n[0.00, 78.91]","4392.3\n[4249.0, 4535.6]","0.006\n[0.000, 0.029]","384.0\n[0.0, 1005.6]","1.19\n[1.15, 1.22]","5.31\n[4.95, 5.67]"
6,gpt-5,ahmed-masry/ColFlor,Easy,"0.797\n[0.205, 0.999]","18.02\n[0.00, 154.33]","4570.6\n[4181.4, 4959.7]","0.041\n[0.041, 0.041]","420.4\n[0.0, 3678.7]","3.15\n[2.89, 3.42]","5.74\n[4.90, 6.57]"
7,gpt-5,ahmed-masry/ColFlor,Hard,"0.854\n[0.228, 1.000]","16.37\n[0.00, 141.62]","4878.2\n[4522.4, 5234.0]","0.020\n[0.020, 0.020]","507.6\n[0.0, 4422.6]","1.17\n[1.09, 1.26]","5.72\n[3.53, 7.90]"
8,gpt-5,ahmed-masry/ColFlor,Medium,"0.833\n[0.219, 1.000]","18.63\n[0.00, 154.54]","4544.6\n[4333.6, 4755.7]","0.033\n[0.033, 0.033]","390.5\n[0.0, 3355.5]","1.23\n[1.17, 1.28]","5.46\n[4.17, 6.74]"
9,gpt-5-mini,vidore/colpali-v1.3-merged,Easy,"0.845\n[0.294, 1.000]","13.85\n[0.00, 44.10]","7764.9\n[7212.1, 8317.7]","0.042\n[0.021, 0.063]","939.2\n[0.0, 2336.2]","1.07\n[1.00, 1.15]","1.84\n[1.64, 2.04]"


In [6]:
if 'Difficulty' in GROUP_BY_COLS:
    summary_table.set_index(GROUP_BY_COLS)[DISPLAY_COLUMNS].unstack('Difficulty')

In [7]:
summary_table_no_diff = None
if GROUP_BY_NO_DIFFICULTY != GROUP_BY_COLS:
    per_iteration_no_diff, summary_table_no_diff, _ = run_ci_summary(
        path=DATA_PATH,
        group_by_cols=GROUP_BY_NO_DIFFICULTY,
        price_dict=DEFAULT_PRICE_DICT,
        top_k=TOP_K,
        model_order=MODEL_ORDER,
        retriever_order=RETRIEVER_ORDER,
        dataframe=merged_df,
        precision_label=PRECISION_LABEL,
    )
    summary_table_no_diff = summary_table_no_diff[GROUP_BY_NO_DIFFICULTY + DISPLAY_COLUMNS]
summary_table_no_diff

Unnamed: 0,Model,Model_ret,Cor_answer,Elapsed,Total_tokens,P@5,Throughput,Cost,Price-per-cost
0,gpt-5,vidore/colpali-v1.3-merged,"0.812\n[0.211, 1.000]","20.12\n[0.00, 163.66]","4643.1\n[4269.6, 5016.5]","0.040\n[0.000, 0.188]","417.9\n[0.0, 3739.0]","5.57\n[5.12, 6.02]","5.72\n[4.14, 7.29]"
1,gpt-5,vidore/colqwen2.5-v0.2,"0.831\n[0.286, 1.000]","22.77\n[0.00, 81.50]","4385.3\n[4273.7, 4496.9]","0.050\n[0.034, 0.066]","366.2\n[0.0, 960.9]","5.26\n[5.13, 5.40]","5.28\n[5.08, 5.48]"
2,gpt-5,ahmed-masry/ColFlor,"0.817\n[0.213, 1.000]","17.83\n[0.00, 151.84]","4626.3\n[4378.9, 4873.7]","0.036\n[0.036, 0.036]","431.1\n[0.0, 3754.7]","5.55\n[5.25, 5.85]","5.66\n[5.36, 5.97]"
3,gpt-5-mini,vidore/colpali-v1.3-merged,"0.833\n[0.288, 1.000]","13.80\n[0.00, 44.56]","7787.2\n[7112.3, 8462.1]","0.043\n[0.034, 0.053]","962.5\n[0.0, 2396.8]","1.87\n[1.71, 2.03]","1.87\n[1.59, 2.15]"
4,gpt-5-mini,vidore/colqwen2.5-v0.2,"0.803\n[0.272, 0.994]","24.34\n[0.00, 89.54]","8920.7\n[8867.9, 8973.4]","0.049\n[0.038, 0.060]","754.3\n[0.0, 2050.2]","2.14\n[2.13, 2.15]","2.23\n[1.98, 2.47]"
5,gpt-5-mini,ahmed-masry/ColFlor,"0.800\n[0.206, 1.000]","21.33\n[0.00, 85.57]","7593.0\n[7562.5, 7623.5]","0.036\n[0.036, 0.036]","388.2\n[0.0, 1423.5]","1.82\n[1.81, 1.83]","1.90\n[1.65, 2.14]"
6,gpt-5-nano,vidore/colpali-v1.3-merged,"0.739\n[0.238, 0.971]","14.09\n[0.00, 48.16]","9909.7\n[9408.4, 10411.0]","0.045\n[0.021, 0.068]","1283.7\n[0.0, 3391.7]","0.48\n[0.45, 0.50]","0.54\n[0.48, 0.59]"
7,gpt-5-nano,vidore/colqwen2.5-v0.2,"0.703\n[0.220, 0.958]","23.66\n[0.00, 78.55]","11755.5\n[11688.7, 11822.3]","0.052\n[0.047, 0.056]","867.0\n[0.0, 2364.7]","0.56\n[0.56, 0.57]","0.67\n[0.62, 0.72]"
8,gpt-5-nano,ahmed-masry/ColFlor,"0.731\n[0.234, 0.968]","13.07\n[0.00, 44.49]","10031.6\n[9928.9, 10134.2]","0.036\n[0.036, 0.036]","1363.3\n[0.0, 3523.5]","0.48\n[0.48, 0.49]","0.55\n[0.48, 0.62]"


In [8]:
merged_df

Unnamed: 0.1,Unnamed: 0,Question_nr,Paper_id,Nr_data_suppl,doi,title,question,A,B,C,...,Elapsed,Total_tokens,Time_start,Time_end,Metrics,Cor_answer,Iteration,Throughput,Cost,P@5
0,0,38,Paper_07,1.0,https://doi.org/10.1172%2Fjci.insight.89703,Estrogens regulate glycosylation of IgG in wom...,Which IgG glycan trait increases in women at 5...,G0/G2,G0F/G2,G0/G1,...,23.206100,8829,1.763008e+09,1763008361,"CompletionUsage(completion_tokens=658, prompt_...",0,1,380.460310,0.017658,0.0
1,1,41,Paper_08,1.0,https://doi.org/10.1016%2Fj.isci.2022.103897,Immunoglobulin G glycome composition in transi...,Using single-point measurements of IgG glycans...,0.776,0.789,0.853,...,17.193598,7517,1.763008e+09,1763008355,"CompletionUsage(completion_tokens=914, prompt_...",0,1,437.197612,0.015034,0.0
2,2,3,Paper_01,1.0,https://doi.org/10.1038/s41590-024-01916-8,A unique serum IgG glycosylation signature pre...,Which IgG glycan changes are seen in people wi...,decreased galactosylation,increased galactosylation,increased sialylation,...,29.192189,7630,1.763008e+09,1763008367,"CompletionUsage(completion_tokens=146, prompt_...",1,1,261.371287,0.015260,0.2
3,3,50,Paper_09,1.0,https://doi.org/10.1002%2Fart.39273,Association of Systemic Lupus Erythematosus Wi...,The predictive model for systemic lupus erythe...,African Caribbean,Latin American,Han Chinese,...,10.190741,7034,1.763008e+09,1763008348,"CompletionUsage(completion_tokens=466, prompt_...",1,1,690.234396,0.014068,0.0
4,4,103,Paper_19,,https://doi.org/10.1093/gerona/glt190,Glycans Are a Novel Biomarker of Chronological...,Which monosaccharide does the letter “A” repre...,N-acetylglucosamine,Galactose,Fucose,...,11.189008,6766,1.763008e+09,1763008349,"CompletionUsage(completion_tokens=786, prompt_...",1,1,604.700626,0.013532,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2875,115,1,Paper_01,1.0,https://doi.org/10.1038/s41590-024-01916-8,A unique serum IgG glycosylation signature pre...,How many different glycan variations have been...,Over 60,Over 30,Over 40,...,48.081554,11384,1.763013e+09,1763012566,"CompletionUsage(completion_tokens=850, prompt_...",1,3,236.764394,0.004554,1.0
2876,116,44,Paper_08,1.0,https://doi.org/10.1016%2Fj.isci.2022.103897,Immunoglobulin G glycome composition in transi...,Every IgG glycan contains a core consisting of:,3 GlcNAc + 2 mannoses,2 GlcNAc + 3 mannoses,3 GlcNAc + 2 galactoses,...,54.080237,10834,1.763013e+09,1763012572,"CompletionUsage(completion_tokens=274, prompt_...",1,3,200.331962,0.004334,0.0
2877,117,68,Paper_12,1.0,https://doi.org/10.1053%2Fj.gastro.2018.01.002,Glycosylation of Immunoglobulin G Associates W...,Which of the following IgG glycan traits is co...,digalactosylation,Sialylation,presence of core fucose,...,45.078932,11390,1.763013e+09,1763012563,"CompletionUsage(completion_tokens=851, prompt_...",1,3,252.667920,0.004556,0.0
2878,118,3,Paper_01,1.0,https://doi.org/10.1038/s41590-024-01916-8,A unique serum IgG glycosylation signature pre...,Which IgG glycan changes are seen in people wi...,decreased galactosylation,increased galactosylation,increased sialylation,...,52.077660,11906,1.763013e+09,1763012570,"CompletionUsage(completion_tokens=1362, prompt...",1,3,228.620104,0.004762,1.0


In [9]:
metric_descriptions_df = pd.DataFrame(
    [
        {'Metric': metric, 'Description': description}
        for metric, description in metric_descriptions.items()
    ]
)
metric_descriptions_df

Unnamed: 0,Metric,Description
0,Cor_answer,Average correctness rate per question
1,Elapsed,Average wall-clock time per question (seconds)
2,Total_tokens,Average total tokens consumed per question
3,P@5,Precision@5: share of retrieved documents cont...
4,Throughput,Average tokens processed per second
5,Cost,USD spent per iteration/run
6,Price-per-cost,Cents spent per correct answer


In [11]:
excel_path = SUMMARY_FILE

with pd.ExcelWriter(excel_path) as writer:
    if 'Difficulty' in GROUP_BY_COLS:
        pivot_df = summary_table.set_index(GROUP_BY_COLS)[DISPLAY_COLUMNS].unstack('Difficulty')
        pivot_df.to_excel(writer, sheet_name=f'diff_topk{TOP_K}')
    else:
        summary_table_display.to_excel(writer, sheet_name=f'diff_topk{TOP_K}', index=False)

    if summary_table_no_diff is not None:
        summary_table_no_diff.to_excel(writer, sheet_name=f'overview_topk{TOP_K}', index=False)

    per_iteration_summary.to_excel(writer, sheet_name='per_iteration', index=False)
    merged_df.to_excel(writer, sheet_name='raw_data', index=False)
    metric_descriptions_df.to_excel(writer, sheet_name='Descriptions', index=False)

excel_path

'results/summary_CI_topk5.xlsx'