In [1]:
import os
import pandas as pd
import json
from pathlib import Path

def parse_scores_string(s):
    """
    Converts a string representation of a list (e.g., "[71.42857, 71.42857]") into a list of floats.
    """
    s = s.strip("[]")
    if not s:
        return []
    return [float(x.strip()) for x in s.split(",")]

def extract_overall_scores(directory, n_samples=20, skip_if=None):
    """
    Reads all .csv and .json files in the given directory and extracts the "Overall" score
    from .csv files, "CIDEr" score from .json files, "Jaccard" score from .json files with specific keys,
    and "Final Score Norm" from certain JSON files.
    Handles cases where "Overall" is a column or a row in .csv files, and supports additional formats with "Overall Score".

    Args:
        directory (str): Path to the directory containing .csv and .json files.

    Returns:
        dict: A dictionary where keys are filenames and values are the extracted scores.
    """
    overall_scores = {}
    if skip_if is None:
        skip_if = []
        
    skip_if.extend(["MMVet_gpt-4-turbo_score_fine.csv", "MMDU_gpt-4o_score.csv"], )

    # Iterate through all files in the directory
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        
        skip_this_file = False
        for skip in skip_if:
            if skip in filename:
                skip_this_file = True
                break
        if skip_this_file:
            continue
        
        filename = filename.replace(Path(directory).name + "_", "")
        
        if filename.endswith('.csv'):
            try:
                # Load the CSV file into a pandas DataFrame
                df = pd.read_csv(filepath)

                if "split" in df.columns and "average_scores" in df.columns:
                    fintab_value = None
                    vwtq_value = None

                    # Process fintabnetqa row: extract first element from list string
                    if "fintabnetqa" in df["split"].values:
                        fintab_row = df[df["split"] == "fintabnetqa"].iloc[0]
                        fintab_list = parse_scores_string(fintab_row["average_scores"])
                        fintab_value = fintab_list[0] if fintab_list else None

                    # Process vwtq row: extract first element from list string
                    if "vwtq" in df["split"].values:
                        vwtq_row = df[df["split"] == "vwtq"].iloc[0]
                        vwtq_list = parse_scores_string(vwtq_row["average_scores"])
                        vwtq_value = vwtq_list[0] if vwtq_list else None

                    # Directly assign overall_score as the average of the two values
                    overall_score = (fintab_value / 100 + vwtq_value / 100) / 2 if (fintab_value is not None and vwtq_value is not None) else None
                elif "Overall" in df.columns:
                    if "MathVerse" in filename or "TextVQA" in filename:
                        overall_score = df["Overall"].iloc[-1] / 100
                    else:
                        # Extract the "Overall" score from the first row
                        overall_score = df["Overall"].iloc[0]
                elif "Overall Score" in df.columns:
                    # MMDU
                    # Extract the "Overall Score" from the row where "set" is "all"
                    overall_row = df[df["set"] == "all"]
                    overall_score = overall_row["Overall Score"].iloc[0] if not overall_row.empty else None
                elif "Avg ACC" in df.columns:
                    # New case: Extract the "Avg ACC" score from the first row
                    overall_score = df["Avg ACC"].iloc[0] / 100
                elif "split" in df.columns and "aAcc" in df.columns:
                    overall_row = df[df["split"] == "Overall"]
                    overall_score = overall_row["aAcc"].iloc[0] / 100 if not overall_row.empty else None
                else:
                    # MMVET
                    # Check if "Overall" is in the first column (as a row)
                    overall_row = df[df.iloc[:, 0] == "Overall"]
                    if not overall_row.empty:
                        overall_score = overall_row.iloc[0, df.columns.get_loc("acc")] / 100 # Extract score from "acc" column
                    else:
                        overall_score = None  # "Overall" not found


                if overall_score is not None and any(f in filename for f in ["POPE_score.csv", "OCRVQA_TEST_acc.csv", "ChartQA_TEST_acc.csv", "GQA_TestDev_Balanced_acc.csv", "MathVerse_MINI"]):
                    # If it's likely a percentage (e.g., > 1), divide by 100
                    # if overall_score > 1:
                    overall_score /= 100

                # overall_scores[filename + " (Accuracy)"] = overall_score
                overall_scores[filename.replace(".csv", "")] = overall_score

            except Exception as e:
                print(f"Error processing {filename}: {e}")
                overall_scores[filename] = None

        elif filename.endswith('.json'):
            try:
                # Load the JSON file
                with open(filepath, 'r') as f:
                    data = json.load(f)
                
                # Extract the "CIDEr" score if available
                if "ROUGE_L" in data:
                    overall_scores[filename + " (ROUGE_L)"] = data.get("ROUGE_L", None) / 100
                # Extract the "Jaccard" score if available
                elif "Jaccard" in data:
                    overall_scores[filename + " (Jaccard)"] = data.get("Jaccard", None) / 1
                # Extract the "Final Score Norm" if available
                elif "Final Score" in data:
                    overall_scores[filename + " (Accuracy)"] = data.get("Final Score", None) / n_samples
                elif "Overall" in data:
                    overall_scores[filename + " (Accuracy)"] = data["Overall"]
                elif "Average" in data:
                    overall_scores[filename + " (Accuracy)"] = data["Average"] / 100
                else:
                    overall_scores[filename] = None

            except Exception as e:
                print(f"Error processing {filename}: {e}")
                overall_scores[filename] = None

    overall_scores = dict(sorted(overall_scores.items(), key=lambda item: item[0]))
    return overall_scores

def create_table(data):
    df = pd.DataFrame(data)
    df = df[df["Metric"] != "MMVet_gpt-4-turbo_score_fine.csv"]

    keys = list(data.keys())
    keys.remove("Metric")

    # Calculate normalized sum
    normalized_sums = {
        # key: (df[key] / df[keys].mean(axis=1)).sum()
        key: (df[key]).mean()
        for key in keys
    }

    # Create a new row for normalized sum
    normalized_sum_row = pd.DataFrame([{"Metric": "Mean", **normalized_sums}])

    # Concatenate the normalized sum row with the original DataFrame
    final_df = pd.concat([df, normalized_sum_row], ignore_index=True)

    # Format numeric columns to two decimal places
    numeric_columns = keys
    for col in numeric_columns:
        final_df[col] = (pd.to_numeric(final_df[col], errors="coerce") * 100).round(1)

    return final_df

def create_table_from_dict(name_path_dict, skip_if=None, n_samples=20):
    if skip_if is None:
        skip_if = []
        
    # print(min([[key.split("_", 1)[-1] for key in extract_overall_scores(path).keys()] for path in list(name_path_dict.values())], key=len))
    # print(extract_overall_scores(next(iter(name_path_dict.values())), skip_if=skip_if, n_samples=n_samples).keys())
    # print([Path(key).name for key in name_path_dict.values()])
    
    return create_table({
        "Metric": [key.split("_", 1)[-1] for key in extract_overall_scores(next(iter(name_path_dict.values())), skip_if=skip_if, n_samples=n_samples).keys()],
        **{
            key: extract_overall_scores(value, skip_if=skip_if, n_samples=n_samples).values()
            for key, value in name_path_dict.items()
        }
    })

In [2]:
import numpy as np

def create_table_from_dict_fillna(name_path_dict, skip_if=None, n_samples=20):
    if skip_if is None:
        skip_if = []
        
    reference_max_benchmarks = list({
        key
        for path in list(name_path_dict.values())
        for key in extract_overall_scores(path, skip_if=skip_if, n_samples=n_samples).keys()
    })
    
    result = {
        "Metric": reference_max_benchmarks
    }
    for key, value in name_path_dict.items():
        key_val_all_benchmarks = extract_overall_scores(value, skip_if=skip_if, n_samples=n_samples)
        key_val_max_benchmarks = {}
        # for key_one_bench, val_one_bench in key_val_all_benchmarks.items():
        #     if key_one_bench not in reference_max_benchmarks:
        #         key_val_max_benchmarks[key_one_bench] = np.nan
        #     else:
        #         key_val_max_benchmarks[key_one_bench] = val_one_bench            
        for ref_key in reference_max_benchmarks:
            if ref_key in key_val_all_benchmarks:
                key_val_max_benchmarks[ref_key] = key_val_all_benchmarks[ref_key]
            else:
                key_val_max_benchmarks[ref_key] = np.nan  # np.nan or None or 0 or whatever you want to fill missing values with. Here we use np.nan.
        
        result[key] = key_val_max_benchmarks.values()
    
    # print(result)
    # for key, value in result.items():
    #     print(key, len(value))
    
    return create_table(result)


def create_table_from_dict_skipna(name_path_dict, skip_if=None, n_samples=20):
    if skip_if is None:
        skip_if = []
        
    reference_max_benchmarks = list(set.intersection(*[
        set(extract_overall_scores(path, skip_if=skip_if, n_samples=n_samples).keys()) for path in name_path_dict.values()
    ]))

    result = {
        "Metric": reference_max_benchmarks
    }
    for key, value in name_path_dict.items():
        key_val_all_benchmarks = extract_overall_scores(value, skip_if=skip_if, n_samples=n_samples)
        key_val_max_benchmarks = {}
        
        for ref_key in reference_max_benchmarks:
            if ref_key in key_val_all_benchmarks:
                key_val_max_benchmarks[ref_key] = key_val_all_benchmarks[ref_key]
            else:
                key_val_max_benchmarks[ref_key] = np.nan  # np.nan or None or 0 or whatever you want to fill missing values with. Here we use np.nan.
        
        result[key] = key_val_max_benchmarks.values()
    
    # print(result)
    # for key, value in result.items():
    #     print(key, len(value))
    
    return create_table(result)

In [3]:
# Comparison with other test-time scaling methods
create_table_from_dict_fillna({
    "base": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized/Base_SmolVLM2_2B",
    "answr_lvl_temp_majority_vote": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_answerlevels_selfconsistency/TTAugAdapter_SmolVLM2_2B_8_AnswerLevelTemperatureMajorityVote",
    "answr_lvl_greedy_majority_vote": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_answerlevels_selfconsistency/TTAugAdapter_SmolVLM2_2B_8_AnswerLevelGreedyMajorityVote",
    "answr_lvl_temp_mllm_selector": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_answerlevels/TTAugAdapter_SmolVLM2_2B_8_AnswerLevelTemperatureMLLMSelector",
    "answr_lvl_greedy_mllm_selector": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_answerlevels/TTAugAdapter_SmolVLM2_2B_8_AnswerLevelGreedyMLLMSelector",
    "answr_lvl_temp_confidence_selector": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_answerlevels_confidence/TTAugAdapter_SmolVLM2_2B_8_AnswerLevelTemperatureMLLMSelector",
    "answr_lvl_greedy_mllm_synthesizer": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_answerlevels_mllmsynthesizer/TTAugAdapter_SmolVLM2_2B_8_AnswerLevelTemperatureMLLMSynthesizer",
    "tta8_av": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized/TTAugAdapter_SmolVLM2_2B_8_SimplePara_average",
}, n_samples=1000)

Unnamed: 0,Metric,base,answr_lvl_temp_majority_vote,answr_lvl_greedy_majority_vote,answr_lvl_temp_mllm_selector,answr_lvl_greedy_mllm_selector,answr_lvl_temp_confidence_selector,answr_lvl_greedy_mllm_synthesizer,tta8_av
0,MME-RealWorld-Lite_rating.json (Accuracy),27.8,26.2,30.8,26.4,29.6,29.0,29.2,31.1
1,OCRBench_score.json (Accuracy),72.9,72.6,72.7,71.9,73.1,60.9,71.1,73.4
2,ChartQA_TEST_acc,74.2,74.4,74.8,73.4,70.9,61.1,72.8,75.6
3,GQA_TestDev_Balanced_acc,0.0,0.0,7.6,0.0,3.7,0.0,0.0,5.8
4,OCRVQA_TEST_acc,0.0,0.0,12.0,0.0,4.5,0.2,3.3,11.8
5,AI2D_TEST_acc,68.5,3.1,3.6,69.2,66.6,69.9,68.0,68.8
6,TextVQA_VAL_acc,73.2,72.6,72.3,71.6,72.9,61.6,71.6,72.8
7,AMBER_score,68.7,70.4,72.7,64.5,67.0,58.9,75.8,75.4
8,COCO_VAL_score.json (ROUGE_L),9.1,8.2,21.2,8.4,13.0,8.6,29.5,15.9
9,Mean,43.8,36.4,40.9,42.8,44.6,38.9,46.8,47.9


In [4]:
# Different aggregation methods
create_table_from_dict_fillna({
    "base": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized/Base_SmolVLM2_2B",
    "tta8_mostconf": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized/TTAugAdapter_SmolVLM2_2B_8_SimplePara_mostconf",
    "tta8_majority": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_continue/TTAugAdapter_SmolVLM2_2B_8_SimplePara_majority",
    "tta8_ewm": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_continue/TTAugAdapter_SmolVLM2_2B_8_SimplePara_ewm",
    "tta8_av": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized/TTAugAdapter_SmolVLM2_2B_8_SimplePara_average",
}, n_samples=1000)

Unnamed: 0,Metric,base,tta8_mostconf,tta8_majority,tta8_ewm,tta8_av
0,MME-RealWorld-Lite_rating.json (Accuracy),27.8,29.5,30.4,31.0,31.1
1,OCRBench_score.json (Accuracy),72.9,72.0,72.2,73.4,73.4
2,ChartQA_TEST_acc,74.2,73.6,74.8,76.6,75.6
3,GQA_TestDev_Balanced_acc,0.0,6.1,3.4,4.3,5.8
4,OCRVQA_TEST_acc,0.0,3.5,9.0,11.4,11.8
5,AI2D_TEST_acc,68.5,68.7,68.7,68.8,68.8
6,TextVQA_VAL_acc,73.2,70.5,71.5,73.3,72.8
7,AMBER_score,68.7,72.3,71.4,74.6,75.4
8,COCO_VAL_score.json (ROUGE_L),9.1,14.2,18.4,14.6,15.9
9,Mean,43.8,45.6,46.6,47.6,47.9


In [5]:
# Scaling: Number of Augmentations
create_table_from_dict_fillna({
    "base": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized/Base_SmolVLM2_2B",
    "tta2_av": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_continuee/TTAugAdapter_SmolVLM2_2B_2_SimplePara_average",
    "tta4_av": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_continuee/TTAugAdapter_SmolVLM2_2B_4_SimplePara_average",
    "tta8_av": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized/TTAugAdapter_SmolVLM2_2B_8_SimplePara_average",
    "tta16_av": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_continuee/TTAugAdapter_SmolVLM2_2B_16_SimplePara_average",
    "tta32_av": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_continuee/TTAugAdapter_SmolVLM2_2B_32_SimplePara_average",
    "tta64_av": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_continuee64/TTAugAdapter_SmolVLM2_2B_64_SimplePara_average",
}, n_samples=1000)

Unnamed: 0,Metric,base,tta2_av,tta4_av,tta8_av,tta16_av,tta32_av,tta64_av
0,MME-RealWorld-Lite_rating.json (Accuracy),27.8,31.0,30.8,31.1,31.9,31.8,32.1
1,OCRBench_score.json (Accuracy),72.9,72.1,73.9,73.4,73.7,72.2,72.8
2,ChartQA_TEST_acc,74.2,75.1,75.0,75.6,76.1,76.7,76.3
3,GQA_TestDev_Balanced_acc,0.0,1.1,3.7,5.8,5.5,6.4,5.2
4,OCRVQA_TEST_acc,0.0,1.2,9.4,11.8,12.6,12.7,13.5
5,AI2D_TEST_acc,68.5,69.2,68.9,68.8,69.6,68.5,69.3
6,TextVQA_VAL_acc,73.2,72.0,73.0,72.8,72.4,72.4,72.2
7,AMBER_score,68.7,72.4,74.3,75.4,75.9,76.9,77.4
8,COCO_VAL_score.json (ROUGE_L),9.1,12.1,14.4,15.9,16.9,17.2,16.9
9,Mean,43.8,45.1,47.0,47.9,48.3,48.3,48.4


In [6]:
create_table_from_dict_fillna({
    ## before this, text aug was classical
    "tta16_av_gpt_in_other": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_auggpt16/TTAugAdapter_SmolVLM2_2B_16_AugGPTPara_average",
    "tta16_av_self_in_other": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_selfpara16/TTAugAdapter_SmolVLM2_2B_16_SimplePara_average",
    "tta16_av_classical_wo_consistency": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_16classicalwoconsist/TTAugAdapter_SmolVLM2_2B_16_SimplePara_average_wo_consistency",


    ## before this, img aug was hard
    "tta16_av_classical_augmix": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_16_simple_mid_low_augmix/TTAugAdapter_SmolVLM2_2B_16_SimplePara_augmix_average",
    "tta16_av_classical_low": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_16_simple_mid_low_augmix/TTAugAdapter_SmolVLM2_2B_16_SimplePara_low_average",
    "tta16_av_classical_medium": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_16_simple_mid_low_augmix/TTAugAdapter_SmolVLM2_2B_16_SimplePara_medium_average",
    "tta16_av_classical_gen": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_classicalgener/TTAugAdapter_SmolVLM2_2B_16_SimplePara_GenImg_average",


    ## decomposition
    "tta16_av_classical_textonly": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_16classical_textonly/TTAugAdapter_SmolVLM2_2B_16_SimplePara_average_textonly",
    "tta16_av_classical_imageonly": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_16classical_imageonly/TTAugAdapter_SmolVLM2_2B_16_SimplePara_average_imageonly",
    "tta16_av_classical_low_imageonly": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_16classical_imageonly_low/TTAugAdapter_SmolVLM2_2B_16_SimplePara_average_imageonly",


    ## adaptation objectives
    "tta_16_classical_learnedweights_noreg_hypchanged": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_simpleaugs_learnedweights_noreg_hypchanged/TTAugAdapter_SmolVLM2_2B_16_SimplePara_learnedweights",
    "tta_16_classical_learnedmodel_subset_new_2": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_simpleaugs_learnedmodel_subset_new_2/TTAugAdapter_SmolVLM2_2B_16_SimplePara_learnedmodel",
}, n_samples=1000)

Unnamed: 0,Metric,tta16_av_gpt_in_other,tta16_av_self_in_other,tta16_av_classical_wo_consistency,tta16_av_classical_augmix,tta16_av_classical_low,tta16_av_classical_medium,tta16_av_classical_gen,tta16_av_classical_textonly,tta16_av_classical_imageonly,tta16_av_classical_low_imageonly,tta_16_classical_learnedweights_noreg_hypchanged,tta_16_classical_learnedmodel_subset_new_2
0,MME-RealWorld-Lite_rating.json (Accuracy),30.0,25.9,32.1,32.1,31.8,32.5,31.1,31.6,26.6,27.7,30.9,31.4
1,OCRBench_score.json (Accuracy),73.5,72.8,70.6,72.4,73.7,73.3,65.3,73.1,73.3,73.8,73.0,70.5
2,ChartQA_TEST_acc,76.9,76.6,71.4,74.1,77.0,76.4,75.7,75.8,74.7,74.2,76.1,76.7
3,GQA_TestDev_Balanced_acc,0.0,0.0,31.2,3.1,4.1,3.7,2.5,2.0,0.0,0.0,5.2,13.5
4,OCRVQA_TEST_acc,2.6,0.0,0.0,12.9,12.1,10.6,12.0,13.5,0.0,0.0,11.9,13.8
5,AI2D_TEST_acc,69.9,68.4,63.9,68.9,69.1,69.0,67.0,68.1,69.8,69.2,69.7,67.4
6,TextVQA_VAL_acc,73.5,74.0,63.9,72.4,72.6,73.3,71.6,73.0,74.2,73.9,74.2,70.5
7,AMBER_score,68.8,72.9,60.0,77.3,77.0,75.9,76.2,77.3,64.7,67.9,76.9,72.8
8,COCO_VAL_score.json (ROUGE_L),20.6,46.1,13.2,17.8,17.8,17.1,18.0,19.0,8.4,8.8,16.4,35.9
9,Mean,46.2,48.5,45.1,47.9,48.4,48.0,46.6,48.2,43.5,43.9,48.3,50.3


In [7]:
# Appendix: Aggregation at Earlier Layers 
create_table_from_dict_fillna({
    "base": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized/Base_SmolVLM2_2B",
    "tta_16_classical_earlylayer4": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_16_earlierlayers/TTAugAdapter_SmolVLM2_2B_16_SimplePara_AverageEarlyLayer4",
    "tta_16_classical_earlylayer8": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_16_earlierlayers/TTAugAdapter_SmolVLM2_2B_16_SimplePara_AverageEarlyLayer8",
    "tta_16_classical_earlylayer10": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_16_earlierlayers/TTAugAdapter_SmolVLM2_2B_16_SimplePara_AverageEarlyLayer10",
    "tta_16_classical_earlylayer12": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_16_earlierlayers/TTAugAdapter_SmolVLM2_2B_16_SimplePara_AverageEarlyLayer12",
    "tta_16_classical_earlylayer16": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_16_earlierlayers/TTAugAdapter_SmolVLM2_2B_16_SimplePara_AverageEarlyLayer16",
    "tta_16_classical_earlylayer20": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_16_earlierlayers/TTAugAdapter_SmolVLM2_2B_16_SimplePara_AverageEarlyLayer20",
    "tta_16_classical_earlylayer22": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_16_earlierlayers/TTAugAdapter_SmolVLM2_2B_16_SimplePara_AverageEarlyLayer22",
    "tta_16_classical_earlylayer23": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_16_earlierlayers/TTAugAdapter_SmolVLM2_2B_16_SimplePara_AverageEarlyLayer23",
    "tta_16_av_logits": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_continuee/TTAugAdapter_SmolVLM2_2B_16_SimplePara_average",
}, n_samples=1000)

Unnamed: 0,Metric,base,tta_16_classical_earlylayer4,tta_16_classical_earlylayer8,tta_16_classical_earlylayer10,tta_16_classical_earlylayer12,tta_16_classical_earlylayer16,tta_16_classical_earlylayer20,tta_16_classical_earlylayer22,tta_16_classical_earlylayer23,tta_16_av_logits
0,MME-RealWorld-Lite_rating.json (Accuracy),27.8,5.6,24.4,25.4,30.9,30.1,30.7,31.7,30.2,31.9
1,OCRBench_score.json (Accuracy),72.9,24.3,43.5,47.4,49.6,51.2,65.0,70.2,72.8,73.7
2,ChartQA_TEST_acc,74.2,17.5,41.6,39.1,46.5,49.9,69.8,74.3,75.2,76.1
3,GQA_TestDev_Balanced_acc,0.0,2.5,0.7,0.3,0.2,0.1,0.5,5.1,6.8,5.5
4,OCRVQA_TEST_acc,0.0,3.2,0.9,2.7,5.9,4.0,4.0,11.1,12.9,12.6
5,AI2D_TEST_acc,68.5,50.1,61.8,64.4,65.9,68.0,69.6,68.2,69.2,69.6
6,TextVQA_VAL_acc,73.2,15.0,43.0,44.8,49.4,47.0,61.4,69.2,71.8,72.4
7,AMBER_score,68.7,41.3,70.4,68.6,74.4,74.6,73.6,79.7,74.7,75.9
8,COCO_VAL_score.json (ROUGE_L),9.1,8.2,25.1,19.7,18.7,16.8,15.7,16.4,16.5,16.9
9,Mean,43.8,18.6,34.6,34.7,38.0,38.0,43.4,47.3,47.8,48.3


In [8]:
# Cross-Model Generalization: Different Parameter Sizes
create_table_from_dict_fillna({
    "base256m": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_continueeeee/Base_SmolVLM2_256M",
    "tta_16_classical_average_256m_repeat": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_simpleaugs_tta_av_othermodels/TTAugAdapter_SmolVLM2_256M_16_SimplePara_average",
    "tta_16_classical_learnedmodel_256m": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_simpleaugs_learnedmodel_subset_new_2_othermodels/TTAugAdapter_SmolVLM2_256M_16_SimplePara_learnedmodel",

    "base500m": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_continueeeee/Base_SmolVLM2_500M",
    "tta_16_classical_average_500m": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_simpleaugs_tta_av_othermodels/TTAugAdapter_SmolVLM2_500M_16_SimplePara_average",
    "tta_16_classical_learnedmodel_500m": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_simpleaugs_learnedmodel_subset_new_2_othermodels/TTAugAdapter_SmolVLM2_500M_16_SimplePara_learnedmodel",
}, n_samples=1000)

Unnamed: 0,Metric,base256m,tta_16_classical_average_256m_repeat,tta_16_classical_learnedmodel_256m,base500m,tta_16_classical_average_500m,tta_16_classical_learnedmodel_500m
0,MME-RealWorld-Lite_rating.json (Accuracy),21.0,21.4,20.7,27.6,27.6,27.2
1,OCRBench_score.json (Accuracy),56.7,53.3,50.3,61.0,60.0,57.6
2,ChartQA_TEST_acc,65.1,59.4,55.1,64.1,64.8,65.5
3,GQA_TestDev_Balanced_acc,0.1,5.8,18.4,0.0,0.0,0.9
4,OCRVQA_TEST_acc,0.2,0.4,0.3,0.0,4.6,5.2
5,AI2D_TEST_acc,37.0,35.4,34.0,56.6,55.3,52.1
6,TextVQA_VAL_acc,47.8,45.1,40.1,59.9,58.0,57.7
7,AMBER_score,29.5,53.3,43.0,55.3,56.1,52.8
8,COCO_VAL_score.json (ROUGE_L),29.0,40.6,38.5,6.2,9.2,31.6
9,Mean,31.8,35.0,33.4,36.7,37.3,38.9


In [9]:
# Cross-Model Generalization: InternVL2
create_table_from_dict_fillna({
    "InternVL2_1B": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_differentarchitecture1000/InternVL2_1B",
    "TTAugAdapter_InternVLChat2": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_8augcount_newdatasets_finalized_differentarchitecture1000/TTAugAdapter_InternVLChat2_10_av_noimgaug",
}, n_samples=1000)

Unnamed: 0,Metric,InternVL2_1B,TTAugAdapter_InternVLChat2
0,MME-RealWorld-Lite_rating.json (Accuracy),13.5,13.3
1,OCRBench_score.json (Accuracy),75.7,75.1
2,ChartQA_TEST_acc,72.1,72.1
3,GQA_TestDev_Balanced_acc,52.0,51.3
4,OCRVQA_TEST_acc,43.3,42.0
5,AI2D_TEST_acc,52.8,52.6
6,TextVQA_VAL_acc,69.6,67.6
7,AMBER_score,72.6,75.7
8,COCO_VAL_score.json (ROUGE_L),17.2,24.6
9,Mean,52.1,52.7


In [10]:
# Cross-Model Generalization: Ovis2
create_table_from_dict_fillna({
    "Ovis2_1B": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_newdatasets_finalized_ovis1000/Ovis2_1B",
    "TTAug_Ovis2_1B_16_average": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_newdatasets_finalized_ovis1000_img/TTAugAdapter_Ovis2_1B_16_SimplePara_average_high",
    
    "Ovis2": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_newdatasets_finalized_ovis1000/Ovis2",
    "TTAug_Ovis2_2B_16_average": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_newdatasets_finalized_ovis1000_img/TTAugAdapter_Ovis2_2B_16_SimplePara_average_high",

    "Ovis2_4B": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_newdatasets_finalized_ovis1000/Ovis2_4B",
    "TTAug_Ovis2_4B_16_average": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_newdatasets_finalized_ovis1000_img/TTAugAdapter_Ovis2_4B_16_SimplePara_average_high",

    "Ovis2_8B": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_newdatasets_finalized_ovis1000/Ovis2_8B",
    "TTAug_Ovis2_8B_16_average": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_newdatasets_finalized_ovis1000_img/TTAugAdapter_Ovis2_8B_16_SimplePara_average_low",
}, n_samples=1000)

Unnamed: 0,Metric,Ovis2_1B,TTAug_Ovis2_1B_16_average,Ovis2,TTAug_Ovis2_2B_16_average,Ovis2_4B,TTAug_Ovis2_4B_16_average,Ovis2_8B,TTAug_Ovis2_8B_16_average
0,MME-RealWorld-Lite_rating.json (Accuracy),35.5,35.6,38.6,40.5,45.7,44.1,45.7,46.5
1,OCRBench_score.json (Accuracy),88.8,84.9,87.3,86.0,91.2,89.2,89.2,87.2
2,ChartQA_TEST_acc,80.4,81.6,86.6,85.9,87.6,87.8,87.4,87.9
3,GQA_TestDev_Balanced_acc,30.0,54.3,34.5,58.7,40.5,55.7,59.4,64.2
4,OCRVQA_TEST_acc,74.3,70.5,76.7,73.1,80.2,76.9,79.3,78.7
5,AI2D_TEST_acc,76.5,73.3,81.9,82.2,84.9,84.5,87.1,87.2
6,TextVQA_VAL_acc,79.2,77.2,78.8,79.5,83.5,83.9,83.1,84.0
7,AMBER_score,76.1,73.8,84.9,85.9,87.4,87.4,87.3,89.8
8,COCO_VAL_score.json (ROUGE_L),22.7,13.7,17.3,13.1,14.0,12.5,13.8,13.3
9,Mean,62.6,62.8,65.2,67.2,68.3,69.1,70.3,71.0


In [11]:
# Appendix: Reference Baseline Models
create_table_from_dict_fillna({
    "Janus": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_newdatasets_finalized_differentbaselines/Janus",
    "IDEFICS2": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_newdatasets_finalized_differentbaselines/IDEFICS2",
    "Molmo": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_newdatasets_finalized_differentbaselines/molmo",
    "XGenMM": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_newdatasets_finalized_differentbaselines/XGenMM",
    "LLaVa-onevision-7b": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_newdatasets_finalized_differentbaselines/llava-onevision-qwen2-7b-si-hf",
    "PaliGemma": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_newdatasets_finalized_differentbaselines/PaliGemma",
    ## "LLaVA_Next-8B": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_newdatasets_finalized_differentbaselines/LLaVA_Next",
    ## "Pixtral-12B": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_newdatasets_finalized_differentbaselines_pixtral/Pixtral", # Very slow
    ## "Idefics3-8B": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_newdatasets_finalized_differentbaselines/Idefics3-8B-Llama3",
}, n_samples=1000)

Unnamed: 0,Metric,Janus,IDEFICS2,Molmo,XGenMM,LLaVa-onevision-7b,PaliGemma
0,MME-RealWorld-Lite_rating.json (Accuracy),23.4,34.3,36.8,35.1,31.1,25.4
1,OCRBench_score.json (Accuracy),58.9,63.4,66.3,55.5,61.2,61.4
2,ChartQA_TEST_acc,31.0,31.6,85.8,65.0,72.3,40.7
3,GQA_TestDev_Balanced_acc,13.7,0.0,55.1,60.2,62.5,61.5
4,OCRVQA_TEST_acc,2.5,0.0,44.9,70.7,69.5,61.2
5,AI2D_TEST_acc,67.5,72.2,80.7,73.5,78.2,67.9
6,TextVQA_VAL_acc,55.0,72.6,81.5,72.8,60.8,70.7
7,AMBER_score,74.8,85.4,85.0,82.1,84.4,84.9
8,COCO_VAL_score.json (ROUGE_L),18.0,24.4,12.1,15.7,13.9,45.9
9,Mean,38.3,42.7,60.9,59.0,59.3,57.7


In [12]:
# Appendix: Qualitative Examples
# create_table_from_dict_fillna({
#     "Base_SmolVLM2_2B": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_newdatasets_finalized_VISUALSAMPLES/Base_SmolVLM2_2B",
#     "TTAugAdapter_SmolVLM2_2B_16_SimplePara_average": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_newdatasets_finalized_VISUALSAMPLES/TTAugAdapter_SmolVLM2_2B_16_SimplePara_average",
#     "TTAugAdapter_SmolVLM2_2B_16_SimplePara_learnedmodel": "/work3/monka/efficient_test_time_scaling_for_small_vlms/benchmark_results/n_samples_1000/exp_83_newdatasets_finalized_VISUALSAMPLES/TTAugAdapter_SmolVLM2_2B_16_SimplePara_learnedmodel",
# }, n_samples=1)