In [1]:
import pandas as pd
import re
from pathlib import Path

# POINT THIS DIRECTLY TO THE 'Random' FOLDER
DATA_DIR = Path(r"C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Qwen2.5vl_3b_Eval\Random")

# Grab all VLAT run CSVs in that folder
filepaths = sorted(DATA_DIR.glob("vlat_*_run_*.csv"))

print("Found CSV files:")
for fp in filepaths:
    print("  ", fp.name)

results = {}

for fp in filepaths:
    filename = fp.name

    # Expect: vlat_qwen2.5vl_3b_run_01.csv
    match = re.match(r"vlat_(.+)_run_(\d+)\.csv", filename)
    if not match:
        print(f"Skipping (pattern didn't match): {filename}")
        continue

    model_key = match.group(1)      # e.g. "qwen2.5vl_3b"
    round_num = int(match.group(2)) # e.g. 1, 2, 3

    df = pd.read_csv(fp)

    if "is_correct" not in df.columns:
        print(f"WARNING: 'is_correct' not found in {filename}, skipping.")
        continue

    correct_count = df["is_correct"].sum()

    if model_key not in results:
        results[model_key] = {}

    results[model_key][round_num] = correct_count

print("\nresults dict:", results)

rows = []
for model_key, rounds in results.items():
    max_round = max(rounds.keys())
    round_scores = [rounds.get(r, 0) for r in range(1, max_round + 1)]
    avg_score = sum(round_scores) / len(round_scores)

    row = {"Model Name": model_key}
    for i, score in enumerate(round_scores, start=1):
        row[f"Round {i}"] = score
    row["Average"] = avg_score
    rows.append(row)

summary_df = pd.DataFrame(rows)

if not summary_df.empty:
    round_cols = sorted([c for c in summary_df.columns if c.startswith("Round ")]
    )
    summary_df = summary_df[["Model Name"] + round_cols + ["Average"]]
    print("\nVLAT TABLE FOR THIS MODEL:")
    print(summary_df.to_string(index=False))
else:
    print("\nNo data found â€“ check that there are vlat_*_run_*.csv files in the Random folder.")

Found CSV files:
   vlat_qwen2.5vl_3b_run_01.csv
   vlat_qwen2.5vl_3b_run_02.csv
   vlat_qwen2.5vl_3b_run_03.csv

results dict: {'qwen2.5vl_3b': {1: 23, 2: 25, 3: 23}}

VLAT TABLE FOR THIS MODEL:
  Model Name  Round 1  Round 2  Round 3   Average
qwen2.5vl_3b       23       25       23 23.666667


In [2]:
import pandas as pd
import re
from pathlib import Path

BASE_DIR = Path(r"C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT")

results = {}

# Recursively find ALL run files in any Random/ folder
filepaths = list(BASE_DIR.rglob("Random/vlat_*_run_*.csv"))

print("Files detected:")
for fp in filepaths:
    print("  ", fp)

for fp in filepaths:
    filename = fp.name

    # Regex for: vlat_<model>_run_<round>.csv
    match = re.match(r"vlat_(.+)_run_(\d+)\.csv", filename)
    if not match:
        print(f"Skipping (regex didn't match): {filename}")
        continue

    model_key = match.group(1)
    round_num = int(match.group(2))

    df = pd.read_csv(fp)

    if "is_correct" not in df.columns:
        print(f"WARNING: is_correct missing in: {filename}")
        continue

    correct_count = df["is_correct"].sum()

    results.setdefault(model_key, {})[round_num] = correct_count

print("\nRaw results dict:")
print(results)

# Pretty names for output table
pretty_names = {
    "gemma3_4b": "Gemma 3 4B",
    "gemma3_12b": "Gemma 3 12B",
    "llava13b": "LLaVA 13B",
    "llava7b": "LLaVA 7B",
    "qwen2.5vl_3b": "Qwen 2.5VL 3B",
    "qwen2.5vl_7b": "Qwen 2.5VL 7B",
}

rows = []
for model_key, rounds in results.items():
    max_round = max(rounds.keys())
    round_scores = [rounds[r] for r in range(1, max_round + 1)]
    avg_score = sum(round_scores) / len(round_scores)

    row = {
        "Model Name": pretty_names.get(model_key, model_key),
    }

    for i, score in enumerate(round_scores, start=1):
        row[f"Round {i}"] = score

    row["Average"] = avg_score
    rows.append(row)

summary_df = pd.DataFrame(rows)

# Order columns
if not summary_df.empty:
    round_cols = sorted([c for c in summary_df.columns if c.startswith("Round")])
    summary_df = summary_df[["Model Name"] + round_cols + ["Average"]]
    summary_df = summary_df.sort_values("Model Name")

    print("\n===== FINAL VLAT SCORE TABLE =====")
    print(summary_df.to_string(index=False))
else:
    print("\n No data collected. Check folder structure.")

Files detected:
   C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Gemma3_12b_Eval\Random\vlat_gemma3_12b_run_01.csv
   C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Gemma3_12b_Eval\Random\vlat_gemma3_12b_run_02.csv
   C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Gemma3_12b_Eval\Random\vlat_gemma3_12b_run_03.csv
   C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Gemma3_4b_Eval\Random\vlat_gemma3_4b_run_01.csv
   C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Gemma3_4b_Eval\Random\vlat_gemma3_4b_run_02.csv
   C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Gemma3_4b_Eval\Random\vlat_gemma3_4b_run_03.csv
   C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Llava13b_Eval\Random\vlat_llava13b_run_01.csv
   C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Llava13b_Eval\Random\vlat_llava13b_run_02.csv
   C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Llava13b_Eval\Random\vlat_llava13b_run_03.csv
   C:\Users\Melita\CSE 4001\VL

In [3]:
import pandas as pd
import re
from pathlib import Path

BASE_DIR = Path(r"C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\CALVI")

results = {}

# Recursively find ALL run files in any Random/ folder
filepaths = list(BASE_DIR.rglob("Random/calvi_*_run_*.csv"))

print("Files detected:")
for fp in filepaths:
    print("  ", fp)

for fp in filepaths:
    filename = fp.name

    # Regex for: calvi_<model>_run_<round>.csv
    match = re.match(r"calvi_(.+)_run_(\d+)\.csv", filename)
    if not match:
        print(f"Skipping (regex didn't match): {filename}")
        continue

    model_key = match.group(1)
    round_num = int(match.group(2))

    df = pd.read_csv(fp)

    if "is_correct" not in df.columns:
        print(f"WARNING: is_correct missing in: {filename}")
        continue

    correct_count = df["is_correct"].sum()

    results.setdefault(model_key, {})[round_num] = correct_count

print("\nRaw results dict:")
print(results)

# Pretty names for output table
pretty_names = {
    "gemma3_4b": "Gemma 3 4B",
    "gemma3_12b": "Gemma 3 12B",
    "llava13b": "LLaVA 13B",
    "llava7b": "LLaVA 7B",
    "qwen2.5vl_3b": "Qwen 2.5VL 3B",
    "qwen2.5vl_7b": "Qwen 2.5VL 7B",
}

rows = []
for model_key, rounds in results.items():
    max_round = max(rounds.keys())
    round_scores = [rounds[r] for r in range(1, max_round + 1)]
    avg_score = sum(round_scores) / len(round_scores)

    row = {
        "Model Name": pretty_names.get(model_key, model_key),
    }

    for i, score in enumerate(round_scores, start=1):
        row[f"Round {i}"] = score

    row["Average"] = avg_score
    rows.append(row)

summary_df = pd.DataFrame(rows)

# Order columns
if not summary_df.empty:
    round_cols = sorted([c for c in summary_df.columns if c.startswith("Round")])
    summary_df = summary_df[["Model Name"] + round_cols + ["Average"]]
    summary_df = summary_df.sort_values("Model Name")

    print("\n===== FINAL CALVI SCORE TABLE =====")
    print(summary_df.to_string(index=False))
else:
    print("\n No data collected. Check folder structure.")

Files detected:
   C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\CALVI\Gemma3_12b_Eval\Random\calvi_gemma3_12b_run_01.csv
   C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\CALVI\Gemma3_12b_Eval\Random\calvi_gemma3_12b_run_02.csv
   C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\CALVI\Gemma3_12b_Eval\Random\calvi_gemma3_12b_run_03.csv
   C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\CALVI\Gemma3_4b_Eval\Random\calvi_gemma3_4b_run_01.csv
   C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\CALVI\Gemma3_4b_Eval\Random\calvi_gemma3_4b_run_02.csv
   C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\CALVI\Gemma3_4b_Eval\Random\calvi_gemma3_4b_run_03.csv
   C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\CALVI\Llava13b_Eval\Random\calvi_llava13b_run_01.csv
   C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\CALVI\Llava13b_Eval\Random\calvi_llava13b_run_02.csv
   C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\CALVI\Llava13b_Eval\Random\calvi_llava13b_run_03.csv
   C:\Users\

In [4]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
import json

# ==== 1. Load VLAT metadata (to get number of choices per question) ====

VLAT_JSON_PATH = Path(r"C:\Users\Melita\CSE 4001\VLM-Eval-Research\data\VLAT\vlat_skip.json")

with open(VLAT_JSON_PATH, "r", encoding="utf-8") as f:
    vlat_meta = json.load(f)

qid_to_C = {}
for q in vlat_meta["questions"]:
    qid = q["id"]
    # count NON-omit options
    non_omit = [opt for opt in q["options"] if opt.lower() != "omit"]
    qid_to_C[qid] = len(non_omit)

choices_df = pd.DataFrame(
    [{"id": qid, "C_i": C_i} for qid, C_i in qid_to_C.items()]
)

# ==== 2. Compute corrected score for EACH RUN separately ====

BASE_DIR = Path(r"C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT")

# model_key -> {round_num -> corrected_score}
per_round_cs = {}

filepaths = list(BASE_DIR.rglob("Random/vlat_*_run_*.csv"))
print("Files detected:")
for fp in filepaths:
    print(" ", fp)

for fp in filepaths:
    filename = fp.name
    match = re.match(r"vlat_(.+)_run_(\d+)\.csv", filename)
    if not match:
        print("Skipping (pattern mismatch):", filename)
        continue

    model_key = match.group(1)
    round_num = int(match.group(2))

    df = pd.read_csv(fp)

    # merge per-question choice counts
    df = df.merge(choices_df, on="id", how="left")

    # drop omits (they get no penalty or reward)
    if "model_answer" in df.columns:
        df_no_omit = df[df["model_answer"].str.lower() != "omit"].copy()
    else:
        df_no_omit = df.copy()

    is_correct = df_no_omit["is_correct"].astype(bool)
    C_i = df_no_omit["C_i"]

    # per-item corrected score
    per_item_score = np.where(
        is_correct,
        1.0,
        -1.0 / (C_i - 1)
    )

    CS = per_item_score.sum()

    per_round_cs.setdefault(model_key, {})[round_num] = CS

print("\nCorrected scores per round (raw dict):")
print(per_round_cs)

# ==== 3. Build table: Model Name | Round 1 | Round 2 | Round 3 | Average ====

pretty_names = {
    "gemma3_12b": "Gemma 3 12B",
    "gemma3_4b": "Gemma 3 4B",
    "llava13b": "LLaVA 13B",
    "llava7b": "LLaVA 7B",
    "qwen2.5vl_3b": "Qwen 2.5VL 3B",
    "qwen2.5vl_7b": "Qwen 2.5VL 7B",
}

rows = []
for model_key, rounds in per_round_cs.items():
    # ensure we have them in order 1,2,3 (or however many exist)
    max_round = max(rounds.keys())
    round_scores = [rounds.get(r, np.nan) for r in range(1, max_round + 1)]

    avg_cs = float(np.nanmean(round_scores))

    row = {"Model Name": pretty_names.get(model_key, model_key)}
    for i, score in enumerate(round_scores, start=1):
        row[f"Round {i}"] = round(score, 2) if pd.notna(score) else np.nan
    row["Average"] = round(avg_cs, 2)

    rows.append(row)

cs_table_df = pd.DataFrame(rows)
cs_table_df = cs_table_df.sort_values("Model Name")

print("\n===== FINAL VLAT CORRECTED SCORE TABLE =====")
print(cs_table_df.to_string(index=False))

Files detected:
  C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Gemma3_12b_Eval\Random\vlat_gemma3_12b_run_01.csv
  C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Gemma3_12b_Eval\Random\vlat_gemma3_12b_run_02.csv
  C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Gemma3_12b_Eval\Random\vlat_gemma3_12b_run_03.csv
  C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Gemma3_4b_Eval\Random\vlat_gemma3_4b_run_01.csv
  C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Gemma3_4b_Eval\Random\vlat_gemma3_4b_run_02.csv
  C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Gemma3_4b_Eval\Random\vlat_gemma3_4b_run_03.csv
  C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Llava13b_Eval\Random\vlat_llava13b_run_01.csv
  C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Llava13b_Eval\Random\vlat_llava13b_run_02.csv
  C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Llava13b_Eval\Random\vlat_llava13b_run_03.csv
  C:\Users\Melita\CSE 4001\VLM-Eval-Res

In [5]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
import json

# ==== 1. Load VLAT question metadata from JSON ====

VLAT_JSON_PATH = Path(r"C:\Users\Melita\CSE 4001\VLM-Eval-Research\data\VLAT\vlat_skip.json")

with open(VLAT_JSON_PATH, "r", encoding="utf-8") as f:
    vlat_meta = json.load(f)

# Build a mapping: question id -> number of NON-OMIT choices (C_i)
qid_to_C = {}
for q in vlat_meta["questions"]:
    qid = q["id"]
    options = q["options"]
    # treat any option equal to "omit" (case-insensitive) as non-choice
    non_omit = [opt for opt in options if opt.lower() != "omit"]
    C_i = len(non_omit)
    qid_to_C[qid] = C_i

# Put into a DataFrame for easy merge
choices_df = pd.DataFrame(
    [{"id": qid, "C_i": C_i} for qid, C_i in qid_to_C.items()]
)

# ==== 2. Walk all VLAT run CSVs and compute scores ====

BASE_DIR = Path(r"C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT")

raw_scores = {}   # model_key -> list of raw scores (R)
corr_scores = {}  # model_key -> list of corrected scores (CS)

filepaths = list(BASE_DIR.rglob("Random/vlat_*_run_*.csv"))

print("Files detected:")
for fp in filepaths:
    print(" ", fp)

for fp in filepaths:
    filename = fp.name
    match = re.match(r"vlat_(.+)_run_(\d+)\.csv", filename)
    if not match:
        print("Skipping (pattern mismatch):", filename)
        continue

    model_key = match.group(1)
    run_num = int(match.group(2))

    df = pd.read_csv(fp)

    # Merge in C_i (per-question number of choices)
    df = df.merge(choices_df, on="id", how="left")

    # --- Regular raw score: simply count correct answers ---
    R_raw = df["is_correct"].sum()

    # --- Corrected score: per-question scoring with variable C_i ---
    # Ignore omits completely
    if "model_answer" in df.columns:
        df_no_omit = df[df["model_answer"].str.lower() != "omit"].copy()
    else:
        df_no_omit = df.copy()  # if no explicit omit flag

    # Per-question score:
    # correct -> +1
    # incorrect -> -1/(C_i - 1)
    # (C_i is number of non-omit choices for that question)
    is_correct = df_no_omit["is_correct"].astype(bool)
    C_i = df_no_omit["C_i"]

    per_item_score = np.where(
        is_correct,
        1.0,
        -1.0 / (C_i - 1)
    )

    CS = per_item_score.sum()

    raw_scores.setdefault(model_key, []).append(R_raw)
    corr_scores.setdefault(model_key, []).append(CS)

print("\nRaw score lists:", raw_scores)
print("Corrected score lists:", corr_scores)

# ==== 3. Build final table like Table 2 (Regular + Corrected) ====

pretty_names = {
    "gemma3_12b": "Gemma 3 12B",
    "gemma3_4b": "Gemma 3 4B",
    "llava13b": "LLaVA 13B",
    "llava7b": "LLaVA 7B",
    "qwen2.5vl_3b": "Qwen 2.5VL 3B",
    "qwen2.5vl_7b": "Qwen 2.5VL 7B",
}

rows = []

def add_rows(score_dict, score_type):
    for model_key, scores in score_dict.items():
        scores = list(scores)
        mean_val = float(np.mean(scores))
        sd_val = float(np.std(scores, ddof=0))
        smin, smax = min(scores), max(scores)

        rows.append({
            "Model": pretty_names.get(model_key, model_key),
            "Score Type": score_type,
            "Mean (M)": round(mean_val, 2),
            "Range": f"({round(smin, 2)}, {round(smax, 2)})",
            "SD": round(sd_val, 2),
        })

# Regular row (raw score) and Corrected row
add_rows(raw_scores, "Regular")
add_rows(corr_scores, "Corrected")

table_df = pd.DataFrame(rows)
table_df = table_df.sort_values(["Model", "Score Type"])

print("\n===== VLAT REGULAR + CORRECTED TABLE (variable C, omits ignored) =====")
print(table_df.to_string(index=False))

Files detected:
  C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Gemma3_12b_Eval\Random\vlat_gemma3_12b_run_01.csv
  C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Gemma3_12b_Eval\Random\vlat_gemma3_12b_run_02.csv
  C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Gemma3_12b_Eval\Random\vlat_gemma3_12b_run_03.csv
  C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Gemma3_4b_Eval\Random\vlat_gemma3_4b_run_01.csv
  C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Gemma3_4b_Eval\Random\vlat_gemma3_4b_run_02.csv
  C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Gemma3_4b_Eval\Random\vlat_gemma3_4b_run_03.csv
  C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Llava13b_Eval\Random\vlat_llava13b_run_01.csv
  C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Llava13b_Eval\Random\vlat_llava13b_run_02.csv
  C:\Users\Melita\CSE 4001\VLM-Eval-Research\Output\VLAT\Llava13b_Eval\Random\vlat_llava13b_run_03.csv
  C:\Users\Melita\CSE 4001\VLM-Eval-Res