In [1]:
import pandas as pd
from common.json_utils import load_from_json
from common.model_utils import *
from common.const import *


def col_key(subset, split):
    return f"{subset.split('-')[1]}_{split[:5]}"


def generate_done(data, q_idx) -> bool:
    if "responses_correct" not in data:
        return False

    resp_corr_list = data["responses_correct"][q_idx]
    if resp_corr_list is None:
        return False

    return len(resp_corr_list) == 8


def is_needed(model_name, split) -> bool:
    if model_name in PROPRIETARY_MODELS or model_sort_key(model_name)[1] > 20:
        return split == "test"
    else:
        return True

columns = []
for subset in SUBSETS:
    for split in SPLITS:
        columns.append(col_key(subset, split))

df = pd.DataFrame(index=OPEN_SRC_MODELS + PROPRIETARY_MODELS + [m + "_sft" for m in models_to_finetune()], columns=columns)
for model_name in OPEN_SRC_MODELS + PROPRIETARY_MODELS:
    for subset in SUBSETS:
        for split in SPLITS:
            if not is_needed(model_name, split):
                df.loc[model_name, col_key(subset, split)] = "unnecessary"
                continue

            filepath = f"./output/{subset}/response/{model_name_to_dirname(model_name)}/base/original_{split}.json"
            if not os.path.exists(filepath):
                df.loc[model_name, col_key(subset, split)] = False
                continue

            data_list = load_from_json(filepath, print_msg=False)
            df.loc[model_name, col_key(subset, split)] = all(
                all(generate_done(data, q_idx) for data in data_list)
                for q_idx in range(3)
            )

for model_name in models_to_finetune():
    for subset in SUBSETS:
        for split in SPLITS[:2]:
            df.loc[model_name + "_sft", col_key(subset, split)] = "unnecessary"
        
        # split="test"
        filepath = f"./output/{subset}/response/{model_name_to_dirname(model_name)}/sft/original_test.json"
        if not os.path.exists(filepath):
            df.loc[model_name + "_sft", col_key(subset, "test")] = False
            continue

        data_list = load_from_json(filepath, print_msg=False)
        df.loc[model_name + "_sft", col_key(subset, "test")] = all(
            all(generate_done(data, q_idx) for data in data_list)
            for q_idx in range(3)
        )

def color_boolean(val: bool):
    if val == "unnecessary":
        return "background-color: gray"

    if val:
        return "background-color: green"
    else:
        return "background-color: red"

df.style.map(color_boolean)

Unnamed: 0,knowledge_train,knowledge_valid,knowledge_test,reasoning_train,reasoning_valid,reasoning_test
meta-llama/Llama-3.1-8B-Instruct,True,True,True,True,True,True
meta-llama/Llama-3.1-70B-Instruct,unnecessary,unnecessary,True,unnecessary,unnecessary,True
Qwen/Qwen3-4B-Instruct-2507,True,True,True,True,True,True
Qwen/Qwen3-30B-A3B-Instruct-2507,unnecessary,unnecessary,True,unnecessary,unnecessary,True
mistralai/Mistral-7B-Instruct-v0.3,True,True,True,True,True,True
mistralai/Mistral-Small-24B-Instruct-2501,unnecessary,unnecessary,True,unnecessary,unnecessary,True
gemini-2.5-flash-lite,unnecessary,unnecessary,True,unnecessary,unnecessary,True
claude-3-5-sonnet,unnecessary,unnecessary,True,unnecessary,unnecessary,True
llama3.1-405b,unnecessary,unnecessary,True,unnecessary,unnecessary,True
deepseek-r1,unnecessary,unnecessary,True,unnecessary,unnecessary,True
