In [None]:
from pathlib import Path

import pandas as pd
import plotly.express as px

from misc import data_config, model_config

In [None]:
main_model_config = (
    model_config.query("main")
    .drop(columns="main")
    .rename(columns={k: f"model_{k}" for k in model_config.columns})
)

model_colors = {
    key: px.colors.qualitative.Dark24[i] for i, key in enumerate(main_model_config.index.values)
}

cmp_groups = {
    "llamamoe": ["llamamoe", "llamamoes"],
    "olmoe": ["olmoe", "olmoesft", "olmoedpo", "olmoeins"],
    "jetmoe": ["jetmoe", "jetmoesft", "jetmoechat"],
}

new_name = {
    "llamamoe": "LLaMA-MoE-v1",
    "llamamoes": "LLaMA-MoE-v1-SFT",
    "olmoe": "OLMoE",
    "olmoesft": "OLMoE-SFT",
    "olmoedpo": "OLMoE-DPO",
    "olmoeins": "OLMoE-Instruct",
    "jetmoe": "JetMoE",
    "jetmoesft": "JetMoE-SFT",
    "jetmoechat": "JetMoE-Chat",
}

cmp_keys = [v for vs in cmp_groups.values() for v in vs]

cmp_model_config = (
    model_config.loc[cmp_keys]
    .drop(columns="main")
    .rename(columns={k: f"model_{k}" for k in model_config.columns})
)

for k, vs in cmp_groups.items():
    for v in vs:
        cmp_model_config.loc[v, "model_group"] = k
        cmp_model_config.loc[v, "model_name"] = new_name[v]

cmp_model_config["model_group"] = cmp_model_config["model_group"].astype(model_config.index.dtype)
cmp_model_config

In [None]:
main_data_config = data_config.rename(columns={k: f"data_{k}" for k in data_config.columns})
main_data_config

In [None]:
root_dir = Path("../output/srp_mpq")

dfs = {
    p.stem: pd.merge(pd.read_parquet(p), cmp_model_config, left_on="model", right_index=True)
    for p in root_dir.glob("*.parquet")
}

for key in dfs.keys():
    if "dataset" in dfs[key].columns:
        dfs[key] = pd.merge(dfs[key], main_data_config, left_on="dataset", right_index=True)

for df in dfs.values():
    df["model"] = df["model"].astype(model_config.index.dtype)
    if "dataset" in df.columns:
        df["dataset"] = df["dataset"].astype(data_config.index.dtype)

dfs["mg"].pivot(
    index=["model_group", "model_name"], columns="seg_len", values=["best_f1", "best_m"]
)

In [None]:
dfs["md"].query("seg_len == 16").pivot(
    index=["model_group", "model_name"], columns="dataset", values="best_f1"
)

In [None]:
sample_seg_len = 16

mdf = (
    pd.merge(
        dfs["mg"]
        .drop(columns=["best_m", "ci_lb", "ci_ub"])
        .rename(columns={"best_f1": "gen_best_f1"}),
        dfs["md"].drop(columns=["act_r", "best_m", "ci_lb", "ci_ub"]),
    )
    .query(f"seg_len == {sample_seg_len}")
    .drop(columns="seg_len")
)

mdf["f1_diff"] = (mdf["best_f1"] - mdf["gen_best_f1"]) / mdf["gen_best_f1"]
mdf.pivot(index=["model_group", "model_name"], columns="dataset", values="f1_diff")