### Check missing runs

In [None]:
from itertools import product
from pathlib import Path

import pandas as pd
import srsly
from tbparse import SummaryReader
from tqdm.auto import tqdm

In [None]:
path = Path("../outputs/main")

In [None]:
missing_hparams = []
unfinished = []
hparams = []

DATASETS = set()
DATA_SEEDS = set()
MODEL_SEEDS = set()
INIT_SEEDS = set()
STRATEGIES = set()

for dataset in path.iterdir():
    # ignore .submitit folder and the multirun.yaml file
    if ".submitit" in str(dataset) or not dataset.is_dir():
        continue

    DATASETS.add(dataset.name)
    for experiment in dataset.iterdir():
        # check experiment metadata
        hparams_file = experiment / "hparams.yaml"
        if not hparams_file.exists():
            missing_hparams.append(experiment)
        else:
            meta = srsly.read_yaml(hparams_file)
            exp_hparam = {
                "experiment": experiment,
                "data_seed": meta["data"]["seed"],
                "model_seed": meta["model"]["seed"],
                "initial_seed": meta["active_data"]["seed"],
                "global_seed": meta["seed"],
                "retriever": meta["index_metric"],
                "dataset_name": meta["dataset"]["name"],
                "model_name": meta["model"]["name"],
                "strategy_name": meta["strategy"]["name"],
            }
            hparams.append(exp_hparam)
            DATA_SEEDS.add(exp_hparam["data_seed"])
            MODEL_SEEDS.add(exp_hparam["model_seed"])
            INIT_SEEDS.add(exp_hparam["initial_seed"])
            STRATEGIES.add(exp_hparam["strategy_name"])

        # read experiment logs
        if not (experiment / "tensorboard_logs.parquet").exists():
            unfinished.append(experiment)

hparams_df = pd.DataFrame(hparams)

In [None]:
all_experiments = pd.DataFrame(
    product(DATASETS, DATA_SEEDS, MODEL_SEEDS, INIT_SEEDS, STRATEGIES),
    columns=["dataset_name", "data_seed", "model_seed", "initial_seed", "strategy_name"],
)
all_experiments.groupby(["dataset_name", "strategy_name"]).size().unique()  # type: ignore

In [None]:
outer = pd.merge(all_experiments, hparams_df, on=all_experiments.columns.tolist(), how="outer", indicator=True)
missing = outer.loc[outer["_merge"] != "both", all_experiments.columns.tolist()]

In [None]:
missing[["dataset_name", "strategy_name"]].drop_duplicates()

In [None]:
unfinished

In [None]:
names = ["amazon-rel", "wikitoxic-.01", "amazon-agri", "agnews-business-.01"]
(
    missing.loc[
        (missing["dataset_name"].isin(names))
        # & (missing["strategy_name"] != "entropy")
    ].sort_values(["dataset_name", "strategy_name"])  # type: ignore
)

### Tensorboard fix

Build tensorboard.parquet for unfinished runs

In [None]:
path = Path("../outputs/ablations/")
for d in path.iterdir():
    if not d.is_dir():
        continue
    for p in tqdm(list((d).iterdir()), desc=d.name):
        if (p / "tensorboard_logs.parquet").exists() or not (p / "tb_logs").exists():
            continue

        tb_logs_path = p / "tb_logs"
        logs = SummaryReader(tb_logs_path)
        logs.scalars.to_parquet(p / "tensorboard_logs.parquet")