In [None]:
from pathlib import Path

import duckdb as db
import pandas as pd
import plotnine as pn
from datasets import load_from_disk
from scipy.stats import gmean

In [None]:
path = Path("../results")

Read hparams for all experiments and the metrics, thus creating a unique dataset to perform the analysis

In [None]:
def get_group(s: str) -> str:
    _s = s.split("/")
    g = "-".join(_s[2:4]) if "ablations" in s or "additional" in s else _s[2]
    return g.strip()


def fix_name(s: str) -> str:
    if "_" in s:
        return s
    return f"noop_{s}"

In [None]:
hparam_df = pd.read_csv(path / "hparams.tsv", sep="\t")

hparam_df["experiment_group"] = hparam_df["filename"].map(get_group)
hparam_df["model.name"] = hparam_df["model.name"].replace("bert-base", "bert-base-uncased")
hparam_df["strategy.name"] = hparam_df["strategy.name"].map(fix_name)

# remove columns with one unique value
hparam_df = hparam_df.iloc[:, (hparam_df.nunique() > 1).values]  # type: ignore

# remove specific columns
hparam_df = hparam_df.iloc[
    :,
    ~hparam_df.columns.str.contains(
        "loggers|data_path|learning_rate|run_name|callbacks|seed|prepared_path|processed_path"
    ),
]

# remove columns with all NA
hparam_df = hparam_df.dropna(axis=1, how="all")

# cols = hparam_df.columns[hparam_df.columns.str.contains("active_fit.*|strategy.args*")].tolist() +

Read metrics

In [None]:
metric_df = pd.read_parquet(path / "metrics.parquet")

In [None]:
df = pd.merge(metric_df, hparam_df, on="filename", how="left")

# aggregate classes into minority and majority
df["variable"] = "f1_majority"
df.loc[df["tag"].str.contains("timer"), "variable"] = "time"
df.loc[
    ((df["dataset.name"] == "amazon-multi") & (df["tag"] != "test/f1_class4") & (~df["tag"].str.contains("timer")))
    | ((df["dataset.name"] != "amazon-multi") & (df["tag"] == "test/f1_class1")),
    "variable",
] = "f1_minority"
df = df.drop(columns=["tag"])

In [None]:
def format_value(m: float, iqr: float) -> str:
    return rf"$\float[1]{{{round(m, 2)}}}_" + r"{\pm" + rf"\float[1]{{{round(iqr, 2)}}}" + "}$"


def format_int(m: int) -> str:
    _m = int(m)
    return rf"\integer{{{int(_m)}}}" if _m < 1000 else rf"\q{{{round(_m / 1000, ndigits=1)}}}" + "{\thousand}"


def format_float(m: float) -> str:
    return rf"\float[1]{{{m}}}"


variable_name = {"f1_majority": r"\textbf{Majority}", "f1_minority": r"\textbf{Minority}", "time": r"\textbf{Time}"}

dataset_name = {
    "f1_majority": "Majority",
    "f1_minority": "Minority",
    "agnews-business-.01": r"\agnewsbus",
    "amazon-agri": r"\amazonagri",
    "amazon-multi": r"\amazonmulti",
    "wikitoxic-.01": r"\wikitoxic",
}

model_name = {
    "bert-base-uncased": r"\bertbase",
    "albert-base-v2": r"\albertbase",
    "bert-tiny": r"\berttiny",
    "deberta_v3-base": r"\debertabase",
    "gpt2": r"\gpt",
    "t5-base": r"\tf",
}

anchorstrategy_name = {"kmeans_pp_sampling": r"\myemph{KM++}", "entropy": r"\myemph{Ent}"}

Compute ablations table

In [None]:
cols = hparam_df.columns[hparam_df.columns.str.contains("active_fit.*|strategy.args*")].tolist()
cols += ["experiment_group", "dataset.name", "model.name", "strategy.name", "variable", "step"]

agg_df = (
    df.query(
        "((experiment_group.str.startswith('ablation')) & (~experiment_group.str.contains('super')))"
        "| ((experiment_group == 'main') & (`model.name` == 'bert-base-uncased')"
        "& (`strategy.name` == 'anchoral_entropy')"
        "& (`dataset.name` == 'amazon-agri'))"
        # "| (experiment_group.str.contains('1000'))"
    )
    .fillna(100000)
    # compute median and IQR
    .groupby(cols)["value"]
    .quantile([0.25, 0.5, 0.75])  # type: ignore
    .unstack(-1)
    .assign(iqr=lambda _df: _df[0.75] - _df[0.25])
    .rename(columns={0.5: "median"})
    .drop(columns=[0.25, 0.75])
    .reset_index()
    # filter for last step
    .assign(max_step=lambda _df: _df.groupby(cols[:-2])["step"].max().min())
)

In [None]:
agg_df["experiment_group"].unique().tolist(), agg_df["max_step"].unique().tolist()

In [None]:
cols = [
    r"$\anchorstrategy_{\mathtt{maj}}$",
    r"$\anchorstrategy_{\mathtt{min}}$",
    r"$\numanchors$",
    r"$\numneighbours$",
    "variable",
    "value",
]

abl_tbl = (
    agg_df.query("step == max_step")
    .drop(columns="max_step")
    .assign(
        **{
            r"$\anchorstrategy_{\mathtt{maj}}$": lambda _df: _df["strategy.args.anchor_strategy_majority"].map(
                anchorstrategy_name
            ),
            r"$\anchorstrategy_{\mathtt{min}}$": lambda _df: _df["strategy.args.anchor_strategy_minority"].map(
                anchorstrategy_name
            ),
            r"$\numneighbours$": lambda _df: _df["strategy.args.num_neighbours"].map(format_int),
            r"$\numanchors$": lambda _df: _df["strategy.args.num_anchors"].map(format_int),
            # "value": lambda _df: _df.apply(lambda row: format_value(row["median"], row["iqr"]), axis=1),
            "value": lambda _df: _df["median"].map(format_float),
            "variable": lambda _df: _df["variable"].map(variable_name),
        }
    )
    .loc[:, cols]
    .set_index(cols[:-1])
    .unstack("variable")
    .droplevel(0, axis=1)  # type: ignore
    .reset_index()
)

In [None]:
abl_tbl.to_latex("../results/ablation_table.tex", index=False)

Compute main results table

In [None]:
cols = ["experiment_group", "dataset.name", "model.name", "strategy.name", "step"]

# consider only those steps with at least 3 evaluations
dd = (
    df.query("experiment_group.str.contains('main|other_models')")
    # .query("(~`strategy.name`.str.contains('noop')) | (`strategy.name`.str.contains('random'))")
    .groupby(cols)["filename"]
    .nunique()
    .reset_index()
    .query("(filename >= 3) | ((`strategy.name`.str.contains('noop')) & (~`strategy.name`.str.contains('random')))")
    .loc[:, cols]
)

# within all the steps with at least 3 evaluations, consider the maximum per experiment and per
# dataset-model combination
dd = dd.assign(max_step_exp=lambda _df: _df.groupby(cols[:-1])["step"].transform("max")).assign(
    min_max_step=lambda _df: _df.query(
        "(~`strategy.name`.str.contains('noop')) | (`strategy.name`.str.contains('random'))"
    )
    .groupby(cols[:-2])["max_step_exp"]
    .transform("min")
)
dd["min_max_step"] = dd["min_max_step"].fillna(dd["max_step_exp"]).astype(int)
dd["strategy.name"].unique()

In [None]:
cols = ["experiment_group", "dataset.name", "model.name", "strategy.name", "variable", "step"]

agg_df = (
    df.query("experiment_group.str.contains('main|other_models')")
    # compute median and IQR
    .groupby(cols)["value"]
    .quantile([0.25, 0.5, 0.75])  # type: ignore
    .unstack(-1)
    .assign(iqr=lambda _df: _df[0.75] - _df[0.25])
    .rename(columns={0.5: "median"})
    .drop(columns=[0.25, 0.75])
    .reset_index()
)
agg_df.loc[(agg_df["variable"] == "time") & (agg_df["step"] >= 195), "step"] = 196

In [None]:
cols = ["experiment_group", "dataset.name", "model.name", "strategy.name", "step"]

overall_df = pd.merge(dd.query("step == max_step_exp"), agg_df, on=cols, how="left")[
    cols + ["variable", "median", "iqr"]
]

budget_df = pd.merge(dd.query("step == min_max_step"), agg_df, on=cols, how="left")[
    cols + ["variable", "median", "iqr"]
]

In [None]:
# overall_df.groupby(cols)["variable"].nunique().reset_index().query("variable < 3")

In [None]:
# budget_df.groupby(cols)["variable"].nunique().reset_index().query("variable < 3")

In [None]:
# df.query("(`dataset.name` == 'amazon-multi') & (`model.name` == 'bert-base-uncased')")["variable"].unique()

In [None]:
cols = ["experiment_group", "model.name", "dataset.name", "strategy.name", "step", "variable"]

pdata = (
    agg_df.set_index(cols)
    .unstack("variable")
    .reset_index()
    .assign(
        strategy=lambda _df: _df["strategy.name"].str.split("_", expand=True)[1],
        pool_filtering=lambda _df: _df["strategy.name"].str.split("_", expand=True)[0],
    )
    .assign(
        pool_filtering=lambda _df: _df["pool_filtering"].map(
            {"anchoral": "AnchorAL", "seals": "SEALS", "randomsubset": "RandSub"}
        ),
        dataset=lambda _df: _df["dataset.name"].map(
            {
                "agnews-business-.01": "Agnews-Bus",
                "amazon-agri": "Amazon-Agri",
                "amazon-multi": "Amazon-Multi",
                "wikitoxic-.01": "WikiToxic",
            }
        ),
        model=lambda _df: _df["model.name"].map(
            {
                "bert-base-uncased": "BERT-base",
                "albert-base-v2": "ALBERT-base",
                "bert-tiny": "BERT-tiny",
                "deberta_v3-base": "DeBERTa-base",
                "gpt2": "GPT-2",
                "t5-base": "T5-base",
            }
        ),
    )
)
pdata.columns = ["_".join(i).removesuffix("_") for i in pdata.columns]
pdata = pdata.dropna(subset=["pool_filtering"])

In [None]:
cols = [
    r"\textbf{Dataset}",
    r"\textbf{Model}",
    r"\textbf{\AL Strategy}",
    r"\textbf{Pool Filtering}",
    r"\textbf{Budget}",
    "variable",
    "value",
]

dfs = []

for i, d in [(r"\textbf{Overall}", overall_df), (r"\textbf{Budget-Matched}", budget_df)]:
    new_d = (
        d.assign(
            **{
                "value": lambda _df: _df.apply(lambda row: format_value(row["median"], row["iqr"]), axis=1),
                "variable": lambda _df: _df["variable"].map(variable_name),
                r"\textbf{Dataset}": lambda _df: _df["dataset.name"].map(dataset_name),
                r"\textbf{Model}": lambda _df: _df["model.name"].map(model_name),
                r"\textbf{\AL Strategy}": lambda _df: "\\" + _df["strategy.name"].str.split("_", expand=True)[1],
                r"\textbf{Pool Filtering}": lambda _df: "\\" + _df["strategy.name"].str.split("_", expand=True)[0],
                r"\textbf{Budget}": lambda _df: ((_df["step"] * 25) + 100).map(format_int),
            }
        )
        .loc[:, cols]
        .set_index(cols[:-1])
        .unstack("variable")
        .droplevel(0, axis=1)  # type: ignore
        .reset_index()
        .set_index(cols[:4])
    )
    new_d.columns = pd.MultiIndex.from_product([[i], new_d.columns.tolist()])

    dfs.append(new_d)
tbl = pd.concat(dfs, axis=1)

In [None]:
# super ugly, yet I need it
tbl[(r"\textbf{Budget-Matched}", r"\textbf{Budget}")] = tbl.apply(
    lambda row: row[(r"\textbf{Budget-Matched}", r"\textbf{Budget}")]
    if row[(r"\textbf{Overall}", r"\textbf{Budget}")] == row[(r"\textbf{Budget-Matched}", r"\textbf{Budget}")]
    else row[(r"\textbf{Budget-Matched}", r"\textbf{Budget}")] + r" \cellcolor{gray!30}",
    axis=1,
)

In [None]:
# tbl.loc[tbl[(r'\textbf{Overall}', r'\textbf{Budget}')] != tbl[(r'\textbf{Budget-Matched}', r'\textbf{Budget}')]]

In [None]:
tbl.fillna("-").to_latex("../results/main_table.tex")

In [None]:
tbl_small = tbl.reset_index()
tbl_small = tbl_small.loc[
    (tbl_small[(r"\textbf{\AL Strategy}", "")] == r"\entropy")
    & (tbl_small[(r"\textbf{Model}", "")] == r"\bertbase")
    & (tbl_small[(r"\textbf{Dataset}", "")] != r"\agnewsbus")
]

In [None]:
(
    tbl_small.loc[:, tbl_small.columns[[0, 3, 4, 5, 6, 7, 9, 10]]]  # type: ignore
    .set_index([r"\textbf{Dataset}"])
    .to_latex("../results/small_table.tex")
)

Compute labelling ids

In [None]:
df = pd.read_parquet(path / "labelled_ids.parquet")

In [None]:
df = pd.merge(df, hparam_df, on="filename", how="left")

In [None]:
cols = ["experiment_group", "model.name", "dataset.name", "strategy.name", "labelling_round", "labels"]

dd = (
    df.assign(total=lambda _df: _df.groupby(["filename", "labelling_round"])["cum_n"].transform("sum"))
    .query("experiment_group.str.contains('main|other_models')")
    .assign(n_exp=lambda _df: _df.groupby(cols)["filename"].transform("nunique"))
    .query("(n_exp >= 3)")
    .loc[:, cols + ["n", "cum_n", "total", "filename"]]
)
dd["label"] = "minority"
dd.loc[
    ((dd["dataset.name"] == "amazon-multi") & (dd["labels"] == 4))
    | ((dd["dataset.name"] != "amazon-multi") & (dd["labels"] == 0)),
    "label",
] = "majority"


dd = dd.assign(
    cum_n=lambda _df: _df.groupby(["filename", "labelling_round", "label"])["cum_n"].transform("sum")
).assign(
    pool_filtering=lambda _df: _df["strategy.name"].str.split("_", expand=True)[0].str.strip(),
    strategy=lambda _df: _df["strategy.name"].str.split("_", expand=True)[1].str.strip(),
    p=lambda _df: _df["cum_n"] / _df["total"],
)

dd

In [None]:
dd["dataset.name"].unique()

In [None]:
cols = ["dataset.name", "pool_filtering", "labelling_round", "label"]

pdata = (
    dd.query("(label == 'minority') & (pool_filtering != 'noop') & (experiment_group == 'main')")
    .groupby(cols)["p"]
    .agg(gmean)
    .reset_index()
    .assign(
        pool_filtering=lambda _df: _df["pool_filtering"].map(
            {"anchoral": "AnchorAL", "seals": "SEALS", "randomsubset": "RandSub"}
        ),
        dataset=lambda _df: _df["dataset.name"].map(
            {
                "agnews-business-.01": "Agnews-Bus",
                "amazon-agri": "Amazon-Agri",
                "amazon-multi": "Amazon-Multi",
                "wikitoxic-.01": "WikiToxic",
            }
        ),
        labelling_round=lambda _df: (_df["labelling_round"] * 25) + 100,
    )
    .assign(max_step=lambda _df: _df.groupby(["pool_filtering", "dataset"])["labelling_round"].transform("max"))
    .assign(min_max_step=lambda _df: _df.groupby(["dataset"])["max_step"].transform("min"))
)

In [None]:
p = (
    pn.ggplot(pdata.query("labelling_round <= 1000"), pn.aes("labelling_round", "p", colour="pool_filtering"))
    + pn.geom_line()
    + pn.geom_point()
    + pn.scale_y_continuous(breaks=[0.05, 0.1, 0.2, 0.3], labels=lambda l: ["." + str(x).split(".")[1] for x in l])
    +
    pn.coord_cartesian(xlim=[100, 1000])  # type: ignore
    + pn.facet_grid("dataset~.", scales="free_y")
    + pn.theme_bw(base_size=12)
    + pn.theme(legend_position="top", legend_box_spacing=0.01, legend_box_margin=0)
    + pn.labs(x="", y="", colour="")
)

In [None]:
p.save("../results/minority_proportions.png", format="png", dpi=300, width=4, height=5)

Subpool analysis

In [None]:
p = str(path / "subpool_ids.parquet")

In [None]:
data_path = Path("../data/prepared")
datasets = {
    "amazon-agri": "amazoncat-agri_bert-base-uncased",
    "amazon-multi": "amazoncat-multi_bert-base-uncased",
    "wikitoxic-.01": "wikitoxic-.01_bert-base-uncased",
    "agnews-business-.01": "agnews-business-.01_bert-base-uncased",
}
data = []
for n, d in datasets.items():
    a = (
        load_from_disk(str(data_path / d))["train"]
        .select_columns(["uid", "labels"])    # type: ignore
        .to_pandas()
        .assign(name=n)    # type: ignore
    )
    data.append(a)

labels = pd.concat(data)

In [None]:
tbl = db.sql(
    f"""
with ctx as (
    select *
    from read_parquet('{p}') l left join hparam_df r on r.filename = l.filename
    where experiment_group == 'main'
)
select 
    *, 
    split_part("strategy.name", '_', 1) as pool_filtering,
    case when
        (name = 'amazon-multi' and labels = 4) or (name != 'amazon-multi' and labels = 0) then 'majority'
        else 'minority'
    end as label
from ctx l left join labels r on l."dataset.name" = r.name and l.subpool_ids == r.uid
"""
)

In [None]:
count_tbl = db.sql(
    """
select 
    filename, labelling_round, label, count(1) as n
from tbl
group by filename, labelling_round, label
"""
)

In [None]:
df = count_tbl.df()

In [None]:
df = pd.merge(df, hparam_df, on="filename", how="left")

In [None]:
dd = (
    df.assign(
        total=lambda _df: _df.groupby(["filename", "labelling_round"])["n"].transform("sum"),
        pool_filtering=lambda _df: _df["strategy.name"].str.split("_", expand=True)[0].str.strip(),
        strategy=lambda _df: _df["strategy.name"].str.split("_", expand=True)[1].str.strip(),
    )
    .assign(p=lambda _df: _df["n"] / _df["total"])
    .sort_values(["filename", "labelling_round"])
)

In [None]:
pdata = (
    dd.query("label == 'minority'")
    .groupby(["pool_filtering", "dataset.name", "labelling_round"])["p"]
    .agg(gmean)
    .reset_index()
    .assign(
        pool_filtering=lambda _df: _df["pool_filtering"].map(
            {"anchoral": "AnchorAL", "seals": "SEALS", "randomsubset": "RandSub"}
        ),
        dataset=lambda _df: _df["dataset.name"].map(
            {
                "agnews-business-.01": "Agnews-Bus",
                "amazon-agri": "Amazon-Agri",
                "amazon-multi": "Amazon-Multi",
                "wikitoxic-.01": "WikiToxic",
            }
        ),
    )
)

In [None]:
p = (
    pn.ggplot(pdata.query("labelling_round <= 120"), pn.aes("labelling_round", "p", colour="pool_filtering"))
    + pn.facet_grid("dataset~.", scales="free_y")
    + pn.geom_line()
    + pn.geom_point()
    + pn.scale_y_continuous(breaks=[0.05, 0.1, 0.2, 0.3], labels=lambda l: ["." + str(x).split(".")[1] for x in l])
    +
    # pn.scale_x_continuous(breaks=[.05, .1, .2, .3]) +
    # pn.coord_cartesian(xlim=[100, 1000]) +  # type: ignore
    pn.facet_grid("dataset~.", scales="free_y")
    + pn.theme_bw(base_size=12)
    + pn.theme(legend_position="top", legend_box_spacing=0.01, legend_box_margin=0)
    + pn.labs(x="", y="", colour="")
)
p

In [None]:
p.save("../results/subpool_minority_proportions.png", format="png", dpi=300, width=4, height=5)