In [1]:
import itertools
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import srsly
from datasets import load_from_disk
from matplotlib import ticker
from scipy.special import entr, softmax
from sklearn.manifold import TSNE
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
from tqdm.auto import tqdm

pd.set_option("display.max_rows", None, "display.max_columns", None)

In [2]:
path_to_experiment = Path("../outputs/multirun/reinit_effect/patch")
path_to_experiment

PosixPath('../outputs/multirun/reinit_effect/patch')

In [3]:
list_dfs = []
for path in list(
    filter(
        lambda ex: "agnews" in str(ex),
        path_to_experiment.rglob("labelled_dataset.parquet"),
    )
):
    data_seed, model_seed = path.parents[1].name.split("_")
    df = pd.read_parquet(path).assign(
        reinit=eval(path.parents[2].name.split("=")[1]),
        strategy=path.parents[3].name,
        experiment=path.parents[1].name,
        data_seed=int(data_seed.split("=")[1]),
        model_seed=int(model_seed.split("=")[1]),
    )
    list_dfs.append(df)

In [4]:
df = pd.concat(list_dfs).reset_index(drop=True)

In [5]:
def jaccard_similarity(A, B):
    # Find intersection of two sets
    nominator = A.intersection(B)

    # Find union of two sets
    denominator = A.union(B)

    # Take the ratio of sizes
    similarity = len(nominator) / len(denominator)

    return similarity


def compute_jaccard_fn(df, prop):
    combinations = itertools.combinations(df[prop].unique().tolist(), 2)

    out = {}
    for a, b in combinations:
        if a == b:
            continue
        A = set(df.loc[df[prop] == a, "unique_id"].unique())
        B = set(df.loc[df[prop] == b, "unique_id"].unique())
        out[f"{a} -- {b}".replace(".seed", "")] = jaccard_similarity(A, B)

    return out


# def compute_jaccard(df, a, b):
#     A = df.loc[df["strategy"] == a, "unique_id"].unique()
#     B = df.loc[df["strategy"] == b, "unique_id"].unique()

#     return jaccard_score(A, B)

In [6]:
# similarity across seeds
res = (
    df.groupby(["strategy", "reinit"])
    .apply(lambda df_: compute_jaccard_fn(df_, "experiment"))
    .to_frame("jaccard")
    .reset_index()
)
res = (
    res.join(pd.DataFrame(res.pop("jaccard").values.tolist()))
    .melt(
        id_vars=["strategy", "reinit"],
        var_name="experiment-pairs",
        value_name="jaccard_similarity",
    )
    .sort_values(["strategy", "reinit", "jaccard_similarity"])
)
res

Unnamed: 0,strategy,reinit,experiment-pairs,jaccard_similarity
6,entropy,False,data=0_model=0 -- data=1994_model=0,0.140851
12,entropy,False,data=0_model=0 -- data=1994_model=1994,0.140851
18,entropy,False,data=0_model=1994 -- data=1994_model=0,0.140851
24,entropy,False,data=0_model=1994 -- data=1994_model=1994,0.140851
0,entropy,False,data=0_model=0 -- data=0_model=1994,1.0
30,entropy,False,data=1994_model=0 -- data=1994_model=1994,1.0
7,entropy,True,data=0_model=0 -- data=1994_model=0,0.277327
13,entropy,True,data=0_model=0 -- data=1994_model=1994,0.277327
19,entropy,True,data=0_model=1994 -- data=1994_model=0,0.277327
25,entropy,True,data=0_model=1994 -- data=1994_model=1994,0.277327


In [7]:
df.loc

In [None]:
res.groupby(["strategy", "reinit"]).agg(["mean"])

In [None]:
# similarity across reinit
res = (
    df.groupby(["strategy", "experiment"])
    .apply(lambda df_: compute_jaccard_fn(df_, "reinit"))
    .to_frame("jaccard")
    .reset_index()
)
res = res.join(pd.DataFrame(res.pop("jaccard").values.tolist()))
res = res.melt(
    id_vars=["strategy", "experiment"],
    var_name="reinit-pairs",
    value_name="jaccard_similarity",
)
res

In [None]:
# similarity across experiments and round
res = (
    df.groupby(["strategy", "reinit", "labelling_round"])
    .apply(lambda df_: compute_jaccard_fn(df_, "experiment"))
    .to_frame("jaccard")
    .reset_index()
)
res = res.join(pd.DataFrame(res.pop("jaccard").values.tolist()))
res = res.melt(
    id_vars=["strategy", "reinit", "labelling_round"],
    var_name="experiment-pairs",
    value_name="jaccard_similarity",
)
res

In [None]:
res.iloc[100].to_dict()

In [None]:
sns.lineplot(
    res, x="labelling_round", y="jaccard_similarity", hue="strategy", style="reinit"
)

In [None]:
# similarity across reinit and rounds
res = (
    df.groupby(["strategy", "experiment"])
    .apply(lambda df_: compute_jaccard_fn(df_, "reinit"))
    .to_frame("jaccard")
    .reset_index()
)
res = res.join(pd.DataFrame(res.pop("jaccard").values.tolist()))
res = res.melt(
    id_vars=["strategy", "experiment"],
    var_name="reinit-pairs",
    value_name="jaccard_similarity",
)
res

In [None]:
a = (
    df.groupby(["experiment", "reinit"])
    .apply(lambda df_: compute_jaccard(df_, "entropy", "margin_confidence"))
    .to_frame("jaccard")
)
b = (
    df.groupby(["experiment", "reinit"])
    .apply(lambda df_: compute_jaccard(df_, "entropy", "random"))
    .to_frame("jaccard")
)
c = (
    df.groupby(["experiment", "reinit"])
    .apply(lambda df_: compute_jaccard(df_, "margin_confidence", "random"))
    .to_frame("jaccard")
)
dd = (
    a.join(b, rsuffix="_er")
    .join(c, rsuffix="_mr")
    .rename(columns={"jaccard": "jaccard_em"})
    .reset_index()
)
dd

In [None]:
def compute_jaccard_strategy(df):
    A = set(df.loc[df["reinit"] == True, "unique_id"].unique())
    B = set(df.loc[df["reinit"] == False, "unique_id"].unique())

    return jaccard_similarity(A, B)


def compute_cosine(df):
    A = (
        embeddings[df.loc[df["reinit"] == True, "unique_id"].unique()]
        .mean(0)
        .reshape(1, -1)
    )
    B = (
        embeddings[df.loc[df["reinit"] == False, "unique_id"].unique()]
        .mean(0)
        .reshape(1, -1)
    )

    return cosine_similarity(A, B).item()

In [None]:
m = {}
for strategy in df["strategy"].unique():
    m[strategy] = (
        df.loc[df["strategy"] == strategy]
        .groupby("experiment")
        .apply(compute_cosine)
        .agg(["mean", "std"])
    )

print(pd.DataFrame(m).T.to_markdown())

In [None]:
a = (
    df.groupby(["labelling_round", "experiment", "reinit"])
    .apply(lambda df_: compute_jaccard(df_, "entropy", "margin_confidence"))
    .to_frame("jaccard")
)
b = (
    df.groupby(["labelling_round", "experiment", "reinit"])
    .apply(lambda df_: compute_jaccard(df_, "entropy", "random"))
    .to_frame("jaccard")
)
c = (
    df.groupby(["labelling_round", "experiment", "reinit"])
    .apply(lambda df_: compute_jaccard(df_, "margin_confidence", "random"))
    .to_frame("jaccard")
)
dd = (
    a.join(b, rsuffix="_er")
    .join(c, rsuffix="_mr")
    .rename(columns={"jaccard": "jaccard_em"})
    .reset_index()
)
dd = dd.melt(id_vars=["labelling_round", "experiment", "reinit"])
dd = dd.rename(columns={"reinit": "re-initialise", "variable": "pairs"})
dd["pairs"] = dd["pairs"].map(
    {
        "jaccard_em": "margin-entropy",
        "jaccard_mr": "margin-random",
        "jaccard_er": "entropy-random",
    }
)
dd

In [None]:
sns.diverging_palette(150, 40, l=65, center="dark", n=3)

In [None]:
a = df.loc[
    (df["experiment"] == "data.seed=0_model.seed=0")
    & (df["strategy"] == "entropy")
    & (df["labelling_round"] == 1)
]

In [None]:
b = a.loc[a["reinit"] == False, "unique_id"]
c = a.loc[a["reinit"] == True, "unique_id"]

In [None]:
embeddings = np.load("../data/processed/agnews/ag_news_index.npy")

In [None]:
b.values

In [None]:
cosine_similarity(
    embeddings[b].mean(0).reshape(1, -1), embeddings[c].mean(0).reshape(1, -1)
)

In [None]:
def compute_cosine(df)

In [None]:
a = (
    df.groupby(["labelling_round", "experiment", "reinit"])
    .apply(lambda df_: compute_jaccard(df_, "entropy", "margin_confidence"))
    .to_frame("jaccard")
)
b = (
    df.groupby(["labelling_round", "experiment", "reinit"])
    .apply(lambda df_: compute_jaccard(df_, "entropy", "random"))
    .to_frame("jaccard")
)
c = (
    df.groupby(["labelling_round", "experiment", "reinit"])
    .apply(lambda df_: compute_jaccard(df_, "margin_confidence", "random"))
    .to_frame("jaccard")
)
dd = (
    a.join(b, rsuffix="_er")
    .join(c, rsuffix="_mr")
    .rename(columns={"jaccard": "jaccard_em"})
    .reset_index()
)
dd = dd.melt(id_vars=["labelling_round", "experiment", "reinit"])
dd = dd.rename(columns={"reinit": "re-initialise", "variable": "pairs"})
dd["pairs"] = dd["pairs"].map(
    {
        "jaccard_em": "margin-entropy",
        "jaccard_mr": "margin-random",
        "jaccard_er": "entropy-random",
    }
)
dd

In [None]:
plt.style.use("bmh")
sns.set_context("paper")
palette = sns.diverging_palette(150, 40, l=60, center="dark", n=3)

fig, ax = plt.subplots()

sns.lineplot(
    data=dd,
    x="labelling_round",
    y="value",
    hue="pairs",
    style="re-initialise",
    ax=ax,
    palette=palette,
)
fig.dpi = 800
fig.suptitle("AGNEWS")
ax.set_title("pair-wise similarity between queried sets")
ax.set_ylabel("jaccard similarity")
ax.set_xlabel("labelling round")
ax.set_ylim(0.0, 0.025)
ax.xaxis.set_minor_locator(ticker.MultipleLocator(5))
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1), fontsize=10)
# ax.legend(fontsize=10, bbox_to_anchor=(1, 1))
sns.despine()
plt.show()