In [1]:
import itertools
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import srsly
from datasets import load_from_disk
from matplotlib import ticker
from scipy.special import entr, softmax
from sklearn.manifold import TSNE
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
from tqdm.auto import tqdm

pd.set_option("display.max_columns", None)

In [127]:
a = np.array([-0.09692922, 0.21142264])

In [128]:
a = softmax(a)

In [129]:
a = a[a.argsort()]

In [130]:
a[-1] - a[-2]

0.15296583592121704

In [46]:
path_to_experiment = Path("../outputs/multirun/reinit_study/")
path_to_experiment

PosixPath('../outputs/multirun/reinit_study')

In [47]:
dataset = "imdb"

list_dfs = []
for path in list(
    filter(
        lambda ex: dataset in str(ex),
        path_to_experiment.rglob("labelled_dataset.parquet"),
    )
):
    data_seed, model_seed, _ = path.parents[1].name.split("_")
    df = pd.read_parquet(path).assign(
        reinit=eval(path.parents[2].name.split("=")[1]),
        strategy=path.parents[3].name,
        experiment=path.parents[1].name,
        data_seed=int(data_seed.split("=")[1]),
        model_seed=int(model_seed.split("=")[1]),
    )
    list_dfs.append(df)

In [48]:
df = pd.concat(list_dfs).reset_index(drop=True)

In [49]:
df

Unnamed: 0,unique_id,is_labelled,is_validation,labelling_round,labels,reinit,strategy,experiment,data_seed,model_seed
0,15706,True,True,-1,1,False,random,data=1994_model=0_2023-03-06T19-17-30,1994,0
1,11042,True,False,48,0,False,random,data=1994_model=0_2023-03-06T19-17-30,1994,0
2,11925,True,True,71,0,False,random,data=1994_model=0_2023-03-06T19-17-30,1994,0
3,16429,True,False,55,1,False,random,data=1994_model=0_2023-03-06T19-17-30,1994,0
4,9136,True,False,26,0,False,random,data=1994_model=0_2023-03-06T19-17-30,1994,0
...,...,...,...,...,...,...,...,...,...,...
20795,16039,True,True,94,1,True,random,data=0_model=1994_2023-03-06T19-17-30,0,1994
20796,14471,True,False,89,1,True,random,data=0_model=1994_2023-03-06T19-17-30,0,1994
20797,21539,True,False,39,1,True,random,data=0_model=1994_2023-03-06T19-17-30,0,1994
20798,2532,True,True,74,0,True,random,data=0_model=1994_2023-03-06T19-17-30,0,1994


In [34]:
def jaccard_similarity(A, B):
    A, B = set(A), set(B)

    # Find intersection of two sets
    nominator = A.intersection(B)

    # Find union of two sets
    denominator = A.union(B)

    # Take the ratio of sizes
    similarity = len(nominator) / len(denominator)

    return similarity


def compute_jaccard_fn(df, prop):
    combinations = itertools.combinations(df[prop].unique().tolist(), 2)

    out = {}
    for a, b in combinations:
        if a == b:
            continue
        A = set(df.loc[df[prop] == a, "unique_id"].unique())
        B = set(df.loc[df[prop] == b, "unique_id"].unique())
        out[f"{a} -- {b}".replace(".seed", "")] = jaccard_similarity(A, B)

    return out

In [35]:
df

Unnamed: 0,unique_id,is_labelled,is_validation,labelling_round,labels,reinit,strategy,experiment,data_seed,model_seed
0,15706,True,True,-1,1,False,random,data=1994_model=0_2023-03-06T19-17-30,1994,0
1,11042,True,False,48,0,False,random,data=1994_model=0_2023-03-06T19-17-30,1994,0
2,11925,True,True,71,0,False,random,data=1994_model=0_2023-03-06T19-17-30,1994,0
3,16429,True,False,55,1,False,random,data=1994_model=0_2023-03-06T19-17-30,1994,0
4,9136,True,False,26,0,False,random,data=1994_model=0_2023-03-06T19-17-30,1994,0
...,...,...,...,...,...,...,...,...,...,...
20795,16039,True,True,94,1,True,random,data=0_model=1994_2023-03-06T19-17-30,0,1994
20796,14471,True,False,89,1,True,random,data=0_model=1994_2023-03-06T19-17-30,0,1994
20797,21539,True,False,39,1,True,random,data=0_model=1994_2023-03-06T19-17-30,0,1994
20798,2532,True,True,74,0,True,random,data=0_model=1994_2023-03-06T19-17-30,0,1994


In [36]:
ids = np.stack(
    df.loc[df["labelling_round"] == -1]
    .groupby("experiment")["unique_id"]
    .agg(list)
    .values
)
for a, b in itertools.combinations(range(4), 2):
    print(jaccard_similarity(ids[a], ids[b]))

1.0
1.0
1.0
1.0
1.0
1.0


In [37]:
# similarity across seeds
res = (
    df.loc[df["labelling_round"] > 0]
    .groupby(["strategy", "reinit"])
    .apply(lambda df_: compute_jaccard_fn(df_, "experiment"))
    .to_frame("jaccard")
    .reset_index()
)
res = (
    res.join(pd.DataFrame(res.pop("jaccard").values.tolist()))
    .melt(
        id_vars=["strategy", "reinit"],
        var_name="experiment-pairs",
        value_name="jaccard_similarity",
    )
    .sort_values(["strategy", "reinit", "jaccard_similarity"])
)
res

Unnamed: 0,strategy,reinit,experiment-pairs,jaccard_similarity
0,random,False,data=1994_model=0_2023-03-06T19-17-30 -- data=...,1.0
2,random,False,data=1994_model=0_2023-03-06T19-17-30 -- data=...,1.0
4,random,False,data=1994_model=0_2023-03-06T19-17-30 -- data=...,1.0
6,random,False,data=1994_model=1994_2023-03-06T19-17-30 -- da...,1.0
8,random,False,data=1994_model=1994_2023-03-06T19-17-30 -- da...,1.0
10,random,False,data=0_model=0_2023-03-06T19-17-30 -- data=0_m...,1.0
1,random,True,data=1994_model=0_2023-03-06T19-17-30 -- data=...,1.0
3,random,True,data=1994_model=0_2023-03-06T19-17-30 -- data=...,1.0
5,random,True,data=1994_model=0_2023-03-06T19-17-30 -- data=...,1.0
7,random,True,data=1994_model=1994_2023-03-06T19-17-30 -- da...,1.0


In [45]:
df["strategy"].unique()

array(['random'], dtype=object)

In [42]:
df_agg = df.groupby(["reinit", "strategy", "experiment"])["unique_id"].agg(list)

reinit  strategy  experiment                              
False   random    data=0_model=0_2023-03-06T19-17-30          [15706, 11042, 11925, 16429, 9136, 5283, 14587...
                  data=0_model=1994_2023-03-06T19-17-30       [15706, 11042, 11925, 16429, 9136, 5283, 14587...
                  data=1994_model=0_2023-03-06T19-17-30       [15706, 11042, 11925, 16429, 9136, 5283, 14587...
                  data=1994_model=1994_2023-03-06T19-17-30    [15706, 11042, 11925, 16429, 9136, 5283, 14587...
True    random    data=0_model=0_2023-03-06T19-17-30          [15706, 11042, 11925, 16429, 9136, 5283, 14587...
                  data=0_model=1994_2023-03-06T19-17-30       [15706, 11042, 11925, 16429, 9136, 5283, 14587...
                  data=1994_model=0_2023-03-06T19-17-30       [15706, 11042, 11925, 16429, 9136, 5283, 14587...
                  data=1994_model=1994_2023-03-06T19-17-30    [15706, 11042, 11925, 16429, 9136, 5283, 14587...
Name: unique_id, dtype: object

In [38]:
a = (
    df.groupby(["strategy", "reinit", "experiment"])["unique_id"]
    .agg(list)
    .reset_index()
)

In [39]:
df.loc[
    df["strategy"] == "random", ["experiment", "unique_id", "labelling_round"]
].pivot_table(
    index="labelling_round",
    columns="experiment",
    aggfunc=lambda ex: sorted(list(ex)),
)

Unnamed: 0_level_0,unique_id,unique_id,unique_id,unique_id
experiment,data=0_model=0_2023-03-06T19-17-30,data=0_model=1994_2023-03-06T19-17-30,data=1994_model=0_2023-03-06T19-17-30,data=1994_model=1994_2023-03-06T19-17-30
labelling_round,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
-1,"[92, 92, 511, 511, 536, 536, 715, 715, 1075, 1...","[92, 92, 511, 511, 536, 536, 715, 715, 1075, 1...","[92, 92, 511, 511, 536, 536, 715, 715, 1075, 1...","[92, 92, 511, 511, 536, 536, 715, 715, 1075, 1..."
0,"[777, 777, 1446, 1446, 1858, 1858, 2169, 2169,...","[777, 777, 1446, 1446, 1858, 1858, 2169, 2169,...","[777, 777, 1446, 1446, 1858, 1858, 2169, 2169,...","[777, 777, 1446, 1446, 1858, 1858, 2169, 2169,..."
1,"[73, 73, 1136, 1136, 2386, 2386, 3316, 3316, 3...","[73, 73, 1136, 1136, 2386, 2386, 3316, 3316, 3...","[73, 73, 1136, 1136, 2386, 2386, 3316, 3316, 3...","[73, 73, 1136, 1136, 2386, 2386, 3316, 3316, 3..."
2,"[2047, 2047, 2859, 2859, 5533, 5533, 5935, 593...","[2047, 2047, 2859, 2859, 5533, 5533, 5935, 593...","[2047, 2047, 2859, 2859, 5533, 5533, 5935, 593...","[2047, 2047, 2859, 2859, 5533, 5533, 5935, 593..."
3,"[1365, 1365, 4551, 4551, 5659, 5659, 5766, 576...","[1365, 1365, 4551, 4551, 5659, 5659, 5766, 576...","[1365, 1365, 4551, 4551, 5659, 5659, 5766, 576...","[1365, 1365, 4551, 4551, 5659, 5659, 5766, 576..."
...,...,...,...,...
95,"[172, 172, 1321, 1321, 4330, 4330, 4867, 4867,...","[172, 172, 1321, 1321, 4330, 4330, 4867, 4867,...","[172, 172, 1321, 1321, 4330, 4330, 4867, 4867,...","[172, 172, 1321, 1321, 4330, 4330, 4867, 4867,..."
96,"[665, 665, 704, 704, 4637, 4637, 5958, 5958, 6...","[665, 665, 704, 704, 4637, 4637, 5958, 5958, 6...","[665, 665, 704, 704, 4637, 4637, 5958, 5958, 6...","[665, 665, 704, 704, 4637, 4637, 5958, 5958, 6..."
97,"[900, 900, 2337, 2337, 3887, 3887, 3990, 3990,...","[900, 900, 2337, 2337, 3887, 3887, 3990, 3990,...","[900, 900, 2337, 2337, 3887, 3887, 3990, 3990,...","[900, 900, 2337, 2337, 3887, 3887, 3990, 3990,..."
98,"[1013, 1013, 1052, 1052, 1481, 1481, 1706, 170...","[1013, 1013, 1052, 1052, 1481, 1481, 1706, 170...","[1013, 1013, 1052, 1052, 1481, 1481, 1706, 170...","[1013, 1013, 1052, 1052, 1481, 1481, 1706, 170..."


In [37]:
seqs = a.loc[a["strategy"] == "random", "unique_id"].map(np.array).tolist()

In [38]:
np.all(seqs[0] == seqs[1])

True

In [43]:
a, b = set(seqs[0]), set(seqs[2])
jaccard_similarity(a, b)

0.08514190317195326

In [None]:
df.loc[(df["strategy"] == "random") & (df["reinit"] == True)]

In [None]:
res.groupby(["strategy", "reinit"]).agg(["mean"])

In [None]:
# similarity across reinit
res = (
    df.groupby(["strategy", "experiment"])
    .apply(lambda df_: compute_jaccard_fn(df_, "reinit"))
    .to_frame("jaccard")
    .reset_index()
)
res = res.join(pd.DataFrame(res.pop("jaccard").values.tolist()))
res = res.melt(
    id_vars=["strategy", "experiment"],
    var_name="reinit-pairs",
    value_name="jaccard_similarity",
)
res

In [None]:
# similarity across experiments and round
res = (
    df.groupby(["strategy", "reinit", "labelling_round"])
    .apply(lambda df_: compute_jaccard_fn(df_, "experiment"))
    .to_frame("jaccard")
    .reset_index()
)
res = res.join(pd.DataFrame(res.pop("jaccard").values.tolist()))
res = res.melt(
    id_vars=["strategy", "reinit", "labelling_round"],
    var_name="experiment-pairs",
    value_name="jaccard_similarity",
)
res

In [None]:
res.iloc[100].to_dict()

In [None]:
sns.lineplot(
    res, x="labelling_round", y="jaccard_similarity", hue="strategy", style="reinit"
)

In [None]:
# similarity across reinit and rounds
res = (
    df.groupby(["strategy", "experiment"])
    .apply(lambda df_: compute_jaccard_fn(df_, "reinit"))
    .to_frame("jaccard")
    .reset_index()
)
res = res.join(pd.DataFrame(res.pop("jaccard").values.tolist()))
res = res.melt(
    id_vars=["strategy", "experiment"],
    var_name="reinit-pairs",
    value_name="jaccard_similarity",
)
res

In [None]:
a = (
    df.groupby(["experiment", "reinit"])
    .apply(lambda df_: compute_jaccard(df_, "entropy", "margin_confidence"))
    .to_frame("jaccard")
)
b = (
    df.groupby(["experiment", "reinit"])
    .apply(lambda df_: compute_jaccard(df_, "entropy", "random"))
    .to_frame("jaccard")
)
c = (
    df.groupby(["experiment", "reinit"])
    .apply(lambda df_: compute_jaccard(df_, "margin_confidence", "random"))
    .to_frame("jaccard")
)
dd = (
    a.join(b, rsuffix="_er")
    .join(c, rsuffix="_mr")
    .rename(columns={"jaccard": "jaccard_em"})
    .reset_index()
)
dd

In [None]:
def compute_jaccard_strategy(df):
    A = set(df.loc[df["reinit"] == True, "unique_id"].unique())
    B = set(df.loc[df["reinit"] == False, "unique_id"].unique())

    return jaccard_similarity(A, B)


def compute_cosine(df):
    A = (
        embeddings[df.loc[df["reinit"] == True, "unique_id"].unique()]
        .mean(0)
        .reshape(1, -1)
    )
    B = (
        embeddings[df.loc[df["reinit"] == False, "unique_id"].unique()]
        .mean(0)
        .reshape(1, -1)
    )

    return cosine_similarity(A, B).item()

In [None]:
m = {}
for strategy in df["strategy"].unique():
    m[strategy] = (
        df.loc[df["strategy"] == strategy]
        .groupby("experiment")
        .apply(compute_cosine)
        .agg(["mean", "std"])
    )

print(pd.DataFrame(m).T.to_markdown())

In [None]:
a = (
    df.groupby(["labelling_round", "experiment", "reinit"])
    .apply(lambda df_: compute_jaccard(df_, "entropy", "margin_confidence"))
    .to_frame("jaccard")
)
b = (
    df.groupby(["labelling_round", "experiment", "reinit"])
    .apply(lambda df_: compute_jaccard(df_, "entropy", "random"))
    .to_frame("jaccard")
)
c = (
    df.groupby(["labelling_round", "experiment", "reinit"])
    .apply(lambda df_: compute_jaccard(df_, "margin_confidence", "random"))
    .to_frame("jaccard")
)
dd = (
    a.join(b, rsuffix="_er")
    .join(c, rsuffix="_mr")
    .rename(columns={"jaccard": "jaccard_em"})
    .reset_index()
)
dd = dd.melt(id_vars=["labelling_round", "experiment", "reinit"])
dd = dd.rename(columns={"reinit": "re-initialise", "variable": "pairs"})
dd["pairs"] = dd["pairs"].map(
    {
        "jaccard_em": "margin-entropy",
        "jaccard_mr": "margin-random",
        "jaccard_er": "entropy-random",
    }
)
dd

In [None]:
sns.diverging_palette(150, 40, l=65, center="dark", n=3)

In [None]:
a = df.loc[
    (df["experiment"] == "data.seed=0_model.seed=0")
    & (df["strategy"] == "entropy")
    & (df["labelling_round"] == 1)
]

In [None]:
b = a.loc[a["reinit"] == False, "unique_id"]
c = a.loc[a["reinit"] == True, "unique_id"]

In [None]:
embeddings = np.load("../data/processed/agnews/ag_news_index.npy")

In [None]:
b.values

In [None]:
cosine_similarity(
    embeddings[b].mean(0).reshape(1, -1), embeddings[c].mean(0).reshape(1, -1)
)

In [None]:
def compute_cosine(df)

In [None]:
a = (
    df.groupby(["labelling_round", "experiment", "reinit"])
    .apply(lambda df_: compute_jaccard(df_, "entropy", "margin_confidence"))
    .to_frame("jaccard")
)
b = (
    df.groupby(["labelling_round", "experiment", "reinit"])
    .apply(lambda df_: compute_jaccard(df_, "entropy", "random"))
    .to_frame("jaccard")
)
c = (
    df.groupby(["labelling_round", "experiment", "reinit"])
    .apply(lambda df_: compute_jaccard(df_, "margin_confidence", "random"))
    .to_frame("jaccard")
)
dd = (
    a.join(b, rsuffix="_er")
    .join(c, rsuffix="_mr")
    .rename(columns={"jaccard": "jaccard_em"})
    .reset_index()
)
dd = dd.melt(id_vars=["labelling_round", "experiment", "reinit"])
dd = dd.rename(columns={"reinit": "re-initialise", "variable": "pairs"})
dd["pairs"] = dd["pairs"].map(
    {
        "jaccard_em": "margin-entropy",
        "jaccard_mr": "margin-random",
        "jaccard_er": "entropy-random",
    }
)
dd

In [None]:
plt.style.use("bmh")
sns.set_context("paper")
palette = sns.diverging_palette(150, 40, l=60, center="dark", n=3)

fig, ax = plt.subplots()

sns.lineplot(
    data=dd,
    x="labelling_round",
    y="value",
    hue="pairs",
    style="re-initialise",
    ax=ax,
    palette=palette,
)
fig.dpi = 800
fig.suptitle("AGNEWS")
ax.set_title("pair-wise similarity between queried sets")
ax.set_ylabel("jaccard similarity")
ax.set_xlabel("labelling round")
ax.set_ylim(0.0, 0.025)
ax.xaxis.set_minor_locator(ticker.MultipleLocator(5))
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1), fontsize=10)
# ax.legend(fontsize=10, bbox_to_anchor=(1, 1))
sns.despine()
plt.show()