# Harris' Segmentation Scheme Analysis

## Imports and setup

In [None]:
import itertools
import os
import sys
import glob
import time
from collections import Counter, defaultdict

import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
from joblib import Parallel
from matplotlib.ticker import PercentFormatter
from tqdm.notebook import tqdm

# Workaround so we can re-use the metric functions
module_path = os.path.abspath(os.path.join("../"))
if module_path not in sys.path:
    sys.path.append(module_path)

import trgl.utils.harris_segmentation as has
from trgl.utils.harris_segmentation import EntropyCalculator

In [None]:
sns.set(palette="pastel")
sns.set_style("whitegrid")
palette = sns.color_palette()

import matplotlib.pylab as pylab

params = {
    "legend.title_fontsize": "32",
    "legend.fontsize": "24",
    "axes.labelsize": "32",
    "axes.titlesize": "32",
    "xtick.labelsize": "22",
    "ytick.labelsize": "26",
}
pylab.rcParams.update(params)

markers = ["o", "+", "*", "^", "v", "x"]
hatches = ["\\", "+", "-", "|", "x", "/"]

## Data Loading

In [None]:
all_files = glob.glob(os.path.join("./data/v2/", "*.json"))
# For now select random subset for analysis
# There's no way we can analyse thousands of files
rng = np.random.default_rng()
all_files = rng.choice(all_files, size=int(len(all_files) * 0.05))
li = []
params = []
for filename in tqdm(all_files):
    split = filename.split("-")
    run_id = split[1]
    architecture = split[2]
    training_dataset = split[3]
    net_temporal = split[4].__contains__("True")
    loss_temporal = split[5].__contains__("True")
    purely_temporal = split[6].__contains__("True")
    attention_sender = split[7].__contains__("True")
    attention_receiver = split[8].__contains__("True")
    eval_dataset = split[9]
    # No need to analyse test datasets for language
    if eval_dataset not in ["trg_previous", "trg_hard", "rg_classic", "rg_hard"]:
        continue
    params.append(
        [
            run_id,
            training_dataset,
            net_temporal,
            loss_temporal,
            purely_temporal,
            attention_sender,
            attention_receiver,
            eval_dataset,
        ]
    )
    df = pd.read_json(filename, orient="index")
    li.append(df)

In [None]:
# This code is commented out not to bash WandB servers everytime we run analysis
# We run this only once and save to pickle

# import wandb
# wandb.login()
# api = wandb.Api(timeout=60)
#
# runs = api.runs("user/TRGL")
# summary_list, config_list, name_list = [], [], []
# for run in tqdm(runs):
#     summary_list.append(
#         run.history(
#             samples=400,
#         )
#     )
#
#     config_list.append({k: v for k, v in run.config.items()})
#
#     name_list.append(run.name.split("-")[1])
#
# runs_full_df = pd.DataFrame(
#     {
#         "summary": summary_list,
#         "config": config_list,
#         "name": name_list,
#     }
# )
#
# runs_full_df.to_pickle("./data/v2/runs_full_df.pickle")

In [None]:
runs_full_df = pd.read_pickle("./data/v2/runs_full_df.pickle")
runs_full_df = runs_full_df[["summary", "config", "name"]]

In [None]:
def df_stats(x: pd.DataFrame):
    # Some are empty, we'll drop them later
    x = x[x["val_acc"].notnull()]
    try:
        x["val_acc"].idxmax()
    except:
        return None
    max_acc_index = x["val_acc"].idxmax()
    over_75_index = x["val_acc"].ge(0.75).any() and x["val_acc"].ge(0.75).idxmax()
    over_85_index = x["val_acc"].ge(0.85).any() and x["val_acc"].ge(0.85).idxmax()
    end_acc_index = x["epoch"].idxmax()

    stats_dict = {
        "max_acc_epoch": x["epoch"][max_acc_index],
        "max_acc_value": x["val_acc"][max_acc_index],
        "over_75_epoch": x["epoch"][over_75_index] if over_75_index else -1,
        "over_75_value": x["val_acc"][over_75_index] if over_75_index else -1,
        "over_85_epoch": x["epoch"][over_85_index] if over_85_index else -1,
        "over_85_value": x["val_acc"][over_85_index] if over_85_index else -1,
        "end_acc_epoch": x["epoch"][end_acc_index],
        "end_acc_value": x["val_acc"][end_acc_index],
    }

    return stats_dict

In [None]:
runs_full_df["summary"] = runs_full_df["summary"].apply(df_stats).dropna()
df_temp = pd.json_normalize(runs_full_df.pop("config"))
runs_full_df = runs_full_df.join(df_temp)
df_temp = pd.json_normalize(runs_full_df.pop("summary"))
runs_full_df = runs_full_df.join(df_temp)
runs_full_df = runs_full_df.set_index("name")

In [None]:
matches = {
    f"match_{x}": {
        "run_id": params[x][0],
        "game_type": params[x][7],
        "training_dataset": params[x][1],
        "net_temporal": params[x][2],
        "loss_temporal": params[x][3],
        "purely_temporal": params[x][4],
        "attention_sender": params[x][5],
        "attention_receiver": params[x][6],
        "messages": {},
        "lang": li[x]["message"].to_numpy(),
        "meanings": li[x]["target"].to_numpy(),
    }
    for x in range(len(li))
}

In [None]:
for idx in tqdm(range(len(matches))):
    run_id = matches[f"match_{idx}"]["run_id"]
    matches[f"match_{idx}"]["temporal"] = runs_full_df.loc[f"{run_id}"]["temporal"][0]
    matches[f"match_{idx}"]["max_epochs"] = runs_full_df.loc[f"{run_id}"]["max_epochs"][
        0
    ]
    matches[f"match_{idx}"]["max_length"] = runs_full_df.loc[f"{run_id}"]["max_length"][
        0
    ]
    matches[f"match_{idx}"]["num_features"] = runs_full_df.loc[f"{run_id}"][
        "num_features"
    ][0]
    matches[f"match_{idx}"]["vocab_size"] = runs_full_df.loc[f"{run_id}"]["vocab_size"][
        0
    ]
    matches[f"match_{idx}"]["num_objects"] = runs_full_df.loc[f"{run_id}"][
        "num_objects"
    ][0]
    # Repeat chance was incorrect (flipped) for our first runs, that is why there is inversion later
    matches[f"match_{idx}"]["repeat_chance"] = runs_full_df.loc[f"{run_id}"][
        "repeat_chance"
    ][0]
    matches[f"match_{idx}"]["temporal_loss"] = runs_full_df.loc[f"{run_id}"][
        "temporal_loss"
    ][0]
    matches[f"match_{idx}"]["length_penalty"] = runs_full_df.loc[f"{run_id}"][
        "length_penalty"
    ][0]
    matches[f"match_{idx}"]["gs_temperature"] = runs_full_df.loc[f"{run_id}"][
        "gs_temperature"
    ][0]
    matches[f"match_{idx}"]["num_properties"] = runs_full_df.loc[f"{run_id}"][
        "num_properties"
    ][0]
    matches[f"match_{idx}"]["num_distractors"] = runs_full_df.loc[f"{run_id}"][
        "num_distractors"
    ][0]
    matches[f"match_{idx}"]["prev_horizon"] = runs_full_df.loc[f"{run_id}"][
        "prev_horizon"
    ][0]

## HAS Analysis

In [None]:
# Function for parallel processing
# Will compute most HAS related metrics
def process_match_has(match: dict):
    def get_map_from_len_to_freq_percentage(
        lang,
        max_len: int,
        threshold: float,
    ):
        len_count = Counter(
            map(
                len,
                itertools.chain.from_iterable(
                    EntropyCalculator(lang, threshold=threshold).segments
                ),
            )
        )
        total_value = sum(len_count.values())
        return {i + 1: len_count[i + 1] / total_value for i in range(max_len)}

    thresholds = (0, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2)
    entr_calc = EntropyCalculator(match["lang"])
    results = {}
    for thr in thresholds:
        entr_calc.threshold = thr

        # Mean number of boundaries
        results[f"thr-{thr}-boundaries"] = entr_calc.mean_n_boundaries

        # Vocab Size
        results[f"thr-{thr}-vocab_size"] = entr_calc.vocab_size

        # Sigurd Style ZLA
        results[f"thr-{thr}-trained_map_len"] = get_map_from_len_to_freq_percentage(
            lang=match["lang"],
            max_len=match["max_length"],
            threshold=thr,
        )

        # ZLA
        freqs = []
        freq_to_lens = defaultdict(list)
        for word, freq in Counter(
            itertools.chain.from_iterable(entr_calc.segments)
        ).most_common():
            freqs.append(freq)
            freq_to_lens[freq].append(len(word))
        results[f"thr-{thr}-zla"] = [np.mean(freq_to_lens[freq]) for freq in freqs]

        # Zipf
        freqs = [
            x[1]
            for x in Counter(
                itertools.chain.from_iterable(
                    EntropyCalculator(match["lang"], threshold=thr).segments
                )
            ).most_common()
        ]
        results[f"thr-{thr}-zipf"] = freqs

        # Topographic Similarities
        results[f"thr-{thr}-thr_to_topsims"] = has.compute_topsim(
            entr_calc.hashed_segments, match["meanings"]
        )
        results[f"thr-{thr}-thr_to_random_seg_topsims"] = has.compute_topsim(
            entr_calc.hashed_random_segments, match["meanings"]
        )

    # Entropies
    results[f"entropies"] = [
        v
        for _, v in sorted(EntropyCalculator(match["lang"]).conditional_entropy.items())
    ]

    # Plain Top Sim
    results[f"plain_topsim"] = has.compute_topsim(match["lang"], match["meanings"])

    del entr_calc
    return results

In [None]:
import sys

sys.__stdout__.write("foo\n")

In [None]:
start_time = time.perf_counter()

results = Parallel(n_jobs=os.cpu_count(), verbose=10)(
    (process_match_has, (matches[match],), {}) for match in matches
)

for x in range(len(li)):
    matches[f"match_{x}"].update(results[x])

finish_time = time.perf_counter()
print(f"Computing strategy stats finished in {finish_time-start_time} seconds")
sys.__stdout__.write(
    f"Computing strategy stats finished in {finish_time-start_time} seconds\n"
)
del results

# Save to pickle as computation takes a very long time.
matches_df = pd.DataFrame(matches)
matches_df = matches_df.T
matches_df.to_pickle("./data/v2/matches_has.pickle")

In [None]:
# Read from pickle - should be no difference and progress saved!
matches_df = pd.read_pickle("./data/v2/matches_has.pickle")

In [None]:
matches_df = matches_df.sort_values(
    ["net_temporal", "loss_temporal", "training_dataset"]
)

## Plots

### Plot N Hypothetical Boundaries

In [None]:
# Preprocess dataframe to fit our graph nicely
matches_df_fig1 = matches_df.loc[
    (matches_df["game_type"] != "analysis_always_same")
    & (matches_df["game_type"] != "analysis_never_same")
    & (matches_df["attention_sender"] == True)
    & (matches_df["attention_receiver"] == True)
].filter(like="boundaries")
matches_df_fig1 = matches_df_fig1.assign(game_type=matches_df["game_type"])
matches_df_fig1 = matches_df_fig1.assign(loss_temporal=matches_df["loss_temporal"])
matches_df_fig1 = matches_df_fig1.assign(net_temporal=matches_df["net_temporal"])
matches_df_fig1 = matches_df_fig1.assign(
    training_dataset=matches_df["training_dataset"]
)
matches_df_fig1 = matches_df_fig1.melt(
    id_vars=["game_type", "net_temporal", "loss_temporal", "training_dataset"],
    var_name="boundary",
    value_name="number",
)
matches_df_fig1["boundary"] = matches_df_fig1["boundary"].apply(
    lambda x: x.split("-")[1]
)

fig, ax = plt.subplots(figsize=(16, 8))
sns.boxplot(
    data=matches_df_fig1,
    y="number",
    x="boundary",
    hue=matches_df_fig1[["net_temporal", "loss_temporal", "training_dataset"]].apply(
        tuple, axis=1
    ),
    ax=ax,
).set(
    xlabel="Threshold Value",
    ylabel="Mean N of HypoBoundaries",
)
labels = [
    "Non-Temporal Net w/o t-loss trained on RGs",
    "Non-Temporal Net w/o t-loss trained on TRGs",
    "Non-Temporal Net trained on RGs",
    "Non-Temporal Net trained on TRGs",
    "Temporal Net w/o t-loss trained on RGs",
    "Temporal Net w/o t-loss trained on TRGs",
    "Temporal Net trained on RGs",
    "Temporal Net trained on TRGs",
]
hatches = ["//", "..", "xx", "OO", "\\\\", "**", "||", "--"]
patches = [patch for patch in ax.patches if type(patch) == mpl.patches.PathPatch]
# the number of patches should be evenly divisible by the number of hatches
h = hatches * (len(patches) // len(hatches))
# iterate through the patches for each subplot
for patch, hatch in zip(patches, h):
    patch.set_hatch(hatch)
    fc = patch.get_facecolor()
    patch.set_edgecolor(fc)
    patch.set_facecolor("none")
h, _ = ax.get_legend_handles_labels()
l = ax.legend(
    h,
    labels,
    title="Network Type",
    ncols=2,
    bbox_to_anchor=(0.5, 1.5),
    loc="upper center",
)
for lp, hatch in zip(l.get_patches(), hatches):
    lp.set_hatch(hatch)
    fc = lp.get_facecolor()
    lp.set_edgecolor(fc)
    lp.set_facecolor("none")
fig.savefig("n_hypothetical_boundaries_at.pdf", bbox_inches="tight")
plt.show()

In [None]:
# Preprocess dataframe to fit our graph nicely
matches_df_fig1 = matches_df.loc[
    (matches_df["game_type"] != "analysis_always_same")
    & (matches_df["game_type"] != "analysis_never_same")
    & (matches_df["attention_sender"] == False)
    & (matches_df["attention_receiver"] == False)
].filter(like="boundaries")
matches_df_fig1 = matches_df_fig1.assign(game_type=matches_df["game_type"])
matches_df_fig1 = matches_df_fig1.assign(loss_temporal=matches_df["loss_temporal"])
matches_df_fig1 = matches_df_fig1.assign(net_temporal=matches_df["net_temporal"])
matches_df_fig1 = matches_df_fig1.assign(
    training_dataset=matches_df["training_dataset"]
)
matches_df_fig1 = matches_df_fig1.melt(
    id_vars=["game_type", "net_temporal", "loss_temporal", "training_dataset"],
    var_name="boundary",
    value_name="number",
)
matches_df_fig1["boundary"] = matches_df_fig1["boundary"].apply(
    lambda x: x.split("-")[1]
)

fig, ax = plt.subplots(figsize=(16, 8))
sns.boxplot(
    data=matches_df_fig1,
    y="number",
    x="boundary",
    hue=matches_df_fig1[["net_temporal", "loss_temporal", "training_dataset"]].apply(
        tuple, axis=1
    ),
    ax=ax,
).set(
    xlabel="Threshold Value",
    ylabel="Mean N of HypoBoundaries",
)
labels = [
    "Non-Temporal Net w/o t-loss trained on RGs",
    "Non-Temporal Net w/o t-loss trained on TRGs",
    "Non-Temporal Net trained on RGs",
    "Non-Temporal Net trained on TRGs",
    "Temporal Net w/o t-loss trained on RGs",
    "Temporal Net w/o t-loss trained on TRGs",
    "Temporal Net trained on RGs",
    "Temporal Net trained on TRGs",
]
hatches = ["//", "..", "xx", "OO", "\\\\", "**", "||", "--"]
patches = [patch for patch in ax.patches if type(patch) == mpl.patches.PathPatch]
# the number of patches should be evenly divisible by the number of hatches
h = hatches * (len(patches) // len(hatches))
# iterate through the patches for each subplot
for patch, hatch in zip(patches, h):
    patch.set_hatch(hatch)
    fc = patch.get_facecolor()
    patch.set_edgecolor(fc)
    patch.set_facecolor("none")
h, _ = ax.get_legend_handles_labels()
l = ax.legend(
    h,
    labels,
    title="Network Type",
    ncols=2,
    bbox_to_anchor=(0.5, 1.5),
    loc="upper center",
)
for lp, hatch in zip(l.get_patches(), hatches):
    lp.set_hatch(hatch)
    fc = lp.get_facecolor()
    lp.set_edgecolor(fc)
    lp.set_facecolor("none")
fig.savefig("n_hypothetical_boundaries_noat.pdf", bbox_inches="tight")
plt.show()

### Plot vocab size

In [None]:
# Preprocess dataframe to fit our graph nicely
matches_df_fig2 = (
    matches_df.loc[
        (matches_df["game_type"] != "analysis_always_same")
        & (matches_df["game_type"] != "analysis_never_same")
        & (matches_df["attention_sender"] == True)
        & (matches_df["attention_receiver"] == True)
    ]
    .filter(like="vocab_size")
    .filter(like="thr")
)
matches_df_fig2 = matches_df_fig2.assign(game_type=matches_df["game_type"])
matches_df_fig2 = matches_df_fig2.assign(loss_temporal=matches_df["loss_temporal"])
matches_df_fig2 = matches_df_fig2.assign(net_temporal=matches_df["net_temporal"])
matches_df_fig2 = matches_df_fig2.assign(
    training_dataset=matches_df["training_dataset"]
)
matches_df_fig2 = matches_df_fig2.melt(
    id_vars=["game_type", "net_temporal", "loss_temporal", "training_dataset"],
    var_name="boundary",
    value_name="number",
)
matches_df_fig2["boundary"] = matches_df_fig2["boundary"].apply(
    lambda x: x.split("-")[1]
)

fig, ax = plt.subplots(figsize=(16, 8))
sns.boxplot(
    data=matches_df_fig2,
    y="number",
    x="boundary",
    hue=matches_df_fig2[["net_temporal", "loss_temporal", "training_dataset"]].apply(
        tuple, axis=1
    ),
    ax=ax,
).set(
    xlabel="Threshold Value",
    ylabel="Vocab Size",
)
labels = [
    "Non-Temporal Net w/o t-loss trained on RGs",
    "Non-Temporal Net w/o t-loss trained on TRGs",
    "Non-Temporal Net trained on RGs",
    "Non-Temporal Net trained on TRGs",
    "Temporal Net w/o t-loss trained on RGs",
    "Temporal Net w/o t-loss trained on TRGs",
    "Temporal Net trained on RGs",
    "Temporal Net trained on TRGs",
]
hatches = ["//", "..", "xx", "OO", "\\\\", "**", "||", "--"]
patches = [patch for patch in ax.patches if type(patch) == mpl.patches.PathPatch]
# the number of patches should be evenly divisible by the number of hatches
h = hatches * (len(patches) // len(hatches))
# iterate through the patches for each subplot
for patch, hatch in zip(patches, h):
    patch.set_hatch(hatch)
    fc = patch.get_facecolor()
    patch.set_edgecolor(fc)
    patch.set_facecolor("none")
h, _ = ax.get_legend_handles_labels()
l = ax.legend(
    h,
    labels,
    title="Network Type",
    ncols=2,
    bbox_to_anchor=(0.5, 1.5),
    loc="upper center",
)
for lp, hatch in zip(l.get_patches(), hatches):
    lp.set_hatch(hatch)
    fc = lp.get_facecolor()
    lp.set_edgecolor(fc)
    lp.set_facecolor("none")
fig.savefig("vocab_size_at.pdf", bbox_inches="tight")
plt.show()

In [None]:
# Preprocess dataframe to fit our graph nicely
matches_df_fig2 = (
    matches_df.loc[
        (matches_df["game_type"] != "analysis_always_same")
        & (matches_df["game_type"] != "analysis_never_same")
        & (matches_df["attention_sender"] == False)
        & (matches_df["attention_receiver"] == False)
    ]
    .filter(like="vocab_size")
    .filter(like="thr")
)
matches_df_fig2 = matches_df_fig2.assign(game_type=matches_df["game_type"])
matches_df_fig2 = matches_df_fig2.assign(loss_temporal=matches_df["loss_temporal"])
matches_df_fig2 = matches_df_fig2.assign(net_temporal=matches_df["net_temporal"])
matches_df_fig2 = matches_df_fig2.assign(
    training_dataset=matches_df["training_dataset"]
)
matches_df_fig2 = matches_df_fig2.melt(
    id_vars=["game_type", "net_temporal", "loss_temporal", "training_dataset"],
    var_name="boundary",
    value_name="number",
)
matches_df_fig2["boundary"] = matches_df_fig2["boundary"].apply(
    lambda x: x.split("-")[1]
)

fig, ax = plt.subplots(figsize=(16, 8))
sns.boxplot(
    data=matches_df_fig2,
    y="number",
    x="boundary",
    hue=matches_df_fig2[["net_temporal", "loss_temporal", "training_dataset"]].apply(
        tuple, axis=1
    ),
    ax=ax,
).set(
    xlabel="Threshold Value",
    ylabel="Vocab Size",
)
labels = [
    "Non-Temporal Net w/o t-loss trained on RGs",
    "Non-Temporal Net w/o t-loss trained on TRGs",
    "Non-Temporal Net trained on RGs",
    "Non-Temporal Net trained on TRGs",
    "Temporal Net w/o t-loss trained on RGs",
    "Temporal Net w/o t-loss trained on TRGs",
    "Temporal Net trained on RGs",
    "Temporal Net trained on TRGs",
]
hatches = ["//", "..", "xx", "OO", "\\\\", "**", "||", "--"]
patches = [patch for patch in ax.patches if type(patch) == mpl.patches.PathPatch]
# the number of patches should be evenly divisible by the number of hatches
h = hatches * (len(patches) // len(hatches))
# iterate through the patches for each subplot
for patch, hatch in zip(patches, h):
    patch.set_hatch(hatch)
    fc = patch.get_facecolor()
    patch.set_edgecolor(fc)
    patch.set_facecolor("none")
h, _ = ax.get_legend_handles_labels()
l = ax.legend(
    h,
    labels,
    title="Network Type",
    ncols=2,
    bbox_to_anchor=(0.5, 1.5),
    loc="upper center",
)
for lp, hatch in zip(l.get_patches(), hatches):
    lp.set_hatch(hatch)
    fc = lp.get_facecolor()
    lp.set_edgecolor(fc)
    lp.set_facecolor("none")
fig.savefig("vocab_size_noat.pdf", bbox_inches="tight")
plt.show()

### Plot ZLA

In [None]:
# Preprocess dataframe to fit our graph nicely
matches_df_fig3 = matches_df.loc[
    (matches_df["game_type"] != "analysis_always_same")
    & (matches_df["game_type"] != "analysis_never_same")
    & (matches_df["attention_sender"] == True)
    & (matches_df["attention_receiver"] == True)
].filter(like="zla")
matches_df_fig3 = matches_df_fig3.assign(game_type=matches_df["game_type"])
matches_df_fig3 = matches_df_fig3.assign(loss_temporal=matches_df["loss_temporal"])
matches_df_fig3 = matches_df_fig3.assign(net_temporal=matches_df["net_temporal"])
matches_df_fig3 = matches_df_fig3.assign(
    training_dataset=matches_df["training_dataset"]
)
matches_df_fig3 = matches_df_fig3.melt(
    id_vars=["game_type", "net_temporal", "loss_temporal", "training_dataset"],
    var_name="threshold",
    value_name="zla",
)
matches_df_fig3["threshold"] = matches_df_fig3["threshold"].apply(
    lambda x: x.split("-")[1]
)

thresholds = (0, 0.5, 1.5, 2)
max_rank = 200

fig, axes = plt.subplots(figsize=(32, 20), nrows=2, ncols=2)
axes = axes.reshape(-1)
for horizon in range(1, 5):
    for thr in thresholds:
        data_list = matches_df_fig3[matches_df_fig3["threshold"] == f"{thr}"]["zla"]
        y = np.array(
            [np.mean([e for e in x if e is not None]) for x in data_list][:max_rank]
        )
        y_sem = np.array(
            [
                has.standard_error_of_mean([e for e in x if e is not None])
                for x in data_list
            ][:max_rank]
        )
        x = np.arange(np.size(y)) + 1
        sns.lineplot(x=x, y=y, label="network type, threshold", ax=axes[horizon - 1])
        axes[horizon - 1].fill_between(
            x,
            y - y_sem,
            y + y_sem,
            color=ax.get_lines()[-1].get_color(),
            alpha=0.3,
        )
    axes[horizon - 1].legend(bbox_to_anchor=(0.5, -0.25), loc="upper center")
    axes[horizon - 1].set_xlabel("Frequency Rank")
    axes[horizon - 1].set_ylabel("Hypo-segment Length")
    axes[horizon - 1].set_xscale("log")
    axes[horizon - 1].set_yscale("log")
fig.savefig("zla_attval_at.pdf", bbox_inches="tight")

In [None]:
# Preprocess dataframe to fit our graph nicely
matches_df_fig3 = matches_df.loc[
    (matches_df["game_type"] != "analysis_always_same")
    & (matches_df["game_type"] != "analysis_never_same")
    & (matches_df["attention_sender"] == False)
    & (matches_df["attention_receiver"] == False)
].filter(like="zla")
matches_df_fig3 = matches_df_fig3.assign(game_type=matches_df["game_type"])
matches_df_fig3 = matches_df_fig3.assign(loss_temporal=matches_df["loss_temporal"])
matches_df_fig3 = matches_df_fig3.assign(net_temporal=matches_df["net_temporal"])
matches_df_fig3 = matches_df_fig3.assign(
    training_dataset=matches_df["training_dataset"]
)
matches_df_fig3 = matches_df_fig3.melt(
    id_vars=["game_type", "net_temporal", "loss_temporal", "training_dataset"],
    var_name="threshold",
    value_name="zla",
)
matches_df_fig3["threshold"] = matches_df_fig3["threshold"].apply(
    lambda x: x.split("-")[1]
)

thresholds = (0, 0.5, 1.5, 2)
max_rank = 200

fig, axes = plt.subplots(figsize=(32, 20), nrows=2, ncols=2)
axes = axes.reshape(-1)
for horizon in range(1, 5):
    for thr in thresholds:
        data_list = matches_df_fig3[matches_df_fig3["threshold"] == f"{thr}"]["zla"]
        y = np.array(
            [np.mean([e for e in x if e is not None]) for x in data_list][:max_rank]
        )
        y_sem = np.array(
            [
                has.standard_error_of_mean([e for e in x if e is not None])
                for x in data_list
            ][:max_rank]
        )
        x = np.arange(np.size(y)) + 1
        sns.lineplot(x=x, y=y, label="network type, threshold", ax=axes[horizon - 1])
        axes[horizon - 1].fill_between(
            x,
            y - y_sem,
            y + y_sem,
            color=ax.get_lines()[-1].get_color(),
            alpha=0.3,
        )
    axes[horizon - 1].legend(bbox_to_anchor=(0.5, -0.25), loc="upper center")
    axes[horizon - 1].set_xlabel("Frequency Rank")
    axes[horizon - 1].set_ylabel("Hypo-segment Length")
    axes[horizon - 1].set_xscale("log")
    axes[horizon - 1].set_yscale("log")
fig.savefig("zla_attval_noat.pdf", bbox_inches="tight")

### Plot Conditional Entropy

In [None]:
matches_df_fig4 = pd.DataFrame(
    [
        pd.Series(x)
        for x in matches_df.loc(
            (matches_df["attention_sender"] == True)
            & (matches_df["attention_receiver"] == True)
        ).entropies
    ]
)
matches_df_fig4.columns = ["entropy_{}".format(x + 1) for x in matches_df_fig4.columns]
matches_df_fig4 = matches_df_fig4.set_index(matches_df.index[: len(matches_df_fig4)])
matches_df_fig4 = matches_df_fig4.assign(game_type=matches_df["game_type"])
matches_df_fig4 = matches_df_fig4.assign(loss_temporal=matches_df["loss_temporal"])
matches_df_fig4 = matches_df_fig4.assign(net_temporal=matches_df["net_temporal"])
matches_df_fig4 = matches_df_fig4.assign(
    training_dataset=matches_df["training_dataset"]
)

matches_df_fig4 = matches_df_fig4.melt(
    id_vars=["game_type", "net_temporal", "loss_temporal", "training_dataset"],
    var_name="position",
    value_name="value",
)
matches_df_fig4["position"] = matches_df_fig4["position"].apply(
    lambda x: x.split("_")[1]
)

fig, ax = plt.subplots(figsize=(14, 8))
sns.lineplot(
    data=matches_df_fig4,
    x="position",
    y="value",
    hue=matches_df_fig1[["net_temporal", "loss_temporal", "training_dataset"]].apply(
        tuple, axis=1
    ),
).set(
    xlabel="Character Position",
    ylabel="Conditional Entropy",
)
labels = [
    "Non-Temporal Net w/o t-loss trained on RGs",
    "Non-Temporal Net w/o t-loss trained on TRGs",
    "Non-Temporal Net trained on RGs",
    "Non-Temporal Net trained on TRGs",
    "Temporal Net w/o t-loss trained on RGs",
    "Temporal Net w/o t-loss trained on TRGs",
    "Temporal Net trained on RGs",
    "Temporal Net trained on TRGs",
]
h, _ = ax.get_legend_handles_labels()
l = ax.legend(
    h,
    labels,
    title="Network Type",
    ncols=2,
    bbox_to_anchor=(0.5, 1.5),
    loc="upper center",
)
fig.savefig("conditional_entropy_at.pdf", bbox_inches="tight")

In [None]:
matches_df_fig4 = pd.DataFrame(
    [
        pd.Series(x)
        for x in matches_df.loc(
            (matches_df["attention_sender"] == False)
            & (matches_df["attention_receiver"] == False)
        ).entropies
    ]
)
matches_df_fig4.columns = ["entropy_{}".format(x + 1) for x in matches_df_fig4.columns]
matches_df_fig4 = matches_df_fig4.set_index(matches_df.index[: len(matches_df_fig4)])
matches_df_fig4 = matches_df_fig4.assign(game_type=matches_df["game_type"])
matches_df_fig4 = matches_df_fig4.assign(loss_temporal=matches_df["loss_temporal"])
matches_df_fig4 = matches_df_fig4.assign(net_temporal=matches_df["net_temporal"])
matches_df_fig4 = matches_df_fig4.assign(
    training_dataset=matches_df["training_dataset"]
)

matches_df_fig4 = matches_df_fig4.melt(
    id_vars=["game_type", "net_temporal", "loss_temporal", "training_dataset"],
    var_name="position",
    value_name="value",
)
matches_df_fig4["position"] = matches_df_fig4["position"].apply(
    lambda x: x.split("_")[1]
)

fig, ax = plt.subplots(figsize=(14, 8))
sns.lineplot(
    data=matches_df_fig4,
    x="position",
    y="value",
    hue=matches_df_fig1[["net_temporal", "loss_temporal", "training_dataset"]].apply(
        tuple, axis=1
    ),
).set(
    xlabel="Character Position",
    ylabel="Conditional Entropy",
)
labels = [
    "Non-Temporal Net w/o t-loss trained on RGs",
    "Non-Temporal Net w/o t-loss trained on TRGs",
    "Non-Temporal Net trained on RGs",
    "Non-Temporal Net trained on TRGs",
    "Temporal Net w/o t-loss trained on RGs",
    "Temporal Net w/o t-loss trained on TRGs",
    "Temporal Net trained on RGs",
    "Temporal Net trained on TRGs",
]
h, _ = ax.get_legend_handles_labels()
l = ax.legend(
    h,
    labels,
    title="Network Type",
    ncols=2,
    bbox_to_anchor=(0.5, 1.5),
    loc="upper center",
)
fig.savefig("conditional_entropy_noat.pdf", bbox_inches="tight")

### Plot Sample Utterance

In [None]:
random_seed: int = 0

rng = np.random.default_rng(random_seed)

threshold = 0

entr = EntropyCalculator(
    matches_df["lang"].sample().to_numpy()[0],
    threshold=threshold,
)
utter_id: int = rng.choice(range(len(entr.data)))
utter = entr.data[utter_id]
boundaries = entr.boundaries[utter_id]

fig, ax = plt.subplots(
    1,
    1,
    sharex="all",
    sharey="all",
    figsize=(14, 8),
)
for i, boundary in enumerate(sorted(boundaries)):
    ax.plot([boundary - 0.5] * 2, [-0.3, 4], linestyle="--", color="black")
for i, boundary in enumerate(sorted(boundaries)):
    bottom = boundary - 2
    while not (
        entr.branching_entropy[utter[bottom:boundary]]
        - entr.branching_entropy[utter[bottom : boundary - 1]]
        > entr.threshold
    ):
        assert bottom >= 0
        bottom -= 1
    anno_data = [utter[bottom : up + 1] for up in range(bottom, boundary)]
    x_plot_data = [x + 0.5 for x in range(bottom, boundary)]
    y_plot_data = [entr.branching_entropy[x] for x in anno_data]
    ax.plot(
        x_plot_data,
        y_plot_data,
        marker={
            0: "v",
            1: "^",
            2: "<",
            3: ">",
        }[i % 4],
        label=(f"transition for boundary {i+1}"),
        linewidth=3,
    )
    for idx in reversed(range(len(anno_data)) if len(anno_data) < 3 else (0, -2, -1)):
        anno = anno_data[idx]
        x, y = x_plot_data[idx], y_plot_data[idx]
        anno_str = (
            "$h("
            + (
                ",".join(map(str, anno))
                if len(anno) < 5
                else f"{anno[0]},\\ldots,{anno[-2]},{anno[-1]}"
            )
            + ")$"
        )
        ax.annotate(
            anno_str,
            (x, y),
            (x + 0.25, y + 0.25),
            bbox=dict(
                boxstyle="round",
                facecolor="white",
                alpha=0.5,
            ),
            arrowprops=dict(
                arrowstyle="->",
                connectionstyle="arc3",
            ),
        )
    prev_boundary = boundary
ax.legend(bbox_to_anchor=(0.5, 1.25), loc="upper center", ncol=2)
ax.set_xticks(list(range(len(utter))))
ax.set_xticklabels([str(u) for u in utter])
ax.set_xlabel("message")
ax.set_ylabel("$h$")
fig.tight_layout()
fig.savefig(
    f"branching_entropy_sample_thr{threshold}_seed{random_seed}.pdf",
    bbox_inches="tight",
)

### Plot TopSim

In [None]:
# Preprocess dataframe to fit our graph nicely
matches_df_fig6_t = matches_df.loc[
    (matches_df["game_type"] != "analysis_always_same")
    & (matches_df["game_type"] != "analysis_never_same")
    & (matches_df["attention_sender"] == True)
    & (matches_df["attention_receiver"] == True)
].filter(like="plain_topsim")

matches_df_fig6 = matches_df.loc[
    (matches_df["game_type"] != "analysis_always_same")
    & (matches_df["game_type"] != "analysis_never_same")
    & (matches_df["attention_sender"] == True)
    & (matches_df["attention_receiver"] == True)
].filter(like="to_topsims")
matches_df_fig6 = matches_df_fig6.assign(game_type=matches_df["game_type"])
matches_df_fig6 = matches_df_fig6.assign(loss_temporal=matches_df["loss_temporal"])
matches_df_fig6 = matches_df_fig6.assign(net_temporal=matches_df["net_temporal"])
matches_df_fig6 = matches_df_fig6.assign(
    training_dataset=matches_df["training_dataset"]
)
matches_df_fig6 = pd.concat([matches_df_fig6, matches_df_fig6_t], axis=1)


thresholds = (0, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2)
xlabel: str = "$threshold$"
ylabel: str = "TopSim"

fig, ax = plt.subplots(figsize=(14, 8))
for i in range(1):
    data_lists = [matches_df_fig6["plain_topsim"].to_numpy()] + [
        matches_df_fig6[f"thr-{thr}-thr_to_topsims"].to_numpy() for thr in thresholds
    ]
    x = np.array([-0.25] + list(thresholds))
    y = np.array([np.mean(d) for d in data_lists])
    y_sem = np.array([np.std(d, ddof=1) / np.sqrt(np.size(d)) for d in data_lists])
    sns.lineplot(
        x=x,
        y=y,
        label="a",
        marker=markers[i % len(markers)],
    )
    ax.fill_between(
        x,
        y - y_sem,
        y + y_sem,
        color=ax.get_lines()[-1].get_color(),
        alpha=0.3,
    )
ax.legend()
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
ax.set_xticks((-0.25,) + tuple(thresholds))
ax.set_xticklabels(("$-\\infty$",) + tuple(thresholds))
fig.savefig("topsim_at.pdf", bbox_inches="tight")

In [None]:
# Preprocess dataframe to fit our graph nicely
matches_df_fig6_t = matches_df.loc[
    (matches_df["game_type"] != "analysis_always_same")
    & (matches_df["game_type"] != "analysis_never_same")
    & (matches_df["attention_sender"] == False)
    & (matches_df["attention_receiver"] == False)
].filter(like="plain_topsim")

matches_df_fig6 = matches_df.loc[
    (matches_df["game_type"] != "analysis_always_same")
    & (matches_df["game_type"] != "analysis_never_same")
    & (matches_df["attention_sender"] == False)
    & (matches_df["attention_receiver"] == False)
].filter(like="to_topsims")
matches_df_fig6 = matches_df_fig6.assign(game_type=matches_df["game_type"])
matches_df_fig6 = matches_df_fig6.assign(loss_temporal=matches_df["loss_temporal"])
matches_df_fig6 = matches_df_fig6.assign(net_temporal=matches_df["net_temporal"])
matches_df_fig6 = matches_df_fig6.assign(
    training_dataset=matches_df["training_dataset"]
)
matches_df_fig6 = pd.concat([matches_df_fig6, matches_df_fig6_t], axis=1)


thresholds = (0, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2)
xlabel: str = "$threshold$"
ylabel: str = "TopSim"

fig, ax = plt.subplots(figsize=(14, 8))
for i in range(1):
    data_lists = [matches_df_fig6["plain_topsim"].to_numpy()] + [
        matches_df_fig6[f"thr-{thr}-thr_to_topsims"].to_numpy() for thr in thresholds
    ]
    x = np.array([-0.25] + list(thresholds))
    y = np.array([np.mean(d) for d in data_lists])
    y_sem = np.array([np.std(d, ddof=1) / np.sqrt(np.size(d)) for d in data_lists])
    sns.lineplot(
        x=x,
        y=y,
        label="a",
        marker=markers[i % len(markers)],
    )
    ax.fill_between(
        x,
        y - y_sem,
        y + y_sem,
        color=ax.get_lines()[-1].get_color(),
        alpha=0.3,
    )
ax.legend()
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
ax.set_xticks((-0.25,) + tuple(thresholds))
ax.set_xticklabels(("$-\\infty$",) + tuple(thresholds))
fig.savefig("topsim_noat.pdf", bbox_inches="tight")

### Plot Topsim Compare to Random Baseline

In [None]:
# Preprocess dataframe to fit our graph nicely
# First we create the random seg topsims
matches_df_fig7_1 = matches_df.loc[
    (matches_df["game_type"] != "analysis_always_same")
    & (matches_df["game_type"] != "analysis_never_same")
    & (matches_df["attention_sender"] == True)
    & (matches_df["attention_receiver"] == True)
].filter(like="seg_topsims")
matches_df_fig7_1 = matches_df_fig7_1.assign(game_type=matches_df["game_type"])
matches_df_fig7_1 = matches_df_fig7_1.assign(loss_temporal=matches_df["loss_temporal"])
matches_df_fig7_1 = matches_df_fig7_1.assign(net_temporal=matches_df["net_temporal"])
matches_df_fig7_1 = matches_df_fig7_1.assign(
    training_dataset=matches_df["training_dataset"]
)

# Then threshold topsims
matches_df_fig7_2 = matches_df.loc[
    (matches_df["game_type"] != "analysis_always_same")
    & (matches_df["game_type"] != "analysis_never_same")
].filter(like="to_topsims")
matches_df_fig7_2 = matches_df_fig7_2.assign(game_type=matches_df["game_type"])
matches_df_fig7_2 = matches_df_fig7_2.assign(loss_temporal=matches_df["loss_temporal"])
matches_df_fig7_2 = matches_df_fig7_2.assign(net_temporal=matches_df["net_temporal"])
matches_df_fig7_2 = matches_df_fig7_2.assign(
    training_dataset=matches_df["training_dataset"]
)

thresholds = (0, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2)
xlabel: str = "$threshold$"
ylabel: str = "TopSim"
attval_format: str = "$(n_{{att}},n_{{val}})={}$"
verbose: bool = True


fig, ax = plt.subplots(figsize=(14, 8))
data_lists = [
    matches_df_fig7_2[f"thr-{thr}-thr_to_topsims"].to_numpy() for thr in thresholds
]
random_seg_data_lists = [
    matches_df_fig7_1[f"thr-{thr}-thr_to_random_seg_topsims"].to_numpy()
    for thr in thresholds
]
x = np.array(list(thresholds))
y = np.array([np.mean(d) for d in data_lists])
y_sem = np.array([np.std(d, ddof=1) / np.sqrt(np.size(d)) for d in data_lists])
y_random = np.array([np.mean(d) for d in random_seg_data_lists])
y_random_sem = np.array(
    [np.std(d, ddof=1) / np.sqrt(np.size(d)) for d in random_seg_data_lists]
)
ax.plot(
    x,
    y,
    label="a",
    marker="o",
)
ax.fill_between(
    x,
    y - y_sem,
    y + y_sem,
    color=ax.get_lines()[-1].get_color(),
    alpha=0.3,
)
ax.plot(
    x,
    y_random,
    label="a" + " (random boundary)",
    marker="D",
)
ax.fill_between(
    x,
    y_random - y_random_sem,
    y_random + y_random_sem,
    color=ax.get_lines()[-1].get_color(),
    alpha=0.3,
)
ax.legend()
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
fig.savefig("topsim_attval{attval}_vs_random_baseline_at.pdf", bbox_inches="tight")

In [None]:
# Preprocess dataframe to fit our graph nicely
# First we create the random seg topsims
matches_df_fig7_1 = matches_df.loc[
    (matches_df["game_type"] != "analysis_always_same")
    & (matches_df["game_type"] != "analysis_never_same")
    & (matches_df["attention_sender"] == False)
    & (matches_df["attention_receiver"] == False)
].filter(like="seg_topsims")
matches_df_fig7_1 = matches_df_fig7_1.assign(game_type=matches_df["game_type"])
matches_df_fig7_1 = matches_df_fig7_1.assign(loss_temporal=matches_df["loss_temporal"])
matches_df_fig7_1 = matches_df_fig7_1.assign(net_temporal=matches_df["net_temporal"])
matches_df_fig7_1 = matches_df_fig7_1.assign(
    training_dataset=matches_df["training_dataset"]
)

# Then threshold topsims
matches_df_fig7_2 = matches_df.loc[
    (matches_df["game_type"] != "analysis_always_same")
    & (matches_df["game_type"] != "analysis_never_same")
].filter(like="to_topsims")
matches_df_fig7_2 = matches_df_fig7_2.assign(game_type=matches_df["game_type"])
matches_df_fig7_2 = matches_df_fig7_2.assign(loss_temporal=matches_df["loss_temporal"])
matches_df_fig7_2 = matches_df_fig7_2.assign(net_temporal=matches_df["net_temporal"])
matches_df_fig7_2 = matches_df_fig7_2.assign(
    training_dataset=matches_df["training_dataset"]
)

thresholds = (0, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2)
xlabel: str = "$threshold$"
ylabel: str = "TopSim"
attval_format: str = "$(n_{{att}},n_{{val}})={}$"
verbose: bool = True


fig, ax = plt.subplots(figsize=(14, 8))
data_lists = [
    matches_df_fig7_2[f"thr-{thr}-thr_to_topsims"].to_numpy() for thr in thresholds
]
random_seg_data_lists = [
    matches_df_fig7_1[f"thr-{thr}-thr_to_random_seg_topsims"].to_numpy()
    for thr in thresholds
]
x = np.array(list(thresholds))
y = np.array([np.mean(d) for d in data_lists])
y_sem = np.array([np.std(d, ddof=1) / np.sqrt(np.size(d)) for d in data_lists])
y_random = np.array([np.mean(d) for d in random_seg_data_lists])
y_random_sem = np.array(
    [np.std(d, ddof=1) / np.sqrt(np.size(d)) for d in random_seg_data_lists]
)
ax.plot(
    x,
    y,
    label="a",
    marker="o",
)
ax.fill_between(
    x,
    y - y_sem,
    y + y_sem,
    color=ax.get_lines()[-1].get_color(),
    alpha=0.3,
)
ax.plot(
    x,
    y_random,
    label="a" + " (random boundary)",
    marker="D",
)
ax.fill_between(
    x,
    y_random - y_random_sem,
    y_random + y_random_sem,
    color=ax.get_lines()[-1].get_color(),
    alpha=0.3,
)
ax.legend()
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
fig.savefig("topsim_attval{attval}_vs_random_baseline_noat.pdf", bbox_inches="tight")