# Printing and Plotting Results

Contains two sections:
1. How to create a summary table.
2. How to create plots showing the ranks of the similarity measures.

## Summary Table

This section of the notebook, creates the overview table in our paper. This code can be easily adjusted to also output more detailed tables.

In [None]:
import re
from pathlib import Path

import pandas as pd
import pandas.io.formats.style

import matplotlib.pyplot as plt

from repsim.benchmark.paths import BASE_PATH


measure_to_abbrv = {
    "AlignedCosineSimilarity": "AlignCos",
    "CKA": "CKA",
    "ConcentricityDifference": "ConcDiff",
    "DistanceCorrelation": "DistCorr",
    "EigenspaceOverlapScore": "EOS",
    "GeometryScore": "GS",
    "Gulp": "GULP",
    "HardCorrelationMatch": "HardCorr",
    "IMDScore": "IMD",
    "JaccardSimilarity": "Jaccard",
    "LinearRegression": "LinReg",
    "MagnitudeDifference": "MagDiff",
    "OrthogonalAngularShapeMetricCentered": "AngShape",
    "OrthogonalProcrustesCenteredAndNormalized": "OrthProc",
    "PWCCA": "PWCCA",
    "PermutationProcrustes": "PermProc",
    "ProcrustesSizeAndShapeDistance": "ProcDist",
    "RSA": "RSA",
    "RSMNormDifference": "RSMDiff",
    "RankSimilarity": "RankSim",
    "SVCCA": "SVCCA",
    "SecondOrderCosineSimilarity": "2nd-Cos",
    "SoftCorrelationMatch": "SoftCorr",
    "UniformityDifference": "UnifDiff",
    "RTD": "RTD",
}

measure_types = [
    ("AlignCos", "Alignment"),
    ("HardCorr", "Alignment"),
    ("AngShape", "Alignment"),
    ("LinReg", "Alignment"),
    ("OrthProc", "Alignment"),
    ("PermProc", "Alignment"),
    ("ProcDist", "Alignment"),
    ("SoftCorr", "Alignment"),

    ("EOS", "RSM"),
    ("CKA", "RSM"),
    ("DistCorr", "RSM"),
    ("GULP", "RSM"),
    ("RSA", "RSM"),
    ("RSMDiff", "RSM"),

    ("MagDiff", "Statistic"),
    ("ConcDiff", "Statistic"),
    ("UnifDiff", "Statistic"),

    ("GS", "Topology"),
    ("IMD", "Topology"),
    ("RTD", "Topology"),

    ("Jaccard", "Neighbors"),
    ("RankSim", "Neighbors"),
    ("2nd-Cos", "Neighbors"),

    ("PWCCA", "CCA"),
    ("SVCCA", "CCA"),

]

measure_type_order = ["CCA", "Alignment", "RSM", "Neighbors", "Topology", "Statistic"]


Step 1: Load all results.

In [None]:
cleaned_dfs = []
nlp_root = BASE_PATH /"paper_results" / "nlp_iclr"
for path in nlp_root.glob("*.csv"):
    df = pd.read_csv(path, index_col=0)
    setting = path.name.split("_")[0]

    pattern = r'(?<=_)sst2(?=_)|(?<=_)mnli(?=_)'
    match = re.search(pattern, path.name)
    assert match is not None
    dataset = match.group(0)

    token = path.name.split("_")[-1].split(".")[0]

    if "smollm" in path.name:
        # not true, but we want to group standard non-aggregated token results for the llm with the cls token results for bert and albert
        token = "cls"

    df["Token"] = token
    df["Setting"] = setting
    df["Dataset"] = dataset
    cleaned_dfs.append(df)

data = pd.concat(cleaned_dfs).reset_index(drop=True)
nlp_data = data


In [None]:
cleaned_dfs = []
root = BASE_PATH /"paper_results" /"graph"
for path in root.glob("*.csv"):
    if path.name.endswith("backup.csv"):
        continue

    df = pd.read_csv(path, index_col=0)
    pattern = r"augmentation|label_test|layer_test|output_correlation|shortcut"
    match = re.search(pattern, path.name)
    pattern_to_setting = {
        "augmentation": "aug",
        "label_test": "mem",
        "layer_test": "mono",
        "output_correlation": "correlation",
        "shortcut": "sc",
    }
    setting = pattern_to_setting[match.group(0)]

    pattern = r"(?<=_)cora(?=_)|(?<=_)flickr(?=_)|(?<=_)ogbn-arxiv(?=_)"
    match = re.search(pattern, path.name)
    assert match is not None
    dataset = match.group(0)

    df["Setting"] = setting
    df["Dataset"] = dataset
    cleaned_dfs.append(df)

data = pd.concat(cleaned_dfs).reset_index(drop=True)
graph_data = data

In [None]:
cleaned_dfs = []
root = BASE_PATH /"paper_results" /"vision_cameraready"
for path in root.glob("*.csv"):
    df = pd.read_csv(path, index_col=0)
    pattern = r"aug|augment|mem|randomlabel|mono|correlation|output|sc|shortcut"
    match = re.search(pattern, path.name)
    pattern_to_setting = {
        "aug": "aug",
        "augment": "aug",
        "mem": "mem",
        "randomlabel": "mem",
        "mono": "mono",
        "correlation": "correlation",
        "output": "correlation",
        "sc": "sc",
        "shortcut": "sc",
    }
    setting = pattern_to_setting[match.group(0)]

    pattern = r"(?<=_)in100(?=_)|(?<=_)c100(?=_)|in100(?=_)|c100(?=_)|C100(?=_)"
    match = re.search(pattern, path.name)
    assert match is not None,  f"{path} was not matched to setting"
    dataset = match.group(0)
    if dataset == "C100":
        dataset = "c100"

    df["Setting"] = setting
    df["Dataset"] = dataset
    cleaned_dfs.append(df)

data = pd.concat(cleaned_dfs).reset_index(drop=True)
vision_data = data

## Clean and Pivot

Step 2: Combine data into a big dataframe, clean up column names etc., and select data to be shown in table.

In [None]:
# ----------------------------------------------------------------------------------------------------------------------
# Combine data
# ----------------------------------------------------------------------------------------------------------------------
data = pd.concat([nlp_data, graph_data, vision_data])
print(data.columns)

data = data.rename(
    columns={
        "functional_similarity_measure": "Functional Similarity Measure",
        "similarity_measure": "Representational Similarity Measure",
        "quality_measure": "Quality Measure",
    }
)

idx = data.Setting == "correlation"
data.loc[idx, "value"] = data.loc[idx, "corr"]

idx = (data.Setting == "correlation") & (data["Functional Similarity Measure"] == "AbsoluteAccDiff")
data.loc[idx, "Setting"] = "acc_corr"

# ----------------------------------------------------------------------------------------------------------------------
# Exclude data not to be shown in table.
# ----------------------------------------------------------------------------------------------------------------------
idx = (data.Setting == "correlation") & (data["Functional Similarity Measure"] != "JSD")
data = data.loc[~idx]

idx = (data.Setting.isin(["aug", "mem", "sc"])) & (data["Quality Measure"] != "AUPRC")
data = data.loc[~idx]

idx = (data.Setting.isin(["correlation", "acc_corr"])) & (data["Quality Measure"] != "spearmanr")
data = data.loc[~idx]

# idx = (data.Setting.isin(["mono"])) & (data["Quality Measure"] != "violation_rate")
idx = (data.Setting.isin(["mono"])) & (data["Quality Measure"] != "correlation")
data = data.loc[~idx]

idx = (data.Token.isin(["mean"]))
data = data.loc[~idx]


# ----------------------------------------------------------------------------------------------------------------------
# Clean up names etc.
# ----------------------------------------------------------------------------------------------------------------------


def beautify_df(data):
    data.loc[:, "Representational Similarity Measure"] = data["Representational Similarity Measure"].map(
        measure_to_abbrv
    )
    data.loc[:, "architecture"] = data["architecture"].map(
        {
            "smollm2-1.7b": "SmolLM2",
            "albert-base-v2": "ALBERT",
            "BERT-L": "BERT",
            "GCN": "GCN",
            "GAT": "GAT",
            "GraphSAGE": "SAGE",
            "VGG11": "VGG11",
            "VGG19": "VGG19",
            "ResNet18": "RNet18",
            "ResNet34": "RNet34",
            "ResNet101": "RNet101",
            "ViT_B32": "ViT_B32",
            "ViT_L32": "ViT_L32",
            "PGNN": "P-GNN",
        }
    )
    data.loc[:, "domain"] = data["domain"].map({"NLP": "Text", "GRAPHS": "Graph", "VISION": "Vision"})
    data.loc[:, "Dataset"] = data["Dataset"].map(
        {
            "mnli_aug_rate0": "MNLI",
            "mnli_mem_rate0": "MNLI",
            "mnli": "MNLI",
            "sst2_sc_rate0558": "SST2",
            "sst2_mem_rate0": "SST2",
            "sst2_sft": "SST2",
            "sst2_sft_sc_rate0558": "SST2",
            "mnli_sc_rate0354": "MNLI",
            "sst2_aug_rate0": "SST2",
            "sst2": "SST2",
            "flickr": "flickr",
            "ogbn-arxiv": "arXiv",
            "cora": "Cora",
            "in100": "IN100",
            "c100": "CIFAR100",
        }
    )
    data.loc[:, "Setting"] = data["Setting"].map(
        {
            "aug": "Augmentation",
            "mem": "Random Labels",
            "correlation": "JSD Corr.",
            "acc_corr": "Acc Corr.",
            "mono": "Layer Mono.",
            "sc": "Shortcuts",
        }
    )
    column_order = ["Acc Corr.", "JSD Corr.", "Random Labels", "Shortcuts", "Augmentation", "Layer Mono."]
    data.loc[:, "Setting"] = pd.Categorical(
        data["Setting"],
        categories=column_order,
        ordered=True,
    )
    data.loc[:, "Quality Measure"] = data["Quality Measure"].map(
        {"violation_rate": "Conformity Rate", "AUPRC": "AUPRC", "spearmanr": "Spearman", "correlation": "Spearman"}
    )
    data.loc[data["Quality Measure"] == "Conformity Rate", "value"] = (
        1 - data.loc[data["Quality Measure"] == "Conformity Rate", "value"]
    )  # must be run in conjunction with the above renaming

    data = data.rename(
        columns={
            "domain": "Domain",
            "architecture": "Arch.",
            "Representational Similarity Measure": "Sim Meas.",
            "Quality Measure": "Eval.",
            "Setting": "Test",
        }
    )
    data = pd.merge(data, pd.DataFrame.from_records(measure_types, columns=["Sim Meas.", "Measure Type"]), how="left", on="Sim Meas.")
    data.loc[:, "Measure Type"] = pd.Categorical(data["Measure Type"], categories=measure_type_order, ordered=True)
    data.loc[data.Test.isin(["Acc Corr.", "JSD Corr."]), "Type"] = "Grounding by Prediction"
    data.loc[data.Test.isin(["Random Labels", "Shortcuts", "Augmentation", "Layer Mono."]), "Type"] = (
        "Grounding by Design"
    )
    return data, column_order


data, column_order = beautify_df(data)

# ----------------------------------------------------------------------------------------------------------------------
# Create aggregated overview table
# ----------------------------------------------------------------------------------------------------------------------
# idx = data["Dataset"].isin(["MNLI", "flickr", "IN100"]) & data["Arch."].isin(["SAGE", "BERT", "RNet18"])
idx = data["Dataset"].isin(["SST2", "flickr", "IN100"]) & data["Arch."].isin(["SAGE", "BERT", "RNet18"])


pivot = pd.pivot_table(
    data.loc[idx],
    index=["Measure Type", "Sim Meas."],  # <---
    # index="Sim Meas.",
    columns=["Type", "Test", "Eval.", "Domain", "Dataset", "Arch."],
    values="value",
)
pivot = pivot.sort_values(by=["Measure Type", "Sim Meas."], axis="index")  # <---
# pivot = pivot.sort_values(by="Sim Meas.", axis="index")
pivot = pivot.reindex(measure_type_order, axis="index", level=0)  # <---
pivot = pivot.reindex(column_order, axis="columns", level="Test")
pivot = pivot.reindex(["Grounding by Prediction", "Grounding by Design"], axis="columns", level="Type")
pivot

### Turn values into strings

In [None]:
unpivot = pivot.unstack().unstack().dropna().reset_index()  # values will be in col "0"
unpivot.loc[:, 1] = unpivot.loc[:, 0].astype("str")
unpivot.loc[:, 1] = unpivot.loc[:, 0].apply(lambda x: f"{round(x, ndigits=2):.2f}")
pivot = unpivot.pivot(index=["Measure Type", "Sim Meas."],
    columns=["Type", "Test", "Eval.", "Domain", "Dataset", "Arch."],
    values=1,)
pivot = pivot.reindex(measure_type_order, axis="index", level=0)  # <---

unpivot
pivot

In [None]:
# Highlight the best values by bolding
for column in pivot.columns:
    col = pivot.loc[:, column].astype("float")
    idx = col == col.max()
    pivot.loc[idx, column] = pivot.loc[idx, column].apply(lambda s: r"\textbf{" + s + "}")
pivot

### Significance Indicators

In [None]:
idx = data["Dataset"].isin(["SST2", "flickr", "IN100"]) & data["Arch."].isin(["SAGE", "BERT", "RNet18"]) & data.Test.isin(["Acc Corr.", "JSD Corr."])
# idx = data["Dataset"].isin(["MNLI", "flickr", "IN100"]) & data["Arch."].isin(["SAGE", "BERT", "RNet18"]) & data.Test.isin(["Acc Corr.", "JSD Corr."])
data_corr = data.loc[idx].copy()


def pval_str(pval):
    # if pval == pd.notna
    if isinstance(pval, float):
        if pval <= 0.01:
            return r"$^{**}$"
            # return r"$^{\dagger}$"
        if pval <= 0.05:
            return r"$^{*\phantom{*}}$"
            # return r"$^{\ddagger}$"
    return r"$^{\phantom{**}}$"

def significance_via_text_style(pval):
    if pval <= 0.01:
        return [r"\underline{\underline{", r"}}"]
    if pval <= 0.05:
        return [r"\underline{", r"}"]
    return ["", ""]

data_corr["val_comb"] = data_corr["value"].apply(lambda x: f"{round(x, ndigits=2):.2f}") + data_corr["pval"].apply(pval_str)
# data_corr["val_comb"] = data_corr["pval"].apply(significance_via_text_style).apply(lambda x: x[0]) + data_corr["value"].apply(lambda x: f"{round(x, ndigits=2):.2f}") + data_corr["pval"].apply(significance_via_text_style).apply(lambda x: x[1])
data_corr

pivot_corr = data_corr.pivot(
    index=["Measure Type", "Sim Meas."],
    columns=["Type", "Test", "Eval.", "Domain", "Dataset", "Arch."],
    values=["val_comb"],
).sort_values(
    by=["Measure Type", "Sim Meas."],
).reindex(
    measure_type_order, axis="index", level=0
).reindex(
    column_order, axis="columns", level="Test"
).reindex(
    ["Graph", "Text", "Vision"], axis="columns", level="Domain"
).loc[:, "val_comb"]
pivot_corr

def floatify(s: str) -> str:
    r"""Turn a string like '-0.10$^{\phantom{**}}$' into '-0.10'"""
    if not isinstance(s, str):
        return s
    return s[:s.find("$")]

def separate_significance_indicator(s: str) -> str:
    r"""Turn a string like '-0.10$^{\phantom{**}}$' into '$^{\phantom{**}}$'"""
    if not isinstance(s, str):
        return s
    return s[s.find("$"):]

for column in pivot_corr.columns:
    col = pivot_corr.loc[:, column].apply(floatify).astype("float")
    identifiers = pivot_corr.loc[:, column].apply(separate_significance_indicator)
    idx = col == col.max()
    new_col = col.apply(lambda x: f"{x:.2f}").apply(lambda s: r"\textbf{" + s + "}") + identifiers
    pivot_corr.loc[idx, column] = new_col

pivot_corr

In [None]:
# pivot.loc[:, ("Grounding by Prediction")].astype("str", copy=False)
# pivot.loc[:, ("Grounding by Prediction", "Acc Corr.", "Spearman", "Graph", "flickr", "SAGE")] = pivot.loc[:, ("Grounding by Prediction", "Acc Corr.", "Spearman", "Graph", "flickr", "SAGE")].astype("str")
# pivot.loc[:, ("Grounding by Prediction")].dtypes

pivot.loc[:, ("Grounding by Prediction")] = pivot_corr
pivot

Step 3: Convert into latex table.

In [None]:
styled = pd.io.formats.style.Styler(
    pivot,
    precision=2,
)

# Highlight top value
# latex_str = styled.highlight_max(axis=0, props="textbf:--rwrap;").to_latex(
#     hrules=True,
#     position="t",
#     label="tab:result_overview",
# )
latex_str = styled.to_latex(hrules=True, position="t", label="tab:result_overview",)


# ----- Manual modifications --------
latex_str = latex_str.split("\n")

# Center headers
pattern = r"\{r\}"
replacement = r"{c}"
latex_str = [re.sub(pattern, replacement, line) if i in [5, 6, 7] else line for i, line in enumerate(latex_str)]

# Remove measure row
latex_str.pop(11)

# Add vertical bars
line_no = 2
# line_no = 3
mod_line = latex_str[line_no][:18] + "".join(["|rrr"] * 6) + "}"
latex_str[line_no] = mod_line

# Make the left-most cells white
latex_str = [
    r"\cellcolor{white}" + line if i >= 11 and (i - 11) % 2 == 0 else line for i, line in enumerate(latex_str[:-4])
] + latex_str[-4:]
latex_str = "\n".join(latex_str)
print(latex_str)

## Rankplots

Requires section above to be run as well.

In [None]:
import seaborn as sns

sns.set_theme("paper", style="white", font_scale=1.5)


Combine data similarly to before, but do not filter out specific parts.

In [None]:
from IPython.display import display

In [None]:
data = pd.concat([nlp_data, graph_data, vision_data])
# display(data[(data.Setting == "mono") & (data.similarity_measure == "RTD")].head())
data = data.rename(
    columns={
        "functional_similarity_measure": "Functional Similarity Measure",
        "similarity_measure": "Representational Similarity Measure",
        "quality_measure": "Quality Measure",
    }
)
data = data.reset_index()


idx = data.Setting == "correlation"
data.loc[idx, "value"] = data.loc[idx, "corr"]

idx = data["Quality Measure"].isin(["AUPRC", "spearmanr", "correlation"])
data = data.loc[idx]

idx = data.Setting == "correlation"
data.loc[idx, "Setting"] = data.loc[idx, "Setting"] + data.loc[idx, "Functional Similarity Measure"]

idx = ~(data.Setting == "mono")
data.loc[idx, "model"] = "agg"

idx = data.Token.isna()
data.loc[idx, "Token"] = "NA"

# idx = data.Token.isin(["mean"])
# data = data.loc[~idx]

data["rank"] = data.groupby(["domain", "Setting", "Dataset", "architecture", "model", "Token"], as_index=True)["value"].rank(
    ascending=False, method="min", na_option="keep"
)
display(data[(data.Setting == "mono") & (data["Representational Similarity Measure"] == "RTD")].head())


# combine layer mono results to equally weight experiments
idx = (data.model != "agg") & (~data["rank"].isna())
data.loc[idx, "rank"] = data[idx].groupby(["domain", "Setting", "Dataset", "architecture", "Token"])["rank"].transform("mean")
data = data.drop_duplicates(subset=["domain", "Setting", "Dataset", "architecture", "Representational Similarity Measure", "Functional Similarity Measure", "Quality Measure"])


data.loc[:, "Representational Similarity Measure"] = data["Representational Similarity Measure"].map(measure_to_abbrv)
data.loc[:, "architecture"] = data["architecture"].map(
    {
        "smollm2-1.7b": "SmolLM2",
        "albert-base-v2": "ALBERT",
        "BERT-L": "BERT",
        "GCN": "GCN",
        "GAT": "GAT",
        "GraphSAGE": "SAGE",
        "VGG11": "VGG11",
        "VGG19": "VGG19",
        "ResNet18": "RNet18",
        "ResNet34": "RNet34",
        "ResNet101": "RNet101",
        "ViT_B32": "ViT_B32",
        "ViT_L32": "ViT_L32",
    }
)
data.loc[:, "domain"] = data["domain"].map({"NLP": "Language", "GRAPHS": "Graph", "VISION": "Vision"})
data.loc[:, "Dataset"] = data["Dataset"].map(
    {
        "mnli_aug_rate0": "MNLI",
        "mnli_mem_rate0": "MNLI",
        "mnli": "MNLI",
        "sst2_sc_rate0558": "SST2",
        "sst2_mem_rate0": "SST2",
        "sst2_sft": "SST2",
        "sst2_sft_sc_rate0558": "SST2",
        "mnli_sc_rate0354": "MNLI",
        "sst2_aug_rate0": "SST2",
        "sst2": "SST2",
        "flickr": "flickr",
        "ogbn-arxiv": "arXiv",
        "cora": "Cora",
        "in100": "IN100",
        "c100": "CIFAR100",
    }
)
data.loc[:, "Setting"] = data["Setting"].map(
    {
        "aug": "Augmentation",
        "mem": "Random Labels",
        "correlationJSD": "JSD Corr.",
        "correlationAbsoluteAccDiff": "Acc Corr.",
        "correlationDisagreement": "Disagr. Corr.",
        "acc_corr": "Acc Corr.",
        "mono": "Layer Mono.",
        "sc": "Shortcuts",
    }
)
# display(data[(data.Setting == "Layer Mono.")  & (data["Representational Similarity Measure"] == "RTD")].head())


data.loc[:, "Quality Measure"] = data["Quality Measure"].map(
    {"violation_rate": "Conformity Rate", "AUPRC": "AUPRC", "spearmanr": "Spearman", "correlation": "Spearman"}
)
data.loc[data["Quality Measure"] == "Conformity Rate", "value"] = (
    1 - data.loc[data["Quality Measure"] == "Conformity Rate", "value"]
)  # must be run in conjunction with the above renaming
# display(data[(data.Setting == "Layer Mono.") & (data["Representational Similarity Measure"] == "RTD")].head())

data = data.rename(
    columns={
        "domain": "Modality",
        "architecture": "Arch.",
        "Representational Similarity Measure": "Sim Meas.",
        "Quality Measure": "Eval.",
        "Setting": "Scenario",
    }
)
# display(data[(data.Scenario == "Layer Mono.") & (data["Sim Meas."] == "RTD")].head())

data = data.sort_values(by=["Sim Meas."])

In [None]:
fake = pd.DataFrame({"a": [2, 2, 2, 2, 2, 2, 3]})
fake.rank(method="min", ascending=False)

### Summary

Rank measures.

In [None]:
avg_ranks = data.groupby(["Modality", "Sim Meas."])["rank"].agg(["mean", "median"]).reset_index()
avg_ranks = avg_ranks.rename(columns={"mean": "avg_rank", "median": "med_rank"})
avg_ranks

Create plots.

In [None]:
plot_data = pd.merge(data, avg_ranks).sort_values(by=["med_rank", "avg_rank"])
plot_data = pd.merge(plot_data, pd.DataFrame.from_records(measure_types, columns=["Sim Meas.", "Measure Type"]), how="left", on="Sim Meas.")

fig, axes = plt.subplots(1, 3, sharey=False, figsize=(7*0.8*3, 7))
fig

for i, mod in enumerate(["Graph", "Language", "Vision"]):
    ax = axes[i]
    sns.boxplot(
        data=plot_data[plot_data.Modality == mod],
        x="rank",
        y="Sim Meas.",
        hue="Measure Type",
        hue_order=["Neighbors", "RSM", "Alignment", "Topology", "CCA", "Statistic"],
        palette="colorblind",
        legend=True if mod=="Vision" else False,
        ax=ax,
        # whis=(5.,95.)
    )
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)

    ax.set_xlabel("Rank")
    ax.set_ylabel("Similarity Measures")

    fig.tight_layout()

    if mod == "Graph":
        ax.set_title("Graphs")
    else:
        ax.set_title(mod)

    if mod == "Vision":
        sns.move_legend(ax, loc="right", bbox_to_anchor=(1.45,0.5))
    fig.savefig(BASE_PATH / "figs" / f"aggregated_ver_{mod}.pdf", bbox_inches="tight")