# Printing and Plotting Results

## Load all data

In [None]:
import re

import pandas as pd
import pandas.io.formats.style

from repsim.benchmark.paths import BASE_PATH
from IPython.display import display

measure_to_abbrv = {
    "AlignedCosineSimilarity": "AlignCos",
    "CKA": "CKA",
    "ConcentricityDifference": "ConcDiff",
    "DistanceCorrelation": "DistCorr",
    "EigenspaceOverlapScore": "EOS",
    "GeometryScore": "GS",
    "Gulp": "GULP",
    "HardCorrelationMatch": "HardCorr",
    "IMDScore": "IMD",
    "JaccardSimilarity": "Jaccard",
    "LinearRegression": "LinReg",
    "MagnitudeDifference": "MagDiff",
    "OrthogonalAngularShapeMetricCentered": "AngShape",
    "OrthogonalProcrustesCenteredAndNormalized": "OrthProc",
    "PWCCA": "PWCCA",
    "PermutationProcrustes": "PermProc",
    "ProcrustesSizeAndShapeDistance": "ProcDist",
    "RSA": "RSA",
    "RSMNormDifference": "RSMDiff",
    "RankSimilarity": "RankSim",
    "SVCCA": "SVCCA",
    "SecondOrderCosineSimilarity": "2nd-Cos",
    "SoftCorrelationMatch": "SoftCorr",
    "UniformityDifference": "UnifDiff",
    "RTD": "RTD",
}

measure_types = [
    ("AlignCos", "Alignment"),
    ("HardCorr", "Alignment"),
    ("AngShape", "Alignment"),
    ("LinReg", "Alignment"),
    ("OrthProc", "Alignment"),
    ("PermProc", "Alignment"),
    ("ProcDist", "Alignment"),
    ("SoftCorr", "Alignment"),

    ("EOS", "RSM"),
    ("CKA", "RSM"),
    ("DistCorr", "RSM"),
    ("GULP", "RSM"),
    ("RSA", "RSM"),
    ("RSMDiff", "RSM"),

    ("MagDiff", "Statistic"),
    ("ConcDiff", "Statistic"),
    ("UnifDiff", "Statistic"),

    ("GS", "Topology"),
    ("IMD", "Topology"),
    ("RTD", "Topology"),

    ("Jaccard", "Neighbors"),
    ("RankSim", "Neighbors"),
    ("2nd-Cos", "Neighbors"),

    ("PWCCA", "CCA"),
    ("SVCCA", "CCA"),

]

measure_type_order = ["CCA", "Alignment", "RSM", "Neighbors", "Topology", "Statistic"]


Step 1: Load all results.

In [None]:
cleaned_dfs = []
nlp_root = BASE_PATH /"paper_results" / "nlp_iclr"
for path in nlp_root.glob("*.csv"):
    df = pd.read_csv(path, index_col=0)
    setting = path.name.split("_")[0]

    pattern = r'(?<=_)sst2(?=_)|(?<=_)mnli(?=_)'
    match = re.search(pattern, path.name)
    assert match is not None
    dataset = match.group(0)

    token = path.name.split("_")[-1].split(".")[0]
    if "smollm" in path.name:
        # not true, but we want to group standard non-aggregated token results for the llm with the cls token results for bert and albert
        token = "cls"

    df["Token"] = token
    df["Setting"] = setting
    df["Dataset"] = dataset
    cleaned_dfs.append(df)

data = pd.concat(cleaned_dfs).reset_index(drop=True)
nlp_data = data


In [None]:
cleaned_dfs = []
root = BASE_PATH /"paper_results" /"graph"
for path in root.glob("*.csv"):
    if path.name.endswith("backup.csv"):
        continue

    df = pd.read_csv(path, index_col=0)
    pattern = r"augmentation|label_test|layer_test|output_correlation|shortcut"
    match = re.search(pattern, path.name)
    pattern_to_setting = {
        "augmentation": "aug",
        "label_test": "mem",
        "layer_test": "mono",
        "output_correlation": "correlation",
        "shortcut": "sc",
    }
    setting = pattern_to_setting[match.group(0)]

    pattern = r"(?<=_)cora(?=_)|(?<=_)flickr(?=_)|(?<=_)ogbn-arxiv(?=_)"
    match = re.search(pattern, path.name)
    assert match is not None
    dataset = match.group(0)

    df["Setting"] = setting
    df["Dataset"] = dataset
    cleaned_dfs.append(df)

data = pd.concat(cleaned_dfs).reset_index(drop=True)
graph_data = data

In [None]:
graph_data[(graph_data.representation_dataset=="cora") & (graph_data.Setting == "correlation") & (graph_data.quality_measure == "spearmanr")].groupby(["architecture", "functional_similarity_measure","similarity_measure"]).count()
graph_data[(graph_data.representation_dataset=="cora") & (graph_data.Setting == "correlation") & (graph_data.quality_measure == "spearmanr") & (graph_data.similarity_measure == "AlignedCosineSimilarity") & (graph_data.architecture == "GCN")]

In [None]:
cleaned_dfs = []
root = BASE_PATH /"paper_results" /"vision_cameraready"
for path in root.glob("*.csv"):
    df = pd.read_csv(path, index_col=0)
    pattern = r"aug|augment|mem|randomlabel|mono|correlation|output|sc|shortcut"
    match = re.search(pattern, path.name)
    pattern_to_setting = {
        "aug": "aug",
        "augment": "aug",
        "mem": "mem",
        "randomlabel": "mem",
        "mono": "mono",
        "correlation": "correlation",
        "output": "correlation",
        "sc": "sc",
        "shortcut": "sc",
    }
    setting = pattern_to_setting[match.group(0)]

    pattern = r"(?<=_)in100(?=_)|(?<=_)c100(?=_)|in100(?=_)|c100(?=_)|C100(?=_)"
    match = re.search(pattern, path.name)
    assert match is not None,  f"{path} was not matched to setting"
    dataset = match.group(0)
    if dataset == "C100":
        dataset = "c100"

    df["Setting"] = setting
    df["Dataset"] = dataset
    cleaned_dfs.append(df)

data = pd.concat(cleaned_dfs).reset_index(drop=True)
vision_data = data

In [None]:
# ----------------------------------------------------------------------------------------------------------------------
# Combine data
# ----------------------------------------------------------------------------------------------------------------------
data = pd.concat([nlp_data, graph_data, vision_data])
print(data.columns)


data = data.rename(
    columns={
        "functional_similarity_measure": "Functional Similarity Measure",
        "similarity_measure": "Representational Similarity Measure",
        "quality_measure": "Quality Measure",
    }
)

# Copy values from correlation experiment into same column for results scores like other experiments
idx = data.Setting == "correlation"
data.loc[idx, "value"] = data.loc[idx, "corr"]

# Exclude evaluation in output correlation experiments with Kendalltau und pearsonr. We only show Spearmanr
idx = (data.Setting == "correlation") & (data["Quality Measure"] != "spearmanr")
data = data.loc[~idx]

# Update the setting to be able to distinguish correlation results with different functional similarity measures easily
idx = data.Setting == "correlation"
data.loc[idx, "Setting"] = data.loc[idx, "Setting"] + data.loc[idx, "Functional Similarity Measure"]


# ----------------------------------------------------------------------------------------------------------------------
# Clean up names etc.
# ----------------------------------------------------------------------------------------------------------------------


def beautify_df(data):
    data.loc[:, "Representational Similarity Measure"] = data["Representational Similarity Measure"].map(
        measure_to_abbrv
    )
    data.loc[:, "architecture"] = data["architecture"].map(
        {
            "smollm2-1.7b": "SmolLM2",
            "albert-base-v2": "ALBERT",
            "BERT-L": "BERT",
            "GCN": "GCN",
            "GAT": "GAT",
            "GraphSAGE": "SAGE",
            "PGNN": "PGNN",
            "VGG11": "VGG11",
            "VGG19": "VGG19",
            "ResNet18": "RNet18",
            "ResNet34": "RNet34",
            "ResNet101": "RNet101",
            "ViT_B32": "ViT B32",
            "ViT_L32": "ViT L32",
        }
    )
    data.loc[:, "domain"] = data["domain"].map({"NLP": "Text", "GRAPHS": "Graph", "VISION": "Vision"})
    data.loc[:, "Dataset"] = data["Dataset"].map(
        {
            "mnli_aug_rate0": "MNLI",
            "mnli_mem_rate0": "MNLI",
            "mnli": "MNLI",
            "sst2_sc_rate0558": "SST2",
            "sst2_mem_rate0": "SST2",
            "sst2_sft": "SST2",
            "sst2_sft_sc_rate0558": "SST2",
            "mnli_sc_rate0354": "MNLI",
            "sst2_aug_rate0": "SST2",
            "sst2": "SST2",
            "flickr": "Flickr",
            "ogbn-arxiv": "OGBN-Arxiv",
            "cora": "Cora",
            "in100": "IN100",
            "c100": "CIFAR100",
        }
    )
    data.loc[:, "Setting"] = data["Setting"].map(
        {
            "aug": "Augmentation",
            "mem": "Random Labels",
            "correlationJSD": "JSD Corr.",
            "correlationAbsoluteAccDiff": "Acc. Corr.",
            "correlationDisagreement": "Disagr. Corr.",
            "mono": "Layer Mono.",
            "sc": "Shortcuts",
        }
    )
    column_order = ["Acc. Corr.", "JSD Corr.", "Disagr. Corr.", "Random Labels", "Shortcuts", "Augmentation", "Layer Mono."]
    data.loc[:, "Setting"] = pd.Categorical(
        data["Setting"],
        categories=column_order,
        ordered=True,
    )
    data.loc[:, "Quality Measure"] = data["Quality Measure"].map(
        {"violation_rate": "Conformity Rate", "AUPRC": "AUPRC", "spearmanr": "Spearman", "correlation": "Spearman"}
    )
    data.loc[data["Quality Measure"] == "Conformity Rate", "value"] = (
        1 - data.loc[data["Quality Measure"] == "Conformity Rate", "value"]
    )  # must be run in conjunction with the above renaming

    data = data.rename(
        columns={
            "domain": "Modality",
            "architecture": "Arch.",
            "Representational Similarity Measure": "Sim Meas.",
            "Quality Measure": "Eval.",
            "Setting": "Test",
        }
    )
    data = pd.merge(data, pd.DataFrame.from_records(measure_types, columns=["Sim Meas.", "Measure Type"]), how="left", on="Sim Meas.")
    data.loc[:, "Measure Type"] = pd.Categorical(data["Measure Type"], categories=measure_type_order, ordered=True)
    data.loc[data.Test.isin(["Acc. Corr.", "JSD Corr.", "Disagr. Corr."]), "Type"] = "Grounding by Prediction"
    data.loc[data.Test.isin(["Random Labels", "Shortcuts", "Augmentation", "Layer Mono."]), "Type"] = (
        "Grounding by Design"
    )
    return data, column_order


data, column_order = beautify_df(data)


In [None]:
data

## Helper Functions for Tables

In [None]:
def pval_str(pval):
    if isinstance(pval, float):
        if pval <= 0.01:
            return r"$^{**}$"
        if pval <= 0.05:
            return r"$^{*\phantom{*}}$"
    return r"$^{\phantom{**}}$"

def floatify(s: str) -> str:
    r"""Turn a string like '-0.10$^{\phantom{**}}$' into '-0.10'"""
    if not isinstance(s, str):
        return s
    return s[:s.find("$")]

def separate_significance_indicator(s: str) -> str:
    r"""Turn a string like '-0.10$^{\phantom{**}}$' into '$^{\phantom{**}}$'"""
    if not isinstance(s, str):
        return s
    return s[s.find("$"):]

def texify(pivot, out_path, caption, label, resizebox_width=1.0):
    def find_line_index(lines, search_items: list[str]):
        for i, line in enumerate(lines):
            if any(item in line for item in search_items):
                return i
        raise ValueError(f"Could not find line with any of {search_items}")

    def find_measure_type_rows(lines):
        pattern = r"\\multirow\[c\]{(\d+)}\{([^}]+)\}\{([^}]+)\}"

        results = []

        # Iterate through each line and search for the pattern
        for index, line in enumerate(lines):
            match = re.search(pattern, line)
            if match:
                # Extract the integer and append the index and integer to results
                results.append((index, int(match.group(1)), match.group(3)))

        return results

    def parse_table_layout(line):
        # Remove \rowcolor{white} and split by & to get columns
        row = line.replace('\\rowcolor{white}', '').strip()
        columns = [col.strip() for col in row.split('&')]

        # First two columns are always left-aligned
        layout = ['ll|']

        # Process remaining columns
        for col in columns[2:]:
            if col.startswith('\\multicolumn'):
                # Extract number of columns from \multicolumn{N}
                num_cols = int(col.split('{')[1].split('}')[0])
                # Add right-aligned columns for multicolumn
                layout.append('r' * num_cols)
            else:
                # Single column, right-aligned with vertical line
                layout.append('r')
        return '|'.join(layout)

    # Convert into latex file
    styled = pd.io.formats.style.Styler(
        pivot,
        precision=2,
    )

    latex_str = styled.to_latex(
        hrules=True,
        position="h",
        label=label,
        caption=caption,
        column_format="",
    )
    # print(latex_str)

    # ----- Manual modifications --------
    lines = latex_str.split("\n")
    # print(lines[:15])

    # Add opening of resizebox
    lines = lines[:3] + [r"\centering"] + [r"\resizebox{" + str(resizebox_width) + r"\linewidth}{!}{"] + [r"\rowcolors{2}{white}{Gray}"] + lines[3:]
    # print("\n".join([f"{i}: {line}" for i, line in enumerate(lines)]))

    # Center headers
    pattern = r"\{r\}"
    replacement = r"{c}"
    lines = [re.sub(pattern, replacement, line) if "multicolumn" in line else line for i, line in enumerate(lines)]
    # print("\n".join([f"{i}: {line}" for i, line in enumerate(lines)]))

    # Remove measure row
    lines.pop(find_line_index(lines, ["Sim Meas."]))
    # print("\n".join([f"{i}: {line}" for i, line in enumerate(lines)]))

    # Remove modality row
    lines.pop(find_line_index(lines, ["Modality"]))
    # print("\n".join([f"{i}: {line}" for i, line in enumerate(lines)]))

    # Remove Arch. row
    # lines.pop(10)
    # print("\n".join([f"{i}: {line}" for i, line in enumerate(lines)]))

    # find first line with data to make all headers white
    first_header_line = 7
    first_data_line = find_measure_type_rows(lines)[0][0]
    lines = lines[:first_header_line] + [r"\rowcolor{white}" + line for line in lines[first_header_line:first_data_line]] + lines[first_data_line:]

    # Add multirow for measure types
    multirow_lines = find_measure_type_rows(lines)
    for i, (line_idx, lines_to_move, category_name) in enumerate(multirow_lines):
        lines[line_idx] = lines[line_idx].replace(r"\multirow[c]{"+ str(lines_to_move) + "}{*}{"+ category_name + "}", "")
        if i == len(multirow_lines) - 1:  # final category should have no following \midrule
            lines[line_idx + lines_to_move - 1] = r"\multirow[c]{"+ str(-1*lines_to_move) + "}{*}{"+ category_name + "}"+ lines[line_idx + lines_to_move - 1]
        else:
            lines[line_idx + lines_to_move - 1] = r"\multirow[c]{"+ str(-1*lines_to_move) + "}{*}{"+ category_name + "}"+ lines[line_idx + lines_to_move - 1] + r"\midrule"

    # Make the measure type col white
    final_rows_to_exclude = 4
    lines = lines[:first_data_line] + [
        r"\cellcolor{white}" + line for line in lines[first_data_line:-final_rows_to_exclude]
    ] + lines[-final_rows_to_exclude:]

    # fix the column layout
    col_layout = parse_table_layout([line for line in lines if line.startswith(r"\rowcolor{white} & Dataset")][0])
    lines[6] = r"\begin{tabular}{" + col_layout + "}"

    # Add closing of resizebox
    lines = lines[:-2] + [r"}"] + lines[-2:]

    #
    latex_str = "\n".join(lines)
    # print(latex_str)

    with open(out_path, "w") as f:
        f.write(latex_str)


## Language Tables

In [None]:
# Select language results
datasets = ["MNLI", "SST2"]
archs = ["BERT", "ALBERT", "SmolLM2"]
idx = data["Dataset"].isin(datasets) & data["Arch."].isin(archs)
tests_with_pvals = ["Acc. Corr.", "JSD Corr.", "Disagr. Corr."]
verbose = False
col_levels = ["Type", "Token", "Test", "Eval.", "Dataset", "Modality", "Arch."]

# Create pivot table
pivot = pd.pivot_table(
    data.loc[idx],  # type: ignore
    index=["Measure Type", "Sim Meas."],
    columns=col_levels,
    values="value",
)
pivot = pivot.sort_values(by=["Measure Type", "Sim Meas."])
pivot = pivot.reindex(measure_type_order, axis="index", level=0)
pivot = pivot.reindex(column_order, axis="columns", level="Test")
pivot = pivot.reindex(["cls", "mean"], axis="columns", level="Token")
pivot = pivot.reindex(archs, axis="columns", level="Arch.")
pivot = pivot.reindex(datasets, axis="columns", level="Dataset")
pivot = pivot.reindex(["Grounding by Prediction", "Grounding by Design"], axis="columns", level="Type")
if verbose:
    display(pivot.head())

# Turn values into strings for manipulation with significance markers
unpivot = pivot.unstack().unstack().dropna().reset_index()  # values will be in col "0"
unpivot.loc[:, 1] = unpivot.loc[:, 0].astype("str")
unpivot.loc[:, 1] = unpivot.loc[:, 0].apply(lambda x: f"{round(x, 2):.2f}")
pivot = unpivot.pivot(index=["Measure Type", "Sim Meas."],
    columns=col_levels,
    values=1,
).sort_values(
    by=["Measure Type", "Sim Meas."]).reindex(
    measure_type_order, axis="index", level=0
)
unpivot
if verbose:
    display(unpivot.head(3))

# Highlight the best values by bolding
for column in pivot.columns:
    col = pivot.loc[:, column].astype("float")
    idx = col == col.max()
    pivot.loc[idx, column] = pivot.loc[idx, column].apply(lambda s: r"\textbf{" + s + "}")
if verbose:
    display(pivot.head(3))


# Add significance markers
# 1) select data that should get markers
idx = data["Dataset"].isin(datasets) & data["Arch."].isin(archs) & data.Test.isin(tests_with_pvals)
data_corr = data.loc[idx].copy()

# 2) Create new column with value and marker
data_corr["val_comb"] = data_corr["value"].apply(lambda x: f"{round(x, ndigits=2):.2f}") + data_corr["pval"].apply(pval_str)
if verbose:
    display(data_corr.head(3))

# 3) Create pivot table for values with markers that can be inserted into the main pivot table
pivot_corr = data_corr.pivot(
    index=["Measure Type", "Sim Meas."],
    columns=col_levels,
    values=["val_comb"],
).sort_values(
    by=["Measure Type", "Sim Meas."],
).reindex(
    measure_type_order, axis="index", level=0
).reindex(
    column_order, axis="columns", level="Test"
).loc[:, "val_comb"]
if verbose:
    display(pivot_corr.head())

# 4) Highlight the best scores by bolding
for column in pivot_corr.columns:
    col = pivot_corr.loc[:, column].apply(floatify).astype("float")
    identifiers = pivot_corr.loc[:, column].apply(separate_significance_indicator)
    idx = col == col.max()
    new_col = col.apply(lambda x: f"{x:.2f}").apply(lambda s: r"\textbf{" + s + "}") + identifiers
    pivot_corr.loc[idx, column] = new_col

# Insert into main pivot
pivot.loc[:, ("Grounding by Prediction")] = pivot_corr
display(pivot.head())


# texify(pivot, "tables/nlp_everything.tex", r"\emph{Results of Test 1-6 for the language domain}. In all cases, we use BERT models.", "tab:nlp_results", "l||rr|rr|rr||rr|rr|rr|rr|rr|rr|rr|rr")
texify(
    pivot.loc[:, ("Grounding by Prediction", "cls")],
    "tables/nlp_test_1_cls.tex",
    r"Results of Test 1 (\emph{Correlation to Accuracy Difference}) and Test 2 (\emph{Correlation to Output Difference}) for the language domain using CLS token representations.",
    "tab:nlp_test_1_cls",
)
texify(
    pivot.loc[:, ("Grounding by Prediction", "mean")],
    "tables/nlp_test_1_mean.tex",
    r"Results of Test 1 (\emph{Correlation to Accuracy Difference}) and Test 2 (\emph{Correlation to Output Difference}) for the language domain using mean-pooled token representations.",
    "tab:nlp_test_1_mean",
)

texify(
    pivot.loc[:, ("Grounding by Design", "cls", "Random Labels")],
    "tables/nlp_test_3_cls.tex",
    r"Results of Test 3 (\emph{Label Randomization}) for the language domain using CLS token representations.",
    "tab:nlp_test_3_cls",
)
texify(
    pivot.loc[:, ("Grounding by Design", "mean", "Random Labels")],
    "tables/nlp_test_3_mean.tex",
    r"Results of Test 3 (\emph{Label Randomization}) for the language domain using mean-pooled token representations.",
    "tab:nlp_test_3_mean",
)

texify(
    pivot.loc[:, ("Grounding by Design", "cls", "Shortcuts")],
    "tables/nlp_test_4_cls.tex",
    r"Results of Test 4 (\emph{Shortcut Affinity}) for the language domain using CLS token representations.",
    "tab:nlp_test_4_cls",
)
texify(
    pivot.loc[:, ("Grounding by Design", "mean", "Shortcuts")],
    "tables/nlp_test_4_mean.tex",
    r"Results of Test 4 (\emph{Shortcut Affinity}) for the language domain using mean-pooled token representations.",
    "tab:nlp_test_4_mean",
)

texify(
    pivot.loc[:, ("Grounding by Design", "cls", "Augmentation")],
    "tables/nlp_test_5_cls.tex",
    r"Results of Test 5 (\emph{Augmentation}) for the language domain using CLS token representations.",
    "tab:nlp_test_5_cls",
)
texify(
    pivot.loc[:, ("Grounding by Design", "mean", "Augmentation")],
    "tables/nlp_test_5_mean.tex",
    r"Results of Test 5 (\emph{Augmentation}) for the language domain using mean-pooled token representations.",
    "tab:nlp_test_5_mean",
)
texify(
    pivot.loc[:, ("Grounding by Design", "cls", "Layer Mono.")],
    "tables/nlp_test_6_cls.tex",
    r"Results of Test 6 (\emph{Layer Monotonicity}) for the language domain using CLS token representations.",
    "tab:nlp_test_6_cls",
)
texify(
    pivot.loc[:, ("Grounding by Design", "mean", "Layer Mono.")],
    "tables/nlp_test_6_mean.tex",
    r"Results of Test 6 (\emph{Layer Monotonicity}) for the language domain using mean-pooled token representations.",
    "tab:nlp_test_6_mean",
)

## Vision Tables - IN100

In [None]:
# Select language results
datasets = ["IN100"]
archs = ["RNet18", "RNet34", "RNet101", "VGG11", "VGG19", "ViT B32", "ViT L32"]
idx = data["Dataset"].isin(datasets) & data["Arch."].isin(archs)
tests_with_pvals = ["Acc. Corr.", "JSD Corr.", "Disagr. Corr."]
col_levels = ["Type", "Test", "Eval.", "Dataset", "Modality", "Arch."]


# Create pivot table
pivot = pd.pivot_table(
    data.loc[idx],  # type: ignore
    index=["Measure Type", "Sim Meas."],
    columns=col_levels,
    values="value",
)
pivot = pivot.sort_values(by=["Measure Type", "Sim Meas."])
pivot = pivot.reindex(measure_type_order, axis="index", level=0)
pivot = pivot.reindex(column_order, axis="columns", level="Test")
pivot = pivot.reindex(archs, axis="columns", level="Arch.")
pivot = pivot.reindex(datasets, axis="columns", level="Dataset")
pivot = pivot.reindex(["Grounding by Prediction", "Grounding by Design"], axis="columns", level="Type")
display(pivot.head(3))

# Turn values into strings for manipulation with significance markers
unpivot = pivot.unstack().unstack().dropna().reset_index()  # values will be in col "0"
unpivot.loc[:, 1] = unpivot.loc[:, 0].astype("str")
unpivot.loc[:, 1] = unpivot.loc[:, 0].apply(lambda x: f"{round(x, 2):.2f}")
pivot = unpivot.pivot(
    index=["Measure Type", "Sim Meas."],
    columns=col_levels,
    values=1,
).sort_values(
    by=["Measure Type", "Sim Meas."]
).reindex(
    measure_type_order, axis="index", level=0
)
unpivot
display(pivot.head(3))

# Highlight the best values by bolding
for column in pivot.columns:
    col = pivot.loc[:, column].astype("float")
    idx = col == col.max()
    pivot.loc[idx, column] = pivot.loc[idx, column].apply(lambda s: r"\textbf{" + s + "}")
display(pivot.head(3))


# Add significance markers
# 1) select data that should get markers
idx = data["Dataset"].isin(datasets) & data["Arch."].isin(archs) & data.Test.isin(tests_with_pvals)
data_corr = data.loc[idx].copy()

# 2) Create new column with value and marker
data_corr["val_comb"] = data_corr["value"].apply(lambda x: f"{round(x, ndigits=2):.2f}") + data_corr["pval"].apply(
    pval_str
)
display(data_corr.head(3))

# 3) Create pivot table for values with markers that can be inserted into the main pivot table
pivot_corr = (
    data_corr.pivot(
        index=["Measure Type", "Sim Meas."],
        columns=col_levels,
        values=["val_comb"],
    )
    .sort_values(by=["Measure Type", "Sim Meas."])
    .reindex(measure_type_order, axis="index", level=0)
    .reindex(column_order, axis="columns", level="Test")
    .loc[:, "val_comb"]
)
display(pivot_corr.head())

# 4) Highlight the best scores by bolding
for column in pivot_corr.columns:
    col = pivot_corr.loc[:, column].apply(floatify).astype("float")
    identifiers = pivot_corr.loc[:, column].apply(separate_significance_indicator)
    idx = col == col.max()
    new_col = col.apply(lambda x: f"{x:.2f}").apply(lambda s: r"\textbf{" + s + "}") + identifiers
    pivot_corr.loc[idx, column] = new_col


# Insert into main pivot
pivot.loc[:, ("Grounding by Prediction")] = pivot_corr

# # Fix order of models
# pivot = pivot.reindex(archs, axis="columns", level="Arch.")

display(pivot.head())

In [None]:
texify(
    pivot.loc[:, ("Grounding by Prediction", "Acc. Corr.")],
    "tables/vision_test_1.tex",
    r"Results of Test 1 (\emph{Correlation to Accuracy Difference}) for the vision domain on ImageNet-100.",
    "tab:vision_results_test_1",
    resizebox_width=0.7,
)

texify(
    pivot.loc[:, ("Grounding by Prediction", ["JSD Corr.", "Disagr. Corr."])],
    "tables/vision_test_2.tex",
    r"Results of Test 2 (\emph{Correlation to Output Difference}) for the vision domain on ImageNet-100.",
    "tab:vision_results_test_2",
)

texify(
    pivot.loc[:, ("Grounding by Design", "Random Labels")],
    "tables/vision_test_3.tex",
    r"Results of Test 3 (\emph{Label Randomization}) for the vision domain on ImageNet-100.",
    "tab:vision_results_test_3",
)

texify(
    pivot.loc[:, ("Grounding by Design", "Shortcuts")],
    "tables/vision_test_4.tex",
    r"Results of Test 4 (\emph{Shortcut Affinity}) for the vision domain on ImageNet-100.",
    "tab:vision_results_test_4",
)

texify(
    pivot.loc[:, ("Grounding by Design", "Augmentation")],
    "tables/vision_test_5.tex",
    r"Results of Test 5 (\emph{Augmentation}) for the vision domain on ImageNet-100.",
    "tab:vision_results_test_5",
)

texify(
    pivot.loc[:, ("Grounding by Design", "Layer Mono.")],
    "tables/vision_test_6.tex",
    r"Results of Test 6 (\emph{Layer Monotonicity}) for the vision domain on ImageNet-100.",
    "tab:vision_results_test_6",
)

## Vision Tables - CIFAR100


In [None]:
# Select language results
datasets = ["CIFAR100"]
archs = ["RNet18", "RNet34", "RNet101", "VGG11", "VGG19", "ViT B32", "ViT L32"]
idx = data["Dataset"].isin(datasets) & data["Arch."].isin(archs)
tests_with_pvals = ["Acc. Corr.", "JSD Corr.", "Disagr. Corr."]
col_levels = ["Type", "Test", "Eval.", "Dataset", "Modality", "Arch."]


# Create pivot table
pivot = pd.pivot_table(
    data.loc[idx],  # type: ignore
    index=["Measure Type", "Sim Meas."],
    columns=col_levels,
    values="value",
)
pivot = pivot.sort_values(by=["Measure Type", "Sim Meas."])
pivot = pivot.reindex(measure_type_order, axis="index", level=0)
pivot = pivot.reindex(column_order, axis="columns", level="Test")
pivot = pivot.reindex(archs, axis="columns", level="Arch.")
pivot = pivot.reindex(datasets, axis="columns", level="Dataset")
pivot = pivot.reindex(["Grounding by Prediction", "Grounding by Design"], axis="columns", level="Type")
display(pivot.head(3))

# Turn values into strings for manipulation with significance markers
unpivot = pivot.unstack().unstack().dropna().reset_index()  # values will be in col "0"
unpivot.loc[:, 1] = unpivot.loc[:, 0].astype("str")
unpivot.loc[:, 1] = unpivot.loc[:, 0].apply(lambda x: f"{round(x, 2):.2f}")
pivot = unpivot.pivot(
    index=["Measure Type", "Sim Meas."],
    columns=col_levels,
    values=1,
).sort_values(
    by=["Measure Type", "Sim Meas."]
).reindex(
    measure_type_order, axis="index", level=0
)
unpivot
display(pivot.head(3))

# Highlight the best values by bolding
for column in pivot.columns:
    col = pivot.loc[:, column].astype("float")
    idx = col == col.max()
    pivot.loc[idx, column] = pivot.loc[idx, column].apply(lambda s: r"\textbf{" + s + "}")
display(pivot.head(3))


# Add significance markers
# 1) select data that should get markers
idx = data["Dataset"].isin(datasets) & data["Arch."].isin(archs) & data.Test.isin(tests_with_pvals)
data_corr = data.loc[idx].copy()

# 2) Create new column with value and marker
data_corr["val_comb"] = data_corr["value"].apply(lambda x: f"{round(x, ndigits=2):.2f}") + data_corr["pval"].apply(
    pval_str
)
display(data_corr.head(3))

# 3) Create pivot table for values with markers that can be inserted into the main pivot table
pivot_corr = (
    data_corr.pivot(
        index=["Measure Type", "Sim Meas."],
        columns=col_levels,
        values=["val_comb"],
    )
    .sort_values(by=["Measure Type", "Sim Meas."])
    .reindex(measure_type_order, axis="index", level=0)
    .reindex(column_order, axis="columns", level="Test")
    .loc[:, "val_comb"]
)
display(pivot_corr.head())

# 4) Highlight the best scores by bolding
for column in pivot_corr.columns:
    col = pivot_corr.loc[:, column].apply(floatify).astype("float")
    identifiers = pivot_corr.loc[:, column].apply(separate_significance_indicator)
    idx = col == col.max()
    new_col = col.apply(lambda x: f"{x:.2f}").apply(lambda s: r"\textbf{" + s + "}") + identifiers
    pivot_corr.loc[idx, column] = new_col


# Insert into main pivot
pivot.loc[:, ("Grounding by Prediction")] = pivot_corr

# # Fix order of models
# pivot = pivot.reindex(archs, axis="columns", level="Arch.")

display(pivot.head())

In [None]:
texify(
    pivot.loc[:, ("Grounding by Prediction", "Acc. Corr.")],
    "tables/vision_test_1_cifar.tex",
    r"Results of Test 1 (\emph{Correlation to Accuracy Difference}) for the vision domain on CIFAR-100.",
    "tab:vision_results_test_1_cifar",
    resizebox_width=0.7,
)

texify(
    pivot.loc[:, ("Grounding by Prediction", ["JSD Corr.", "Disagr. Corr."])],
    "tables/vision_test_2_cifar.tex",
    r"Results of Test 2 (\emph{Correlation to Output Difference}) for the vision domain on CIFAR-100.",
    "tab:vision_results_test_2_cifar",
)

texify(
    pivot.loc[:, ("Grounding by Design", "Random Labels")],
    "tables/vision_test_3_cifar.tex",
    r"Results of Test 3 (\emph{Label Randomization}) for the vision domain on CIFAR-100.",
    "tab:vision_results_test_3_cifar",
)

texify(
    pivot.loc[:, ("Grounding by Design", "Shortcuts")],
    "tables/vision_test_4_cifar.tex",
    r"Results of Test 4 (\emph{Shortcut Affinity}) for the vision domain on CIFAR-100.",
    "tab:vision_results_test_4_cifar",
)

texify(
    pivot.loc[:, ("Grounding by Design", "Augmentation")],
    "tables/vision_test_5_cifar.tex",
    r"Results of Test 5 (\emph{Augmentation}) for the vision domain on CIFAR-100.",
    "tab:vision_results_test_5_cifar",
)

texify(
    pivot.loc[:, ("Grounding by Design", "Layer Mono.")],
    "tables/vision_test_6_cifar.tex",
    r"Results of Test 6 (\emph{Layer Monotonicity}) for the vision domain on CIFAR-100.",
    "tab:vision_results_test_6_cifar",
)

## Graph Tables

In [None]:
# Select language results
datasets = ["Cora", "Flickr", "OGBN-Arxiv"]
archs = ["GCN", "SAGE", "GAT", "PGNN"]
idx = data["Dataset"].isin(datasets) & data["Arch."].isin(archs)
tests_with_pvals = ["Acc. Corr.", "JSD Corr.", "Disagr. Corr."]
col_levels = ["Type", "Test", "Eval.", "Dataset", "Modality", "Arch."]
verbose = False

# Create pivot table
pivot = pd.pivot_table(
    data.loc[idx],  # type: ignore
    index=["Measure Type", "Sim Meas."],
    columns=col_levels,
    values="value",
)
pivot = pivot.sort_values(by=["Measure Type", "Sim Meas."])
pivot = pivot.reindex(measure_type_order, axis="index", level=0)
pivot = pivot.reindex(column_order, axis="columns", level="Test")
pivot = pivot.reindex(archs, axis="columns", level="Arch.")
pivot = pivot.reindex(datasets, axis="columns", level="Dataset")
pivot = pivot.reindex(["Grounding by Prediction", "Grounding by Design"], axis="columns", level="Type")
if verbose:
    display(pivot.head())

# Turn values into strings for manipulation with significance markers
unpivot = pivot.unstack().unstack().dropna().reset_index()  # values will be in col "0"
unpivot.loc[:, 1] = unpivot.loc[:, 0].astype("str")
unpivot.loc[:, 1] = unpivot.loc[:, 0].apply(lambda x: f"{round(x, 2):.2f}")
pivot = unpivot.pivot(index=["Measure Type", "Sim Meas."],
    columns=col_levels,
    values=1,
).sort_values(
    by=["Measure Type", "Sim Meas."]).reindex(
    measure_type_order, axis="index", level=0
)
unpivot
if verbose:
    display(unpivot.head(3))

# Highlight the best values by bolding
for column in pivot.columns:
    col = pivot.loc[:, column].astype("float")
    idx = col == col.max()
    pivot.loc[idx, column] = pivot.loc[idx, column].apply(lambda s: r"\textbf{" + s + "}")
if verbose:
    display(pivot.head(3))


# Add significance markers
# 1) select data that should get markers
idx = data["Dataset"].isin(datasets) & data["Arch."].isin(archs) & data.Test.isin(tests_with_pvals)
data_corr = data.loc[idx].copy()

# 2) Create new column with value and marker
data_corr["val_comb"] = data_corr["value"].apply(lambda x: f"{round(x, ndigits=2):.2f}") + data_corr["pval"].apply(pval_str)
if verbose:
    display(data_corr.head(3))

# 3) Create pivot table for values with markers that can be inserted into the main pivot table
pivot_corr = data_corr.pivot(
    index=["Measure Type", "Sim Meas."],
    columns=col_levels,
    values=["val_comb"],
).sort_values(
    by=["Measure Type", "Sim Meas."],
).reindex(
    measure_type_order, axis="index", level=0
).reindex(
    column_order, axis="columns", level="Test"
).loc[:, "val_comb"]
if verbose:
    display(pivot_corr.head())

# 4) Highlight the best scores by bolding
for column in pivot_corr.columns:
    col = pivot_corr.loc[:, column].apply(floatify).astype("float")
    identifiers = pivot_corr.loc[:, column].apply(separate_significance_indicator)
    idx = col == col.max()
    new_col = col.apply(lambda x: f"{x:.2f}").apply(lambda s: r"\textbf{" + s + "}") + identifiers
    pivot_corr.loc[idx, column] = new_col

# Insert into main pivot
pivot.loc[:, ("Grounding by Prediction")] = pivot_corr

display(pivot.head())

In [None]:
texify(
    pivot.loc[:, ("Grounding by Prediction", "Acc. Corr.")],
    "tables/graph_test_1.tex",
    r"Results of Test 1 (\emph{Correlation to Accuracy Difference}) for the graph domain.",
    "tab:graph_results_test_1",
    resizebox_width=0.7,
)

texify(
    pivot.loc[:, ("Grounding by Prediction", ["JSD Corr.", "Disagr. Corr."])],
    "tables/graph_test_2.tex",
    r"Results of Test 2 (\emph{Correlation to Output Difference}) for the graph domain.",
    "tab:graph_results_test_2",
)

texify(
    pivot.loc[:, ("Grounding by Design", "Random Labels")],
    "tables/graph_test_3.tex",
    r"Results of Test 3 (\emph{Label Randomization}) for the graph domain.",
    "tab:graph_results_test_3",
)

texify(
    pivot.loc[:, ("Grounding by Design", "Shortcuts")],
    "tables/graph_test_4.tex",
    r"Results of Test 4 (\emph{Shortcut Affinity}) for the graph domain.",
    "tab:graph_results_test_4",
)

texify(
    pivot.loc[:, ("Grounding by Design", "Augmentation")],
    "tables/graph_test_5.tex",
    r"Results of Test 5 (\emph{Augmentation}) for the graph domain.",
    "tab:graph_results_test_5",
)

texify(
    pivot.loc[:, ("Grounding by Design", "Layer Mono.")],
    "tables/graph_test_6.tex",
    r"Results of Test 6 (\emph{Layer Monotonicity}) for the graph domain.",
    "tab:graph_results_test_6",
)