# Reproduce plots

In [None]:
!python --version

In [None]:
1+1

In [None]:
import pandas as pd
import seaborn as sns
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import matplotlib as mpl
import os.path as osp


template_textwidth_inches = 5.50107
golden_ratio = 1.61803398875
long_to_short_name = {
    "RedPajama-INCITE-7B-Base": "RedPajama",
    "bloom-7b1": "bloom",
    "falcon-7b": "falcon",
    "galactica-6.7b": "galactica",
    "gpt-j-6b": "gpt-j",
    "llama-7b": "llama",
    "mpt-7b": "mpt",
    "open-llama-7b": "open-llama",
    "opt-6.7b": "opt",
    "pythia-6.9b-deduped": "pythia-deduped",
    "stablelm-base-alpha-7b": "stablelm-alpha",
    "CodeLlama-7b-hf": "CodeLlama",
    "CodeLlama-7b-Python-hf": "CodeLlama-Python",
}

parquet_basepath = (
    "results/zenodo"  # TODO: change this to wherever you downloaded the parquetfiles
)

def prepare_df(path):
    df = pd.read_parquet(path)
    display(df.head())
    return df

In [None]:
df_otis = prepare_df(osp.join(parquet_basepath, "winogrande_otis.parquet"))
df_otistr = prepare_df(osp.join(parquet_basepath, "winogrande_otistr.parquet"))

human_otis = prepare_df(osp.join(parquet_basepath, "humaneval_otis.parquet"))
human_otistr = prepare_df(osp.join(parquet_basepath, "humaneval_otistr.parquet"))

## Fig 1a (Winogrande, OTIS)

In [None]:
df = df_otis


def winogrande_otis_paper_figure(df):
    measures = [
        "Pipeline(normalize_matrix_norm+orthogonal_procrustes{})",
        "aligned_cossim",
        "rsm_norm_diff",
        "Pipeline(+jaccard_similarity{'k': 10})",
    ]
    measures_short_names = [
        "Orthogonal Procrustes",
        "Aligned Cossim",
        "Norm RSM-Diff (Cosine)",
        "Jaccard (k=10)",
    ]
    reverse_cmap_measures = [
        "Pipeline(normalize_matrix_norm+orthogonal_procrustes{})",
        "rsm_norm_diff",
    ]
    score = "score"
    cbar_width_scaler = 1.3
    width_one_axis = template_textwidth_inches / len(measures) * cbar_width_scaler * 2

    fig, ax = plt.subplots(
        1,
        len(measures),
        figsize=(len(measures) * width_one_axis, width_one_axis),
        squeeze=False,
    )
    for i, (measure, measure_name) in enumerate(zip(measures, measures_short_names)):
        ticklabels = sorted(
            set(pd.unique(df.loc[df["measure"] == measure, "model1"])).union(
                set(pd.unique(df.loc[df["measure"] == measure, "model2"]))
            )
        )

        G = nx.from_pandas_edgelist(
            df.loc[df["measure"] == measure, ["model1", "model2", score]].sort_values(
                by=["model1", "model2"], axis=0
            ),
            source="model1",
            target="model2",
            edge_attr=score,
        )
        data = nx.adjacency_matrix(G, weight=score, nodelist=ticklabels).todense()
        # we only want the lower triangle as the measures are symmetric
        mask = np.triu(np.ones_like(data, dtype=bool), k=0)
        data[mask] = np.nan  # NaN values wont show up
        data = data[
            1:, :-1
        ]  # eliminate the first row and the first column which exlusively consists of nans

        ticklabels = [long_to_short_name.get(l, l) for l in ticklabels]
        xticklabels = ticklabels[:-1]
        yticklabels = ticklabels[1:] if i == 0 else [""] * (len(ticklabels) - 1)
        if reverse_cmap_measures and measure in reverse_cmap_measures:
            cmap = "rocket_r"
        else:
            cmap = "rocket"
        _ = sns.heatmap(
            data,
            ax=ax[0, i],
            xticklabels=xticklabels,
            yticklabels=yticklabels,
            cmap=cmap,
            annot=False,
            annot_kws=dict(fontsize="xx-small"),
            square=False,
        )
        ax[0, i].set_title(measure_name)
        if (i + 1) == len(measures):
            with mpl.rc_context({"text.usetex": True}):
                ax[0, i].collections[0].colorbar.set_label(
                    r"$\leftarrow$ less similar       more similar $\rightarrow$"
                    + "\n(darker)            (brighter)"
                )
    return fig


fig = winogrande_otis_paper_figure(
    df_otis
)
fig.show()
# fig.savefig("figures/repsim_otis_hm.pdf", bbox_inches="tight")

## Fig 1b (HumanEval, OTIS)

In [None]:
def humaneval_otis_paper_figure(df, score="score"):
    measures = [
        "Pipeline(normalize_matrix_norm+orthogonal_procrustes{})",
        "aligned_cossim",
        "rsm_norm_diff",
        "Pipeline(+jaccard_similarity{'k': 10})",
    ]
    measures_short_names = [
        "Orthogonal Procrustes",
        "Aligned Cossim",
        "Norm RSM-Diff (Cosine)",
        "Jaccard (k=10)",
    ]
    reverse_cmap_measures = [
        "Pipeline(normalize_matrix_norm+orthogonal_procrustes{})",
        "rsm_norm_diff",
    ]
    model_order = [
        "RedPajama-INCITE-7B-Base",
        "bloom-7b1",
        "falcon-7b",
        "galactica-6.7b",
        "gpt-j-6b",
        "llama-7b",
        "mpt-7b",
        "open-llama-7b",
        "opt-6.7b",
        "pythia-6.9b-deduped",
        "stablelm-base-alpha-7b",
        "CodeLlama-7b-hf",
        "CodeLlama-7b-Python-hf",
    ]

    df = df.copy()
    df["model1"] = pd.Categorical(df["model1"], categories=model_order, ordered=True)
    df["model2"] = pd.Categorical(df["model2"], categories=model_order, ordered=True)

    cbar_width_scaler = 1.3
    width_one_axis = template_textwidth_inches / len(measures) * cbar_width_scaler * 2
    fig, ax = plt.subplots(
        1,
        len(measures),
        figsize=(len(measures) * width_one_axis * 1.1, width_one_axis),
        squeeze=False,
    )
    for i, (measure, measure_name) in enumerate(zip(measures, measures_short_names)):
        print(measure)

        ticklabels = [
            s
            for s in df["model1"].values.categories
            if s in set(df["model1"].unique()).union(set(df["model2"].unique()))
        ]
        G = nx.from_pandas_edgelist(
            df.loc[df["measure"] == measure, ["model1", "model2", score]].sort_values(
                by=["model1", "model2"], axis=0
            ),
            source="model1",
            target="model2",
            edge_attr=score,
        )
        data = nx.adjacency_matrix(G, weight=score, nodelist=ticklabels).todense()
        # we only want the lower triangle as the measures are symmetric
        mask = np.triu(np.ones_like(data, dtype=bool), k=0)
        data[mask] = np.nan  # NaN values wont show up
        data = data[
            1:, :-1
        ]  # eliminate the first row and the first column which exlusively consists of nans

        ticklabels = [long_to_short_name.get(l, l) for l in ticklabels]
        xticklabels = ticklabels[:-1]
        yticklabels = ticklabels[1:] if i == 0 else [""] * (len(ticklabels) - 1)
        if reverse_cmap_measures and measure in reverse_cmap_measures:
            cmap = "rocket_r"
        else:
            cmap = "rocket"
        _ = sns.heatmap(
            data,
            ax=ax[0, i],
            xticklabels=xticklabels,
            yticklabels=yticklabels,
            cmap=cmap,
            annot=False,
            annot_kws=dict(fontsize="xx-small"),
            square=False,
        )
        ax[0, i].set_title(measure_name)
        if (i + 1) == len(measures):
            with mpl.rc_context({"text.usetex": True}):
                ax[0, i].collections[0].colorbar.set_label(
                    r"$\leftarrow$ less similar       more similar $\rightarrow$"
                    + "\n(darker)            (brighter)"
                )
    return fig


fig = humaneval_otis_paper_figure(human_otis)
fig.show()
# fig.savefig("figures/repsim_humaneval_otis_hm.pdf", bbox_inches="tight")

## Fig 2a (Winogrande, OTISTR)

In [None]:
def winogrande_otistr_paper_figure(df):
    measures = [
        "Pipeline(center_columns+normalize_matrix_norm+orthogonal_procrustes{})",
        "Pipeline(center_columns+aligned_cossim{})",
        "Pipeline(center_columns+normalize_matrix_norm+rsm_norm_diff{'inner': 'euclidean'})",
        "Pipeline(center_columns+jaccard_similarity{})",
        "Pipeline(normalize_matrix_norm+representational_similarity_analysis{'inner': 'euclidean', 'outer': 'spearman'})",
        "centered_kernel_alignment",
    ]
    measures_short_names = [
        "Orthogonal\nProcrustes",
        "Aligned Cossim",
        "Norm RSM-Diff\n(Euclidean)",
        "Jaccard (k=10)",
        "RSA\n(Euclidean, Spearman)",
        "CKA",
    ]
    reverse_cmap_measures = [
        "Pipeline(center_columns+normalize_matrix_norm+orthogonal_procrustes{})",
        "Pipeline(center_columns+normalize_matrix_norm+rsm_norm_diff{'inner': 'euclidean'})",
    ]
    score = "score"
    cbar_width_scaler = 1.3
    width_one_axis = template_textwidth_inches / len(measures) * cbar_width_scaler * 2

    fig, ax = plt.subplots(
        1,
        len(measures),
        figsize=(len(measures) * width_one_axis, width_one_axis),
        squeeze=False,
    )
    for i, (measure, measure_name) in enumerate(zip(measures, measures_short_names)):
        print(measure)
        ticklabels = sorted(
            set(pd.unique(df.loc[df["measure"] == measure, "model1"])).union(
                set(pd.unique(df.loc[df["measure"] == measure, "model2"]))
            )
        )

        G = nx.from_pandas_edgelist(
            df.loc[df["measure"] == measure, ["model1", "model2", score]].sort_values(
                by=["model1", "model2"], axis=0
            ),
            source="model1",
            target="model2",
            edge_attr=score,
        )
        data = nx.adjacency_matrix(G, weight=score, nodelist=ticklabels).todense()
        # we only want the lower triangle as the measures are symmetric
        mask = np.triu(np.ones_like(data, dtype=bool), k=0)
        data[mask] = np.nan  # NaN values wont show up
        data = data[
            1:, :-1
        ]  # eliminate the first row and the first column which exlusively consists of nans

        ticklabels = [long_to_short_name.get(l, l) for l in ticklabels]
        xticklabels = ticklabels[:-1]
        yticklabels = ticklabels[1:] if i == 0 else [""] * (len(ticklabels) - 1)
        if reverse_cmap_measures and measure in reverse_cmap_measures:
            cmap = "rocket_r"
        else:
            cmap = "rocket"
        _ = sns.heatmap(
            data,
            ax=ax[0, i],
            xticklabels=xticklabels,
            yticklabels=yticklabels,
            cmap=cmap,
            annot=False,
            annot_kws=dict(fontsize="xx-small"),
            square=False,
        )
        ax[0, i].set_title(measure_name)
        if (i + 1) == len(measures):
            with mpl.rc_context({"text.usetex": True}):
                ax[0, i].collections[0].colorbar.set_label(
                    r"$\leftarrow$ less similar       more similar $\rightarrow$"
                    + "\n(darker)            (brighter)"
                )
    return fig


fig = winogrande_otistr_paper_figure(df_otistr)
fig.show()
# fig.savefig("figures/repsim_otistr_hm.pdf", bbox_inches="tight")

## Fig 2b (HumanEval, OTISTR)

In [None]:
def humaneval_otistr_paper_figure(df):
    measures = [
        "Pipeline(center_columns+normalize_matrix_norm+orthogonal_procrustes{})",
        "Pipeline(center_columns+aligned_cossim{})",
        "Pipeline(center_columns+normalize_matrix_norm+rsm_norm_diff{'inner': 'euclidean'})",
        "Pipeline(center_columns+jaccard_similarity{})",
        "Pipeline(normalize_matrix_norm+representational_similarity_analysis{'inner': 'euclidean', 'outer': 'spearman'})",
        "centered_kernel_alignment",
    ]
    measures_short_names = [
        "Orthogonal\nProcrustes",
        "Aligned Cossim",
        "Norm RSM-Diff\n(Euclidean)",
        "Jaccard (k=10)",
        "RSA\n(Euclidean, Spearman)",
        "CKA",
    ]
    reverse_cmap_measures = [
        "Pipeline(center_columns+normalize_matrix_norm+orthogonal_procrustes{})",
        "Pipeline(center_columns+normalize_matrix_norm+rsm_norm_diff{'inner': 'euclidean'})",
    ]
    model_order = [
        "RedPajama-INCITE-7B-Base",
        "bloom-7b1",
        "falcon-7b",
        "galactica-6.7b",
        "gpt-j-6b",
        "llama-7b",
        "mpt-7b",
        "open-llama-7b",
        "opt-6.7b",
        "pythia-6.9b-deduped",
        "stablelm-base-alpha-7b",
        "CodeLlama-7b-hf",
        "CodeLlama-7b-Python-hf",
    ]
    score = "score"

    df = df.copy()
    df["model1"] = pd.Categorical(df["model1"], categories=model_order, ordered=True)
    df["model2"] = pd.Categorical(df["model2"], categories=model_order, ordered=True)

    cbar_width_scaler = 1.3
    width_one_axis = template_textwidth_inches / len(measures) * cbar_width_scaler * 2
    fig, ax = plt.subplots(
        1,
        len(measures),
        figsize=(len(measures) * width_one_axis * 1.1, width_one_axis),
        squeeze=False,
    )
    for i, (measure, measure_name) in enumerate(zip(measures, measures_short_names)):
        print(measure)
        ticklabels = [
            s
            for s in df["model1"].values.categories
            if s in set(df["model1"].unique()).union(set(df["model2"].unique()))
        ]

        G = nx.from_pandas_edgelist(
            df.loc[df["measure"] == measure, ["model1", "model2", score]].sort_values(
                by=["model1", "model2"], axis=0
            ),
            source="model1",
            target="model2",
            edge_attr=score,
        )
        data = nx.adjacency_matrix(G, weight=score, nodelist=ticklabels).todense()
        # we only want the lower triangle as the measures are symmetric
        mask = np.triu(np.ones_like(data, dtype=bool), k=0)
        data[mask] = np.nan  # NaN values wont show up
        data = data[
            1:, :-1
        ]  # eliminate the first row and the first column which exlusively consists of nans

        ticklabels = [long_to_short_name.get(l, l) for l in ticklabels]
        xticklabels = ticklabels[:-1]
        yticklabels = ticklabels[1:] if i == 0 else [""] * (len(ticklabels) - 1)
        if reverse_cmap_measures and measure in reverse_cmap_measures:
            cmap = "rocket_r"
        else:
            cmap = "rocket"
        _ = sns.heatmap(
            data,
            ax=ax[0, i],
            xticklabels=xticklabels,
            yticklabels=yticklabels,
            cmap=cmap,
            annot=False,
            annot_kws=dict(fontsize="xx-small"),
            square=False,
        )
        ax[0, i].set_title(measure_name)
        if (i + 1) == len(measures):
            with mpl.rc_context({"text.usetex": True}):
                ax[0, i].collections[0].colorbar.set_label(
                    r"$\leftarrow$ less similar       more similar $\rightarrow$"
                    + "\n(darker)            (brighter)"
                )
    return fig

fig = humaneval_otistr_paper_figure(
    human_otistr
)
fig.show()
# fig.savefig("figures/repsim_humaneval_otistr_hm.pdf", bbox_inches="tight")

## Correlations

In [None]:
from scipy.stats import spearmanr
import scipy.spatial.distance

### Correlations across datasets per measures

In [None]:
df1 = df_otis
df2 = human_otis


def all_models(df):
    return set(pd.unique(df.model1)).union(set(pd.unique(df.model2)))


models_in_both_dfs = all_models(df1).intersection(all_models(df2))


def cross_dataset_correlation(df1, df2, distance_measures, score="score"):
    def get_array(df):
        ticklabels = sorted(
            set(pd.unique(df.loc[df["measure"] == measure, "model1"])).union(
                set(pd.unique(df.loc[df["measure"] == measure, "model2"]))
            )
        )
        G = nx.from_pandas_edgelist(
            df.loc[df["measure"] == measure, ["model1", "model2", score]].sort_values(
                by=["model1", "model2"], axis=0
            ),
            source="model1",
            target="model2",
            edge_attr=score,
        )
        data = nx.adjacency_matrix(G, weight=score, nodelist=ticklabels).todense()
        # we only want the lower triangle as the measures are symmetric
        mask = np.triu(np.ones_like(data, dtype=bool), k=0)
        data[mask] = np.nan
        data = data.flatten()
        return data[~np.isnan(data)]

    measures = set(pd.unique(df1.measure)).intersection(set(pd.unique(df2.measure)))
    corrs = {}
    for measure in measures:
        df1_measure = df1.loc[
            (df1.model1.isin(models_in_both_dfs))
            & (df1.model2.isin(models_in_both_dfs))
            & (df1.measure == measure)
        ]
        df2_measure = df2.loc[
            (df2.model1.isin(models_in_both_dfs))
            & (df2.model2.isin(models_in_both_dfs))
            & (df2.measure == measure)
        ]

        data1 = get_array(df1_measure)
        data2 = get_array(df2_measure)
        if measure in distance_measures:
            data1, data2 = -1 * data1, -1 * data2

        corrs[measure] = {
            "spearman": spearmanr(data1, data2).statistic,
            "pearson": 1
            - scipy.spatial.distance.cdist(
                data1.reshape(1, -1), data2.reshape(1, -1), metric="correlation"
            ),
        }
    return corrs


res = cross_dataset_correlation(
    df1,
    df2,
    [
        "Pipeline(normalize_matrix_norm+orthogonal_procrustes{})",
        "rsm_norm_diff",
    ],
)
avg_spearman_corr = 0
for measure, corrs in res.items():
    print(measure)
    print(corrs)
    avg_spearman_corr += corrs["spearman"]
avg_spearman_corr /= len(res)
avg_spearman_corr