In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path
import matplotlib.pyplot as plt
from src.plots.latex import update_rcParams, HUE_ORDER, set_size

In [None]:
df = pd.read_parquet(Path("../reports/publicsplit.parquet"))
print(len(df))
df["acc_diff"] = df["Acc1"] - df["Acc2"]
df["abs_acc_diff"] = (df["Acc1"] - df["Acc2"]).apply(np.abs)
df.head()

Only needed if you want to build the full table including results on CPU and with fixed seeds.
Otherwise, skip the following cells (until the next section).

In [None]:

cpu = pd.read_parquet(Path("../reports/public_gatcpu.parquet"))
cpu.loc[:, "Model"] = "GAT (CPU)"
cpu["acc_diff"] = cpu["Acc1"] - cpu["Acc2"]
cpu.head()

In [None]:
df = df.append(cpu, ignore_index=True)

In [None]:
fixedseed = pd.read_parquet("../reports/public_fixedseed.parquet")


def convert_name(s: str):
    if s == "GAT2017":
        return "GAT (fixed)"
    elif s == "GCN2017":
        return "GCN (fixed)"
    else:
        return s

fixedseed.loc[:, "Model"] = fixedseed["Model"].apply(convert_name)
fixedseed["acc_diff"] = fixedseed["Acc1"] - fixedseed["Acc2"]
fixedseed.head()

In [None]:
df = df.append(fixedseed, ignore_index=True)

In [None]:

fixedseed_cpu = pd.read_parquet("/root/feature-similarity/reports/public_fixedseed_cpu.parquet")


def convert_name(s: str):
    if s == "GAT2017":
        return "GAT (fixed, CPU)"
    elif s == "GCN2017":
        return "GCN (fixed, CPU)"
    else:
        return s

fixedseed_cpu.loc[:, "Model"] = fixedseed_cpu["Model"].apply(convert_name)
fixedseed_cpu["acc_diff"] = fixedseed_cpu["Acc1"] - fixedseed_cpu["Acc2"]
fixedseed_cpu.head()

In [None]:
df = df.append(fixedseed_cpu, ignore_index=True)

## Create a summary table

In [None]:
meantab = pd.pivot_table(df, values="Value", index=["Dataset", "Model"], columns=["Metric"], aggfunc=lambda x: 100 * np.mean(x))
stdtab = pd.pivot_table(df, values="Value", index=["Dataset", "Model"], columns=["Metric"], aggfunc=lambda x: 100 * np.std(x))
meanacctab = pd.pivot_table(df, values="Acc-mean", index=["Dataset", "Model"], aggfunc=lambda x: 100 * np.mean(x))
stdacctab = pd.pivot_table(df, values="Acc-std", index=["Dataset", "Model"], aggfunc=lambda x: 100 * np.mean(x))

meandifftab = pd.pivot_table(df, values="abs_acc_diff", index=["Dataset", "Model"], aggfunc=lambda x: 100 * np.mean(x))  # avg lower bound for PI


In [None]:
mean_conv = lambda x: f"{x:0.1f} $\\pm$"
std_conv = lambda x: f" {x:0.1f}"
tab = meantab.applymap(mean_conv) + stdtab.applymap(std_conv)
tempseries = meanacctab.applymap(mean_conv)["Acc-mean"] + stdacctab.applymap(std_conv)["Acc-std"]
tempseries.name = "Acc-mean"
tab = tab.join(tempseries)


def col_order(idx):
    col_to_idx = {
        "Acc-mean": 0,
        "PI": 1,
        "NormPI": 2,
        "True PI": 3,
        "False PI": 4,
        "MAE": 5,
        "SymKL": 6,
    }
    return pd.Index([col_to_idx[s] for s in idx])


def row_order(idx):
    col_to_idx = {
        "CiteSeer": 0,
        "Pubmed": 1,
        "CS": 2,
        "Physics": 3,
        "Computers": 4,
        "Photo": 5,
        "WikiCS": 6,
    }
    return pd.Index([col_to_idx[s] for s in idx])


tab = tab.sort_index(axis=1, key=col_order)
tab = tab.sort_index(axis=0, key=row_order, level=0)


col_to_name = {
    "Acc-mean": "Accuracy",
    "PI": "$d$",
    "NormPI": "$d_{norm}$",
    "True PI": "$d_{True}$",
    "False PI": "$d_{False}$",
    "MAE": "MAE",
    "SymKL": "SymKL",
}
tab = tab.rename(col_to_name, axis="columns")
model_to_name = {"GAT2017": "GAT", "GCN2017": "GCN"}
tab = tab.rename(model_to_name, axis="rows", level=1)


tab = tab.drop("SymKL", axis="columns")
tab
tab.to_latex(
    "../reports/tab1.tex",
    sparsify=True,
    escape=False,
    multirow=True,
    multicolumn=False,
    column_format="llcccccc",
)



## Poster Figure

In [None]:
tmpdf1 = meantab.reset_index()
tmpdf2 = meandifftab.reset_index()
datasets = ["CS", "Photo", "Pubmed"]
with plt.style.context("seaborn"):
    with update_rcParams(
        {
            "font.size": 20,
            "axes.labelsize": 18,
            # "axes.linewidth": 5,
            "legend.fontsize": 15,
            "xtick.labelsize": 18,
            "ytick.labelsize": 18,
        }
    ):
        width, height = set_size(1690, fraction=0.25)
        g = sns.catplot(
            data=tmpdf1[tmpdf1.Dataset.isin(datasets)],
            x="PI",
            y="Dataset",
            hue="Model",
            kind="bar",
            hue_order=["GCN2017", "GAT2017"],
            legend=False,
            height=height,
            aspect=width / height,  # golden ratio
        )
        sns.barplot(
            data=tmpdf2[tmpdf2.Dataset.isin(datasets)],
            x="abs_acc_diff",
            y="Dataset",
            hue="Model",
            hue_order=["GCN2017", "GAT2017"],
            palette={"GCN2017": "#647796", "GAT2017": "#6b9475"},
            ax=g.ax,
        )
        handles, labels = g.ax.get_legend_handles_labels()
        g.ax.legend(
            handles[0:2],
            ["GCN", "GAT"],
            ncol=1,
            loc="best",
            fancybox=False,
            frameon=True,
        )
        g.set_xlabels("Disagreement (\%)")
        g.set_ylabels("")
        # g.set_yticklabels(["CS"])
        g.savefig("../reports/poster/bar.pdf")
