# Correlating Node Properties with Subgroup Prediction Disagreement

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from src.plots.latex import update_rcParams, HUE_ORDER, set_size

## EDA

In [None]:
df = pd.read_parquet("../reports/nodeprops.parquet")  # Load precomputed df
print(len(df))
df.head()
df.Metric.unique()

In [None]:
meanaccs = (
    df.groupby(
        ["Dataset", "Model", "Property", "Bin_val", "InitSeed1"], as_index=False
    )["SubgroupAcc1"]
    .mean()
    .groupby(["Dataset", "Model", "Property", "Bin_val"])["SubgroupAcc1"]
    .agg([np.mean, np.std])
)
meanaccs.head()

In [None]:
for (dataset, model, prop, binval), mean in meanaccs["mean"].iteritems():
    df.loc[
        (df.Dataset == dataset)
        & (df.Model == model)
        & (df.Bin_val == binval)
        & (df.Property == prop),
        f"MeanSubgroupAcc1",
    ] = mean

df.head()

In [None]:
next(meanaccs["mean"].iteritems())

In [None]:
aggvals = (
    df.groupby(
        ["Dataset", "Model", "Property", "Bin_val", "Metric"]
    )[["Value", "Bin_size"]]
    .mean()
)
aggvals = pd.merge(aggvals, meanaccs, left_index=True, right_index=True)

In [None]:
col_order = ["CiteSeer", "Pubmed", "CS", "Physics", "Computers", "Photo", "WikiCS"]
row_order = ["class", "pagerank", "degree"]

In [None]:
width, height = set_size(fraction=1,)

In [None]:
with plt.style.context("seaborn"):
    metric = "PI"
    sns.catplot(
        data=df[df.Metric == metric],
        x="Bin",
        y="Value",
        hue="Dataset",
        col="Model",
        row="Property",
        kind="point",
        sharey=False,
        sharex=False,
        aspect=1.61,
        height=height,
        hue_order=HUE_ORDER,
        # ci="sd",
        style="Dataset",
        yscale="log"
    )

## Paper Plots

In [None]:
properties = [("pagerank", "PageRank"), ("clustering", "Clustering Coefficient"), ("kcore", "K-core")]
datasets = ["Photo", "Physics", "CS"]
metric = "PI"
font_scale = 2
for prop, prop_name in properties:
    for i, dataset in enumerate(datasets):
        with plt.style.context("seaborn"):
            with update_rcParams(
                {
                    "axes.labelsize": 8 * font_scale,
                    "font.size": 8 * font_scale,
                    "legend.fontsize": 6 * font_scale,
                    "xtick.labelsize": 6 * font_scale,
                    "ytick.labelsize": 6 * font_scale,
                }
            ):
                nrows, ncols = 1, 1
                width, height = set_size(fraction=1)
                fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=(width, height))

                pf = df.loc[
                    (df["Metric"] == metric)
                    & (df["Dataset"] == dataset)
                    & (df["Property"] == prop)
                ]
                sns.boxplot(
                    data=pf,
                    x="Bin",
                    y="Value",
                    hue="Model",
                    ax=ax,
                )
                ax.set_title(f"{prop_name} | {dataset}")
                ax.set_ylabel("Disagreement $d$")
                ax.set_xlabel("Septile")
                ax.set_xticklabels([1,2,3,4,5,6,7])
                if i > 0:
                    ax.legend_.remove()
                else:
                    handles, labels = ax.get_legend_handles_labels()
                    ax.legend(handles, ["GCN", "GAT"])
                
                fig.savefig(f"../reports/{prop}_{dataset}.pdf", bbox_inches="tight")

In [None]:
properties = [("pagerank", "PageRank"), ("clustering", "Clustering Coefficient"), ("kcore", "K-core")]
datasets = ["Photo", "Physics", "CS"]
metric = "False PI"
font_scale = 2
for prop, prop_name in properties:
    for i, dataset in enumerate(datasets):
        with plt.style.context("seaborn"):
            with update_rcParams(
                {
                    "axes.labelsize": 8 * font_scale,
                    "font.size": 8 * font_scale,
                    "legend.fontsize": 6 * font_scale,
                    "xtick.labelsize": 6 * font_scale,
                    "ytick.labelsize": 6 * font_scale,
                }
            ):
                nrows, ncols = 1, 1
                width, height = set_size(fraction=1)
                fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=(width, height))

                pf = df.loc[
                    (df["Metric"] == metric)
                    & (df["Dataset"] == dataset)
                    & (df["Property"] == prop)
                ]
                sns.boxplot(
                    data=pf,
                    x="Bin",
                    y="Value",
                    hue="Model",
                    ax=ax,
                )
                ax.set_title(f"{prop_name} | {dataset}")
                ax.set_ylabel("False Disagr. $d_{False}$")
                ax.set_xlabel("Septile")
                ax.set_xticklabels([1,2,3,4,5,6,7])
                if i > 0:
                    ax.legend_.remove()
                else:
                    handles, labels = ax.get_legend_handles_labels()
                    ax.legend(handles, ["GCN", "GAT"])
                
                fig.savefig(f"../reports/{prop}_{dataset}_fpi.pdf", bbox_inches="tight")

In [None]:
properties = [("pagerank", "PageRank"), ("clustering", "Clustering Coefficient"), ("kcore", "K-core")]
datasets = ["Photo", "Physics", "CS", "Pubmed", "CiteSeer", "Computers", "WikiCS"]
metric = "False PI"
font_scale = 2
metric_to_name = {
    "PI": "Disagreement $d$",
    "NormPI": "Norm. Dis. $d_{Norm}$",
    "False PI": "False Dis. $d_{False}$",
    "True PI": "True Dis. $d_{True}$",
    "MAE": "MAE",
    "SymKL": "Symmetric KL-Div",
}


for metric in ["PI", "False PI", "True PI", "NormPI", "MAE"]:
    for prop, prop_name in properties:
        for i, dataset in enumerate(datasets):
            with plt.style.context("seaborn"):
                with update_rcParams(
                    {
                        "axes.labelsize": 8 * font_scale,
                        "font.size": 8 * font_scale,
                        "legend.fontsize": 6 * font_scale,
                        "xtick.labelsize": 6 * font_scale,
                        "ytick.labelsize": 6 * font_scale,
                    }
                ):
                    nrows, ncols = 1, 1
                    width, height = set_size(fraction=1)
                    fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=(width, height))

                    pf = df.loc[
                        (df["Metric"] == metric)
                        & (df["Dataset"] == dataset)
                        & (df["Property"] == prop)
                    ]
                    sns.boxplot(
                        data=pf,
                        x="Bin",
                        y="Value",
                        hue="Model",
                        ax=ax,
                    )
                    ax.set_title(f"{prop_name} | {dataset}")
                    ax.set_ylabel(metric_to_name[metric])
                    ax.set_xlabel("Septile")
                    try:
                        ax.set_xticklabels([1,2,3,4,5,6,7])
                    except ValueError:
                        pass
                    if i > 0:
                        ax.legend_.remove()
                    else:
                        handles, labels = ax.get_legend_handles_labels()
                        ax.legend(handles, ["GCN", "GAT"])
                    
                    fig.savefig(f"../reports/appendix/{prop}_{dataset}_{metric}.pdf", bbox_inches="tight")
                    plt.close()

In [None]:
df[df.Property == "clustering"].groupby(["Dataset", "Model", "Bin", "Bin_val"])[["Bin_size"]].mean().loc[("Photo", "GAT2017")]

In [None]:
aggvals.head()
aggvals.loc[idx[:,"GAT2017":"GAT2017", "class", :]]

In [None]:
idx = pd.IndexSlice

g = sns.relplot(
    data=aggvals.loc[idx[:, :, :, :, "PI"]],
    x="mean",
    y="Value",
    # size="Bin_size",
    row="Property",
    col="Dataset",
    hue="Model",
    facet_kws=dict(sharex=False,sharey=False),
    kind="scatter",

)
g.savefig("./cache/props_acc_class.png")

In [None]:
idx = pd.IndexSlice
model = "GAT2017"
with plt.style.context("seaborn"):
    with update_rcParams(
        {
            "axes.labelsize": 8 * font_scale,
            "font.size": 8 * font_scale,
            "legend.fontsize": 6 * font_scale,
            "xtick.labelsize": 6 * font_scale,
            "ytick.labelsize": 6 * font_scale,
        }
    ):
        font_scale = 2
        datasets = ["CiteSeer", "Pubmed", "CS", "Physics", "Computers", "Photo", "WikiCS"]
        markers = ["X", "v", "o", "D", "s", "P", "^"]
        nrows, ncols = 1, 1
        width, height = set_size(fraction=1)
        properties = [("pagerank", "PageRank"), ("clustering", "Clustering Coefficient"), ("kcore", "K-core"), ("class", "Class")]

        for prop, prop_name in properties:
            fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=(width, height))
            # for i, (dataset, marker) in enumerate(zip(datasets, markers)):
            pf = aggvals.loc[idx[:, model, prop, :, "PI"]]
            sns.scatterplot(
                data=pf,
                y="Value",
                x="mean",
                # size="Bin_size",
                hue="Dataset",
                style="Dataset",
                ax=ax,
                alpha=0.8,
                hue_order=HUE_ORDER,
                # legend=False if i < (len(datasets) -1) else True,
                legend=True,
                # markers=marker,
                markers={ds: m for ds, m in zip(datasets, markers)}
            )
            ax.set_title(prop_name, fontdict={"size":12})
            ax.set_ylabel("Disagreement $d$")
            ax.set_xlabel("Mean Subgroup Accuracy")

            handles, labels = ax.get_legend_handles_labels()
            # ax.legend(labels=labels, handles=handles, ncol=4, bbox_to_anchor=(-.2, -.25), loc="upper left", columnspacing=0, fontsize=10)
            if prop == "class":
                ax.legend(labels=labels, handles=handles, ncol=1, loc="best", columnspacing=0, fontsize=10, frameon=True)
            else:
                ax.legend(labels=labels, handles=handles, ncol=2, loc="best", columnspacing=0, fontsize=10, frameon=True)
            fig.savefig(f"../reports/{prop}_{model}_acctopi.pdf", bbox_inches="tight")

                

In [None]:
idx = pd.IndexSlice

metric_to_name = {
    "PI": "Disagreement $d$",
    "NormPI": "Norm. Dis. $d_{Norm}$",
    "False PI": "False Dis. $d_{False}$",
    "True PI": "True Dis. $d_{True}$",
    "MAE": "MAE",
    "SymKL": "Symmetric KL-Div",
}


for model in ["GAT2017", "GCN2017"]:
    for metric in ["PI", "False PI", "True PI", "NormPI", "MAE"]:
        with plt.style.context("seaborn"):
            with update_rcParams(
                {
                    "axes.labelsize": 8 * font_scale,
                    "font.size": 8 * font_scale,
                    "legend.fontsize": 6 * font_scale,
                    "xtick.labelsize": 6 * font_scale,
                    "ytick.labelsize": 6 * font_scale,
                }
            ):
                font_scale = 2
                datasets = ["CiteSeer", "Pubmed", "CS", "Physics", "Computers", "Photo", "WikiCS"]
                markers = ["X", "v", "o", "D", "s", "P", "^"]
                nrows, ncols = 1, 1
                width, height = set_size(fraction=1)
                properties = [("pagerank", "PageRank"), ("clustering", "Clustering Coefficient"), ("kcore", "K-core"), ("class", "Class")]

                for prop, prop_name in properties:
                    fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=(width, height))
                    # for i, (dataset, marker) in enumerate(zip(datasets, markers)):
                    pf = aggvals.loc[idx[:, model, prop, :, metric]]
                    sns.scatterplot(
                        data=pf,
                        y="Value",
                        x="mean",
                        # size="Bin_size",
                        hue="Dataset",
                        style="Dataset",
                        ax=ax,
                        alpha=0.8,
                        hue_order=HUE_ORDER,
                        # legend=False if i < (len(datasets) -1) else True,
                        legend=True,
                        # markers=marker,
                        markers={ds: m for ds, m in zip(datasets, markers)}
                    )
                    ax.set_title(prop_name, fontdict={"size":12})
                    ax.set_ylabel(metric_to_name[metric])
                    
                    ax.set_xlabel("Mean Subgroup Accuracy")

                    handles, labels = ax.get_legend_handles_labels()
                    # ax.legend(labels=labels, handles=handles, ncol=4, bbox_to_anchor=(-.2, -.25), loc="upper left", columnspacing=0, fontsize=10)
                    if prop == "class":
                        ax.legend(labels=labels, handles=handles, ncol=1, loc="best", columnspacing=0, fontsize=10, frameon=True)
                    else:
                        ax.legend(labels=labels, handles=handles, ncol=2, loc="best", columnspacing=0, fontsize=10, frameon=True)
                    fig.savefig(f"../reports/appendix/{prop}_{model}_accto_{metric}.pdf", bbox_inches="tight")

                        