In [None]:
import os
import json
import warnings

In [None]:
import numpy as np
import scipy as sp
import scipy.stats
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

SAVING_FORMATS = ["pdf", "svg", "png"]
internal_valid_results_folder = "results/"
results_folder = "results/metabric"
images_folder = "images/metabric"
os.makedirs(images_folder, exist_ok=True)
for fmt in SAVING_FORMATS: os.makedirs(os.path.join(images_folder,fmt), exist_ok=True)

In [None]:
INVERTED_METRICS = ["MSE"]

results_to_gather = [
    'clf_meta_results_202303311447.csv',
    'clf_meta_results_202304020106.csv',
    'clf_meta_results_202304020436.csv',
    'clf_meta_results_202304022211.csv',
    'clf_meta_results_202304061522.csv',
    'clf_meta_results_202304070049.csv'
]

test_results = pd.concat(
    [
        pd.read_csv(os.path.join(results_folder,f), index_col=0) for f in results_to_gather
    ],
    ignore_index=True
).reset_index()

for m in INVERTED_METRICS:
    test_results[f"Train {m}"] = -test_results[f"Train {m}"]
    test_results[f"Test {m}"] = -test_results[f"Test {m}"]

test_results = test_results.rename(columns={"AUC":"ROC AUC"})

test_results["Classifier"] = test_results["Classifier"].str.replace("SVC", "SVM").str.replace("LogisticRegression", "LR").str.replace("RandomForestClassifier", "RF")
test_results["Space Type"] = test_results["Space Type"].str.replace("Latent", "$z$").str.replace("Pathway Activity", "$a$")
test_results.loc[np.logical_and(test_results["Space Type"]=="$z$", test_results["Model"].str.contains("VAE")), "Space Type"] = "$μ$"

test_results.head()

In [None]:
def sep_model_and_feat(k):
    if isinstance(k,(pd.Index,)):
        idx:pd.Index = k
        return [sep_model_and_feat(i) for i in idx.values]
    feature = k.split("-")[0] + " " + " ".join(k.split("-")[1].split(" ")[1:])
    model = k.split("-")[1].split(" ")[0]
    return model, feature

In [None]:
def preprocess_for_latex(series:pd.Series) -> pd.Series:
    return series.str.replace(
        "β","$\\beta$", regex=False
    ).str.replace(
        "μ","$\\mu$", regex=False
    ).str.replace(
        "-step-","-$\\mathbbm{1}$-", regex=False
    ).str.replace(
        "-smooth-","-$\\sigma$-", regex=False
    ).str.replace(
        "(Hallmark Genes)","(HG)", regex=False
    )

In [None]:
table_cols = ["Model Size", "Test MSE", "Accuracy", "Precision", "Recall", "F1", "ROC AUC"]
table_results = test_results.copy()
table_results["Model"] = preprocess_for_latex(table_results["Model"])
table_results["Model Class"] = preprocess_for_latex(table_results["Model Class"])

table_results.groupby(
        ["Classifier","Model","Space Type",]
    )[table_cols].agg(
        np.median
    )

print(
    table_results.groupby(
        ["Classifier","Model","Space Type",]
    )[table_cols].agg(
        lambda x: f"{np.quantile(x, q=0.50):.3f} ({np.quantile(x, q=0.75)-np.quantile(x, q=0.25):.3f})"
    ).sort_index(inplace=False).style.to_latex(
        hrules=True,
    )
)

In [None]:
# See viz-brca.ipynb
with open(os.path.join(internal_valid_results_folder,"best.json"), "r") as best_f:
    best = json.load(best_f)
    for k in best:
        model_and_clf_name, best_score = best[k]
        model_and_clf_name = preprocess_for_latex(pd.Series([model_and_clf_name]))[0]
        best[k] = [
            "-".join([model_and_clf_name.split("-")[0], *model_and_clf_name.split("-")[2:]]),
            model_and_clf_name.split("-")[1],
            best_score,
        ]

{
    'AE ': ('AE-[128, 64]', "LR", 0.7416842803471456),
    'VAE ': ('VAE-$\mathbbm{1}$-$\\beta$1-[128, 64]', "LR", 0.5126946725701951),
    'PAAE (KEGG)': ('PAAE-[32]-[64] (KEGG)', "LR", 0.9743640361931267),
    'PAAE (HG)': ('PAAE-[32]-[64] (HG)', "LR", 0.9695108184255021),
    'PAVAE (KEGG)': ('PAVAE-$\sigma$-$\\beta$1-[]-[128, 64] (KEGG)', "SVM", 0.8640078506643825),
    'PAVAE (HG)': ('PAVAE-$\mathbbm{1}$-$\\beta$1-[]-[128, 64] (HG)', "SVM", 0.9260142637119084),
}
best

In [None]:
for mdl_cls, (mdl, clf, auc) in best.items():
    print(
        mdl_cls,
        mdl,
        clf,
        f"{auc:.3f}",
    )
    print(
        table_results.loc[
            np.logical_and(
                table_results["Model"] == mdl,
                table_results["Classifier"] == clf,
            ),
            ["Model", "Classifier", "Space Type"] + ["ROC AUC"]
        ].groupby(
            ["Model", "Classifier", "Space Type"]
        ).agg(
            lambda x: f"{np.quantile(x, q=0.50):.3f} ({np.quantile(x, q=0.75)-np.quantile(x, q=0.25):.3f})"
        ).reset_index()
    )
    print()

In [None]:
col_to_rank = "ROC AUC"
bigger_is_better = True
argbest_fn = (lambda series: series.argmax()) if bigger_is_better else (lambda series: series.argmin())
table_cols = ["Model Size", "Test MSE", "Accuracy", "Precision", "Recall", "F1", "ROC AUC"]
table_results = test_results.copy()
table_results["Model"] = preprocess_for_latex(table_results["Model"])
table_results["Model Class"] = preprocess_for_latex(table_results["Model Class"])

median_aggregates = table_results.groupby(
        ["Classifier","Model","Space Type",]
    )[table_cols].agg(
        np.median
    ).reset_index()

median_aggregates["Model Class"] = median_aggregates["Model"].map({k:v for (k,v) in table_results[["Model","Model Class"]].to_numpy()})

best_of_each_class = []
for m in median_aggregates["Model Class"].unique():
    best_of_this_class = median_aggregates[median_aggregates["Model Class"]==m].set_index(["Classifier", "Model", "Space Type"])
    best_of_each_class.append(best_of_this_class.iloc[[argbest_fn(best_of_this_class["ROC AUC"])],:])

best_of_each_class_df = pd.concat(best_of_each_class)

print(
    table_results.groupby(
        ["Classifier","Model","Space Type",]
    )[table_cols].agg(
        lambda x: f"{np.quantile(x, q=0.50):.3f} ({np.quantile(x, q=0.75)-np.quantile(x, q=0.25):.3f})"
    ).sort_index(inplace=False).loc[best_of_each_class_df.index].style.to_latex(
        hrules=True,
    )
)

In [None]:
unique_clf = test_results["Classifier"].unique()
n_clf = len(unique_clf)

fig, axes = plt.subplots(ncols = n_clf, sharex=True, sharey=True)

for axi, clf in enumerate(unique_clf):
    plot_results = test_results[test_results["Classifier"]==clf]
    plot_results = plot_results[
        np.logical_or(
            plot_results["Model Class"].str.startswith("PAAE (KEGG)"),
            plot_results["Model Class"].str.startswith("AE"),
            plot_results["Model Class"].str.startswith("VAE"),
        )
    ]

    p = sns.scatterplot(
        data=plot_results,
        x="Model Size",
        y="ROC AUC",
        hue="Model Class",
        style="Space Type",
        ax=axes[axi],
        legend=(axi==n_clf-1),
        )
    p.set_xscale("log")
    if (axi==n_clf-1): sns.move_legend(p, "center left", bbox_to_anchor=(1, 0.5))
    axes[axi].set_title(clf)
    
    for fmt in SAVING_FORMATS: plt.savefig(os.path.join(images_folder,fmt,f"clfbrca-ext-paramplot.{fmt}"), bbox_inches="tight")

In [None]:
# Do plot with number of layers
unique_clf = test_results["Classifier"].unique()
unique_clf = test_results["Classifier"].unique()
n_clf = len(unique_clf)

fig, axes = plt.subplots(ncols = n_clf, sharex=True, sharey=True)

for axi, clf in enumerate(unique_clf):
    plot_results = test_results[test_results["Classifier"]==clf]
    plot_results["Nonlinear Layers"] = plot_results["Model"].str.count(",")
    plot_results = plot_results[
        np.logical_or(
            plot_results["Model Class"].str.startswith("PAAE (KEGG)"),
            plot_results["Model Class"].str.startswith("AE"),
            plot_results["Model Class"].str.startswith("VAE"),
        )
    ]

    p = sns.scatterplot(
        data=plot_results,
        x="Nonlinear Layers",
        y="ROC AUC",
        hue="Model Class",
        style="Space Type",
        ax=axes[axi],
        legend=(axi==n_clf-1),
        )
    if (axi==n_clf-1): sns.move_legend(p, "center left", bbox_to_anchor=(1, 0.5))
    axes[axi].set_title(clf)
    
    for fmt in SAVING_FORMATS: plt.savefig(os.path.join(images_folder,fmt,f"clfbrca-ext-layerplot.{fmt}"), bbox_inches="tight")

In [None]:
paper_normtype = "log2p1e-3_fpkm"

In [None]:
paper_models = [
    ("AE-[128, 64]","LR","$z$"),
    ("PAAE-[32]-[64] (KEGG)","LR","$z$"),
    ("PAAE-[32]-[64] (KEGG)","LR","$a$"),
    ("PAAE-[32]-[64] (Hallmark Genes)","LR","$z$"),
    ("PAAE-[32]-[64] (Hallmark Genes)","LR","$a$"),
    ("PAVAE-smooth-β1-[]-[128, 64] (KEGG)","SVM","$μ$"),
    ("PAVAE-smooth-β1-[]-[128, 64] (KEGG)","SVM","$a$"),
    ("PAVAE-step-β1-[]-[128, 64] (Hallmark Genes)","SVM","$μ$"),
    ("PAVAE-step-β1-[]-[128, 64] (Hallmark Genes)","SVM","$a$"),
    ("VAE-step-β1-[128, 64]","LR","$μ$"),
]

In [None]:
all_models = set(test_results["Model"].unique())

DEFAULT_MODEL_REPORTING_DEFINITIONS = [
    (False, "paper", set(paper_models)
    ),
]

DEFAULT_HUE_FN = lambda this_df: this_df["Model"].str.split("-").str[0]
DEFAULT_HUE_ORDER = sorted(test_results["Model"].str.split("-").str[0].unique())

def plot_fn(plot_var,
        clusion_names_to_show = None,
        figsize_to_show = "wideadj",
        model_reporting_definitions = DEFAULT_MODEL_REPORTING_DEFINITIONS,
        hue_fn = DEFAULT_HUE_FN,
        hue_order = DEFAULT_HUE_ORDER,
        plot_legend=False,
        sort_by_var=False,
        sort_descending=True,
        savefigs=True,
        xlim=None,
        plot_kwargs={}):

    for is_exclusion, set_name, model_set in model_reporting_definitions:

        is_included = test_results[["Model","Classifier","Space Type"]].apply(lambda x: tuple(x) in model_set, axis=1)
        if is_exclusion: is_included = np.logical_not(is_included)
        
        this_data = test_results[is_included].replace([np.inf,-np.inf],np.nan).dropna(subset=[plot_var]).copy()
        plot_col = "(Model, Classifier, Space)"
        this_data[plot_col] = test_results[["Model","Classifier","Space Type"]].apply(lambda x: str(tuple(x)), axis=1)
        
        this_models = this_data[plot_col].unique()
        if sort_by_var:
            model_name_and_median = list(zip(this_models, [this_data.loc[this_data[plot_col]==m,plot_var].median() for m in this_models]))
            model_name_and_median = sorted(
                model_name_and_median,
                key=lambda x: tuple(reversed(x)),
                reverse=sort_descending,
            )
            this_models = [model for model, _ in model_name_and_median]

        if this_data.shape[0]<=0:
            continue
        this_hue = this_data["Model"].str.split("-").str[0]
        num_models = len(this_data[plot_col].unique())
        for figsizename, figsize in [
            ("default",None),
            ("thinadj",(3,6/20*num_models)),
            ("wideadj",(7,6/20*num_models)),
        ]:
            plt.figure(figsize=figsize)
            
            sns.boxplot(data = this_data,
                x = plot_var, y = plot_col,
                hue=this_hue, dodge=False,
                hue_order=hue_order,
                order=this_models,
                **plot_kwargs)
            plt.xlim(xlim)
            if not plot_legend:
                plt.legend([],[], frameon=False)
            if savefigs:
                for fmt in SAVING_FORMATS: plt.savefig(os.path.join(images_folder,fmt,f"clfbrca-ext-boxplot-{plot_var.lower().replace('-','_').replace(' ','_')}-{set_name}-{figsizename}.{fmt}"), bbox_inches="tight")
            if ((clusion_names_to_show is None or set_name in clusion_names_to_show
                ) and (
                figsize_to_show==figsizename
                )):
                print(set_name, figsizename)
                plt.show()
            plt.close()

In [None]:
plot_fn("ROC AUC", sort_by_var=True, sort_descending=True)

In [None]:
plot_fn("Accuracy", sort_by_var=True, sort_descending=True)

In [None]:
plot_fn("F1", sort_by_var=True, sort_descending=True)

In [None]:
plot_fn("Train MSE")

In [None]:
plot_fn("Test MSE", xlim=[0,200])

In [None]:
plot_fn("Fit Time", savefigs=False)

In [None]:
def get_comparison_table(
        this_data,
        plot_var,
        col_name="ROC AUC",
        table_is_sorted=True,
        bigger_is_better=True,
        cmp_col_fn = lambda df: df["Model"]
        ):
    cmp_col = cmp_col_fn(this_data)
    comparison_models = cmp_col.unique()
    num_models = len(comparison_models)
    model_name_and_median = list(zip(comparison_models, [this_data.loc[cmp_col==m,col_name].median() for m in comparison_models], [this_data.loc[cmp_col==m,col_name].count() for m in comparison_models]))
    if table_is_sorted:
        model_name_and_median = sorted(
            model_name_and_median,
            key=lambda x: x[1],
            reverse=bigger_is_better
        )

    model_comparison_df_p_value_data = {
        n: [] for (n, *_) in model_name_and_median
    }
    model_comparison_df_bigger_data = {
        n: [] for (n, *_) in model_name_and_median
    }
    model_comparison_df_plot_data = {
        n: [] for (n, *_) in model_name_and_median
    }
    model_comparison_df_index = []

    for i in range(len(model_name_and_median)):
        ni = model_name_and_median[i][0]
        dfi = this_data[cmp_col==ni]
        model_comparison_df_index.append(ni)
        for j in range(len(model_name_and_median)):
            nj = model_name_and_median[j][0]
            dfj = this_data[cmp_col==nj]
            try:
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    wcx = sp.stats.wilcoxon(dfi[plot_var], dfj[plot_var], nan_policy="omit")[1]
            except:
                wcx = np.nan
            i_bigger_than_j = dfi[plot_var].median()>dfj[plot_var].median()
            cmp_symbol = "-" if np.isnan(wcx)  else (('$>$' if i_bigger_than_j else '$<$') if wcx<0.05 else "$\\approx$")
            p_value_str = "-" if np.isnan(wcx) else (f"$p={wcx:.3f}$" if wcx>=1e-3 else "$p<10^{-3}$")

            model_comparison_df_p_value_data[nj].append(wcx)
            model_comparison_df_bigger_data[nj].append(i_bigger_than_j)
            model_comparison_df_plot_data[nj].append(f"{cmp_symbol} ({p_value_str})")

    df_model_comparison = pd.DataFrame(model_comparison_df_p_value_data, index=model_comparison_df_index)
    df_model_comparison_plot = pd.DataFrame(model_comparison_df_plot_data, index=model_comparison_df_index)
    return model_name_and_median, df_model_comparison, df_model_comparison_plot

In [None]:
cmp_col_fn = lambda df: df[["Model","Classifier","Space Type"]].apply(lambda x: tuple(x), axis=1)
paper_test_results = test_results[cmp_col_fn(test_results).isin(set(paper_models))].reset_index()

model_name_and_median, auc_cmp_df, auc_cmp_plt_df = get_comparison_table(paper_test_results, "ROC AUC", cmp_col_fn=cmp_col_fn)
print(model_name_and_median)
auc_cmp_plt_df

In [None]:
print(auc_cmp_plt_df.style.to_latex())

In [None]:
cmp_col_fn = lambda df: df[["Model","Classifier","Space Type"]].apply(lambda x: tuple(x), axis=1)
paper_test_results = test_results[cmp_col_fn(test_results).isin(set(paper_models))].reset_index()

for metric, bigger_is_better in [
    ("Model Size", False),
    ("Test MSE", False),
    ("Accuracy", True),
    ("Precision", True),
    ("Recall", True),
    ("F1", True),
    ("ROC AUC", True),
]:
    model_name_and_median, auc_cmp_df, auc_cmp_plt_df = get_comparison_table(paper_test_results, metric, col_name=metric, cmp_col_fn=cmp_col_fn, bigger_is_better=bigger_is_better)
    print(metric, (*model_name_and_median[0], *(auc_cmp_df.iloc[0,0],)), end="")
    for i in range(1, len(model_name_and_median)):
        if auc_cmp_df.iloc[0,i] > 0.05:
            print("", (*model_name_and_median[i],*(auc_cmp_df.iloc[0,i],)), end="")
        else:
            break
    print()

In [None]:
model_name_and_median, auc_cmp_df, auc_cmp_plt_df = get_comparison_table(test_results, "ROC AUC")
print(model_name_and_median)
auc_cmp_plt_df

In [None]:
print(auc_cmp_plt_df.style.to_latex())