In [None]:
import os
import json
import warnings

In [None]:
import numpy as np
import scipy as sp
import scipy.stats
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Plots are quite big, and they weren't used for anything, but feel free to activate the plot generation if you want.
make_plots = False

SAVING_FORMATS = ["pdf", "svg", "png"]
results_folder = "results"
images_folder = "images"
os.makedirs(images_folder, exist_ok=True)
for fmt in SAVING_FORMATS: os.makedirs(os.path.join(images_folder,fmt), exist_ok=True)

In [None]:
INVERTED_METRICS = ["AE_Loss", "AE_MSE", "AE_R2"]

results_to_gather = [ # Had to run each model separately due to memory constraints
    "clf_valid_results_202302241145.csv",
    "clf_valid_results_202302241540.csv",
    "clf_valid_results_202302251829.csv",
    "clf_valid_results_202302271642.csv",
]

test_results = pd.concat(
    [
        pd.read_csv(os.path.join(results_folder,f), index_col=0) for f in results_to_gather
    ],
    ignore_index=True
).reset_index()

for m in INVERTED_METRICS:
    test_results[f"Test {m}"] = -test_results[f"Test {m}"]

test_results = test_results.rename(
    columns={
        **{old:old.replace("Test ", "") for old in test_results.columns if old.startswith("Test ")},
        **{old:old.replace("Test AE_", "AE ") for old in test_results.columns if old.startswith("Test AE_")},
    }
).rename(columns={"AUC":"ROC AUC"})

test_results["Model"] = test_results["Model"].str.replace("-SVC", "-SVM").str.replace("-LogisticRegression", "-LR").str.replace("-RandomForestClassifier", "-RF")

test_results["AE Type"] = test_results["Model"].str.split("-").str[0] + " " + test_results["Model"].str.extract("(\\(.*\\))").replace(np.nan,"").squeeze()

test_results

In [None]:
best_models_per_type = {}
for typ in test_results["AE Type"].unique():
    best = test_results.loc[test_results["AE Type"]==typ,"Model"].unique()[0]
    best_auc = test_results.loc[test_results["Model"]==best,"ROC AUC"].mean()
    for this in test_results.loc[test_results["AE Type"]==typ,"Model"].unique():
        this_auc = test_results.loc[test_results["Model"]==this,"ROC AUC"].mean()
        if this_auc > best_auc:
            best = this
            best_auc = this_auc
    best_models_per_type[typ] = (best,best_auc)
with open(os.path.join(results_folder,"best.json"), "w") as best_f:
    json.dump(best_models_per_type, best_f)
best_models_per_type

In [None]:
def sep_model_and_feat(k):
    if isinstance(k,(pd.Index,)):
        idx:pd.Index = k
        return [sep_model_and_feat(i) for i in idx.values]
    feature = k.split("-")[0] + " " + " ".join(k.split("-")[1].split(" ")[1:])
    model = k.split("-")[1].split(" ")[0]
    return model, feature

In [None]:
table_cols = ["AE MSE", "Accuracy", "Precision", "Recall", "F1", "ROC AUC"]
table_results = test_results.copy()
table_results["Model"] = table_results["Model"].str.replace("β","$\\beta$")
print(
    table_results[table_results["Input Normalization"]=="log2p1e-3_fpkm"].groupby(
        "Model"
    )[table_cols].agg(
        lambda x: f"{np.quantile(x, q=0.50):.3f} ({np.quantile(x, q=0.75)-np.quantile(x, q=0.25):.3f})"
    ).sort_index(key=sep_model_and_feat,inplace=False).style.to_latex(
        hrules=True,
    )
)

In [None]:
table_cols = ["AE MSE", "Accuracy", "Precision", "Recall", "F1", "ROC AUC"]
table_results = test_results.copy()
table_results["Model"] = table_results["Model"].str.replace("β","$\\beta$")
print(
    table_results[table_results["Input Normalization"]=="log2p1e-3_fpkm"].groupby(
        "Model"
    )[table_cols].agg(
        lambda x: f"${np.mean(x):.3f}\\pm {np.std(x):.3f}$"
    ).sort_index(key=sep_model_and_feat,inplace=False).style.to_latex(
        hrules=True,
    )
)

In [None]:
test_results["Model"].unique()

In [None]:
paper_normtype = "log2p1e-3_fpkm"

In [None]:
all_models = set(test_results["Model"].unique())

DEFAULT_MODEL_REPORTING_DEFINITIONS = [
    (False, "all", all_models),
    (False, "svm", {m for m in all_models if "-SVM" in m}),
    (False, "rf", {m for m in all_models if "-RF" in m}),
    (False, "lr", {m for m in all_models if "-LR" in m}),
]

DEFAULT_HUE_FN = lambda this_df: this_df["Model"].str.split("-").str[0]
DEFAULT_HUE_ORDER = sorted(test_results["Model"].str.split("-").str[0].unique())

def plot_fn(plot_var,
        normtypes_to_show = None,
        clusion_names_to_show = None,
        figsize_to_show = "wideadj",
        model_reporting_definitions = DEFAULT_MODEL_REPORTING_DEFINITIONS,
        hue_fn = DEFAULT_HUE_FN,
        hue_order = DEFAULT_HUE_ORDER,
        plot_legend=False,
        sort_by_var=False,
        sort_descending=True,
        savefigs=True,
        xlim=None,
        plot_kwargs={}):
    all_models = set(test_results["Model"].unique())

    for normtype in sorted(test_results["Input Normalization"].unique(), key=lambda x: x[-1]+x):
        for is_exclusion, set_name, model_set in model_reporting_definitions:

            is_included = test_results["Model"].isin(model_set)
            if is_exclusion: is_included = np.logical_not(is_included)

            is_normtype = test_results["Input Normalization"]==normtype
            
            this_data = test_results[np.logical_and(test_results["External Split"]!=0,np.logical_and(is_normtype,is_included))].replace([np.inf,-np.inf],np.nan).dropna(subset=[plot_var]).copy()
            this_models = this_data["Model"].unique()
            if sort_by_var:
                model_name_and_median = list(zip(this_models, [this_data.loc[this_data["Model"]==m,plot_var].median() for m in this_models]))
                model_name_and_median = sorted(
                    model_name_and_median,
                    key=lambda x: tuple(reversed(x)),
                    reverse=sort_descending,
                )
                this_models = [model for model, _ in model_name_and_median]
            
            if isinstance(model_set, dict) and not is_exclusion:
                this_data.loc[:,"Model"] = this_data["Model"].map(model_set)
            if this_data.shape[0]<=0:
                continue
            this_hue = this_data["Model"].str.split("-").str[0]
            num_models = len(this_data["Model"].unique())
            for figsizename, figsize in [
                ("default",None),
                ("thinadj",(3,6/20*num_models)),
                ("wideadj",(7,6/20*num_models)),
            ]:
                plt.figure(figsize=figsize)
                
                sns.boxplot(data = this_data,
                    x = plot_var, y = "Model",
                    hue=this_hue, dodge=False,
                    hue_order=hue_order,
                    order=this_models,
                    **plot_kwargs)
                plt.xlim(xlim)
                if not plot_legend:
                    plt.legend([],[], frameon=False)
                if savefigs:
                    for fmt in SAVING_FORMATS: plt.savefig(os.path.join(images_folder,fmt,f"clfbrca-boxplot-{plot_var.lower().replace('-','_').replace(' ','_')}-{normtype}-{set_name}-{figsizename}.{fmt}"), bbox_inches="tight")
                if ((clusion_names_to_show is None or set_name in clusion_names_to_show
                    ) and (
                    normtypes_to_show is None or normtype in normtypes_to_show
                    ) and (
                    figsize_to_show==figsizename
                    )):
                    print(normtype, set_name, figsizename)
                    plt.show()
                plt.close()

In [None]:
if make_plots: plot_fn("ROC AUC", sort_by_var=True, sort_descending=True)

In [None]:
if make_plots: plot_fn("Accuracy", sort_by_var=True, sort_descending=True)

In [None]:
if make_plots: plot_fn("F1", sort_by_var=True, sort_descending=True)

In [None]:
if make_plots: plot_fn("AE MSE")

In [None]:
if make_plots: plot_fn("Fit Time", savefigs=False)

In [None]:
def get_comparison_table(this_data, plot_var, col_name="ROC AUC", table_is_sorted=True, bigger_is_better=True):
    comparison_models = this_data["Model"].unique()
    num_models = len(comparison_models)
    model_name_and_median = list(zip(comparison_models, [this_data.loc[this_data["Model"]==m,col_name].median() for m in comparison_models], [this_data.loc[this_data["Model"]==m,col_name].count() for m in comparison_models]))
    if table_is_sorted:
        model_name_and_median = sorted(
            model_name_and_median,
            key=lambda x: x[1],
            reverse=bigger_is_better
        )

    model_comparison_df_p_value_data = {
        n: [] for (n, *_) in model_name_and_median
    }
    model_comparison_df_bigger_data = {
        n: [] for (n, *_) in model_name_and_median
    }
    model_comparison_df_plot_data = {
        n: [] for (n, *_) in model_name_and_median
    }
    model_comparison_df_index = []

    for i in range(len(model_name_and_median)):
        ni = model_name_and_median[i][0]
        dfi = this_data[this_data["Model"]==ni]
        model_comparison_df_index.append(ni)
        for j in range(len(model_name_and_median)):
            nj = model_name_and_median[j][0]
            dfj = this_data[this_data["Model"]==nj]
            try:
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    wcx = sp.stats.wilcoxon(dfi[plot_var], dfj[plot_var])[1]
            except:
                wcx = np.nan
            i_bigger_than_j = dfi[plot_var].median()>dfj[plot_var].median()
            cmp_symbol = "-" if np.isnan(wcx)  else (('$>$' if i_bigger_than_j else '$<$') if wcx<0.05 else "$\\approx$")
            p_value_str = "-" if np.isnan(wcx) else (f"$p={wcx:.3f}$" if wcx>=1e-3 else "$p<10^{-3}$")

            model_comparison_df_p_value_data[nj].append(wcx)
            model_comparison_df_bigger_data[nj].append(i_bigger_than_j)
            model_comparison_df_plot_data[nj].append(f"{cmp_symbol} ({p_value_str})")

    df_model_comparison = pd.DataFrame(model_comparison_df_p_value_data, index=model_comparison_df_index)
    df_model_comparison_plot = pd.DataFrame(model_comparison_df_plot_data, index=model_comparison_df_index)
    return model_name_and_median, df_model_comparison, df_model_comparison_plot

In [None]:
model_name_and_median, auc_cmp_df, auc_cmp_plt_df = get_comparison_table(test_results, "ROC AUC")
print(model_name_and_median)
auc_cmp_plt_df

In [None]:
print(auc_cmp_plt_df.style.to_latex())