## Overfitting Exploration

### Useful Preliminaries

In [None]:
import os
import sys
sys.path.append("..")  # add project root

import shutil
import re
from argparse import ArgumentParser
from pickle import dump, load

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

import zarr
import dask.array as da

from ray import tune

from sklearn.metrics import balanced_accuracy_score, roc_auc_score

from src.data_utils import *
from src.constants import *
from src.tuner import train_cv, RayAdaptiveRepeatedCVSearch

from pprint import pprint

In [None]:
pd.options.display.float_format = '{:10,.3f}'.format

In [None]:
np.random.seed(420)

In [None]:
sns.set_theme(context="talk")

In [None]:
# path constants
train_dir = "/home/mr2238/scratch_pi_np442/mr2238/accelerate/total/"

In [None]:
# check img directory exists, if not make it
img_dir = "/home/mr2238/project_pi_np442/mr2238/accelerate/imgs/overfit"
os.makedirs(img_dir, exist_ok=True)

### Loading Model and Results

In [None]:
dataset_name = "smooth_downsample_w_300s_hr_rso2r_rso2l_spo2_abp"
run_name = "2.0rapid"
small = False
model_name = f"models{'_debug' if small else ''}_{run_name}"

In [None]:
model_store = os.path.join(train_dir, dataset_name, model_name)
print(model_store)

In [None]:
print(os.listdir(model_store))

In [None]:
model_states = {}
for f in os.listdir(model_store):
    if not f.endswith(".pkl"):
        state = tune.ExperimentAnalysis(experiment_checkpoint_path=os.path.join(model_store, f))
        model_states[f] = state

In [None]:
# TBD load test metrics? could also just move this to eval.py

### Plot Best Results

In [None]:
for k, v in model_states.items():
    print(k)
    print(v.results_df.columns[:9])

In [None]:
# gather results
def gather_results(model_states, metric, others_to_fetch):
    rows = []
    of_interest = ['model'] + [metric] + others_to_fetch
    for k, v in model_states.items():
        df = v.results_df
        try:
            result = df.loc[[df[metric].idxmax()]]
            result["model"] = k
            rows.append(result[of_interest])
        except:
            continue
    return pd.concat(rows, ignore_index=True)

In [None]:
others = ['mean_train_auc', 'std_val_auc', 'std_train_auc',
       'mean_val_auc', 'mean_val_balanced_accuracy',
       'std_val_balanced_accuracy', 'std_train_balanced_accuracy']

In [None]:
r = gather_results(model_states, 'mean_train_balanced_accuracy', others)
print(r)

### Plot results per model

##### Prelims

In [None]:
# list training dirs
dataset_names = os.listdir(train_dir)
run_name = "2.0rapid"
len(dataset_names)

In [None]:
if "debug" in dataset_names:
    dataset_names.remove("debug")

In [None]:
img_dir = os.path.join(img_dir, run_name)
os.makedirs(img_dir, exist_ok=True)


In [None]:
# loop through training dirs, pick out training results per model
def model_path_iter(dataset_names, run_name):
    for ds in dataset_names:
        ds_path = os.path.join(train_dir, ds)
        for model_dir in os.listdir(ds_path):
            if run_name in model_dir:
                # grab model paths
                md_path = os.path.join(ds_path, model_dir)
                for m in os.listdir(md_path):
                    model_path = os.path.join(md_path, m)
                    if not m.endswith(".pkl"):
                        yield model_path, "debug" in model_dir

In [None]:
of_interest = ['mean_val_auc', 'mean_train_auc', 'std_val_auc', 'std_train_auc',
       'mean_train_balanced_accuracy', 'mean_val_balanced_accuracy',
       'std_val_balanced_accuracy', 'std_train_balanced_accuracy']

##### DF

In [None]:
selector = "mean_val_auc"

In [None]:
large_results = []
for m, d in model_path_iter(dataset_names, run_name):
    # grab results_df
    try:
        state = tune.ExperimentAnalysis(experiment_checkpoint_path=m)
    except ValueError:
        print(f"Could not find experiment at {m}, skipping.")
        continue
    df = state.results_df
    if df.shape[1] > 0:
        df = df[df['done'] == True]
        df = df[of_interest]
    
        # add debug flag to df
        df['debug'] = d
        # add model_name
        df['model'] = os.path.basename(m)

        # add dataset_name
        df['dataset'] = os.path.basename(os.path.dirname(os.path.dirname(m)))

        # select the best based on mean_val_auc
        best_row = df.loc[df[selector].idxmax()].to_frame().T

        # combine into one dataset
        large_results.append(best_row)

large_result_df = pd.concat(large_results)
print(large_result_df.shape)

In [None]:
pprint(large_result_df.model.unique())
len(large_result_df.model.unique())

In [None]:
mapping = {
    "separate_decomp": "separatepca", 
    "pca": "pca", 
    "raw": "raw", 
    "chronos": "chronos", 
    "design": "design", 
    "design_w": "whiten"}
modelnames = [
            "log_reg",
            "svm",
            "knn",
            "rand_forest",
            "decision_tree",
            "xgb",
            "rocket",
            "kn_multivar",
        ]

In [None]:
large_result_df["name_m"] = large_result_df["model"]
large_result_df["model"] = large_result_df["model"].str.replace("_separate_pca", "_separate_decomp")
for k, v in mapping.items():
    large_result_df["model"] = large_result_df["model"].str.replace(pat=k, repl=v)
large_result_df = large_result_df.rename(columns={"model": "model_name"})
pprint(large_result_df.columns)

In [None]:
large_result_df[["model", "datamode"]] = (
    large_result_df["model_name"]
      .str.rsplit("_", n=1, expand=True)
)
large_result_df.drop(columns="model_name")
large_result_df["model"]

In [None]:
print(large_result_df["model"].unique())

In [None]:
print(large_result_df["datamode"].unique())

In [None]:
long_df = large_result_df.melt(
    id_vars=["dataset", "debug", "model", "datamode"],
    value_vars=["mean_train_auc", "mean_val_auc"],
    var_name="metric",
    value_name="auc"
)

##### Graph DF

In [None]:
# plot all model performances on scatter plot
plt.figure(figsize=(10, 5))
for embedding in large_result_df.datamode.unique():
    plt.figure(figsize=(16, 12))
    d = large_result_df[large_result_df.datamode == embedding].sort_values(by=["mean_val_auc", "mean_train_auc"], ascending=False)
    g = sns.scatterplot(
        data=d,
        x = "mean_train_auc",
        y = "mean_val_auc",
        hue="model",
        style="dataset",
        s=150,
        alpha=0.7
    )
    # add y=x line
    plt.plot([0, 1], [0, 1], ls='--', c='gray')
    g.set_title(f"{embedding.upper()} model performances")
    g.set_xlabel("Mean Train AUC")
    g.set_ylabel("Mean Val AUC")
    g.legend(loc="upper left", bbox_to_anchor=(1.02, 1))
    g.set_ylim(0.3, 0.8)
    g.set_xlim(0.5, 1.0)
    
    img_name = f"{embedding}_all_small_models_performance.png"
    plt.savefig(os.path.join(img_dir, img_name), bbox_inches='tight')


In [None]:
# plot models in val AUC vs val balanced acc
plt.figure(figsize=(10, 5))
for embedding in large_result_df.datamode.unique():
    plt.figure(figsize=(16, 12))
    d = large_result_df[large_result_df.datamode == embedding]
    g = sns.scatterplot(
        data=d,
        x = "mean_val_auc",
        y = "mean_val_balanced_accuracy",
        hue="model",
        style="dataset",
        s=150,
        alpha=0.7
    )
    # add y=x line
    # plt.plot([0, 1], [0, 1], ls='--', c='gray')
    g.set_title(f"{embedding.upper()} model performances")
    g.set_xlabel("Mean Val AUC")
    g.set_ylabel("Mean Val Balanced Accuracy")
    g.legend(loc="upper left", bbox_to_anchor=(1.02, 1))
    g.set_ylim(0.45, 0.7)
    g.set_xlim(0.45, 0.7)
    
    img_name = f"{embedding}_all_small_models_pareto.png"
    plt.savefig(os.path.join(img_dir, img_name), bbox_inches='tight')

In [None]:
# plot models in val AUC vs val balanced acc
plt.figure(figsize=(16, 12))
d = large_result_df
g = sns.scatterplot(
    data=d,
    x = "mean_val_auc",
    y = "mean_val_balanced_accuracy",
    hue="model",
    s=50,
    alpha=0.7,
    edgecolor="black",
)
# add y=x line
# plt.plot([0, 1], [0, 1], ls='--', c='gray')
# g.set_title(f"Model performances AUC-accuracy Tradeoff")
g.set_xlabel("Mean Validation AUROC")
g.set_ylabel("Mean Validation Balanced Accuracy")
g.legend(loc="upper left", bbox_to_anchor=(1.02, 1))
# g.set_ylim(0.45, 0.7)
# g.set_xlim(0.45, 0.7)

img_name = f"all_small_models_pareto.png"
plt.savefig(os.path.join(img_dir, img_name), bbox_inches='tight')

In [None]:
# plot all model performances on scatter plot
plt.figure(figsize=(16, 12))
d = large_result_df
g = sns.scatterplot(
    data=d,
    x = "mean_train_auc",
    y = "mean_val_auc",
    hue="model",
    s=50,
    alpha=0.7,
    edgecolor="black",
)
# add y=x line
plt.plot([0, 1], [0, 1], ls='--', c='gray')
# g.set_title(f"{embedding.upper()} model performances")
g.set_xlabel("Mean Training AUROC")
g.set_ylabel("Mean Validation AUROC")
g.legend(loc="upper left", bbox_to_anchor=(1.02, 1))
g.set_ylim(0.3, 0.8)
g.set_xlim(0.45, 1.0)

img_name = f"all_small_models_performance.png"
plt.savefig(os.path.join(img_dir, img_name), bbox_inches='tight')


In [None]:
# plot
for ds in long_df.dataset.unique():
    plot_df = long_df[(long_df.dataset == ds)].copy()
    plot_df["model"] = np.where(plot_df["debug"], plot_df["model"] + "*",plot_df["model"])
    # plot grouped barchart of train_auc and val_auc with model on x axis
    if plot_df.empty:
        continue

    plt.figure(figsize=(10, 5))
    g = sns.catplot(
        data=plot_df,
        x="datamode",
        y="auc",
        hue="metric",
        col="model",
        kind="bar",
        dodge=True,
        height=4,
        aspect=1.2,
        col_wrap = 3,
        sharex=False,
    )
    g.set_titles("{col_name}")
    # g.set_xticklabels(rotation=30)
    g.set_axis_labels("", "AUC")
    g.set(ylim=(0, 1))
    # g.legend.set_loc("upper right")

    
    plt.suptitle(f"{ds}", y=1.04)
    # plt.tight_layout()
    # plt.legend(loc=(1,1))
    img_name = f"{ds}.png"
    # plt.savefig(os.path.join(img_dir, img_name), bbox_inches='tight')
    plt.show()


In [None]:
plt.close()

In [None]:
# display top 5 per group
for s in long_df.metric.unique():
    print(f"Top 5 for {s}:")
    print(long_df[long_df.metric == s].sort_values(by="auc", ascending=False).head(5))

In [None]:
pd.set_option("display.max_colwidth", None)
print(large_result_df.sort_values(by=["mean_val_auc", "mean_train_auc"], ascending=False).dataset.head(10))

In [None]:
# determine which datasets have highest performance
groups = large_result_df.groupby(['dataset'])
print(groups['mean_val_balanced_accuracy'].max().sort_values(ascending=False)[:50])
print(groups['mean_val_auc'].mean().sort_values(ascending=False)[:50])

In [None]:
# determine which datasets have highest performance
groups = large_result_df.groupby(['model', 'dataset'])
of_interest = groups['mean_val_auc'].max().sort_values(ascending=False).reset_index()
pd.set_option("display.max_colwidth", None)
print(of_interest[of_interest.model == "xgb"].dataset[:5])
print(of_interest[of_interest.model == "rocket"].dataset[:5])
print(of_interest[of_interest.model == "rand_forest"].dataset[:5])
# print(groups['mean_val_auc'].mean().sort_values(ascending=False)[:50])

### Models


#### Pareto Frontier Best Models

In [None]:
# pareto frontier
def identify_pareto(scores):
    # Count number of items
    population_size = scores.shape[0]
    # Create a NumPy index for scores on the Pareto front (zero indexed)
    pareto_front = np.ones(population_size, dtype=bool)
    # Compare each point with all others
    for i in range(population_size):
        for j in range(population_size):
            # Check if point 'i' is dominated by point 'j'
            if all(scores[j] >= scores[i]) and any(scores[j] > scores[i]):
                # Point 'i' is dominated, thus not on Pareto front
                pareto_front[i] = 0
                break
    # Return indices of Pareto front
    return pareto_front

In [None]:
large_result_df['is_pareto'] = identify_pareto(large_result_df[["mean_val_auc", "mean_val_balanced_accuracy"]].to_numpy())

In [None]:
large_result_df['ci_val_auc'] = (large_result_df['std_val_auc'] / np.sqrt(15)) * 1.96
large_result_df['ci_val_balanced_accuracy'] = (large_result_df['std_val_balanced_accuracy'] / np.sqrt(15)) * 1.96

In [None]:
large_result_df.sort_values(by=["mean_val_auc"], ascending=False)[:10]

In [None]:
large_result_df["pareto"] = large_result_df["mean_val_auc"] + large_result_df['ci_val_auc'] + large_result_df['ci_val_balanced_accuracy'] + large_result_df["mean_val_balanced_accuracy"]
large_result_df.sort_values(by=["pareto"], ascending=False)[:10]

In [None]:
large_result_df["is_pareto"].value_counts()

#### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
pd.set_option("display.max_colwidth", None)

In [None]:
def load_confusion(row, base_dir=train_dir, run_name=run_name):
    models_dir_name = "models_debug" if bool(row.debug) else "models"
    models_dir_name = f"{models_dir_name}_{run_name}"
    model_path = os.path.join(base_dir, row.dataset, models_dir_name)
    print(f"loading from: {model_path}")

    model_name = str(row.name_m)


    return load(
        open(
            os.path.join(model_path, f"{model_name}_confmat.pkl"),
            "rb",
        ),
    )

In [None]:
conf = load_confusion(row=large_result_df.sort_values(by=["pareto"], ascending=False).iloc[0])
print(conf / conf.sum())
conf = load_confusion(row=large_result_df.sort_values(by=["mean_val_auc"], ascending=False).iloc[0])
print(conf / conf.sum())

#### Running Best Models on Whole Data

In [None]:
def load_model(row, base_dir=train_dir, run_name=run_name):
    models_dir_name = "models_debug" if bool(row.debug) else "models"
    models_dir_name = f"{models_dir_name}_{run_name}"
    model_path = os.path.join(base_dir, row.dataset, models_dir_name)
    print(f"loading from: {model_path}")

    model_name = str(row.name_m)


    return load(
        open(
            os.path.join(model_path, f"{model_name}.pkl"),
            "rb",
        ),
    )

In [None]:
pareto_models = large_result_df[large_result_df["is_pareto"]].copy()
pareto_models.shape

In [None]:
m_list = []
for i in range(pareto_models.shape[0]):
    m = load_model(pareto_models.iloc[i])
    m_list.append(m)


In [None]:
# load datasets
def get_data(row, base_dir=train_dir):
    data_mode = row.datamode
    print(data_mode)
    if data_mode == "raw":
        f = "x.zarr"
    elif data_mode == "pca":
        f = "pca_x.zarr"
    elif data_mode == "fpca":
        f = "fpca_x.zarr"
    elif data_mode == "separatepca":
        f = "separate_decomp_x.zarr"
    elif data_mode == "chronos":
        f = "chronos_x.zarr"
    elif data_mode == "design":
        f = "design_x.zarr"
    elif data_mode == "whiten":
        f = "white_design_x.zarr"
    X_train = da.from_zarr(os.path.join(train_dir, row.dataset, "permanent", "train", f))
    labels = pd.read_pickle(
        os.path.join(train_dir, row.dataset, "permanent", "train", "labels.pkl")
    )
    y_train = labels["in?"].astype(int)

    X_test = da.from_zarr(os.path.join(train_dir, row.dataset, "permanent", "test", f))
    labels = pd.read_pickle(
        os.path.join(train_dir, row.dataset, "permanent", "test", "labels.pkl")
    )
    y_test = labels["in?"].astype(int)

    return X_train, y_train, X_test, y_test

In [None]:
test_acc = []
test_auc = []

for i in range(pareto_models.shape[0]):
    row = pareto_models.iloc[i]
    X_train, y_train, X_test, y_test = get_data(row)
    print(X_train.shape)
    print(X_test.shape)

    m = m_list[i]
    if row.model == "svm":
        print(f"  Best params: {m.get_params()}")
        m.set_params(cache_size=1500)
    print(m.__class__.__name__)
    print(f"  Best params: {m.get_params()}")
    m.fit(X_train, y_train)

    if hasattr(m, "predict_proba"):
        y_prob = m.predict_proba(X_test)[:, 1]
        y_pred = (y_prob >= 0.5).astype(int)
    else:
        y_prob = m.decision_function(X_test)
        y_pred = (y_prob >= 0).astype(int)

    test_acc.append(balanced_accuracy_score(y_test, y_pred))
    test_auc.append(roc_auc_score(y_test, y_prob))
    print(test_acc, test_auc)


In [None]:
try:
    pareto_models["test_auc"] = test_auc
    pareto_models["test_balanced_accuracy"] = test_acc
except:
    pass

In [None]:
pareto_models

Notice how the balanced accuracy drops drastically, despite the AUROC staying stable. This is likely a result of the decision thresholding. We therefore tune the decision threshold of all these models.

In [None]:
from sklearn.model_selection import TunedThresholdClassifierCV

In [None]:
tuner_results = []
best_acc = []
best_thresholds = []
ntest_acc = []

for i in range(pareto_models.shape[0]):
    row = pareto_models.iloc[i]
    X_train, y_train, X_test, y_test = get_data(row)
    print(X_train.shape)
    print(X_test.shape)

    m = m_list[i]
    if row.model == "svm":
        m.set_params(cache_size=1500)
    print(m.__class__.__name__)
    print(f"  Best params: {m.get_params()}")
    tuner = TunedThresholdClassifierCV(estimator = m, 
                                       cv=None, 
                                       refit=True, 
                                       n_jobs=-1, 
                                       random_state=42, 
                                       store_cv_results=True,
                                      )
    tuner.fit(X_train, y_train)

    threshold = tuner.best_threshold_
    best_acc.append(tuner.best_score_)
    best_thresholds.append(threshold)

    if hasattr(m, "predict_proba"):
        y_prob = tuner.predict_proba(X_test)[:, 1]
        y_pred = (y_prob >= threshold).astype(int)
    else:
        y_prob = tuner.decision_function(X_test)
        y_pred = (y_prob >= threshold).astype(int)

    ntest_acc.append(balanced_accuracy_score(y_test, y_pred))

In [None]:
pareto_models["tuned_train_acc"] = best_acc
pareto_models["tuned_train_thresh"] = best_thresholds
pareto_models["tuned_test_acc"] = ntest_acc

In [None]:
pareto_models