## Overfitting Exploration

### Useful Preliminaries

In [112]:
import os
import sys
sys.path.append("..")  # add project root

import shutil
import re
from argparse import ArgumentParser
from pickle import dump, load

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

import zarr
import dask.array as da

from ray import tune

from sklearn.metrics import balanced_accuracy_score, roc_auc_score

from src.data_utils import *
from src.constants import *
from src.tuner import train_cv, RayAdaptiveRepeatedCVSearch

In [113]:
pd.options.display.float_format = '{:10,.2f}'.format

In [114]:
np.random.seed(420)

In [115]:
sns.set_theme(context="talk")

In [116]:
# path constants
train_dir = "/home/mr2238/project_pi_np442/mr2238/accelerate/data/training/"

### Loading Model and Results

In [117]:
dataset_name = "downsample_w_60s_rso2r_rso2l_abp"
run_name = "current"
small = False
model_name = f"models{'_debug' if small else ''}_{run_name}"

In [118]:
model_store = os.path.join(train_dir, dataset_name, model_name)
print(model_store)

/home/mr2238/project_pi_np442/mr2238/accelerate/data/training/downsample_w_60s_rso2r_rso2l_abp/models_current


In [119]:
print(os.listdir(model_store))

['decision_tree_pca', 'decision_tree_pca.pkl', 'decision_tree_raw', 'decision_tree_raw.pkl', 'decision_tree_separate_pca', 'decision_tree_separate_pca.pkl', 'knn_multivar_raw', 'knn_pca', 'knn_raw', 'knn_separate_pca', 'log_reg_pca', 'log_reg_pca.pkl', 'log_reg_raw', 'log_reg_raw.pkl', 'log_reg_separate_pca', 'log_reg_separate_pca.pkl', 'rocket_raw', 'xgb_pca', 'xgb_raw', 'xgb_separate_pca']


In [120]:
model_states = {}
for f in os.listdir(model_store):
    if not f.endswith(".pkl"):
        state = tune.ExperimentAnalysis(experiment_checkpoint_path=os.path.join(model_store, f))
        model_states[f] = state

In [121]:
# TBD load test metrics? could also just move this to eval.py

### Plot Best Results

In [122]:
for k, v in model_states.items():
    print(k)
    print(m.results_df.columns[:9])

decision_tree_pca
Index(['mean_val_auc', 'mean_train_auc', 'std_val_auc', 'std_train_auc',
       'mean_val_balanced_accuracy', 'mean_train_balanced_accuracy',
       'std_val_balanced_accuracy', 'std_train_balanced_accuracy', 'fold'],
      dtype='object')
decision_tree_raw
Index(['mean_val_auc', 'mean_train_auc', 'std_val_auc', 'std_train_auc',
       'mean_val_balanced_accuracy', 'mean_train_balanced_accuracy',
       'std_val_balanced_accuracy', 'std_train_balanced_accuracy', 'fold'],
      dtype='object')
decision_tree_separate_pca
Index(['mean_val_auc', 'mean_train_auc', 'std_val_auc', 'std_train_auc',
       'mean_val_balanced_accuracy', 'mean_train_balanced_accuracy',
       'std_val_balanced_accuracy', 'std_train_balanced_accuracy', 'fold'],
      dtype='object')
knn_multivar_raw
Index(['mean_val_auc', 'mean_train_auc', 'std_val_auc', 'std_train_auc',
       'mean_val_balanced_accuracy', 'mean_train_balanced_accuracy',
       'std_val_balanced_accuracy', 'std_train_balanced_ac

In [139]:
# gather results
def gather_results(model_states, metric, others_to_fetch):
    rows = []
    of_interest = ['model'] + [metric] + others_to_fetch
    for k, v in model_states.items():
        df = v.results_df
        try:
            result = df.loc[[df[metric].idxmax()]]
            result["model"] = k
            rows.append(result[of_interest])
        except:
            continue
    return pd.concat(rows, ignore_index=True)

In [140]:
others = ['mean_train_auc', 'std_val_auc', 'std_train_auc',
       'mean_val_balanced_accuracy', 'mean_train_balanced_accuracy',
       'std_val_balanced_accuracy', 'std_train_balanced_accuracy', 'fold']

In [141]:
r = gather_results(model_states, 'mean_val_auc', others)
print(r)

                         model  mean_val_auc  mean_train_auc  std_val_auc  \
0            decision_tree_pca          0.53            0.97         0.01   
1            decision_tree_raw          0.57            0.68         0.03   
2   decision_tree_separate_pca          0.55            0.77         0.02   
3                      knn_pca          0.57            1.00         0.03   
4                      knn_raw          0.58            1.00         0.04   
5             knn_separate_pca          0.57            1.00         0.03   
6                  log_reg_pca          0.54            0.53         0.02   
7                  log_reg_raw          0.53            0.54         0.02   
8         log_reg_separate_pca          0.54            0.53         0.02   
9                      xgb_pca          0.61            0.67         0.05   
10                     xgb_raw          0.61            0.67         0.04   
11            xgb_separate_pca          0.61            0.67         0.03   