In [1]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
from os.path import join
from tqdm import tqdm
import pandas as pd
import numpy as np
import joblib
import imodelsx.process_results
import sys
sys.path.append('../experiments/')
results_dir = '../results/04_train_best_model'
experiment_filename = '../experiments/01_train_model.py'

# load the results in to a pandas dataframe
r = imodelsx.process_results.get_results_df(results_dir)
cols_varied = imodelsx.process_results.get_experiment_keys(
    r, experiment_filename)
print('experiment varied these params:', cols_varied)
r = imodelsx.process_results.fill_missing_args_with_default(
    r, experiment_filename)

# get the breakdown of data in these groups
r.groupby(cols_varied).size()

2024-06-28 17:48:51.664895: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1823/1823 [00:31<00:00, 57.76it/s]


experiment varied these params: ['dataset_name', 'seed', 'model_name', 'max_depth', 'max_features', 'max_trees', 'n_epochs']


dataset_name  seed  model_name  max_depth  max_features  max_trees  n_epochs
abalone       0     figs        4          0.50          20         100         1
                                                         30         100         1
                                           0.75          20         100         1
                                                         30         100         1
                                           1.00          20         100         1
                                                                               ..
transaction   4     rf_plus     4          1.00          30         100         1
                                5          1.00          30         100         1
                    xgboost     4          1.00          30         100         1
                                5          1.00          30         100         1
                                6          1.00          30         100         1
Length: 1823, dtype: 

In [2]:
datasets = r['dataset_name'].unique()

In [3]:
#RF params: max_depth, max_features
random_forest= r[r['model_name'] == 'random_forest'].drop(columns=['subsample_frac', 'save_dir', 'featurizer_name', 'featurizer_frac', 'featurizer_overlap',
                                                        'depth', 'bit', 'use_cache', 'cat_mappings', 'task_type', 'save_dir_unique',
                                                        'gpu', 'n_epochs', 'max_rules', 'max_trees', 'pre_interaction', 'post_interaction',
                                                        'pre_max_features', 'post_max_features', 'size_interactions'])
random_forest_g = random_forest.groupby(['dataset_name','model_name', 'max_depth', 'max_features'])[['r2_score_train_true', 'r2_score_val_true']].mean()

In [4]:
random_forest_best_hyp = {}
for d in datasets:
    queried = random_forest_g.query(f"dataset_name == '{d}'")
    d_best_hyp = dict(zip(['dataset_name','model_name', 'max_depth', 'max_features'], list(queried['r2_score_val_true'].idxmax())))
    d_best_hyp['r2_val'] = queried['r2_score_val_true'].max()
    del d_best_hyp['dataset_name']
    del d_best_hyp['model_name']
    random_forest_best_hyp[d] = d_best_hyp
random_forest_best_hyp

{'allstate': {'max_depth': 6,
  'max_features': 0.5,
  'r2_val': 0.4622840715830795},
 'parkinsons': {'max_depth': 6,
  'max_features': 1.0,
  'r2_val': 0.7524213096993081},
 'powerplant': {'max_depth': 6,
  'max_features': 0.75,
  'r2_val': 0.9438282121672575},
 'miami_housing': {'max_depth': 6,
  'max_features': 0.75,
  'r2_val': 0.8556671426021527},
 'cpu_act': {'max_depth': 6,
  'max_features': 0.5,
  'r2_val': 0.9730567587088202},
 'transaction': {'max_depth': 6,
  'max_features': 0.5,
  'r2_val': 0.21572899530353756},
 'ca_housing': {'max_depth': 6,
  'max_features': 0.5,
  'r2_val': 0.7030792389320804},
 'insurance': {'max_depth': 4,
  'max_features': 1.0,
  'r2_val': 0.8571740981122209},
 'mercedes': {'max_depth': 4,
  'max_features': 1.0,
  'r2_val': 0.5878040706284025},
 'abalone': {'max_depth': 6,
  'max_features': 1.0,
  'r2_val': 0.5437160079110199},
 'airfoil': {'max_depth': 6, 'max_features': 1.0, 'r2_val': 0.78256972006629},
 'concrete': {'max_depth': 6,
  'max_features

In [5]:
#RF+ params: max_depth, max_features
rf_plus = r[r['model_name'] == 'rf_plus'].drop(columns=['subsample_frac', 'save_dir', 'featurizer_name', 'featurizer_frac', 'featurizer_overlap',
                                                        'depth', 'bit', 'use_cache', 'cat_mappings', 'task_type', 'save_dir_unique',
                                                        'gpu', 'n_epochs', 'max_rules', 'max_trees', 'pre_interaction', 'post_interaction',
                                                        'pre_max_features', 'post_max_features', 'size_interactions'])
rf_plus_g = rf_plus.groupby(['dataset_name','model_name', 'max_depth', 'max_features'])[['r2_score_train_true', 'r2_score_val_true']].mean()

In [6]:
rf_plus_best_hyp = {}
for d in datasets:
    queried = rf_plus_g.query(f"dataset_name == '{d}'")
    d_best_hyp = dict(zip(['dataset_name','model_name', 'max_depth', 'max_features'], list(queried['r2_score_val_true'].idxmax())))
    d_best_hyp['r2_val'] = queried['r2_score_val_true'].max()
    del d_best_hyp['dataset_name']
    del d_best_hyp['model_name']
    rf_plus_best_hyp[d] = d_best_hyp
rf_plus_best_hyp

{'allstate': {'max_depth': 5,
  'max_features': 0.75,
  'r2_val': 0.4857347448456884},
 'parkinsons': {'max_depth': 6,
  'max_features': 1.0,
  'r2_val': 0.7542152631148996},
 'powerplant': {'max_depth': 6,
  'max_features': 0.75,
  'r2_val': 0.9446751731190283},
 'miami_housing': {'max_depth': 6,
  'max_features': 1.0,
  'r2_val': 0.8712473379484337},
 'cpu_act': {'max_depth': 6,
  'max_features': 0.5,
  'r2_val': 0.9783737223170819},
 'transaction': {'max_depth': 5,
  'max_features': 1.0,
  'r2_val': 0.20406457989161803},
 'ca_housing': {'max_depth': 6,
  'max_features': 0.75,
  'r2_val': 0.7221553430142867},
 'insurance': {'max_depth': 4,
  'max_features': 1.0,
  'r2_val': 0.860206251420065},
 'mercedes': {'max_depth': 4,
  'max_features': 1.0,
  'r2_val': 0.5894047942318578},
 'abalone': {'max_depth': 6,
  'max_features': 0.75,
  'r2_val': 0.5668616306295201},
 'airfoil': {'max_depth': 6,
  'max_features': 1.0,
  'r2_val': 0.7887936365206617},
 'concrete': {'max_depth': 6,
  'max_f

In [7]:
#FIGS params: max_rules, max_trees, max_features
figs = r[r['model_name'] == 'figs'].drop(columns=['subsample_frac', 'save_dir', 'featurizer_name', 'featurizer_frac', 'featurizer_overlap',
                                                        'depth', 'bit', 'use_cache', 'cat_mappings', 'task_type', 'save_dir_unique',
                                                        'gpu', 'n_epochs', 'max_depth','pre_interaction', 'post_interaction',
                                                        'pre_max_features', 'post_max_features', 'size_interactions'])
figs_g = figs.groupby(['dataset_name','model_name', 'max_rules','max_trees', 'max_features'])[['r2_score_train_true', 'r2_score_val_true']].mean()

In [8]:
figs_best_hyp = {}
for d in datasets:
    queried = figs_g.query(f"dataset_name == '{d}'")
    d_best_hyp = dict(zip(['dataset_name','model_name', 'max_rules', 'max_trees', 'max_features'], list(queried['r2_score_val_true'].idxmax())))
    d_best_hyp['r2_val'] = queried['r2_score_val_true'].max()
    del d_best_hyp['dataset_name']
    del d_best_hyp['model_name']
    figs_best_hyp[d] = d_best_hyp
figs_best_hyp

{'allstate': {'max_rules': 60,
  'max_trees': 20,
  'max_features': 0.75,
  'r2_val': 0.5069910874821875},
 'parkinsons': {'max_rules': 60,
  'max_trees': 20,
  'max_features': 0.75,
  'r2_val': 0.8825121344810796},
 'powerplant': {'max_rules': 60,
  'max_trees': 20,
  'max_features': 0.5,
  'r2_val': 0.9374676393991448},
 'miami_housing': {'max_rules': 60,
  'max_trees': 20,
  'max_features': 1.0,
  'r2_val': 0.8621672423479977},
 'cpu_act': {'max_rules': 60,
  'max_trees': 20,
  'max_features': 1.0,
  'r2_val': 0.9773946580799491},
 'transaction': {'max_rules': 60,
  'max_trees': 20,
  'max_features': 0.75,
  'r2_val': -0.09348332723359438},
 'ca_housing': {'max_rules': 60,
  'max_trees': 20,
  'max_features': 1.0,
  'r2_val': 0.7815641252184437},
 'insurance': {'max_rules': 60,
  'max_trees': 20,
  'max_features': 0.5,
  'r2_val': 0.7879054740134631},
 'mercedes': {'max_rules': 60,
  'max_trees': 20,
  'max_features': 0.75,
  'r2_val': 0.4982606945129011},
 'abalone': {'max_rules': 

In [9]:
#RF+ params: max_depth, max_features
xgboost = r[r['model_name'] == 'xgboost'].drop(columns=['subsample_frac', 'save_dir', 'featurizer_name', 'featurizer_frac', 'featurizer_overlap',
                                                        'depth', 'bit', 'use_cache', 'cat_mappings', 'task_type', 'save_dir_unique',
                                                        'gpu', 'n_epochs', 'max_rules', 'max_trees', 'pre_interaction', 'post_interaction',
                                                        'pre_max_features', 'post_max_features', 'size_interactions', 'max_features'])
xgboost_g = xgboost.groupby(['dataset_name','model_name', 'max_depth'])[['r2_score_train_true', 'r2_score_val_true']].mean()

In [10]:
xgboost_best_hyp = {}
for d in datasets:
    queried = xgboost_g.query(f"dataset_name == '{d}'")
    d_best_hyp = dict(zip(['dataset_name','model_name', 'max_depth'], list(queried['r2_score_val_true'].idxmax())))
    d_best_hyp['r2_val'] = queried['r2_score_val_true'].max()
    del d_best_hyp['dataset_name']
    del d_best_hyp['model_name']
    xgboost_best_hyp[d] = d_best_hyp
xgboost_best_hyp

{'allstate': {'max_depth': 5, 'r2_val': 0.56880074550301},
 'parkinsons': {'max_depth': 6, 'r2_val': 0.8943154947685432},
 'powerplant': {'max_depth': 6, 'r2_val': 0.9619338164422656},
 'miami_housing': {'max_depth': 5, 'r2_val': 0.9154602004150789},
 'cpu_act': {'max_depth': 4, 'r2_val': 0.9833557963371277},
 'transaction': {'max_depth': 4, 'r2_val': 0.12749623951140837},
 'ca_housing': {'max_depth': 6, 'r2_val': 0.8258611965361391},
 'insurance': {'max_depth': 4, 'r2_val': 0.814687698692056},
 'mercedes': {'max_depth': 4, 'r2_val': 0.5457945775010881},
 'abalone': {'max_depth': 4, 'r2_val': 0.5052047848701477},
 'airfoil': {'max_depth': 6, 'r2_val': 0.9363278907330399},
 'concrete': {'max_depth': 4, 'r2_val': 0.9152923336616308},
 'qsar': {'max_depth': 4, 'r2_val': 0.40006738901138306}}

In [11]:
#ResNet params: n_epochs
resnet = r[r['model_name'] == 'resnet'].drop(columns=['subsample_frac', 'save_dir', 'featurizer_name', 'featurizer_frac', 'featurizer_overlap',
                                                        'depth', 'bit', 'use_cache', 'cat_mappings', 'task_type', 'save_dir_unique',
                                                        'gpu', 'max_rules', 'max_trees', 'pre_interaction', 'post_interaction',
                                                        'pre_max_features', 'post_max_features', 'size_interactions', 'max_features', 'max_depth'])
resnet_g = resnet.groupby(['dataset_name','model_name', 'n_epochs'])[['r2_score_train_true', 'r2_score_val_true']].mean()

In [12]:
resnet_best_hyp = {}
for d in resnet['dataset_name'].unique():
    queried = resnet_g.query(f"dataset_name == '{d}'")
    d_best_hyp = dict(zip(['dataset_name','model_name','n_epochs'], list(queried['r2_score_val_true'].idxmax())))
    d_best_hyp['r2_val'] = queried['r2_score_val_true'].max()
    del d_best_hyp['dataset_name']
    del d_best_hyp['model_name']
    resnet_best_hyp[d] = d_best_hyp
resnet_best_hyp

{'miami_housing': {'n_epochs': 200, 'r2_val': 0.8335230025572364},
 'ca_housing': {'n_epochs': 200, 'r2_val': 0.7108938926826326},
 'concrete': {'n_epochs': 100, 'r2_val': 0.5756088074295108},
 'powerplant': {'n_epochs': 200, 'r2_val': 0.9291184240041794},
 'parkinsons': {'n_epochs': 100, 'r2_val': 0.8453877042791884},
 'insurance': {'n_epochs': 100, 'r2_val': 0.6193256156088197},
 'abalone': {'n_epochs': 200, 'r2_val': 0.39101495742797854},
 'airfoil': {'n_epochs': 100, 'r2_val': 0.35768165935684787},
 'qsar': {'n_epochs': 200, 'r2_val': 0.1023605465888977},
 'cpu_act': {'n_epochs': 100, 'r2_val': 0.9514864563941956}}

In [13]:
#FT Transformer params: n_epochs
ft_transformer = r[r['model_name'] == 'ft_transformer'].drop(columns=['subsample_frac', 'save_dir', 'featurizer_name', 'featurizer_frac', 'featurizer_overlap',
                                                        'depth', 'bit', 'use_cache', 'cat_mappings', 'task_type', 'save_dir_unique',
                                                        'gpu', 'max_rules', 'max_trees', 'pre_interaction', 'post_interaction',
                                                        'pre_max_features', 'post_max_features', 'size_interactions', 'max_features', 'max_depth'])
ft_transformer_g = ft_transformer.groupby(['dataset_name','model_name', 'n_epochs'])[['r2_score_train_true', 'r2_score_val_true']].mean()

In [14]:
ft_transformer_best_hyp = {}
for d in ft_transformer['dataset_name'].unique():
    queried = ft_transformer_g.query(f"dataset_name == '{d}'")
    d_best_hyp = dict(zip(['dataset_name','model_name','n_epochs'], list(queried['r2_score_val_true'].idxmax())))
    d_best_hyp['r2_val'] = queried['r2_score_val_true'].max()
    del d_best_hyp['dataset_name']
    del d_best_hyp['model_name']
    ft_transformer_best_hyp[d] = d_best_hyp
ft_transformer_best_hyp

{'powerplant': {'n_epochs': 200, 'r2_val': 0.9336203155525509},
 'miami_housing': {'n_epochs': 100, 'r2_val': 0.8532076207192454},
 'insurance': {'n_epochs': 200, 'r2_val': 0.6814996945201638},
 'abalone': {'n_epochs': 100, 'r2_val': 0.44565123319625854},
 'parkinsons': {'n_epochs': 100, 'r2_val': 0.8442809764311544},
 'cpu_act': {'n_epochs': 200, 'r2_val': 0.9749787211418152},
 'ca_housing': {'n_epochs': 200, 'r2_val': 0.7175731868584732},
 'concrete': {'n_epochs': 200, 'r2_val': 0.6545138020010774},
 'airfoil': {'n_epochs': 200, 'r2_val': 0.38982354812794223}}

In [15]:
best_hyp = {}
for m in ['random_forest', 'rf_plus', 'figs', 'xgboost', 'resnet', 'ft_transformer']:
    best_hyp[m] = eval(f'{m}_best_hyp')

In [23]:
import json 

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

with open('/home/mattyshen/interpretableDistillation/scripts/best_hyperparams/original_hyperparams.json', "w") as outfile: 
    json.dump(best_hyp, outfile, cls =NpEncoder)