In [2]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
from os.path import join
from tqdm import tqdm
import pandas as pd
import numpy as np
import joblib
import imodelsx.process_results
import sys
sys.path.append('../experiments/')
results_dir = '../results/04_train_best_model'
experiment_filename = '../experiments/01_train_model.py'

# load the results in to a pandas dataframe
r = imodelsx.process_results.get_results_df(results_dir)
cols_varied = imodelsx.process_results.get_experiment_keys(
    r, experiment_filename)
print('experiment varied these params:', cols_varied)
r = imodelsx.process_results.fill_missing_args_with_default(
    r, experiment_filename)

# get the breakdown of data in these groups
r.groupby(cols_varied).size()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1823/1823 [00:30<00:00, 60.37it/s]


experiment varied these params: ['dataset_name', 'seed', 'model_name', 'max_depth', 'max_features', 'max_trees', 'n_epochs']


dataset_name  seed  model_name  max_depth  max_features  max_trees  n_epochs
abalone       0     figs        4          0.50          20         100         1
                                                         30         100         1
                                           0.75          20         100         1
                                                         30         100         1
                                           1.00          20         100         1
                                                                               ..
transaction   4     rf_plus     4          1.00          30         100         1
                                5          1.00          30         100         1
                    xgboost     4          1.00          30         100         1
                                5          1.00          30         100         1
                                6          1.00          30         100         1
Length: 1823, dtype: 

In [31]:
datasets = r['dataset_name'].unique()
datasets

array(['allstate', 'parkinsons', 'powerplant', 'miami_housing', 'cpu_act',
       'transaction', 'ca_housing', 'insurance', 'mercedes', 'abalone',
       'airfoil', 'concrete', 'qsar'], dtype=object)

In [24]:
#RF params: max_depth, max_features
rf = r[r['model_name'] == 'random_forest'].drop(columns=['subsample_frac', 'save_dir', 'featurizer_name', 'featurizer_frac', 'featurizer_overlap',
                                                        'depth', 'bit', 'use_cache', 'cat_mappings', 'task_type', 'save_dir_unique',
                                                        'gpu', 'n_epochs', 'max_rules', 'max_trees', 'pre_interaction', 'post_interaction',
                                                        'pre_max_features', 'post_max_features', 'size_interactions'])
rf_g = rf.groupby(['dataset_name','model_name', 'max_depth', 'max_features'])[['r2_score_train_true', 'r2_score_val_true']].mean()

In [46]:
for d in datasets:
    queried = rf_g.query(f"dataset_name == '{d}'")
    print(queried['r2_score_val_true'].idxmax())
    print(queried['r2_score_val_true'].max())

('allstate', 'random_forest', 6, 0.5)
0.4622840715830795
('parkinsons', 'random_forest', 6, 1.0)
0.7524213096993081
('powerplant', 'random_forest', 6, 0.75)
0.9438282121672575
('miami_housing', 'random_forest', 6, 0.75)
0.8556671426021527
('cpu_act', 'random_forest', 6, 0.5)
0.9730567587088202
('transaction', 'random_forest', 6, 0.5)
0.21572899530353756
('ca_housing', 'random_forest', 6, 0.5)
0.7030792389320804
('insurance', 'random_forest', 4, 1.0)
0.8571740981122209
('mercedes', 'random_forest', 4, 1.0)
0.5878040706284025
('abalone', 'random_forest', 6, 1.0)
0.5437160079110199
('airfoil', 'random_forest', 6, 1.0)
0.78256972006629
('concrete', 'random_forest', 6, 1.0)
0.8492022821680688
('qsar', 'random_forest', 6, 1.0)
0.39048867278188115


In [35]:
#RF+ params: max_depth, max_features
rf_p = r[r['model_name'] == 'rf_plus'].drop(columns=['subsample_frac', 'save_dir', 'featurizer_name', 'featurizer_frac', 'featurizer_overlap',
                                                        'depth', 'bit', 'use_cache', 'cat_mappings', 'task_type', 'save_dir_unique',
                                                        'gpu', 'n_epochs', 'max_rules', 'max_trees', 'pre_interaction', 'post_interaction',
                                                        'pre_max_features', 'post_max_features', 'size_interactions'])
rf_p_g = rf_p.groupby(['dataset_name','model_name', 'max_depth', 'max_features'])[['r2_score_train_true', 'r2_score_val_true']].mean()

In [36]:
for d in datasets:
    queried = rf_p_g.query(f"dataset_name == '{d}'")
    print(queried['r2_score_val_true'].idxmax(), queried['r2_score_val_true'].max())

('allstate', 'rf_plus', 5, 0.75) 0.4857347448456884
('parkinsons', 'rf_plus', 6, 1.0) 0.7542152631148996
('powerplant', 'rf_plus', 6, 0.75) 0.9446751731190283
('miami_housing', 'rf_plus', 6, 1.0) 0.8712473379484337
('cpu_act', 'rf_plus', 6, 0.5) 0.9783737223170819
('transaction', 'rf_plus', 5, 1.0) 0.20406457989161803
('ca_housing', 'rf_plus', 6, 0.75) 0.7221553430142867
('insurance', 'rf_plus', 4, 1.0) 0.860206251420065
('mercedes', 'rf_plus', 4, 1.0) 0.5894047942318578
('abalone', 'rf_plus', 6, 0.75) 0.5668616306295201
('airfoil', 'rf_plus', 6, 1.0) 0.7887936365206617
('concrete', 'rf_plus', 6, 1.0) 0.8784921130490341
('qsar', 'rf_plus', 5, 1.0) 0.3631886371350036


In [38]:
#FIGS params: max_rules, max_trees, max_features
figs = r[r['model_name'] == 'figs'].drop(columns=['subsample_frac', 'save_dir', 'featurizer_name', 'featurizer_frac', 'featurizer_overlap',
                                                        'depth', 'bit', 'use_cache', 'cat_mappings', 'task_type', 'save_dir_unique',
                                                        'gpu', 'n_epochs', 'max_depth','pre_interaction', 'post_interaction',
                                                        'pre_max_features', 'post_max_features', 'size_interactions'])
figs_g = figs.groupby(['dataset_name','model_name', 'max_rules','max_trees', 'max_features'])[['r2_score_train_true', 'r2_score_val_true']].mean()

In [39]:
for d in datasets:
    queried = figs_g.query(f"dataset_name == '{d}'")
    print(queried['r2_score_val_true'].idxmax(), queried['r2_score_val_true'].max())

('allstate', 'figs', 60, 20, 0.75) 0.5069910874821875
('parkinsons', 'figs', 60, 20, 0.75) 0.8825121344810796
('powerplant', 'figs', 60, 20, 0.5) 0.9374676393991448
('miami_housing', 'figs', 60, 20, 1.0) 0.8621672423479977
('cpu_act', 'figs', 60, 20, 1.0) 0.9773946580799491
('transaction', 'figs', 60, 20, 0.75) -0.09348332723359438
('ca_housing', 'figs', 60, 20, 1.0) 0.7815641252184437
('insurance', 'figs', 60, 20, 0.5) 0.7879054740134631
('mercedes', 'figs', 60, 20, 0.75) 0.4982606945129011
('abalone', 'figs', 60, 20, 0.75) 0.4310555732638309
('airfoil', 'figs', 60, 20, 1.0) 0.8418877912337767
('concrete', 'figs', 60, 20, 1.0) 0.8820219424565234
('qsar', 'figs', 60, 20, 1.0) 0.21637869968198498


In [41]:
#RF+ params: max_depth, max_features
xgb = r[r['model_name'] == 'xgboost'].drop(columns=['subsample_frac', 'save_dir', 'featurizer_name', 'featurizer_frac', 'featurizer_overlap',
                                                        'depth', 'bit', 'use_cache', 'cat_mappings', 'task_type', 'save_dir_unique',
                                                        'gpu', 'n_epochs', 'max_rules', 'max_trees', 'pre_interaction', 'post_interaction',
                                                        'pre_max_features', 'post_max_features', 'size_interactions', 'max_features'])
xgb_g = xgb.groupby(['dataset_name','model_name', 'max_depth'])[['r2_score_train_true', 'r2_score_val_true']].mean()

In [42]:
for d in datasets:
    queried = xgb_g.query(f"dataset_name == '{d}'")
    print(queried['r2_score_val_true'].idxmax(), queried['r2_score_val_true'].max())

('allstate', 'xgboost', 5) 0.56880074550301
('parkinsons', 'xgboost', 6) 0.8943154947685432
('powerplant', 'xgboost', 6) 0.9619338164422656
('miami_housing', 'xgboost', 5) 0.9154602004150789
('cpu_act', 'xgboost', 4) 0.9833557963371277
('transaction', 'xgboost', 4) 0.12749623951140837
('ca_housing', 'xgboost', 6) 0.8258611965361391
('insurance', 'xgboost', 4) 0.814687698692056
('mercedes', 'xgboost', 4) 0.5457945775010881
('abalone', 'xgboost', 4) 0.5052047848701477
('airfoil', 'xgboost', 6) 0.9363278907330399
('concrete', 'xgboost', 4) 0.9152923336616308
('qsar', 'xgboost', 4) 0.40006738901138306
