In [1]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
from os.path import join
from tqdm import tqdm
import pandas as pd
import numpy as np
import joblib
import imodelsx.process_results
import sys
sys.path.append('../experiments/')
results_dir = '../results/02_distill_featurized_model'
experiment_filename = '../experiments/02_distill_featurized_model.py'

# load the results in to a pandas dataframe
r = imodelsx.process_results.get_results_df(results_dir)
cols_varied = imodelsx.process_results.get_experiment_keys(
    r, experiment_filename)
print('experiment varied these params:', cols_varied)
r = imodelsx.process_results.fill_missing_args_with_default(
    r, experiment_filename)

# get the breakdown of data in these groups
r.groupby(cols_varied).size()

2024-06-23 10:53:34.579934: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 488/488 [00:03<00:00, 125.73it/s]


experiment varied these params: ['dataset_name', 'model_name', 'distiller_name', 'featurizer_frac', 'depth', 'bit', 'pre_interaction', 'gpu']


dataset_name  model_name      distiller_name  featurizer_frac  depth  bit  pre_interaction  gpu
abalone       ft_transformer  figs            0.3              2      0    l0l2             2      1
                                                                           l1l2             2      1
                                                                      1    l0l2             2      1
                                                                           l1l2             2      1
                                                               3      0    l0l2             2      1
                                                                                                  ..
powerplant    resnet          ft_distill      0.7              3      1    l0l2             2      1
                                                                           l1l2             2      1
transaction   resnet          figs            0.7              2      0    l1l2             2   

In [15]:
r.columns

Index(['dataset_name', 'subsample_frac', 'seed', 'save_dir', 'model_name',
       'distiller_name', 'featurizer_name', 'featurizer_frac',
       'featurizer_overlap', 'depth', 'bit', 'max_depth', 'max_rules',
       'max_trees', 'pre_interaction', 'pre_max_features', 'post_interaction',
       'post_max_features', 'n_epochs', 'gpu', 'size_interactions',
       'use_cache', 'cat_mappings', 'task_type', 'save_dir_unique',
       'teacher_r2_score_train_true', 'teacher_r2_score_val_true',
       'distiller_r2_score_train_true', 'distiller_r2_score_val_true',
       'distiller_r2_score_train_teacher', 'distiller_r2_score_val_teacher'],
      dtype='object')

In [29]:
pd.pivot_table(r[r['depth']==3], values='teacher_r2_score_val_true', aggfunc='max', index='dataset_name', columns=['model_name']).round(2).dropna()

model_name,ft_transformer,resnet
dataset_name,Unnamed: 1_level_1,Unnamed: 2_level_1
abalone,0.4,0.36
airfoil,0.49,0.48
ca_housing,0.69,0.66
concrete,0.67,0.64
cpu_act,0.97,0.94
insurance,0.75,0.64
miami_housing,0.8,0.81
parkinsons,0.71,0.65
powerplant,0.93,0.92


In [34]:
pd.pivot_table(r, values='distiller_r2_score_val_true', aggfunc='mean', index='dataset_name', columns=['model_name', 'distiller_name']).round(2).dropna().to_csv('distill.csv')

In [38]:
r.to_csv('distill.csv')

In [31]:
pd.pivot_table(r, values='teacher_r2_score_val_true', aggfunc='mean', index='dataset_name', columns=['model_name']).round(2).dropna()

model_name,ft_transformer,resnet
dataset_name,Unnamed: 1_level_1,Unnamed: 2_level_1
abalone,0.35,0.32
airfoil,0.21,0.31
ca_housing,0.62,0.6
concrete,0.56,0.59
cpu_act,0.96,0.91
insurance,0.66,0.58
miami_housing,0.79,0.78
parkinsons,0.63,0.53
powerplant,0.88,0.89


In [69]:
distill = pd.read_csv('distill.csv', index_col=0)
train = pd.read_csv('train.csv',index_col=0)

In [70]:
distill = pd.pivot_table(distill, values='distiller_r2_score_val_true', aggfunc='max', index='dataset_name', columns=['model_name', 'distiller_name']).round(2).dropna()

In [71]:
train = pd.pivot_table(train, values='r2_score_val_true', aggfunc='max', index='dataset_name', columns=['model_name']).round(2).dropna()

In [72]:
distill.columns = ['+'.join(col) for col in distill.columns]

In [73]:
distill.join(train)

Unnamed: 0_level_0,ft_transformer+figs,ft_transformer+ft_distill,resnet+figs,resnet+ft_distill,figs,ft_transformer,random_forest,resnet,rf_plus,xgboost
dataset_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
abalone,0.39,0.37,0.37,0.4,0.34,0.47,0.51,0.46,0.53,0.5
airfoil,0.45,0.52,0.43,0.42,0.84,0.45,0.71,0.42,0.71,0.92
ca_housing,0.68,0.66,0.68,0.67,0.79,0.73,0.64,0.72,0.64,0.82
concrete,0.65,0.61,0.66,0.65,0.91,0.78,0.87,0.69,0.9,0.92
cpu_act,0.97,0.96,0.95,0.95,0.98,0.98,0.97,0.96,0.98,0.98
insurance,0.74,0.78,0.69,0.75,0.8,0.71,0.86,0.65,0.86,0.82
miami_housing,0.77,0.74,0.78,0.75,,,,,,
parkinsons,0.78,0.76,0.77,0.77,0.88,0.83,0.68,0.83,0.69,0.89
powerplant,0.93,0.92,0.92,0.92,0.94,0.94,0.94,0.93,0.94,0.96


In [55]:
data = {
    ('A', 'X1'): [1, 2, 3],
    ('A', 'X2'): [4, 5, 6],
    ('B', 'Y1'): [7, 8, 9],
    ('B', 'Y2'): [10, 11, 12]
}

df = pd.DataFrame(data, index=['i1', 'i2', 'i3'])
df

Unnamed: 0_level_0,A,A,B,B
Unnamed: 0_level_1,X1,X2,Y1,Y2
i1,1,4,7,10
i2,2,5,8,11
i3,3,6,9,12


In [57]:
# Combine the column levels
df.columns = ['_'.join(col) for col in df.columns]

# Reset index to convert row index to columns
df_reset = df.reset_index()

In [58]:
df_reset

Unnamed: 0,index,A___X_1,A___X_2,B___Y_1,B___Y_2
0,i1,1,4,7,10
1,i2,2,5,8,11
2,i3,3,6,9,12


In [None]:
# Reshape the DataFrame to long format
df_flattened = df_reset.melt(id_vars=['index'], var_name='CombinedColumn', value_name='Value')

# Rename the index column
df_flattened = df_flattened.rename(columns={'index': 'RowIndex'})

print(df_flattened)

In [51]:
pd.melt(distill, id_vars = ['model_name', 'distiller_name'], var_name = 'combinedcol')

KeyError: "The following id_vars or value_vars are not present in the DataFrame: ['model_name', 'distiller_name']"

In [4]:
r_feat = r[r['featurizer_name'] == 'featurizer']

In [5]:
temp = r_feat.drop(columns=['n_epochs', 'n_epochs', 'gpu', 'size_interactions', 'use_cache', 'cat_mappings', 'task_type', 'save_dir', 
                     'seed', 'subsample_frac', 'featurizer_overlap', 'save_dir_unique'])

In [6]:
temp = temp.sort_values('r2_score_val_true', ascending = False)

KeyError: 'r2_score_val_true'

In [None]:
temp['order'] = range(len(temp))

In [None]:
temp

In [None]:
temp.groupby('featurizer_frac')['order'].mean()

In [None]:
temp.groupby('bit')['order'].mean()

In [None]:
temp.groupby('depth')['order'].mean()

In [None]:
pd.pivot_table(r_feat, values='r2_score_val_true', aggfunc='max', index='dataset_name', columns=['model_name']).round(2)

In [None]:
r = r[r['featurizer_name'] == 'featurizer']

In [None]:
distill1 = r[(r['featurizer_frac'] == 0.5) & (r['distiller_name'].isin(['figs', 'ft_distill'])) & (r['featurizer_overlap'])]

In [None]:
distill1

In [None]:
pd.pivot_table(distill1[distill1['depth']==2], values='distiller_f_r2_score_test_true', aggfunc='mean', index='dataset_name', columns=['model_name', 'distiller_name']).round(2)

In [None]:
pd.pivot_table(distill1, values='distiller_r2_score_test_true', aggfunc='max', index='dataset_name', columns=['model_name', 'distiller_name']).round(2)

In [None]:
train1 = r[(r['featurizer_frac'] == 0.5) & (r['model_name'].isin(['figs', 'ft_distill'])) & (r['featurizer_overlap'])]
train0 = r[(r['featurizer_frac'] == 0.5) & (r['model_name'].isin(['figs', 'ft_distill'])) & (r['featurizer_overlap']==0)]

In [None]:
pd.pivot_table(train1[train1['depth'] ==2], values='r2_score_test_true', aggfunc='mean', index='dataset_name', columns=['model_name']).round(2)

In [None]:
train1['bit']==0

In [None]:
pd.pivot_table(train1[(train1['depth'] == 2) & (train1['bit']==0)], values='r2_score_test_true', aggfunc='max', index='dataset_name', columns=['model_name']).round(2)

In [None]:
pd.pivot_table(train1[train1['depth'] == 3], values='r2_score_test_true', aggfunc='max', index='dataset_name', columns=['model_name']).round(2)

In [None]:
train1 = train1.dropna(axis = 1)
train0 = train0.dropna(axis = 1)

In [None]:
pd.pivot_table(train1, values='r2_score_test_true', aggfunc='median', index='dataset_name', columns=['model_name']).round(2)

In [None]:
pd.pivot_table(train0, values='r2_score_test_true', aggfunc='mean', index='dataset_name', columns=['model_name']).round(2)

In [None]:
r_01 =r.dropna(subset=['distiller_f_r2_score_train_true'])

In [None]:
r_01

In [None]:
r_01['model_name'].isin(['figs', 'ft_distill'])

In [None]:
r_01[(r_01['featurizer_frac'] == 0.5) & (r_01['model_name'].isin(['figs', 'ft_distill']))]

In [None]:
r_01['featurizer_frac'] == 0.5 & r_01['model_name'] in ['figs', 'ft_distill']

In [None]:
r_train = r_train.dropna(axis = 1)

In [None]:
r_train['r2_score_test_true'] = r_train['r2_score_test_true'].round(2)

In [None]:
pd.pivot_table(r_train, values='r2_score_test_true', aggfunc='median', index='dataset_name', columns=['model_name'])

In [None]:
figs = r[r['distiller_name'] == 'figs']
pd.pivot_table(figs, values='distiller_r2_score_test_teacher', aggfunc='median', index='dataset_name', columns=['distiller_name', 'model_name'])

In [None]:
pd.pivot_table(figs, values='teacher_r2_score_test_true', aggfunc='median', index='dataset_name', columns=['distiller_name', 'model_name'])

In [None]:
ft_distill = r[r['distiller_name'] == 'ft_distill']
pd.pivot_table(ft_distill, values='distiller_r2_score_test_teacher', aggfunc='median', index='dataset_name', columns=['distiller_name', 'model_name'])

In [None]:
pd.pivot_table(ft_distill, values='teacher_r2_score_test_true', aggfunc='median', index='dataset_name', columns=['distiller_name', 'model_name'])

In [None]:
r.sort_values('teacher_r2_score_test_true', ascending=False)

In [None]:
# group using these experiment hyperparams when averaging over random seeds
ravg = imodelsx.process_results.average_over_seeds(
    r, experiment_filename, key_to_average_over='seed'
)

# apply cross validation
ravg_cv = (
    ravg
    .sort_values(by='distiller_r2_score_test_true', ascending=False)
    .groupby(by=['model_name', 'dataset_name'])
    .max()
    .reset_index()
)
ravg_cv

In [None]:
# , hue='dataset_name')
sns.barplot(data=ravg_cv, x='model_name', y='accuracy_test')
plt.show()

Load an individual model

In [None]:
run_args = r.iloc[0]
model = joblib.load(join(run_args.save_dir_unique, 'model.pkl'))
model