# H2O model interpretation
Needs to be executed with 
srun --pty -A fc_keenan -p savio -t 00:10:00 papermill 03_h2o_explainability.ipynb 03_h2o_explainability_out.ipynb

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import h2o
from models.h2o import H2o
import os
import glob
from modules.analysis import *
import numpy as np

In [None]:
# loads model paths and exp_ids
reps = list(range(0, 30))

exp_id = '2022110113578590'

models = []
exp_paths = []
h2o.init()

for rep in reps:
    exp_path = os.path.join('experiments', exp_id + '_' + str(rep))
    exp_paths.append(exp_path)
    print(exp_path)
    models.append(H2o.load(os.path.join(exp_path, 'fold_0'), init=False))

In [None]:
# load base models
df_models = []
# loop over 30 bootrap models
for idx, bootstrap_model in enumerate(models):
    print(idx)
    path = glob.glob(os.path.join(exp_paths[idx], 'fold_0', '*_AutoML_*'))[0]
    
    # loop over base models
    base_models = []
    base_models_imp = []
    if 'base_models' in bootstrap_model.actual_params:
        
        # get metalearner
        base_model_imp = bootstrap_model.metalearner().varimp(use_pandas=True)
        base_model_imp = base_model_imp[['variable', 'percentage']]
        base_model_imp.columns = ['base_model', 'importance']
        base_models.append(base_model_imp)

    else:
        base_models.append(pd.DataFrame({'base_model': [path.split('/')[-1]], 'importance': [1]}))
        
    df_base_models = pd.concat(base_models)
    df_base_models['rep'] = reps[idx]
    df_base_models['path'] = path
    df_models.append(df_base_models)
    
df_models = pd.concat(df_models, ignore_index=True)
df_models['base_model_type'] = df_models['base_model'].str.split('_').str[0]
df_models['model_type'] = df_models['path'].str.split('/').str[-1].str.split('_').str[0:2].str.join(' ')
df_models

In [None]:
# create saving path
out_path = os.path.join('analysis/benchmark/', exp_id)
if not os.path.isdir(out_path):
    os.makedirs(out_path)

In [None]:
# plot model count matrix
cmat = df_models[['rep', 'base_model_type']].groupby('rep').value_counts().unstack(level=0, fill_value=0)
#cmat = cmat * 100 / cmat.sum()

fig = plt.figure(figsize=(10, 5))
ax = plt.axes()

cmat = cmat.where(cmat > 0, np.nan)

plt.imshow(cmat, cmap=cmap_gpp_1)
plt.yticks(np.arange(0, len(cmat.index), 1), cmat.index)
plt.xticks(np.arange(0, len(cmat.columns), 1), cmat.columns+1)
plt.xlabel('Bootstrap')
plt.ylabel('Model Family')

cbar = plt.colorbar(fraction=0.047*cmat.shape[0]/cmat.shape[1])
cbar.set_label('Number of models')

plt.tight_layout()
plt.savefig(os.path.join('analysis/benchmark', exp_id, 'base_model_count.pdf'))

In [None]:
# plot model importance bar chart
model_imp = df_models.copy()
model_imp['rep'] = model_imp.rep + 1
model_imp = model_imp.groupby(['rep', 'base_model_type'])['importance'].sum().reset_index().fillna(0)

colors = plt.cm.Paired(np.linspace(0, 1, 6))

model_imp.set_index(['rep', 'base_model_type']).unstack('base_model_type').droplevel(0, axis=1).plot(kind='bar', stacked=True, width=1)
plt.legend(bbox_to_anchor=(1, 0.5), loc='center left', ncol=1, title='Model Family')
plt.ylim(0, 1)
plt.xlabel('Bootstrap')
plt.ylabel('Relative Importance')
plt.tight_layout()
plt.savefig(os.path.join('analysis/benchmark', exp_id, 'base_model_importance_bootstraps.pdf'))

In [None]:
model_imp.set_index(['rep', 'base_model_type']).unstack('base_model_type')

In [None]:
# create saving path
out_path = os.path.join('analysis/benchmark/', exp_id)
if not os.path.isdir(out_path):
    os.makedirs(out_path)

sns.boxplot(data=model_imp, x='base_model_type', y='importance', color=sns.color_palette()[0])
plt.xlabel('Base Model Type')
plt.ylabel('Relative Importance')
plt.tight_layout()
plt.savefig(os.path.join('analysis/benchmark', exp_id, 'base_model_importance.pdf'))