[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](http://colab.research.google.com/github/monte-flora/explain_tutorial/blob/main/src/tutorial_notebooks/Notebook01_Generate_Explanations.ipynb)


## Generate the Explainability Output

In this notebook, we compute the explainability output used in the paper. The methods include:
1. [Accumulated Local Effects (ALE)](https://christophm.github.io/interpretable-ml-book/ale.html)  
2. [SHAP (Shapley Additive Explanations)](https://christophm.github.io/interpretable-ml-book/shap.html)
3. [SAGE (Shapley Additive Global Explanations)](https://iancovert.com/blog/understanding-shap-sage/)


In [None]:
def using_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False
    
import sys, os 
from glob import glob
if using_colab():
    # When using Google Colab, need to clone the explain_tutorial repo
    # Otherwise, the code assumes you are running these notebooks
    # in their original directory structure. 
    try:
        !git clone https://github.com/monte-flora/explain_tutorial
    except:
        print('explain_tutorial has already been cloned!')
    sys.path.append('explain_tutorial')   
else:
    from os.path import dirname
    path = dirname(dirname(os.getcwd()))
    sys.path.append(path)

from src.io.colab_io import GoogleDriveIO
    
# Download data from Google drive
if using_colab():
    downloader = GoogleDriveIO()
    # Make a 'datasets' and 'models' directories
    if not os.path.exists('datasets'):
        os.mkdir('datasets')
    if not os.path.exists('models'):
        os.mkdir('models')
    
    # These are all the paths to files that were downloaded in Notebook00_Download_Data_and_Models
    # If you are only interested in a single dataset, then you can remove the others
    # which improves download times. 
    
    paths_dict = {'lightning dataset' : '/content/datasets/lightning_dataset.csv', 
              'road_surface dataset' : '/content/datasets/road_surface_dataset.csv', 
              'severe_wind dataset' : '/content/datasets/severe_wind_dataset.csv', 
              'lightning model' : '/content/models/NN_classification.joblib', 
              'road_surface model' : '/content/models/JTTI_ProbSR_RandomForest.pkl', 
        'severe_wind model' : '/content/models/LogisticRegression_wind_severe_0km_None_first_hour_realtime.joblib', 
              }

    for title in paths_dict.keys():
        downloader.download(title, paths_dict[title])

#### Install different python packages 

In [1]:
# Neccesary packages to load the ML models from pickle
%pip install scikit-explain==0.1.4 sage-importance imblearn daal4py scikit-learn==1.0.2 netCDF4 scikit-learn-intelex bayeshist

#### Import python packages (internal and third party)

In [2]:
import skexplain 
from skexplain.common.importance_utils import to_skexplain_importance
from src.io.io import load_data_and_model
from src.common.util import subsampler, normalize_importance, compute_sage

import pickle
import shap
import itertools
import numpy as np
import multiprocessing as mp

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


#### Setting the user constants (paths, parameters, etc)

In [3]:
# Constants. 
N_BOOTSTRAP = 10
N_BINS = 30
# Colab only has 2 CPUs, so thats the default here. If you have access to more CPUs
# Feel free to increase N_JOBS. The closer you are to number of features greatly decreases
# the runtime. 
N_JOBS = 2
GLOBAL_SIZE = 50000
LOCAL_SIZE = 2500
DATASET = 'road_surface'

EVALUATION_FN = 'norm_aupdc'

if not os.path.exists('results'):
    os.mkdir('results')
    
BASE_PATH       = os.getcwd()
RESULTS_PATH    = os.path.join(BASE_PATH, 'results')
DATA_BASE_PATH  = os.path.join(BASE_PATH, 'datasets')
MODEL_BASE_PATH = os.path.join(BASE_PATH, 'models')

In [5]:
# Compute ALE 
def compute_ale(explainer, dataset, est_name, **kwargs): 
    ale = explainer.ale(features='all', n_bootstrap=N_BOOTSTRAP, n_bins=N_BINS, n_jobs=N_JOBS)
    # Save ALE results (as netcdf file)
    explainer.save(os.path.join(RESULTS_PATH, f'ale_{dataset}.nc'), ale, encoding=None)


# Compute Shapely Additive Explanation (SHAP)
def compute_shap(explainer, dataset, est_name, **kwargs):
    X = kwargs['X']
    features = kwargs['X'].columns
    results = explainer.local_attributions('shap', 
                                       shap_kws={'masker' : 
                                      shap.maskers.Partition(X, max_samples=50, 
                                                             clustering="correlation"), 
                                     'algorithm' : 'permutation'})


    shap_rank = to_skexplain_importance(results[f'shap_values__{est_name}'].values, 
                                     estimator_name=est_name, 
                                     feature_names=features, 
                                     method ='shap_sum', 
                                     normalize=False    
                                       )

    # Sum the SHAP values for each feature and then save results. 
    explainer.save(os.path.join(RESULTS_PATH, f'shap_{dataset}.nc'), results, encoding=None)
    explainer.save(os.path.join(RESULTS_PATH, f'shap_rank_{dataset}.nc'), shap_rank, encoding=None)

# Compute SAGE
def compute_sage_(explainer, dataset, est_name, **kwargs):
    estimator = explainer.estimators[est_name]
    
    X = explainer.X
    y = explainer.y
    X_orig = kwargs['X']
    
    features = kwargs['X'].columns
    sage_values = compute_sage(estimator, X.values, y, X_orig, n_jobs = N_JOBS)
    sage_rank = to_skexplain_importance(sage_values,
                                     estimator_name=est_name, 
                                     feature_names=features, 
                                     method = 'sage', 
                                     normalize=False  
                                       )

    # Sum the SAGE values for each feature and then save results. 
    explainer.save(os.path.join(RESULTS_PATH, f'sage_{dataset}.nc'), sage_rank, encoding=None)


# Compute Grouped SAGE
def compute_group_sage(explainer, dataset,  est_name, **kwargs):
    
    X = explainer.X
    feature_groups = kwargs['groups']
    # Group indices
    groups = []
    cols = list(X.columns)
    features = []
    for key, group in feature_groups.items():
        ind_list = []
        for feature in group:
            ind_list.append(cols.index(feature))
        groups.append(ind_list)
        features.append(key)  
    
    estimator = explainer.estimators[est_name]
    
    y = explainer.y
    X_orig = kwargs['X']
    
    sage_values = compute_sage(estimator, X.values, y, X_orig, groups=groups)
    sage_rank = to_skexplain_importance(sage_values,
                                     estimator_name=est_name, 
                                     feature_names=features, 
                                     method = 'sage', 
                                     normalize=False  
                                       )

    # Sum the SAGE values for each feature and then save results. 
    explainer.save(os.path.join(RESULTS_PATH, f'grouped_sage_{dataset}.nc'), sage_rank, encoding=None)
    
global_methods = [compute_ale, compute_sage_, compute_group_sage,]
local_methods = [compute_shap]

# Load model and data.
model, X, y, groups = load_data_and_model(DATASET, DATA_BASE_PATH, MODEL_BASE_PATH, 
                                     return_groups=True)
est_name = model[0]
    
# Subsample the dataset with GLOBAL_SIZE samples for the global methods. 
X_sub, y_sub = subsampler(X,y, GLOBAL_SIZE)

# Initialize the explainer. 
global_explainer = skexplain.ExplainToolkit(model, X_sub, y_sub) 
    
# Subsample the GLOBAL_SIZE samples with LOCAL_SIZE samples for the local methods
X_local, y_local = subsampler(X_sub, y_sub, LOCAL_SIZE)
local_explainer = skexplain.ExplainToolkit(model, X_local, y_local)
    
for method in local_methods:
    print(method)
    method(local_explainer, DATASET, est_name, X=X)

for method in global_methods:
    print(method)
    method(global_explainer, DATASET, est_name, X=X, model=model, groups=groups)

<function compute_sage_ at 0x7fe5bcb04b80>
PermutationEstimator will use 4 jobs


  0%|          | 0/1 [00:00<?, ?it/s]

<function compute_group_sage at 0x7fe5c181e310>


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# Upload the explainability results to Google drive for Notebook02_Explainability_Tutorial.
if using_colab():
    uploader= GoogleDriveIO()
    results_paths = glob('results/*')

    for path in results_paths:
        print(f'Uploading {path} dataset to Google Drive...')
        uploader.upload(path, title=os.path.basename(path).replace('.nc', ''))
