## Compute the Global Feature Rankings 

In this notebook, we compute the global feature rankings. The methods include:
1. ALE variance (PAPER)
2. Backward Single-pass Permutation Importance 
3. Forward Single-pass Permutation Importance 
4. Backward Multi-pass Permutation Importance  
5. Forward Multi-pass Permutation Importance 
6. Random Forest Gini Impurity 
7. Logistic Regression Coefficients 
8. Summed SHAP values 
9. SAGE values 

In [1]:
import sys, os 
from os.path import dirname
path = dirname(dirname(os.getcwd()))
sys.path.insert(0, path)

In [2]:
import skexplain 
from skexplain.common.importance_utils import to_skexplain_importance
from src.io.io import load_data_and_model
from src.common.util import subsampler 

In [3]:
# Constants. 
N_BOOTSTRAP = 1
N_BINS = 20 
N_JOBS = 30 
N_PERMUTE = 5
SIZE = 5000
evaluation_fn = 'norm_aupdc'
RESULTS_PATH = os.path.join(path, 'results')
BASE_PATH = '/work/mflora/explainability_work/'
DATA_BASE_PATH = os.path.join(BASE_PATH, 'datasets')
MODEL_BASE_PATH = os.path.join(BASE_PATH, 'models')

In [4]:
# Load the Data and Model 
dataset = 'tornado'
option = 'reduced'
model, X, y = load_data_and_model(dataset, option, DATA_BASE_PATH, MODEL_BASE_PATH)

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [5]:
# Subsample the dataset. 
X_sub, y_sub = subsampler(X,y,SIZE)

### 1. ALE variance 

In [7]:
help(explainer.save)

Help on method save in module skexplain.main.explain_toolkit:

save(fname, data) method of skexplain.main.explain_toolkit.ExplainToolkit instance
    Save results of a computation (permutation importance, calc_ale, calc_pd, etc)
    
    Parameters
    ----------
    fname : string
        filename to store the results in (including path)
    data : ExplainToolkit results
        the results of a ExplainToolkit calculation. Can be a dataframe or dataset.
    
    Examples
    -------
    >>> import skexplain
    >>> estimators = skexplain.load_models() # pre-fit estimators within skexplain
    >>> X, y = skexplain.load_data() # training data
    >>> explainer = skexplain.ExplainToolkit(estimators=estimators
    ...                             X=X,
    ...                             y=y,
    ...                            )
    >>> perm_imp_results = explainer.calc_permutation_importance(
    ...                       n_vars=10,
    ...                       evaluation_fn = 'norm_aupdc

In [6]:
# Compute ALE 
explainer = skexplain.ExplainToolkit(model, X_sub, y_sub)
ale = explainer.ale(features='all', n_bootstrap=N_BOOTSTRAP, n_bins=N_BINS)
# Compute the variance. 
ale_var = explainer.ale_variance(ale)
# Convert to feature rankings 
ale_rank = to_skexplain_importance(ale_var[f'ale_variance_scores__{model[0]}'].values, 
                                           model[0], list(X.columns), method='ale_variance', normalize=True)

# Save the raw ALE and ALE-variance rankings results for paper 1 
explainer.save(os.path.join(RESULTS_PATH, f'ale_{dataset}_{option}.nc'), ale)
explainer.save(os.path.join(RESULTS_PATH, f'ale_rank_{dataset}_{option}.nc'), ale_rank)

  0%|          | 0/14 [00:00<?, ?it/s]

TypeError: data is not a pandas.DataFrame or xarray.Dataset. The type is <class 'str'>.

In [None]:
# Compute the permutatation importance (forward, backward, single-pass, multi-pass)
DIRECTIONS = ['forward', 'backward']
n_vars = len(X.columns)

for direction in DIRECTIONS: 
    results = explainer.permutation_importance(
                                           n_vars=n_vars, 
                                           evaluation_fn=evaluation_fn,
                                           n_permute=N_PERMUTE, 
                                           n_jobs=N_JOBS,
                                           verbose=True,
                                           random_seed=42, 
                                           direction=direction,
                                              )
    # Save the results 


In [None]:
# Compute PD 
explainer = skexplain.ExplainToolkit(model, X_sub, y_sub)
pd = explainer.pd(features='all', n_bootstrap=N_BOOTSTRAP, n_bins=N_BINS)

# Save the results
explainer.save(pd, 'ale')

# Compute 
results = 

In [None]:
# Model-specific 
rf.feature_importances_
lr.coefs_

to_skexplain_importances

In [None]:
# Compute SHAP (Approx. Owen Values)
# Check if each SHAP example can be ran in parallel. 
results = explainer.shap(shap_kwargs={'masker' : 
                                      shap.maskers.Partition(X, max_samples=100, 
                                                             clustering="correlation"), 
                                     'algorithm' : 'permutation'})

# Sum the SHAP values for each feature and then save results. 

explainer.save('shap_values.nc', data=results)

In [None]:
# Compute SAGE
def compute_sage(model, X, y, background):
    """Compute SAGE"""
    # Set up an imputer to handle missing features
    random_state = np.random.RandomState(42)
    random_inds = np.random.choice(len(background), size=100, replace=False)
    try:
        X_rand = background.values[random_inds,:]
    except:
        X_rand = background[random_inds,:]
    
    # Set up the imputer. 
    imputer = sage.MarginalImputer(model.predict_proba, X_rand)

    # Set up an estimator. 
    estimator = sage.PermutationEstimator(imputer, 'cross entropy')

    print(np.shape(X))
    
    sage_values = estimator(X, y)
    
    return sage_values

# Calculate SAGE values
    X_sub, y_sub = subsampler(X,y)
    sage_values = compute_sage(model, X_sub.values, y_sub, X)
    
    with open(out_file, 'wb') as f:
        pickle.dump(sage_values, f)
