# Configuration
Environment variables for directory locations and enabling working in COLAB



In [1]:
# Parameters
ENABLE_COLAB = False

PROJECT_NAME = 'ML1030'

#Root Machine Learning Directory. Projects appear underneath
GOOGLE_DRIVE_MOUNT = '/content/gdrive' 
COLAB_ROOT_DIR = GOOGLE_DRIVE_MOUNT + '/MyDrive/Colab Notebooks'
COLAB_UTILITY_DIR = COLAB_ROOT_DIR + '/utility_files'

LOCAL_ROOT_DIR = '/home/magni/ML_Root/project_root'
LOCAL_UTILITY_DIR = LOCAL_ROOT_DIR + '/pipeline'

# TODO: Add in more utility directories to include in path

# Bootstrap Environment
<p>Initialize environment with the above configuration.<br>
Mount Google drive for access if needed<br>
Set currently active directory to PROJECT_DIR</p>

Sets variables:<br>
UTILITY_DIR = location of custom files for import<br>
PROJECT_DIR = location of project files (ipynb/py/...)<br>
ROOT_DIR = one level up from project_dir. <br>


In [2]:
import sys
import os

if ENABLE_COLAB:
  # Mount Google Drive for access
  from google.colab import drive
  drive.mount(GOOGLE_DRIVE_MOUNT, force_remount=True)
  UTILITY_DIR = COLAB_UTILITY_DIR
  ROOT_DIR = COLAB_ROOT_DIR
  
else:
  UTILITY_DIR = LOCAL_UTILITY_DIR
  ROOT_DIR = LOCAL_ROOT_DIR

# Set PROJECT_DIR for easy access
PROJECT_DIR = ROOT_DIR + '/' + PROJECT_NAME    
    
# Add the UTILITY_DIR to the path to import files easier
sys.path.append(os.path.abspath(UTILITY_DIR))

# Setup Models (Load trained models)

In [3]:
import pickle
import gzip
import importlib

In [4]:
# Load two trained models from previous experiments
myExpXGB = pickle.load(gzip.open(f'{ROOT_DIR}/data/{PROJECT_NAME}/05_experiments/01_ML1030_XGB_TFIDF2.jexp.gz', 'rb'))
myExpRF = pickle.load(gzip.open(f'{ROOT_DIR}/data/{PROJECT_NAME}/05_experiments/01_ML1030_RF_TFIDF2.jexp.gz', 'rb'))

ModuleNotFoundError: No module named 'DataExperiment'

In [None]:
import numpy as np
print(f'myExpXGB final feature length: {len(myExpXGB.finalFeatures)}')
print(f'myExpRF final feature length: {len(myExpRF.finalFeatures)}')

a = np.array(myExpXGB.finalFeatures)
b = np.array(myExpRF.finalFeatures)
print((a == b).all()) # are the arrays completely identical TRUE/FALSE

# Setup Data (Load existing data)

In [None]:
# Get final xData from previous trained model
xData = myExpXGB.dataPackage.getXTrainData(finalFeatures=myExpXGB.finalFeatures)
print(type(xData))
xData.head()

In [None]:
# Get final xData from previous trained model
yData = myExpXGB.dataPackage.getYTrainData()
print(type(yData))
yData.head()

# Model Manager (Class)
Allows us to load models for automating access

Functions:<br>
Load Models<br>
List Models<br>
Add Models<br>
Remove Models<br>



In [None]:
import ModelManager

In [None]:
importlib.reload(ModelManager)

In [None]:
# Create Model Manager with model with pre-trained model from earlier
myMM = ModelManager.ModelManager(model=myExpXGB.getFinalModel(),
                                 description="XGBoost model description")

# Add in a second model
myMM.add_model(myExpRF.getFinalModel(), "Random Forest model description")

In [None]:
myMM.list_models()

In [None]:
# Summary of models loaded
# Note the SHAP_VALUE calculated line. Expensive to calculate and is reused for each chart/call.
# Other assets to be loaded as development continues (e.g. explainers)
myMM.summary()

In [None]:
# Add in a duplicate model to test remove function
myMM.add_model(model=myExpRF.getFinalModel(), 
               description="Random Forest model (duplicate)")
myMM.summary()

In [None]:
myMM.remove_model(3)

In [None]:
myMM.remove_model(2)

# Data Manager
Simple wrapper for access and usage of data. Currently only stores xTrain and yTrain data.<br>
Needs to expand to match data storage format of project:<br>
1. test
2. train
3. val

In [None]:
import DataManager

In [None]:
importlib.reload(DataManager)

In [None]:
# Currently only storing one set of xData/yData. Not full train/val/test...
# May need to alter for data storage/privacy reasons with clinical dataset

myDM = DataManager.DataManager(xData=xData,
                              yData=yData)

In [None]:
myDM.summary()

# Analysis Manager (Core)
Wrapper class for ModelManager and DataManager<br>
Provides access to interpretability components (SHAP, LIME, ...)<br>
Provides persistance and loading of assets to save recalculation time<br>

In [None]:
import AnalysisManager

In [None]:
importlib.reload(AnalysisManager)

In [None]:
# AnalysisManager will add a ".gz" extension to filename so that it is zipped by default
myAnalysis = AnalysisManager.AnalysisManager(filename='analysisManagerTest',
                                             data_manager=myDM,
                                             model_manager=myMM)

In [None]:
myAnalysis.summary()

In [None]:
#Generating SHAP values can be expensive. Ability to save and reload work.
myAnalysis.save()

In [None]:
del myAnalysis
myAnalysis.summary()

In [None]:
recovered_object = AnalysisManager.AnalysisManager.load(PROJECT_DIR + '/analysisManagerTest.gz')
recovered_object.summary()

# Analysis Manager (SHAP)


In [None]:
import AnalysisManager
importlib.reload(AnalysisManager)

In [None]:
#Reload recovered_object in case we played with it in an earlier scenario
recovered_object = AnalysisManager.AnalysisManager.load(PROJECT_DIR + '/analysisManagerTest.gz')

In [None]:
#Removing the Random Forest as it currently crashes when running SHAP charts.
recovered_object.model_manager.remove_model(1)

In [None]:
# Add in a second model for display purposes
recovered_object.model_manager.add_model(model=recovered_object.model_manager.model_list[0].model,
                                         description='XGB duplicate model')

### Calc SHAP values

In [None]:
%%time
# Do some prep work to create all the "shap_values" required for the models
# Calculation intensive so we store them for later usage
# Override=True means recalculate value even if present otherwise only calculates missing
recovered_object.calc_shap_values(GPU=False,
                                  override=False,
                                  debug=True)


In [None]:
recovered_object.show_shap_summary(plot_type='bar')

In [None]:
recovered_object.show_shap_summary(plot_type='dot')

In [None]:
recovered_object.show_shap_bar()

In [None]:
recovered_object.show_shap_beeswarm()

In [None]:
recovered_object.show_shap_waterfall()

In [None]:
recovered_object.show_shap_waterfall(value_index=1)

# Analysis Manager (LIME)

In [None]:
#Reload recovered_object in case we played with it in an earlier scenario
recovered_object = AnalysisManager.AnalysisManager.load(PROJECT_DIR + '/analysisManagerTest.gz')

In [None]:
recovered_object.show_lime_global()

# Scratchpad - AnalysisManager SHAP (XGB)

In [None]:
import shap
recovered_objectXGB = AnalysisManager.AnalysisManager.load(PROJECT_DIR + '/analysisManagerTest.gz')
recovered_objectXGB.summary()

In [None]:
# Remove the Random Forest model so only XGB left
recovered_objectXGB.model_manager.remove_model(1)

In [None]:
xDataXGB = recovered_objectXGB.data_manager.xData.copy()
modelStoreXGB = recovered_objectXGB.model_manager.model_list[0]
modelXGB = recovered_objectXGB.model_manager.model_list[0].model

print(f'xDataXGB length = {len(xDataXGB)}')
print(f'modelStoreXGB type: {type(modelStoreXGB)}')
print(f'modelXGB:')
print(modelXGB)

In [None]:
def local_calc_shap_value(modelStore,
                    xData,
                    GPU=False,
                    debug=False):
    print (f'Calculating shap_values for {modelStore.description}')
    if GPU:
        #explainer = shap.explainers.GPUTree(modelStore.model, xData)
        #shap_values = explainer(xData)
        #explainer = shap.Explainer(modelStore.model, xData)
        #shap_values = explainer(xData)
        print(f'STOP: Do not use GPU=True yet')
    else:
        if debug:
            print(f'DEBUG: non-gpu path')
        explainer = shap.Explainer(modelStore.model)
        shap_values = explainer(xData)
    
    if debug:
        print(f'DEBUG: shap_value type: {type(shap_values)}')
        print(f'DEBUG: explainer type: {type(explainer)}')
        print(f'DEBUG: modelStore.model:')
        print(modelStore.model)

    #modelStore.set_shap_values(shap_values=shap_values)
    return shap_values

In [None]:
localShap = local_calc_shap_value(modelStore=modelStoreXGB,
                                  xData=xDataXGB,
                                  GPU=False,
                                  debug=True)

In [None]:
baseExplainerXGB = shap.Explainer(modelXGB)
base_shap_valuesXGB = baseExplainerXGB(xDataXGB)

In [None]:
explainerXGB = shap.TreeExplainer(modelXGB)
shap_valuesXGB = explainerXGB(xDataXGB)

In [None]:
print(f'baseExplainerXGB: {type(baseExplainerXGB)}')
print(f'base_shap_valuesXGB: {type(base_shap_valuesXGB)}')

print(f'explainerXGB: {type(explainerXGB)}')
print(f'shap_valuesXGB: {type(shap_valuesXGB)}')
#print(shap_valuesRF)

In [None]:
shap.summary_plot(shap_valuesXGB, xDataXGB, plot_type="bar")

In [None]:
print(base_shap_valuesXGB.shape)
shap.plots.bar(base_shap_valuesXGB) 

In [None]:
#shap.plots.waterfall(shap_values[1])
shap.plots.waterfall(shap_valuesXGB[1])

In [None]:
# force break for using run all/run below
assert(1==2)

# Scratchpad - AnalysisManager SHAP (Random Forest)

In [None]:
import shap
recovered_objectRF = AnalysisManager.AnalysisManager.load(PROJECT_DIR + '/analysisManagerTest.gz')
recovered_objectRF.summary()

In [None]:
# Remove the XGBoost model so only Random Forest left
recovered_objectRF.model_manager.remove_model(0)

In [None]:
xDataRF = recovered_objectRF.data_manager.xData.copy()
modelRF = recovered_objectRF.model_manager.model_list[0].model

print(f'xDataRF length = {len(xDataRF)}')
print(f'modelRF:')
print(modelRF)

In [None]:
# GPU Tree crashes on RF. not sure why yet commented out
#explainerGPURF = shap.explainers.GPUTree(modelRF)
#shap_valuesGPURF = explainerGPURF.shap_values(xDataRF)

In [None]:
%%time
baseExplainerRF = shap.Explainer(modelRF)
base_shap_valuesRF = baseExplainerRF(xDataRF)

In [None]:
%%time
explainerRF = shap.TreeExplainer(modelRF)
shap_valuesRF = explainerRF(xDataRF)

In [None]:
print(f'baseExplainerRF: {type(baseExplainerRF)}')
print(f'base_shap_valuesRF: {type(base_shap_valuesRF)}')

print(f'explainerRF: {type(explainerRF)}')
print(f'shap_valuesRF: {type(shap_valuesRF)}')
#print(shap_valuesRF)

In [None]:
print(base_shap_valuesRF.shape)

In [None]:
shap.summary_plot(base_shap_valuesRF[:, :, 0], xDataRF, plot_type="bar")

In [None]:
shap.plots.beeswarm(base_shap_valuesRF[:, :, 0])

In [None]:
shap.plots.bar(base_shap_valuesRF[:, :, 0])

In [None]:
#shap.plots.waterfall(shap_values[1])
shap.plots.waterfall(base_shap_valuesRF.base_values[1], base_shap_valuesRF.values[1], xData[1])

# Scratchpad
Discrepencies in SHAP output per model

In [None]:
# SHAP value comparison between Random Forest and XGB models
# Trained on identical data

print(f'XGBoost SHAP      : {base_shap_valuesXGB.shape}')
print(f'Random Forest SHAP: {base_shap_valuesRF.shape}')

In [None]:
# SHAP function calls different between models (e.g. RF/XGB)
# Also different for Neural Networks
shap.plots.bar(base_shap_valuesRF[:, :, 0])

In [None]:
shap.plots.bar(base_shap_valuesXGB)