In [226]:
%pip install pandas numpy scikit-learn ucimlrepo lime

Note: you may need to restart the kernel to use updated packages.




In [458]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, Normalizer, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.svm import SVC

from ucimlrepo import fetch_ucirepo 
import lime
from lime import lime_tabular

import pickle as pkl

from bayes_opt import BayesianOptimization
from tqdm import tqdm
import matplotlib.pyplot as plt

from scipy.stats import uniform
from scipy.special import softmax
import shap

MODEL_NAME = 'Wine'

MODEL_FUNCTION = DecisionTreeClassifier
model_params = {}
coding = {"Wine": 109}


In [228]:
# fetch dataset 
split_size = 0.3
seed = 1234567

dataset_id = coding[MODEL_NAME]

wine = fetch_ucirepo(id=109) 

In [229]:
# data (as pandas dataframes) 
X = wine.data.features
y = wine.data.targets.to_numpy()
y = y.reshape((len(y), ))
    

n_classes = len(np.unique(y))
n_feats = X.shape[1]
x_train, x_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = split_size, random_state = seed)

normalizer = Normalizer().fit(x_train)
encoder = LabelEncoder().fit(y_train)

x_train = normalizer.transform(x_train)
x_test = normalizer.transform(x_test)
y_train = encoder.transform(y_train)
y_test  = encoder.transform(y_test)

In [230]:
def evaluate_model(model, samples, targets):
    pred = model.predict(samples)
    return accuracy_score(targets, pred)

In [305]:
model = MODEL_FUNCTION(*model_params)
model.fit(x_train, y_train)

acc_train = evaluate_model(model, x_train, y_train)
acc_test  = evaluate_model(model, x_test, y_test)

print(acc_train, acc_test)

1.0 1.0


In [173]:
with open(f'{MODEL_NAME}.pkl', 'wb') as f:
    pkl.dump(model, f)

In [174]:
predicted_labels = model.predict(x_test)

### Interpretability

In [None]:
class Local2GlobalExplainer:
    def __init__(self, x_train, model):
        self.model = model
        self.data = x_train
        
        self.explainer = lime_tabular.LimeTabularExplainer(
            training_data=self.data, 
            mode = 'classification'
            )
        
    def get_optimal_gmm(n_components):
        c = round(n_components)
        gmm = GaussianMixture(n_components = c).fit(x_train)
        return gmm.bic(x_train)
        
    def get_local_interpretation(self, sample, num_features):
    
        exp = self.explainer.explain_instance(sample, model.predict, num_features = len(sample))
        local_exp = list(exp.local_exp.values())[0]
        local_exp = sorted(local_exp)
        
        explanations = [x[1] for x in local_exp]
        return explanations

In [175]:
def get_local_interpretation(explainer, predict_function, sample, num_features):
    
    exp = explainer.explain_instance(sample, predict_function, num_features = len(sample))
    local_exp = list(exp.local_exp.values())[0]
    local_exp = sorted(local_exp)
    
    explanations = [x[1] for x in local_exp]
    return explanations

In [469]:
explainer = lime_tabular.LimeTabularExplainer(
    training_data = x_train,
    mode = 'classification'
)

def get_optimal_gmm(n_components):
    c = round(n_components)
    gmm = GaussianMixture(n_components = c).fit(x_train)
    return gmm.bic(x_train)

parameters = {'n_components': (0, len(x_train)//4)}

gmm = BayesianGaussianMixture(n_components= n_classes*2).fit(x_train)
# gbm_bo = BayesianOptimization(get_optimal_gmm, parameters, random_state=111, allow_duplicate_points=True)
# gbm_bo.maximize(init_points = 30, n_iter = 10)
# params_gbm = gbm_bo.max['params']
# n_comp = round(params_gbm['n_components'])

# gmm = GaussianMixture(n_components= n_comp).fit(x_train)

Initialization 1 did not converge. Try different init parameters, or increase max_iter, tol or check for degenerate data.


In [461]:
def get_scores(ex):
    '''Function to normalize scores using a softmax function multiplied by correlation signs'''
    signs = np.sign(ex)
    abs_softmax = softmax(np.abs(ex))
    return abs_softmax*signs

In [462]:
def run_explainability(fit_dist, train_samples, model, explainer, main_dist, num_samples, mcmc = False):
    if mcmc == True:
        # run monte carlo trials
        samples, gmm_class = fit_dist.sample(num_samples)       # generate samples from the fit gmm
        explanations = []                                       # list to store explanations
        for sample in tqdm(samples):
            interpret = get_local_interpretation(explainer, model.predict_proba, sample, n_feats)
            sigmoid_interpretation = get_scores(interpret)
            explanations.append(sigmoid_interpretation)
            
        agg_explanations = np.mean(np.array(explanations), axis = 0)        # aggregating
        
        return agg_explanations, explanations
    

    else:
        # importance sampling
        for i in train_samples:
            pass
    
        
        
def rank_explanations(explanations):
    return sorted(list(zip(range(len(explanations)), explanations)), key = lambda x: -abs(x[1]))

In [463]:
aggregated, point_explanations = run_explainability(gmm, x_train, model, explainer, None, 1000, mcmc = True)

100%|██████████| 1000/1000 [00:15<00:00, 64.15it/s]


In [464]:
rank_explanations(aggregated)

[(9, -0.04427801177176151),
 (0, 0.010350171452497546),
 (3, 0.0076402752498283535),
 (10, 0.006595678668763183),
 (12, -0.0036595638779221337),
 (1, -0.0030315274890849358),
 (5, -0.0009369094913669249),
 (4, 0.0009242944614868953),
 (7, -0.0008775219894062209),
 (2, -0.0008659531302355162),
 (8, -0.0005055154321299345),
 (11, 0.0003654287180868321),
 (6, -0.00016309288481481266)]

In [465]:
def get_shap_feature_importances(model, x_train):
    se = shap.Explainer(model)
    shap_values = se.shap_values(x_train)
    importance_order = np.argsort(-abs(np.abs(np.array(shap_values)).mean(axis = 1).mean(axis = 0)))
    importances = np.abs(np.array(shap_values)).mean(axis = 1).mean(axis = 0)[importance_order]
    return importance_order, importances

order, importances = get_shap_feature_importances(model, x_train)

## Testing

### Local2Global

In [503]:
vals_ = list(set(np.arange(n_feats)).difference(set([9,0,3,10])))
model_l2g = MODEL_FUNCTION(*model_params)
model_l2g.fit(x_train[:, vals], y_train)

acc_train_l2g = evaluate_model(model_l2g, x_train[:, vals], y_train)
acc_test_l2g  = evaluate_model(model_l2g, x_test[:, vals], y_test)

print(acc_train_l2g, acc_test_l2g)

1.0 0.8333333333333334


### Shap

In [504]:
# shap features are 3, 6, 10, 9
vals = list(set(np.arange(n_feats)).difference(set([3,6,10,9])))

# x_train

In [505]:
model_shap = MODEL_FUNCTION(*model_params)
model_shap.fit(x_train[:, vals], y_train)

acc_train_shap = evaluate_model(model_shap, x_train[:, vals], y_train)
acc_test_shap  = evaluate_model(model_shap, x_test[:, vals], y_test)

print(acc_train_shap, acc_test_shap)

1.0 0.8703703703703703
