# Execution example

Clustering and explanation of a classification model trained on the *Spambase* dataset.

In [1]:
import pandas as pd
dataset = pd.read_csv('spambase.csv', header = None).to_numpy()
labels = dataset[:,-1]
dataset = dataset[:,:-1]

## Model training phase

In [2]:
import sklearn 
import sklearn.preprocessing 
import sklearn.ensemble
import numpy as np
from sklearn.pipeline import Pipeline

scaler = sklearn.preprocessing.MinMaxScaler()
scaled_dataset = scaler.fit_transform(dataset)

clf = sklearn.ensemble.RandomForestClassifier(n_estimators=100, 
                                              max_depth=4,
                                              random_state=0)
scores = sklearn.model_selection.cross_val_score(clf, dataset, labels, cv=10,scoring='accuracy')

print("accuracy =", np.mean(scores))
print("std_dev = ",np.std(scores))

clf.fit(dataset,labels)
model_labels = clf.predict(dataset)

accuracy = 0.914561196249592
std_dev =  0.028419917503289226


## Clustering phase

In [11]:
import os,sys,inspect

# magagna per importare la cartella padre. 
# TODO: aggiustare con il setup giusto nel caso diventassimo famosi
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 
from clustering.ADP import ADP

import os
import scipy

In [12]:
adp_clusterer = ADP(granularity = 0, distancetype = 'euclidean')
scaler = sklearn.preprocessing.MinMaxScaler()
pipe = Pipeline([('scaling', scaler), ('clustering', adp_clusterer)])
pipe.fit(dataset,model_labels.astype(int))

print("Number of pivots found: ",  len(pipe['clustering'].pivots))

Number of pivots found:  19


## Explanation phase

In [14]:
from explainers.CASTLETabularExplainer import CASTLETabularExplainer 

#proximity_function = lambda x: 1/(1+x)
#proximity_function = lambda x: np.exp(-x)
#proximity_function = lambda x: -x
#proximity_function = lambda x: 1 - (x - np.min(x))/(np.max(x) - np.min(x))
proximity_function = lambda x: np.max(x) - x

explainer_castle = CASTLETabularExplainer(
                                 dataset, 
                                 cluster_model = pipe, 
                                 pivots = pipe['clustering'].pivots,
                                 class_names=['False','True'], 
                                 discretize_continuous=False,
                                 sample_around_instance = True,
                                 verbose=False,
                                 proximity_function = proximity_function)

In [15]:
num_samples = 1000 # Number of neighbors generated by the algorithm
num_features = dataset.shape[1] # Number of features for LIME explanation
num_clusters = len(pipe['clustering'].pivots) # Number of pivots to use for CASTLE explanation

test_instance = dataset[0] # instance to explain

exp_castle,exp_lime = explainer_castle.explain_instance(test_instance, clf.predict_proba, top_labels=1, 
                                               num_clusters = num_clusters, num_samples = num_samples, verbose=False)

### Explanation scores: comparison

In [16]:
castle_adjusted_r2 = 1 - (1-exp_castle.score)*(num_samples-1)/(num_samples-num_clusters-1)
lime_adjusted_r2 = 1 - (1-exp_lime.score)*(num_samples-1)/(num_samples-num_features-1)

print("LIME adjusted-R2: ", lime_adjusted_r2)
print("CASTLE adjusted-R2: ", castle_adjusted_r2)

LIME adjusted-R2:  0.601108568361618
CASTLE adjusted-R2:  0.2211632387217971
