## Evaluation Update Update

CHECK: dimensionality nan > VERBOSE HAS TO BE 1 OR HIGHER

In [1]:
# required modules (under Anaconda use: > conda install -c conda-forge <package>)
if False: # (skip if already installed)
    !pip install lark-parser
    !pip install linear-tree
    !pip install pydot
    !pip install pydotplus
    # download and install SWI Prolog from https://www.swi-prolog.org/download/stable
    # be sure that the executable is added to the PATH

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# standard imports
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import pydotplus
from IPython.display import Image
from xgboost import XGBClassifier

# local imports
sys.path.append('../src/') # local path
import reasonx
import dautils

from helper_functions import read_adult, read_give_me_some_credit, read_south_german_credit, read_credit_card_default, read_australian_credit
from helper_functions import evaluation_return_array

from neighborhood import naive_neighborhood_instance

### Experiments

In [3]:
safe_results = 0

simplified=False
continuous_only=False
dataset = "adult"

# read dataset
if dataset == "gmsc":
    df, pred_atts, target, df_code = read_give_me_some_credit(continuous_only=continuous_only, simplified=simplified)
if dataset == "sgc":
    df, pred_atts, target, df_code = read_south_german_credit(continuous_only=continuous_only, simplified=simplified)
if dataset == "adult":
    df, pred_atts, target, df_code = read_adult(continuous_only=continuous_only, simplified=simplified)
if dataset == "dccc":
    df, pred_atts, target, df_code = read_credit_card_default(continuous_only=continuous_only, simplified=simplified)
if dataset == "aca":
    df, pred_atts, target, df_code = read_australian_credit(continuous_only=continuous_only, simplified=simplified)

In [4]:
print(df.info())
#print("\n TARGET: \n", df[target].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   race          48842 non-null  object
 1   sex           48842 non-null  object
 2   workclass     48842 non-null  object
 3   education     48842 non-null  object
 4   age           48842 non-null  int64 
 5   capitalgain   48842 non-null  int64 
 6   capitalloss   48842 non-null  int64 
 7   hoursperweek  48842 non-null  int64 
 8   class         48842 non-null  object
dtypes: int64(4), object(5)
memory usage: 3.4+ MB
None


In [5]:
# encode df
df_encoded_onehot = df_code.fit_transform(df)
# encoded atts names
encoded_pred_atts = df_code.encoded_atts(pred_atts)
df_encoded_onehot.head()

Unnamed: 0,race_AmerIndianEskimo,race_AsianPacIslander,race_Black,race_Other,race_White,sex_Female,sex_Male,workclass_Federalgov,workclass_Localgov,workclass_Neverworked,...,workclass_Selfempinc,workclass_Selfempnotinc,workclass_Stategov,workclass_Withoutpay,education,age,capitalgain,capitalloss,hoursperweek,class
0,0,0,0,0,1,0,1,0,0,0,...,0,0,1,0,13,39,2174,0,40,0
1,0,0,0,0,1,0,1,0,0,0,...,0,1,0,0,13,50,0,0,13,0
2,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,9,38,0,0,40,0
3,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,7,53,0,0,40,0
4,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,13,28,0,0,40,0


In [6]:
# define case (a: ML model = tree = base model; b: global surrogate; c: local surrogate)
# constraints: add them as string, only on F and CF (for now)

case = "c"
# constraints: empty or as string
constraints = []
#constraints = "F.capitalgain = CF.capitalgain"
#constraints = "F.sex = CF.sex, F.race = CF.race"
# minconf value of factual
confidence_f = 0.99
tree_depth = 4

# split predictive and target
X, y = df_encoded_onehot[encoded_pred_atts], df_encoded_onehot[target]
# retain test sets
X1, XT1, y1, yt1 = train_test_split(X, y, test_size=0.3, random_state=42)
X2, XT2, y2, yt2 = train_test_split(X, y, test_size=0.3, random_state=24)

if case == "a":
    clf1 = DecisionTreeClassifier(max_depth=tree_depth)
    clf1.fit(X1, y1)

    print("accuracy             ", clf1.score(XT1, yt1))

    tree_ = clf1

if case == "b":
    # learn ML model
    xgb = XGBClassifier(random_state = 0)
    xgb.fit(X1, y1)
    xgb_label = xgb.predict(XT1)

    print("accuracy XGB         ", xgb.score(XT1, yt1))

    xgb_label_df = pd.Series(data=xgb_label)

    # split the test set (XT1/xgb_labels) in two parts
    XT1_train, XT1_test, xgb_label_train, xgb_label_test = train_test_split(XT1, xgb_label_df, test_size=0.3, random_state=42)

    # train a surrogate decision tree
    clf2 = DecisionTreeClassifier(max_depth=tree_depth)
    clf2.fit(XT1_train, xgb_label_train)

    # fidelity of the surrogate tree
    print("fidelity             ", clf2.score(XT1_test, xgb_label_test))

    tree_ = clf2

if case == "c":
    # learn ML model
    xgb = XGBClassifier(random_state = 0)
    xgb.fit(X1, y1)
    xgb_label = xgb.predict(XT1)

    ml_model_local = xgb

    print("accuracy XGB         ", xgb.score(XT1, yt1))

accuracy XGB          0.8514297413498942


In [7]:
# basic eval DT
if case == "b" or case == "a":
    print("depth                ", tree_.get_depth())
    print("number of leaves     ", tree_.get_n_leaves())

In [8]:
# execute the evaluation

instances = 100
evaluation_list = []

if case == "a":
    r = reasonx.ReasonX(pred_atts, target, df_code, verbose = 1)
    r.model(tree_)
    evaluation_list = evaluation_return_array(r, XT1, yt1, instances, minconf_f=confidence_f, constraints_fce=constraints)

if case == "b":
    r = reasonx.ReasonX(pred_atts, target, df_code, verbose = 1)
    r.model(tree_)
    # r is initialize with global surrogate tree that is learned on 70% of the original test set
    # evaluation should be done over the remaining dataset that are the 30% of the original test set
    evaluation_list = evaluation_return_array(r, XT1_test, xgb_label_test, instances, minconf_f=confidence_f, constraints_fce=constraints)

if case == "c": 
    fidelity = []
    for i in range(instances): 
        # pick data instance
        features=XT1.iloc[i:i+1]
        # relevant label is the predicted label by ML model
        label=xgb_label[i]
        data_numpy = XT1.to_numpy()

        # neighborhood generation
        N = 5000
        C = int(df_encoded_onehot.shape[1] * 2 / 3)
        neigh = naive_neighborhood_instance(features.to_numpy(), C, N, np.transpose(data_numpy), 42)

        # predict labels of neigh
        label_neigh = ml_model_local.predict(neigh)

        # split neigh
        neigh_train, neigh_test, neigh_label_train, neigh_label_test = train_test_split(neigh, label_neigh, test_size=0.3, random_state=42)

        # train surrogate DT
        clf2 = DecisionTreeClassifier(max_depth=tree_depth)
        clf2.fit(neigh_train, neigh_label_train)

        fidelity.append(clf2.score(neigh_test, neigh_label_test))

        # execute the evaluation
        r = reasonx.ReasonX(pred_atts, target, df_code, verbose = 1)
        r.model(clf2)

        # compute evaluation only for data instance in focus
        evaluation_ = evaluation_return_array(r, features, label, n_instances = 1, minconf_f=confidence_f, constraints_fce=constraints)
        if i == 0:
            evaluation_list = evaluation_
        else:
            evaluation_list = np.concatenate((evaluation_list, evaluation_), axis=1)

        #r.reset()

    print("mean fidelity    ", np.mean(fidelity))

data set size (20, 14653)
dimensions upper/lower bound arrays 20 20
F.race = White, F.sex = Male, F.workclass = Private, F.education = HSgrad, F.age = 18, F.capitalgain = 0, F.capitalloss = 0, F.hoursperweek = 20
---
Answer constraint: F.race=White,F.sex=Male,F.workclass=Private,F.education=9.0,F.age=18.0,F.capitalgain=0.0,F.capitalloss=0.0,F.hoursperweek=20.0
---
Answer constraint: F.race=White,F.sex=Male,F.workclass=Private,F.education=9.0,F.age=18.0,F.capitalgain=0.0,F.capitalloss=0.0,F.hoursperweek=20.0,CF.capitalgain>4252.0,CF.capitalgain<=4296.5
---
Answer constraint: F.race=White,F.sex=Male,F.workclass=Private,F.education=9.0,F.age=18.0,F.capitalgain=0.0,F.capitalloss=0.0,F.hoursperweek=20.0,CF.capitalgain>7067.5,CF.capitalgain<=26914.5,CF.age<=20.5,CF.capitalloss>2208.0
---
Answer constraint: F.race=White,F.sex=Male,F.workclass=Private,F.education=9.0,F.age=18.0,F.capitalgain=0.0,F.capitalloss=0.0,F.hoursperweek=20.0,CF.capitalgain>7067.5,CF.capitalgain<=31051.5,CF.age>20.5,CF.

In [9]:
# safe results

if safe_results == 1:

    if len(constraints) > 0:
        np.savetxt('evaluation/evaluation_case_' + case + '_instances_' + str(instances) +  "_" + dataset + '_constraints_' + constraints + '.csv', evaluation_list, delimiter=",")
    else:
        np.savetxt('evaluation/evaluation_case_' + case + '_instances_' + str(instances) +  "_" + dataset + '.csv', evaluation_list, delimiter=",")

    # test whether properely saved
    if len(constraints) > 0:
        load = np.loadtxt('evaluation/evaluation_case_' + case + '_instances_' + str(instances) +  "_" + dataset + '_constraints_' + constraints + '.csv', delimiter=",")
    else:
        load = np.loadtxt('evaluation/evaluation_case_' + case + '_instances_' + str(instances) +  "_" + dataset + '.csv', delimiter=",")
    
    # Verify if the loaded array matches the original
    #print(load_original == evaluation_parameters)
    if np.allclose(load, evaluation_list, equal_nan=True):
        print("Yes, both the arrays are the same")
    else:
        print("No, both the arrays are not the same")

In [10]:
# evaluation list

# 0 - number of solutions factual (must be 1)
# 1 - length of rule factual
# 2 - NEW average length of rule CE
# 3 - NEW average number of admissible solutions
# 4 - number of solutions CE l1
# 5 - average distance CE l1
# 6 - average dimension CE l1
# 7 - number of solutions CE linf
# 8 - average distance CE linf
# 9 - average dimension CE linf

print("mean/std length F                                ", np.nanmean(evaluation_list, axis = 1)[1], np.nanstd(evaluation_list, axis = 1)[1])
print("mean/std length CE                               ", np.nanmean(evaluation_list, axis = 1)[2], np.nanstd(evaluation_list, axis = 1)[2])

print("no admissible paths (CE)                         ", np.nanmean(evaluation_list, axis = 1)[3], np.nanstd(evaluation_list, axis = 1)[3])

print("no of solutions l1                               ", np.nanmean(evaluation_list, axis = 1)[4], np.nanstd(evaluation_list, axis = 1)[4])
print("mean/std distance l1                             ", np.nanmean(evaluation_list, axis = 1)[5], np.nanstd(evaluation_list, axis = 1)[5])
print("mean/std dimension l1                            ", np.nanmean(evaluation_list, axis = 1)[6], np.nanstd(evaluation_list, axis = 1)[6])

print("no of solutions linf                             ", np.nanmean(evaluation_list, axis = 1)[7], np.nanstd(evaluation_list, axis = 1)[7])
print("mean/std distance linf                           ", np.nanmean(evaluation_list, axis = 1)[8], np.nanstd(evaluation_list, axis = 1)[8])
print("mean/std dimension linf                          ", np.nanmean(evaluation_list, axis = 1)[9], np.nanstd(evaluation_list, axis = 1)[9])

#print("actual output over instances (measured via F)    ", (np.count_nonzero(~np.isnan(evaluation_list[0,:])) / instances))
print("actual output over instances (measured via F)    ", (np.sum((evaluation_list[0,:])) / instances))
print("actual output over instances (measured via l1)   ", (np.count_nonzero(np.isnan(evaluation_list[4,:])) / instances))
print("actual output over instances (measured via linf) ", (np.count_nonzero(np.isnan(evaluation_list[7,:])) / instances))

mean/std length F                                 2.0677966101694913 0.40607283222748947
mean/std length CE                                3.193233790691418 0.22320909686646856
no admissible paths (CE)                          7.186440677966102 0.7913639003341527
no of solutions l1                                7.169491525423729 0.8265210400160413
mean/std distance l1                              0.4229827073594859 0.08198958109866097
mean/std dimension l1                             0.0 0.0
no of solutions linf                              7.186440677966102 0.7913639003341527
mean/std distance linf                            0.3020830508644726 0.06074330790224286
mean/std dimension linf                           1.0 0.0
actual output over instances (measured via F)     0.59
actual output over instances (measured via l1)    0.41
actual output over instances (measured via linf)  0.41


In [11]:
evaluation_list

array([[1.        , 1.        , 0.        , 1.        , 1.        ,
        1.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 1.        , 1.        , 0.        , 1.        ,
        0.        , 1.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 1.        , 0.        , 1.        ,
        0.        , 1.        , 0.        , 0.        , 1.        ,
        1.        , 1.        , 1.        , 0.        , 1.        ,
        1.        , 1.        , 0.        , 0.        , 1.        ,
        1.        , 0.        , 1.        , 0.        , 1.        ,
        0.        , 0.        , 1.        , 1.        , 1.        ,
        0.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 0.        , 0.        ,
        1.        , 0.        , 1.        , 1.        , 0.        ,
        0.        , 0.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 0.        , 1.  