# Experiments ECAI24 on the small business administration dataset

temporal shift


In [2]:
# Basics
import numpy as np
import pandas as pd
import csv

# sci-kit learn
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split,cross_validate
from util_scripts.convert import extract_sklearn_params, custom_nn_model
from joblib import dump, load
import gurobipy

pd.options.display.max_columns = 100
pd.options.display.max_rows = 150

import warnings
warnings.filterwarnings('ignore')

from util_scripts.preprocessor import Preprocessor, min_max_scale
from util_scripts.utilexp import *
from interval import *
import tensorflow as tf
tf.compat.v1.enable_eager_execution()
tf.random.set_seed(1)
np.random.seed(1)

CYAN_COL = '\033[96m'
BLUE_COL = '\033[94m'
RED_COL = '\033[91m'
GREEN_COL = '\033[92m'
YELLOW_COL = '\033[93m'
RESET_COL = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'

In [3]:
df = pd.read_csv("./datasets/sba/SBAcase.11.13.17.csv", delimiter=',')
df = df.dropna(axis=1)
df = df.drop(columns=['ApprovalDate', "LoanNr_ChkDgt", "Name", "Zip", "City", "State", "NAICS", "FranchiseCode", 'BalanceGross', "MIS_Status", "Selected", "UrbanRural", 'Recession', 'New', 'RealEstate', 'Portion'])
continuous_features = ['Term', 'NoEmp', 'CreateJob', 'RetainedJob',
       'DisbursementGross', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv',
       'daysterm']

df1 = df[df['ApprovalFY'] < 2006].drop(columns="ApprovalFY")
df2 = df[df['ApprovalFY'] >= 2006].drop(columns="ApprovalFY")
df = df.drop(columns=["ApprovalFY"])
# min max scale
min_vals = np.min(df[continuous_features], axis=0)
max_vals = np.max(df[continuous_features], axis=0)
df1_mm = min_max_scale(df1, continuous_features, min_vals, max_vals)
df2_mm = min_max_scale(df2, continuous_features, min_vals, max_vals)

# get X, y
X1, y1 = df1_mm.drop(columns=['Default']), pd.DataFrame(1 - df1_mm['Default'])
X2, y2 = df2_mm.drop(columns=['Default']), pd.DataFrame(1 - df2_mm['Default'])
SPLIT = .2
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, stratify=y1, test_size=SPLIT, shuffle=True,
                                                    random_state=5)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, stratify=y2, test_size=SPLIT, shuffle=True,
                                                    random_state=2)

ordinal_features = {}
discrete_features = {}
columns = list(df1_mm.columns)
feat_var_map = {}
for i in range(len(X1.columns)):
    feat_var_map[i] = [i]


In [4]:
display(df1_mm)

Unnamed: 0,Term,NoEmp,CreateJob,RetainedJob,DisbursementGross,ChgOffPrinGr,GrAppv,SBA_Appv,daysterm,Default
0,0.117647,0.001538,0.0,0.0,0.012110,0.0,0.010872,0.006035,0.117647,0
1,0.183007,0.001538,0.0,0.0,0.010893,0.0,0.010872,0.006035,0.183007,0
2,0.117647,0.015385,0.0,0.0,0.010893,0.0,0.010872,0.006035,0.117647,0
3,0.117647,0.009231,0.0,0.0,0.019551,0.0,0.019399,0.010768,0.117647,0
5,0.274510,0.001538,0.0,0.0,0.022072,0.0,0.019399,0.010768,0.274510,0
...,...,...,...,...,...,...,...,...,...,...
2089,0.980392,0.001538,0.0,0.0,0.084481,0.0,0.083351,0.069933,0.980392,0
2098,0.980392,0.006154,0.0,0.0,0.040761,0.0,0.040290,0.036422,0.980392,0
2099,0.274510,0.003077,0.0,0.0,0.019551,0.0,0.019399,0.017868,0.274510,0
2100,0.392157,0.004615,0.0,0.0,0.106622,0.0,0.211256,0.176429,0.392157,0


In [5]:
display(df2_mm)

Unnamed: 0,Term,NoEmp,CreateJob,RetainedJob,DisbursementGross,ChgOffPrinGr,GrAppv,SBA_Appv,daysterm,Default
4,0.784314,0.100000,0.023077,0.121495,0.146381,0.000000,0.144319,0.161283,0.784314,0
6,0.879085,0.003077,0.000000,0.003738,0.126686,0.163674,0.124920,0.104544,0.879085,1
7,0.274510,0.001538,0.015385,0.001869,0.026930,0.000000,0.010872,0.006035,0.274510,0
13,0.248366,0.003077,0.000000,0.003738,0.010893,0.019675,0.010872,0.006035,0.248366,1
15,0.196078,0.020000,0.053846,0.037383,0.008729,0.000000,0.008740,0.004851,0.196078,0
...,...,...,...,...,...,...,...,...,...,...
2093,0.274510,0.004615,0.000000,0.005607,0.043690,0.000000,0.040716,0.022601,0.274510,0
2094,0.274510,0.004615,0.000000,0.005607,0.037948,0.000000,0.010872,0.006035,0.274510,0
2095,0.784314,0.009231,0.030769,0.011215,0.310006,0.000000,0.305479,0.340196,0.784314,0
2096,0.784314,0.043077,0.061538,0.052336,0.443330,0.000000,0.436794,0.485978,0.784314,0


# Train and observe model changes

In [6]:
# Randomdised search + 5-fold cross validation (default)
nn = MLPClassifier(learning_rate='adaptive', random_state=0)

# parameters
max_iter_vals = [2, 3, 4, 5]
hidden_layer_sizes_vals = [(i) for i in range(3, 20)]
batch_size_vals = [8, 16, 32, 64]
learning_rate_init_vals = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05]

#distributions = dict(max_iter=max_iter_vals, hidden_layer_sizes=hidden_layer_sizes_vals)
distributions = dict(hidden_layer_sizes=hidden_layer_sizes_vals,
                     batch_size=batch_size_vals,
                     learning_rate_init=learning_rate_init_vals,
                     max_iter=max_iter_vals)

nns = RandomizedSearchCV(nn, distributions, scoring='f1_macro')
#nns = RandomizedSearchCV(nn, distributions, scoring='accuracy')
search = nns.fit(X1, y1)
print(search.best_params_)


{'max_iter': 5, 'learning_rate_init': 0.02, 'hidden_layer_sizes': 12, 'batch_size': 8}


In [7]:
clf = MLPClassifier(learning_rate='adaptive', hidden_layer_sizes=6, learning_rate_init=0.05, batch_size=8,
                    max_iter=5, random_state=0)
# 5-fold cross validation
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score, f1_score, precision_score

scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
scores = cross_validate(clf, X1, y1, scoring=scoring)
for name in list(scores.keys()):
    if name == 'fit_time' or name == 'score_time':
        continue
    print("%0.2f %s with a std of %0.2f" % (scores[name].mean(), name, scores[name].std()))

#clf = MLPClassifier(learning_rate='adaptive', hidden_layer_sizes=18, learning_rate_init=0.005, batch_size=8,
#                    max_iter=9000, random_state=0)
clf = MLPClassifier(learning_rate='adaptive', hidden_layer_sizes=18, learning_rate_init=0.005, batch_size=8,
                    max_iter=9000, random_state=0)
clf.fit(X1_train, y1_train)
resres = clf.predict(X1_test.values)
print('\n', classification_report(y1_test, resres, target_names=[f'bad credit (0)', f'good credit (1)'], digits=3))
resres = clf.predict(X1_train.values)
print('\n', classification_report(y1_train, resres, target_names=[f'bad credit (0)', f'good credit (1)'], digits=3))


0.94 test_accuracy with a std of 0.01
0.92 test_precision_macro with a std of 0.04
0.87 test_recall_macro with a std of 0.03
0.89 test_f1_macro with a std of 0.02

                  precision    recall  f1-score   support

 bad credit (0)      1.000     0.946     0.972        37
good credit (1)      0.990     1.000     0.995       195

       accuracy                          0.991       232
      macro avg      0.995     0.973     0.984       232
   weighted avg      0.991     0.991     0.991       232


                  precision    recall  f1-score   support

 bad credit (0)      0.965     0.926     0.945       149
good credit (1)      0.986     0.994     0.990       778

       accuracy                          0.983       927
      macro avg      0.976     0.960     0.967       927
   weighted avg      0.983     0.983     0.983       927



In [8]:
# save the trained classifier
dump(clf, 'sba.joblib')

['sba.joblib']

# Experiments: computing counterfactuals

#### Procedures

These procedures are covered by UtilExp class

1. Train M on D1
2. Get delta-min, build M+ and M-: incrementally train M 5 times, using different 10% of D2 each time, then get the maximum inf-distance between the incremented models and M. Construct M+ and M- using delta-min
3. Get M2: incrementally train M on D2
4. Select test instances: randomly select 50 D1 instances to explain, clf(x)=0, desired class=1
5. Report metrics using each baseline

#### Metrics
- Proximity: normalised L1: "Scaling Guarantees for Nearest CEs" page 7
- Sparsity: L0
- Validity-delta: percentage of test instances that 1) have counterfactuals valid on m1, 2) counterfactuals valid on M+ and M- under delta_min
- Validity-m2: percentage of test instances that 1) have counterfactual(s), 2) these counterfactual(s) are all valid on both m1 and m2
- LOF: average LOF score

In [9]:
clf = load("sba.joblib")
gurobipy.setParam("FeasibilityTol", 1e-03)
gurobipy.setParam("OptimalityTol", 1e-03)
gurobipy.setParam("IntFeasTol", 1e-03)


Restricted license - for non-production use only - expires 2025-11-24
Set parameter FeasibilityTol to value 0.001
Set parameter OptimalityTol to value 0.001
Set parameter IntFeasTol to value 0.001


In [10]:
util_exp = UtilExp(clf, X1, y1, X2, y2, columns, ordinal_features, discrete_features, continuous_features, feat_var_map, gap=0.25, num_test_instances=200)
print(util_exp.delta_max)
print(util_exp.delta_min)

0.30498064647513345
0.10821238236127151


In [11]:
# save model trained on the whole dataset
m2 = copy.deepcopy(clf)
m2.partial_fit(X2, y2)
util_exp.Mmax = m2

In [12]:
# pre-verification on the points soundness 
valids = util_exp.verify_soundness()
print(len(valids))

percentage of sound model changes: 0.6348314606741573
113


In [13]:
valids = util_exp.verify_soundness(update_test_instances=True)

percentage of sound model changes: 0.6348314606741573
test instances updated to sound (x, Delta) pairs, length: 50


In [14]:
input_size, n_layers, output_size, output_act, h_act, optimizer, params = extract_sklearn_params(clf)
tf_model = custom_nn_model(input_size, n_layers, params, output_size, h_act, output_act, optimizer)
     
# Set model weights
for k, v in params.items():
	tf_model.layers[k].set_weights(v)
        
tf_model.save('./models/sba.h5', save_format='h5')   

input_size, n_layers, output_size, output_act, h_act, optimizer, params = extract_sklearn_params(m2)
tf_model = custom_nn_model(input_size, n_layers, params, output_size, h_act, output_act, optimizer)
     
# Set model weights
for k, v in params.items():
	tf_model.layers[k].set_weights(v)
	
tf_model.save('./models/sba_retrained.h5', save_format='h5')  

original_model = tf.keras.models.load_model('./models/sba.h5', compile=False)
old_weights = {}
for l in range(1,len(original_model.layers)):
	old_weights[l] = original_model.layers[l].get_weights()


model_retrained = tf.keras.models.load_model('./models/sba_retrained.h5', compile=False)
retrained_weights = {}
for l in range(1,len(model_retrained.layers)):
	retrained_weights[l] = model_retrained.layers[l].get_weights()


max_diff = -1
for l in range(1,len(old_weights)):
	old_layer_weights = old_weights[l][0]
	new_retrained_weights = retrained_weights[l][0]

	difference = abs(old_layer_weights - new_retrained_weights)
	
	for list_weights in difference:
		max_distance = max(list_weights)
		if max_distance > max_diff:
			max_diff = max_distance

print("\nThe maximum distance between weights is:", max_diff)




The maximum distance between weights is: 0.24813461


### CFX computation: in the following cells we both compute the CFX using MILP and the proposed probabilistic APΔS approach. 

In [15]:
# OURS-ROBUST: compute CFX based on the n sound points discovered above
ours_robust_ces_apas = util_exp.run_ours_robust(approx=True)
util_exp.evaluate_ces(ours_robust_ces_apas)
cfxs_robust_apas = ours_robust_ces_apas
print(len(cfxs_robust_apas))

50it [00:21,  2.31it/s]


total computation time in s: 21.692475080490112
found: 1.0
average normalised L1: 0.008920690895365926
average normalised L0: 0.1642800000000001
average lof score: 0.4
counterfactual validity: 1.0
delta validity: 0.0
m2 validity: 1.0
50


In [16]:
# OURS-ROBUST: compute CFX based on the n sound points discovered above
ours_robust_ces = util_exp.run_ours_robust()
util_exp.evaluate_ces(ours_robust_ces)
cfxs_robust = ours_robust_ces
print(len(cfxs_robust))

50it [00:23,  2.09it/s]


total computation time in s: 23.9276442527771
found: 1.0
average normalised L1: 0.018392337205926063
average normalised L0: 0.27084
average lof score: -0.88
counterfactual validity: 1.0
delta validity: 1.0
m2 validity: 1.0
50


In [18]:
# double checking the robustness of the CFXs found by APΔS approach after retraining
tot_robust_cfx = len(cfxs_robust_apas)
robust_cfx_after_retrain = 0

for cfx in cfxs_robust_apas:
    if model_retrained(np.array(cfx.reshape(1,-1))) >= 0.5:
        robust_cfx_after_retrain += 1

print(f"Percentage CFXs robust after the retraing: {(robust_cfx_after_retrain/tot_robust_cfx)*100}%") 

Percentage CFXs robust after the retraing: 100.0%


In [19]:
tot_robust_cfx = len(cfxs_robust)
robust_cfx_after_retrain = 0

for cfx in cfxs_robust:
    if model_retrained(np.array(cfx.reshape(1,-1))) >= 0.5:
        robust_cfx_after_retrain += 1

print(f"Percentage CFXs robust after the retraing: {(robust_cfx_after_retrain/tot_robust_cfx)*100}%") 

Percentage CFXs robust after the retraing: 100.0%


In [20]:
def estimate_robustness(model, delta, cfx, concretizations, use_biases=True, robustness=True):

    """
    Utility method for the estimation of the CFX (not) Δ-robustness in the INN.

    Returns:
    --------
        rate: float
            estimation of the CFX (not) Δ-robustness computed with 'concretizations' models concretizations from the INN
    """
    np.random.seed(1)
    # Store initial weights
    old_weights = {}
    for l in range(1,len(model.layers)):
        old_weights[l] = model.layers[l].get_weights()

    for _ in range(concretizations):
        
        #perturbated_weights = {}
        input_features = np.array(cfx)

        for l in range(1,len(old_weights)+1):
            layer_weights = old_weights[l][0]
            if use_biases: layer_biases  = old_weights[l][1]
            
            weights_perturbation = np.random.uniform(-delta, delta, layer_weights.shape)
            if use_biases: biases_perturbation = np.random.uniform(-delta, delta, layer_biases.shape)
           
            
            layer_weights = [layer_weights+weights_perturbation]

            if use_biases: 
                layer_biases = [layer_biases+biases_perturbation]
                preactivated_res = np.dot(input_features, layer_weights) + layer_biases
            else:
                preactivated_res = np.dot(input_features, layer_weights)

            if l != len(old_weights):
                #relu
                activated_res = np.maximum(0.0, preactivated_res)
            else:
                #sigmoid
                activated_res = 1/(1 + np.exp(-preactivated_res))
            
            input_features = activated_res
            
        if input_features < 0.5:
            return 0  
    
    return 1

def compute_delta_max_MILP(cfx, delta_init, verbose=False):
  
    lower = 1/(1 + np.exp(-util_exp.is_robust_custom_delta_new(cfx, delta_init)))
    if lower < 0.5: return 0 # CFX not robust
        
    delta = delta_init
    while lower >= 0.5: # over-approx lower bound is >= 0.5, i.e., x results robust
        delta = 2*delta
        lower = 1/(1 + np.exp(-util_exp.is_robust_custom_delta_new(cfx, delta)))
        if verbose: 
            print(f'Testing δ={delta}')
            print(f'Lower is: {lower}')
    
    delta_max = delta/2
    
    while True:
        if abs(delta-delta_max) < delta_init:
            return delta_max

        if verbose: print(f"\nInterval to test is: [{delta_max}, {delta}]")
        
        delta_new = (delta_max+delta)/2
        lower = 1/(1 + np.exp(-util_exp.is_robust_custom_delta_new(cfx, delta_new)))
        if verbose: 
            print(f'Testing δ={delta_new}')
            print(f'Rate is: {lower}')
        
        if lower >= 0.5:
            delta_max = delta_new
        else:
            delta = delta_new



def compute_delta_max(model, cfx, delta_init, concretizations, use_biases=True, verbose=False):
  
    rate = estimate_robustness(model, delta_init, cfx, concretizations, use_biases)
    if rate != 1: return 0 # CFX not robust
        
    delta = delta_init
    while rate == 1: # for all the concretizations x results robust
        delta = 2*delta
        rate = estimate_robustness(model, delta, cfx, concretizations, use_biases)
        if verbose: 
            print(f'Testing δ={delta}')
            print(f'Rate is: {rate}')
    
    delta_max = delta/2
    
    while True:
        if abs(delta-delta_max) < delta_init:
            return delta_max

        if verbose: print(f"\nInterval to test is: [{delta_max}, {delta}]")
        
        delta_new = (delta_max+delta)/2
        rate = estimate_robustness(model, delta_new, cfx, concretizations, use_biases)
        if verbose: 
            print(f'Testing δ={delta_new}')
            print(f'Rate is: {rate}')
        
        if rate == 1:
            delta_max = delta_new
        else:
            delta = delta_new
        

In [21]:
model = tf.keras.models.load_model('./models/sba.h5', compile=False)
alpha = 0.999
R = 0.995
concretizations = int(np.emath.logn(R, (1-alpha)))
delta_init = 0.0001
delta_AAAI = 0.10821238236127151

with open("full_results_sba.csv", mode='w', newline='') as file:

    csv_writer = csv.writer(file)
    csv_writer.writerow(["CFX", "AAAI δ_max", "Wilks δ_max", "MILP δ_max", "Difference"])

    wilks = []
    milp = []
    print( f"{CYAN_COL}Condifence α =(1-R^n)={(1-R**concretizations)*100}%, R={R*100}%, Concretizations(n)={concretizations}{RESET_COL}")

    # start computing the new deltas with the approximation
    for cfx in cfxs_robust:

        delta_max_sampling = compute_delta_max(model, cfx.reshape(1,-1), delta_init, concretizations,verbose=False)
        delta_max_MILP = compute_delta_max_MILP(cfx, delta_init,verbose=False) 
        difference = abs(delta_max_sampling-delta_max_MILP)
        wilks.append(delta_max_sampling)
        milp.append(delta_max_MILP)
        csv_writer.writerow([cfx,  delta_AAAI, delta_max_sampling, delta_max_MILP, difference])

        print('δ_max =', delta_max_sampling)
        print('δ improvement w.r.t original MILP\'s δ is',difference)
        print("______________________________________________________________________________________")



    csv_writer.writerow([" "])
    csv_writer.writerow(["Mean Wilks", "Mean MILP"])
    csv_writer.writerow([np.mean(np.array(milp)),np.mean(np.array(wilks))])



[96mCondifence α =(1-R^n)=99.89995272421471%, R=99.5%, Concretizations(n)=1378[0m
δ_max = 0.28790000000000004
δ improvement w.r.t original MILP's δ is 0.17810000000000004
______________________________________________________________________________________
δ_max = 0.29594999999999994
δ improvement w.r.t original MILP's δ is 0.18674999999999992
______________________________________________________________________________________
δ_max = 0.3016
δ improvement w.r.t original MILP's δ is 0.19229999999999997
______________________________________________________________________________________
δ_max = 0.29699999999999993
δ improvement w.r.t original MILP's δ is 0.18749999999999994
______________________________________________________________________________________
δ_max = 0.28800000000000003
δ improvement w.r.t original MILP's δ is 0.17790000000000003
______________________________________________________________________________________
δ_max = 0.3068
δ improvement w.r.t original MILP'