JELIA CONFERENCE 2023 paper

In [None]:
# required modules (under Anaconda use: > conda install -c conda-forge <package>)
if False: # (skip if already installed)
    !pip install lark-parser
    !pip install linear-tree
    !pip install pydot
    !pip install pydotplus
    # download and install SWI Prolog from https://www.swi-prolog.org/download/stable
    # be sure that the executable is added to the PATH

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# standard imports
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifier
import matplotlib.pyplot as plt
import pydotplus
from IPython.display import Image

# imported packages
from lineartree import LinearTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

# local imports
sys.path.append('../src/') # local path
import reasonx
import dautils

### Example on continuous only attributes

In [None]:
continuous_only = False

### Read dataframe

In [None]:
# read dataset
df = pd.read_csv('../data/adult_continuous.csv', na_values='?')
# remove unused columns
del df['fnlwgt']
del df['education-num']
# simplify dataframe
del df['marital-status']
del df['native-country']
del df['occupation']
del df['relationship']
# remove special characters in column names and values
df.columns = df.columns.str.replace("[-&()]", "", regex=True)
df = df.replace('[-&()]', '', regex=True)
# missing values imputation with mode (needed for Decision Trees)
df = df.apply(lambda x:x.fillna(x.value_counts().index[0]))
df.info()

In [None]:
# nominal-ordinal-continuous partition of predictive attributes
nominal_atts = [] if continuous_only else ['race', 'sex', 'workclass']
ordinal_atts = [] if continuous_only else ['education']
continuous_atts = ['age', 'capitalgain', 'hoursperweek'] if continuous_only else ['age', 'capitalgain', 'capitalloss', 'hoursperweek']
# class attribute
target = 'class'
# predictive atts
pred_atts = nominal_atts + ordinal_atts + continuous_atts

In [None]:
# forcing encoding of ordinal attributes (consistent with the order) and class attribute (0=negatives, 1=positives)
decode = {
    'education': {
        1:'Preschool', 2:'1st4th', 3:'5th6th', 4:'7th8th', 5:'9th', 6:'10th', 7:'11th',
        8:'12th', 9:'HSgrad', 10:'Somecollege', 11:'Assocvoc', 12:'Assocacdm', 13:'Bachelors', 
        14:'Masters', 15:'Profschool', 16:'Doctorate' 
    },
    'class': {
        0: '<=50K', 1: '>50K'
    }
}
# encode nominal (as categories), ordinal+target (as int), passing the encoding of ordinal+target
prefix_sep = "_" # separator for one-hot encoding
df_code = dautils.Encode(nominal=nominal_atts, ordinal=ordinal_atts+[target], decode=decode, onehot=True, prefix_sep=prefix_sep)
df_encoded_onehot = df_code.fit_transform(df)
df_encoded_onehot.head()

In [None]:
# encode-decoding dictionaries
df_code.encode, df_code.decode

In [None]:
# encoded atts names
encoded_pred_atts = df_code.encoded_atts(pred_atts)
# split predictive and target
X, y = df_encoded_onehot[encoded_pred_atts], df_encoded_onehot[target]
X1, _, y1, _ = train_test_split(X, y, test_size=0.3, random_state=42)
X2, _, y2, _ = train_test_split(X, y, test_size=0.3, random_state=24)

In [None]:
# ADDED

# pick data instance

features=X1.iloc[0:1]
label=y1.iloc[0]
#print(features, label)
data_numpy = X1.to_numpy()
print(data_numpy)

In [None]:
# ADDED

# generate neighborhood
from neighborhood import naive_neighborhood_instance

# parameters
N = 5000
C = 15

neigh = naive_neighborhood_instance(features.to_numpy(), C, N, np.transpose(data_numpy), 42)
#print(neigh)

In [None]:
# train a decision tree (ML classifier)
clf1 = DecisionTreeClassifier(max_depth=3)
clf1.fit(X1, y1)
#clf2 = DecisionTreeClassifier(max_depth=3)
#clf2.fit(X2, y2)

# train a random forest, XGB classifier and neural net
xgb = XGBClassifier(random_state = 0)
xgb.fit(X1, y1)

In [None]:
# ADDED

# label
neigh_label = clf1.predict(neigh)
neigh_label_xgb = xgb.predict(neigh)

# split neigh
neigh_train, neigh_test, neigh_label_train, neigh_label_test = train_test_split(neigh, neigh_label_xgb, test_size=0.3, random_state=42)

# ratio to check class balance
print("orig predictor", sum(neigh_label), N)
print(sum(neigh_label_xgb), N)

# learn base model (DT) on full neighborhood
neigh_clf = DecisionTreeClassifier(max_depth=3)
neigh_clf.fit(neigh, neigh_label)
neigh_clf_xgb = DecisionTreeClassifier(max_depth=3)
neigh_clf_xgb.fit(neigh, neigh_label_xgb)

neigh_clf_train_xgb = DecisionTreeClassifier(max_depth=3)
neigh_clf_train_xgb.fit(neigh_train, neigh_label_train)

# train = test QUICK AND DIRTY : print(neigh_clf.score(neigh, neigh_label), neigh_clf_xgb.score(neigh, neigh_label_xgb))

# FIDELITY
print(neigh_clf_train_xgb.score(neigh_test, neigh_label_test))

In [None]:
# visualize the decision tree
dot_data = tree.export_graphviz(neigh_clf_xgb, out_file=None, 
                                feature_names=encoded_pred_atts, # attributes names
                                class_names=df_code.decode[target], # class labels
                                filled=True, rounded=True)  
graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())

### Decision tree paths to CLP rules 

In [None]:
# Model2CLP
r = reasonx.ReasonX(pred_atts, target, df_code)
r.model(neigh_clf_train_xgb)

In [None]:
# Why was my credit application rejected?
# ANSWER 1

# factual rule on an instance
r.instance('F', features=X1.iloc[0:1], label=y1.iloc[0])
r.solveopt(verbose=2)

In [None]:
# Why was my credit application rejected?
# ANSWER 2

# counter-factual rules with min conf
print("iter 0")
r.instance('CE', label=1-y1.iloc[0], minconf=0.9)
r.solveopt(verbose=2)
print("iter 1")
r.instance('CE', label=1-y1.iloc[0], minconf=0.8)
r.solveopt(verbose=2)
print("iter 2")
r.instance('CE', label=1-y1.iloc[0], minconf=0.7)
r.solveopt(verbose=2)
print("iter 3")
r.instance('CE', label=1-y1.iloc[0], minconf=0.6)
r.solveopt(verbose=2)

In [None]:
# ADD BGK
# on the age (CF.age = F.age)

r.constraint("CE.age = F.age")
r.solveopt(verbose = 2)

In [None]:
# CLOSEST CE

r.solveopt(minimize='l1norm(F, CE)', project=['CE'], verbose = 2)

In [None]:
# UNDER-SPECIFIED INFORMATION

r.retract("F.age=19.0")
r.constraint("F.age<=19.0")
r.solveopt(minimize='l1norm(F, CE)', project=["CE", "F.age"], verbose = 2)

In [None]:
# CHECK data point

X1.iloc[0:1]