## Reasoning over time

In [1]:
# required modules (under Anaconda use: > conda install -c conda-forge <package>)
if False: # (skip if already installed)
    !pip install lark-parser
    !pip install linear-tree
    !pip install pydot
    !pip install pydotplus
    # download and install SWI Prolog from https://www.swi-prolog.org/download/stable
    # be sure that the executable is added to the PATH

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# standard imports
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifier
import matplotlib.pyplot as plt
import pydotplus
from IPython.display import Image

# imported packages
from lineartree import LinearTreeClassifier

# local imports
sys.path.append('../src/') # local path
import reasonx
import dautils

### Example on continuous only attributes

In [3]:
continuous_only = False

### Read dataframe

In [4]:
# read dataset
df = pd.read_csv('../data/adult_continuous.csv', na_values='?')
# remove unused columns
del df['fnlwgt']
del df['education-num']
# simplify dataframe
del df['marital-status']
del df['native-country']
del df['occupation']
del df['relationship']
# remove special characters in column names and values
df.columns = df.columns.str.replace("[-&()]", "", regex=True)
df = df.replace('[-&()]', '', regex=True)
# missing values imputation with mode (needed for Decision Trees)
df = df.apply(lambda x:x.fillna(x.value_counts().index[0]))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   age           48842 non-null  int64 
 1   workclass     48842 non-null  object
 2   education     48842 non-null  object
 3   race          48842 non-null  object
 4   sex           48842 non-null  object
 5   capitalgain   48842 non-null  int64 
 6   capitalloss   48842 non-null  int64 
 7   hoursperweek  48842 non-null  int64 
 8   class         48842 non-null  object
dtypes: int64(4), object(5)
memory usage: 3.4+ MB


In [5]:
df

Unnamed: 0,age,workclass,education,race,sex,capitalgain,capitalloss,hoursperweek,class
0,39,Stategov,Bachelors,White,Male,2174,0,40,<=50K
1,50,Selfempnotinc,Bachelors,White,Male,0,0,13,<=50K
2,38,Private,HSgrad,White,Male,0,0,40,<=50K
3,53,Private,11th,Black,Male,0,0,40,<=50K
4,28,Private,Bachelors,Black,Female,0,0,40,<=50K
...,...,...,...,...,...,...,...,...,...
48837,39,Private,Bachelors,White,Female,0,0,36,<=50K
48838,64,Private,HSgrad,Black,Male,0,0,40,<=50K
48839,38,Private,Bachelors,White,Male,0,0,50,<=50K
48840,44,Private,Bachelors,AsianPacIslander,Male,5455,0,40,<=50K


In [6]:
# nominal-ordinal-continuous partition of predictive attributes
nominal_atts = [] if continuous_only else ['race', 'sex', 'workclass']
ordinal_atts = [] if continuous_only else ['education']
continuous_atts = ['age', 'capitalgain', 'hoursperweek'] if continuous_only else ['age', 'capitalgain', 'capitalloss', 'hoursperweek']
# class attribute
target = 'class'
# predictive atts
pred_atts = nominal_atts + ordinal_atts + continuous_atts

In [7]:
# forcing encoding of ordinal attributes (consistent with the order) and class attribute (0=negatives, 1=positives)
decode = {
    'education': {
        1:'Preschool', 2:'1st4th', 3:'5th6th', 4:'7th8th', 5:'9th', 6:'10th', 7:'11th',
        8:'12th', 9:'HSgrad', 10:'Somecollege', 11:'Assocvoc', 12:'Assocacdm', 13:'Bachelors', 
        14:'Masters', 15:'Profschool', 16:'Doctorate' 
    },
    'class': {
        0: '<=50K', 1: '>50K'
    }
}
# encode nominal (as categories), ordinal+target (as int), passing the encoding of ordinal+target
prefix_sep = "_" # separator for one-hot encoding
df_code = dautils.Encode(nominal=nominal_atts, ordinal=ordinal_atts+[target], decode=decode, onehot=True, prefix_sep=prefix_sep)
df_encoded_onehot = df_code.fit_transform(df)
df_encoded_onehot.head()

Unnamed: 0,age,workclass_Federalgov,workclass_Localgov,workclass_Neverworked,workclass_Private,workclass_Selfempinc,workclass_Selfempnotinc,workclass_Stategov,workclass_Withoutpay,education,...,race_AsianPacIslander,race_Black,race_Other,race_White,sex_Female,sex_Male,capitalgain,capitalloss,hoursperweek,class
0,39,0,0,0,0,0,0,1,0,13,...,0,0,0,1,0,1,2174,0,40,0
1,50,0,0,0,0,0,1,0,0,13,...,0,0,0,1,0,1,0,0,13,0
2,38,0,0,0,1,0,0,0,0,9,...,0,0,0,1,0,1,0,0,40,0
3,53,0,0,0,1,0,0,0,0,7,...,0,1,0,0,0,1,0,0,40,0
4,28,0,0,0,1,0,0,0,0,13,...,0,1,0,0,1,0,0,0,40,0


In [8]:
# encoded atts names
encoded_pred_atts = df_code.encoded_atts(pred_atts)
# split predictive and target
X, y = df_encoded_onehot[encoded_pred_atts], df_encoded_onehot[target]
#X1, _, y1, _ = train_test_split(X, y, test_size=0.3, random_state=42)
#X2, _, y2, _ = train_test_split(X, y, test_size=0.3, random_state=24)

# split data set to simulate different data distributions
X_sub1, X_sub2, y_sub1, y_sub2 = train_test_split(X, y, test_size=0.5, random_state=42)
# retain test sets
X1, XT1, y1, yt1 = train_test_split(X_sub1, y_sub1, test_size=0.3, random_state=42)
X2, XT2, y2, yt2 = train_test_split(X_sub2, y_sub2, test_size=0.3, random_state=42)
# train a decision tree
clf1 = DecisionTreeClassifier(max_depth=3)
clf1.fit(X1, y1)
clf2 = DecisionTreeClassifier(max_depth=3)
clf2.fit(X2, y2)

In [9]:
# visualize the decision tree
#dot_data = tree.export_graphviz(clf1, out_file=None, 
#                                feature_names=encoded_pred_atts, # attributes names
#                                class_names=df_code.decode[target], # class labels
#                                filled=True, rounded=True)  
#graph = pydotplus.graph_from_dot_data(dot_data)  
#Image(graph.create_png())

In [10]:
# visualize the decision tree
#dot_data = tree.export_graphviz(clf2, out_file=None, 
#                                feature_names=encoded_pred_atts, # attributes names
#                                class_names=df_code.decode[target], # class labels
#                                filled=True, rounded=True)  
#graph = pydotplus.graph_from_dot_data(dot_data)  
#Image(graph.create_png())

In [11]:
# equality constraints on all features
def constraints(att_, pred_atts):
    const = ""
    for i in range(len(pred_atts)):
        #if att in pred_atts[i]:
        if any(pred_atts[i] in att for att in att_):
            continue
        else:
            const_i = "F0." + pred_atts[i] + " = " + "F1." + pred_atts[i]
            if len(const) == 0:
                #const = '"' + const + const_i
                const = const + const_i
            else:
                const = const + ", " + const_i
    return const

con = constraints([], pred_atts)
print(con)

F0.race = F1.race, F0.sex = F1.sex, F0.workclass = F1.workclass, F0.education = F1.education, F0.age = F1.age, F0.capitalgain = F1.capitalgain, F0.capitalloss = F1.capitalloss, F0.hoursperweek = F1.hoursperweek


### Experiment

In [12]:
# Model2CLP
r = reasonx.ReasonX(pred_atts, target, df_code)
r.model(clf1)
r.model(clf2)

# diff models but compare over SAME data instace
# factual decision rule
r.instance('F0', features=XT1.iloc[1:2], label=yt1.iloc[1], model = 0)
r.instance('F1', features=XT1.iloc[1:2], label=yt1.iloc[1], model = 1)
r.solveopt()

print("\ncombined \n")
r.constraint(con)
r.solveopt()

F0.race = AsianPacIslander, F0.sex = Male, F0.workclass = Private, F0.education = HSgrad, F0.age = 40, F0.capitalgain = 0, F0.capitalloss = 0, F0.hoursperweek = 40
F1.race = AsianPacIslander, F1.sex = Male, F1.workclass = Private, F1.education = HSgrad, F1.age = 40, F1.capitalgain = 0, F1.capitalloss = 0, F1.hoursperweek = 40
---
Answer constraint: F0.race=AsianPacIslander,F0.sex=Male,F0.workclass=Private,F0.education=9.0,F0.age=40.0,F0.capitalgain=0.0,F0.capitalloss=0.0,F0.hoursperweek=40.0,F1.race=AsianPacIslander,F1.sex=Male,F1.workclass=Private,F1.education=9.0,F1.age=40.0,F1.capitalgain=0.0,F1.capitalloss=0.0,F1.hoursperweek=40.0
Rule satisfied by F0: IF F0.capitalgain<=5095.5,F0.education<=12.5,F0.age>29.5 THEN <=50K [0.8095]
Rule satisfied by F1: IF F1.capitalgain<=5119.0,F1.education<=12.5,F1.age>33.5 THEN <=50K [0.7907]

combined 

---
Answer constraint: F0.race=AsianPacIslander,F0.sex=Male,F0.workclass=Private,F0.education=9.0,F0.age=40.0,F0.capitalgain=0.0,F0.capitalloss=0.0

In [13]:
# ask for CE one after the other to avoid combinatorial effect
# including numbers

print("contrastive rules instance 0 \n")

r.instance('CF0', label=1-yt1.iloc[1], model = 0)
# countrastive decision rules
a, b, c, d, e = r.solveopt(evaluation = 1)
print("\n NUMBER OF RULES ", a)

# contrastive instances
a, b, c, d, e = r.solveopt(minimize='l1norm(F0, CF0)', evaluation=1, eps =0.01)
print("\n", a, b, c, d, e )

print("\n contrastive rules instance 1 \n")

r.reset(keep_model=True)
r.instance('F0', features=XT1.iloc[1:1+1], label=yt1.iloc[1], model = 0)
r.instance('F1', features=XT1.iloc[1:1+1], label=yt1.iloc[1], model = 1)
r.instance('CF1', label=1-yt1.iloc[1], model = 1)

# countrastive decision rules
a, b, c, d, e = r.solveopt(evaluation = 1)
print("\n NUMBER OF RULES ", a)

# contrastive instances
a, b, c, d, e = r.solveopt(minimize='l1norm(F1, CF1)', evaluation=1, eps = 0.01)
print("\n", a, b, c, d, e )

contrastive rules instance 0 

---
Answer constraint: F0.race=AsianPacIslander,F0.sex=Male,F0.workclass=Private,F0.education=9.0,F0.age=40.0,F0.capitalgain=0.0,F0.capitalloss=0.0,F0.hoursperweek=40.0,F1.race=AsianPacIslander,F1.sex=Male,F1.workclass=Private,F1.education=9.0,F1.age=40.0,F1.capitalgain=0.0,F1.capitalloss=0.0,F1.hoursperweek=40.0,F1.race_Black=F0.race_Black,F1.race_Other=F0.race_Other,F1.race_White=F0.race_White,F1.race_AmerIndianEskimo=F0.race_AmerIndianEskimo,F1.sex_Female=F0.sex_Female,F1.workclass_Selfempinc=F0.workclass_Selfempinc,F1.workclass_Federalgov=F0.workclass_Federalgov,F1.workclass_Selfempnotinc=F0.workclass_Selfempnotinc,F1.workclass_Stategov=F0.workclass_Stategov,F1.workclass_Neverworked=F0.workclass_Neverworked,F1.workclass_Localgov=F0.workclass_Localgov,F1.workclass_Withoutpay=F0.workclass_Withoutpay,CF0.capitalgain<=5095.5,CF0.education>12.5,CF0.age>30.5
Rule satisfied by F0: IF F0.capitalgain<=5095.5,F0.education<=12.5,F0.age>29.5 THEN <=50K [0.8095]
R

In [15]:
# COMBINATORIAL

# a - number of solutions
# b - distance (only for opt)
# c - number of premises in rule
# d - number of constraints in "answer constraints"
# e - dimensionality check (only for opt)

r.reset(keep_model=True)
r.instance('F0', features=XT1.iloc[1:1+1], label=yt1.iloc[1], model = 0)
r.instance('CF0', label=1-yt1.iloc[1], model = 0)

print("optimization")
answer_cons, a, b, c, d, e = r.solveopt(evaluation=1, return_results=1, eps=0.01)
#print("\n", answer_cons, a, b, c, d, e )

# push output into system and query for second instance

r.instance('F1', label=yt1.iloc[1], model = 1)
r.instance('CF1', label=1-yt1.iloc[1], model = 1)
# instance 0 shoud have same values as instance 1
r.constraint(con)

for i in range(3):
    print("CONTRASTIVE RULE ", i)
    # contraints on instance 0 must be also valid for instance 1
    constraints_ = answer_cons[i][1].replace("CF0", "CF1")
    r.constraint(constraints_)
    print(constraints_)
    # query
    answer_cons_, a_, b_, c_, d_, e_ = r.solveopt(evaluation=1, return_results=1, project=["CF1"])
    print(a_)

    answer_cons_, a_, b_, c_, d_, e_ = r.solveopt(minimize='l1norm(F1, CF1)', evaluation=1, return_results=1, project=["CF1"], eps=0.01)
    print(a_,b_)

    r.retract(constraints_)

F0.race = AsianPacIslander, F0.sex = Male, F0.workclass = Private, F0.education = HSgrad, F0.age = 40, F0.capitalgain = 0, F0.capitalloss = 0, F0.hoursperweek = 40
optimization
---
Answer constraint: F0.race=AsianPacIslander,F0.sex=Male,F0.workclass=Private,F0.education=9.0,F0.age=40.0,F0.capitalgain=0.0,F0.capitalloss=0.0,F0.hoursperweek=40.0,CF0.capitalgain<=5095.5,CF0.education>12.5,CF0.age>30.5
Rule satisfied by F0: IF F0.capitalgain<=5095.5,F0.education<=12.5,F0.age>29.5 THEN <=50K [0.8095]
Rule satisfied by CF0: IF CF0.capitalgain<=5095.5,CF0.education>12.5,CF0.age>30.5 THEN >50K [0.5087]
---
Answer constraint: F0.race=AsianPacIslander,F0.sex=Male,F0.workclass=Private,F0.education=9.0,F0.age=40.0,F0.capitalgain=0.0,F0.capitalloss=0.0,F0.hoursperweek=40.0,CF0.capitalgain>5095.5,CF0.capitalgain<=6457.5
Rule satisfied by F0: IF F0.capitalgain<=5095.5,F0.education<=12.5,F0.age>29.5 THEN <=50K [0.8095]
Rule satisfied by CF0: IF CF0.capitalgain>5095.5,CF0.capitalgain<=6457.5 THEN >50K 

In [18]:
# COMBINATORIAL

# a - number of solutions
# b - distance (only for opt)
# c - number of premises in rule
# d - number of constraints in "answer constraints"
# e - dimensionality check (only for opt)

r.reset(keep_model=True)
r.instance('F1', features=XT1.iloc[1:1+1], label=yt1.iloc[1], model = 1)
r.instance('CF1', label=1-yt1.iloc[1], model = 1)

print("optimization")
answer_cons, a, b, c, d, e = r.solveopt(evaluation=1, return_results=1)
#print("\n", answer_cons, a, b, c, d, e )

# push output into system and query for second instance

r.instance('F0', label=yt1.iloc[1], model = 0)
r.instance('CF0', label=1-yt1.iloc[1], model = 0)
# instance 0 shoud have same values as instance 1
r.constraint(con)

for i in range(3):
    print("CONTRASTIVE RULE ", i)
    # contraints on instance 0 must be also valid for instance 1
    constraints_ = answer_cons[i][1].replace("CF1", "CF0")
    r.constraint(constraints_)
    print(constraints_)
    # query
    answer_cons_, a_, b_, c_, d_, e_ = r.solveopt(evaluation=1, return_results=1,project=["CF0"])
    print(a_)

    answer_cons_, a_, b_, c_, d_, e_ = r.solveopt(minimize='l1norm(F0, CF0)', evaluation=1, return_results=1, project=["CF0"], eps = 0.01)
    print(a_,b_)

    r.retract(constraints_)

F1.race = AsianPacIslander, F1.sex = Male, F1.workclass = Private, F1.education = HSgrad, F1.age = 40, F1.capitalgain = 0, F1.capitalloss = 0, F1.hoursperweek = 40
optimization
---
Answer constraint: F1.race=AsianPacIslander,F1.sex=Male,F1.workclass=Private,F1.education=9.0,F1.age=40.0,F1.capitalgain=0.0,F1.capitalloss=0.0,F1.hoursperweek=40.0,CF1.capitalgain<=5119.0,CF1.education>12.5,CF1.age>29.5
Rule satisfied by F1: IF F1.capitalgain<=5119.0,F1.education<=12.5,F1.age>33.5 THEN <=50K [0.7907]
Rule satisfied by CF1: IF CF1.capitalgain<=5119.0,CF1.education>12.5,CF1.age>29.5 THEN >50K [0.5125]
---
Answer constraint: F1.race=AsianPacIslander,F1.sex=Male,F1.workclass=Private,F1.education=9.0,F1.age=40.0,F1.capitalgain=0.0,F1.capitalloss=0.0,F1.hoursperweek=40.0,CF1.capitalgain>5119.0,CF1.capitalgain<=5316.5
Rule satisfied by F1: IF F1.capitalgain<=5119.0,F1.education<=12.5,F1.age>33.5 THEN <=50K [0.7907]
Rule satisfied by CF1: IF CF1.capitalgain>5119.0,CF1.capitalgain<=5316.5 THEN >50K 