# Imports

In [1]:
from datasets import load_dataset
from sklearn.ensemble import RandomForestClassifier
from sys import path
import pandas as pd
path.insert(1, "rule_induction/") # Just so imports work
from diagnoser import Settings, discover, Target
seed = 42

# Dataset setup

In [2]:
# Load dataset
dataset = load_dataset("inria-soda/tabular-benchmark", data_files = "clf_cat/default-of-credit-card-clients.csv")
dataset = dataset['train'].to_pandas()

# Create a "test" dataset for evaluation - just 10%
test = dataset.sample(
    n=round(dataset.shape[0]*0.1),
    random_state=seed
)
train = dataset.drop(index=test.index)
print("Original dataset size: %s" % (dataset.shape[0]))
print("Train set size: %s" % (train.shape[0]))
print("Test set size: %s" % (test.shape[0]))

Original dataset size: 13272
Train set size: 11945
Test set size: 1327


# Model fitting/training

In [3]:
# Extract independent / dependent variables
y_col = 'y'
X_cols = [col for col in dataset.columns if col[0] == "x" ]

In [4]:
%%time
# Fit model on train set
model = RandomForestClassifier(
    random_state = seed
)
model.fit(train[X_cols], train[y_col])

CPU times: total: 1.42 s
Wall time: 2.41 s


# Evaluation

In [5]:
# Predict on test
test['pred_label'] = model.predict(test[X_cols])
# Check confusion matrix
print("Confusion matrix: ")
pd.crosstab(
    test[y_col],
    test['pred_label'],
    rownames = ['Actual'], 
    colnames = ['Predicted'],
    margins=True
)
    

Confusion matrix: 


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,528,142,670
1,239,418,657
All,767,560,1327


# Rule Induction Analysis

## Setup

In [6]:
# Create TP/FP/TN/FN cols for analysis
test['tp'] = test['fp'] = test['tn'] = test['fn'] = 0
test.loc[(test[y_col] == 1) & (test['pred_label']) == 1, 'tp'] = 1
test.loc[(test[y_col] == 0) & (test['pred_label']) == 1, 'fp'] = 1
test.loc[(test[y_col] == 0) & (test['pred_label']) == 0, 'tn'] = 1
test.loc[(test[y_col] == 1) & (test['pred_label']) == 0, 'fn'] = 1

## Default config

In [7]:
# Create the "relevant_attributes" mapping dictionary.
# Mapping from relevant attributes to consider to their type (D (discrete), I (Int), C (Continuous))
# According to the dataset description https://www.openml.org/search?type=data&sort=runs&id=45036&status=active, only x2 is nominal, the rest are numeric
relevant_attributes = {
    feature: "C" for feature in X_cols if feature != "x2"
}
relevant_attributes.update({"x2": "I"})

In [8]:
%%time
# Check FP cases - in what conditions does our model commonly fail to predict false?
fp_result = discover(
    df=test, # The dataframe with predictions
    target=Target("fp", 1), # The name of the prediction result column we want to analyze
    relevant_attributes=relevant_attributes
)
print("fp_result: ")
fp_result

fp_result: 
CPU times: total: 10.1 s
Wall time: 16 s


x14>478965.0
x13>487049.0
x20<=1010.0

In [9]:
%%time
# Check FP cases - in what conditions does our model commonly fail to predict true?
fn_result = discover(
    df=test,
    target=Target("fn", 1), 
    relevant_attributes=relevant_attributes,
)
print("fn_result: ")
fn_result

fn_result: 
CPU times: total: 15.2 s
Wall time: 24.9 s


x6<=0.0
x22>105015.0
x23>182778.0

In [10]:
%%time
# Check TP cases - in what conditions does our model commonly excel?
tp_result = discover(
    df=test,
    target=Target("tp", 1), 
    relevant_attributes=relevant_attributes,
)
print("tp_result: ")
tp_result

tp_result: 
CPU times: total: 12.2 s
Wall time: 20.6 s


x10>2.0
x11>3.0
x9>3.0

In [11]:
%%time
# Check TN cases - in what conditions does our model commonly excel?
tn_result = discover(
    df=test,
    target=Target("tn", 1), 
    relevant_attributes=relevant_attributes,
)
print("tn_result: ")
tn_result

tn_result: 
CPU times: total: 9.17 s
Wall time: 13.9 s


x12>4834.0
x14>4748.0
x10>2.0

## Manual config

In [12]:
# Change defaults
config = Settings(
    beam_width=50, # 10 # Beam width controls the width of the beam during beam search. A large beam leads to more precision, but is slower
    num_rules=5, # 3 # Number of final rules to display
    num_bins=10, # 5 # Number of final rules to display
    # binning_method # Binning method, currently only support EQFreq and EQWidth
    minimum_relative_coverage=1, # 0.1 # Determines the minimum relative amount of coverage (percentage of rows) each subgroup should have. minimum_relative_coverage=1.5 means that each inferred subgroups has to have cover at least 1.5% of the total rows in the dataset
    disjunctions=False, # Indicates whether the algorithm should also find disjunctions
    # target_coverage # 
    all_rules=False, # The percentage of target that we want to cover (only applicable if Settings.disjunctions=True)
)

In [13]:
%%time
fp_result = discover(
    df=test,
    target=Target("fp", 1), 
    relevant_attributes=relevant_attributes,
    config=config
)
print("fp_result w config 1: ")
fp_result

fp_result w config 1: 
CPU times: total: 56.9 s
Wall time: 1min 43s


x20<=1413.0
x20<=1007.0
x1<=240000.0
x20<=2547.0
x20<=3358.0

In [14]:
%%time
# Use disjunctions and output all rules.
config = Settings(
    beam_width=50,
    num_bins=10,
    minimum_relative_coverage=1, 
    disjunctions=True, 
    all_rules=True,
)

fp_result = discover(
    df=test,
    target=Target("fp", 1), 
    relevant_attributes=relevant_attributes,
    config=config
)

print("fp_result w config 2: ")
fp_result.print()

fp_result w config 2: 
########################################
(x19<=702.0) | (x20<=1413.0)
########################################
Subgroup Discovery Result

Found [1m1[0m subgroups
[1mDataset[0m
Target: fp=1
# Rows:	1327
# Cols:	27
% Target in dataset 10.7%
[1mSubgroup: x19<=702.0[0m
% of subgroup in population (Full Dataset):	28.11% (373 rows)
Precision: P(fp=1 | x19<=702.0) = 16.62%
Recall: P(x19<=702.0 | fp=1) = 43.66%
Subgroup Discovery Result

Found [1m1[0m subgroups
[1mDataset[0m
Target: fp=1
# Rows:	1327
# Cols:	27
% Target in dataset 10.7%
[1mSubgroup: x20<=1413.0[0m
% of subgroup in population (Full Dataset):	45.06% (598 rows)
Precision: P(fp=1 | x20<=1413.0) = 15.89%
Recall: P(x20<=1413.0 | fp=1) = 66.9%
########################################
(x20<=1007.0) | (x19<=702.0)
########################################
Subgroup Discovery Result

Found [1m1[0m subgroups
[1mDataset[0m
Target: fp=1
# Rows:	1327
# Cols:	27
% Target in dataset 10.7%
[1mSubgroup: x20