In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, balanced_accuracy_score
from aix360.algorithms.rule_induction.rbm.boolean_rule_cg import BooleanRuleCG as BRCG
from aix360.algorithms.rbm import FeatureBinarizer
import time

# Rule Induction using BRCG

## Binary classification with a random 20% test set

We read the adult dataset from the UCI repository. The goal is to learn a rule describing people who earn more than 50K.

In [28]:
df = pd.read_csv('../../../rule_injection_embed/data/full_features/low.csv')

### Comlum names shall not contain whitespace or arithmetic operators (+, -, *, /)
We eventually output the rule set in TRXF format, where compound features are supported by parsing an expression string. So simple features like column names of a data frame must not contain these so that they are parsed as a single variable rather than an expression.

In [29]:
df.columns = df.columns.str.replace('-', '_')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24287 entries, 0 to 24286
Columns: 133 entries, glucose_t0_24_t0_22hours to label
dtypes: float64(133)
memory usage: 24.6 MB


In [30]:
TARGET_COLUMN = 'label'
print(df.head())

   glucose_t0_24_t0_22hours  glucose_t0_22_t0_20hours  \
0                      10.8                       8.8   
1                       NaN                      10.8   
2                       NaN                       NaN   
3                       NaN                      10.5   
4                       NaN                       NaN   

   glucose_t0_20_t0_18hours  glucose_t0_18_t0_16hours  \
0                       NaN                       NaN   
1                       NaN                       NaN   
2                       6.3                       NaN   
3                       9.4                       NaN   
4                      10.4                       NaN   

   glucose_t0_16_t0_14hours  glucose_t0_14_t0_12hours  \
0                       7.7                       NaN   
1                       7.2                       NaN   
2                       6.7                       NaN   
3                      12.1                       NaN   
4                       8.2  

### The rule induction trains for specific 'foreground' aka 'positive' value of the target label, which we set to '>50K' below. This means that the rule set will characterize the set of adults who earn more than 50K).

In [31]:
POS_VALUE = 1.0 # Setting positive value of the label for which we train
values_dist = df[TARGET_COLUMN].value_counts()
print('Positive value {} occurs {} times.'.format(POS_VALUE,values_dist[POS_VALUE]))
print(values_dist)
# This is distribution of the two values of the target label

Positive value 1.0 occurs 2378 times.
0.0    21909
1.0     2378
Name: label, dtype: int64


Balance the data

In [17]:
# choolse all positive data 
df_positive = df[df[TARGET_COLUMN] == POS_VALUE]
df_negative = df[df[TARGET_COLUMN] != POS_VALUE]
# random choose negative data
df_negative = df_negative.sample(n = len(df_positive))
df = pd.concat([df_positive, df_negative])
df = df.sample(frac=1).reset_index(drop=True)
print(df[TARGET_COLUMN].value_counts())
df.head()

0.0    2378
1.0    2378
Name: label, dtype: int64


Unnamed: 0,glucose_t0_24_t0_22hours,glucose_t0_22_t0_20hours,glucose_t0_20_t0_18hours,glucose_t0_18_t0_16hours,glucose_t0_16_t0_14hours,glucose_t0_14_t0_12hours,glucose_t0_12_t0_10hours,glucose_t0_10_t0_8hours,glucose_t0_8_t0_6hours,glucose_t0_6_t0_4hours,...,future_insulin_group5_t0+6_t0+8hours,future_insulin_group5_t0+8_t0+10hours,future_insulin_group5_t0+10_t0+12hours,future_insulin_group5_t0+12_t0+14hours,future_insulin_group5_t0+14_t0+16hours,future_insulin_group5_t0+16_t0+18hours,future_insulin_group5_t0+18_t0+20hours,future_insulin_group5_t0+20_t0+22hours,future_insulin_group5_t0+22_t0+24hours,label
0,,11.0,10.7,,,11.4,,12.9,14.7,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,,5.0,,,7.9,,,18.8,13.3,13.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,,5.7,,8.3,10.3,,,11.4,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,,13.6,,,8.7,,7.6,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,9.9,11.3,,6.5,,5.1,,,9.8,,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,26.0,0.0


### Train-test split and encode labels as integers

In [32]:
train, test = train_test_split(df, test_size=0.2, random_state=42)
# Split the data set into 80% training and 20% test set
print('Training set:')
print(train[TARGET_COLUMN].value_counts())
print('Test set:')
print(test[TARGET_COLUMN].value_counts())

y_train = train[TARGET_COLUMN].apply(lambda x: 1 if x == POS_VALUE else 0)
x_train = train.drop(columns=[TARGET_COLUMN])

y_test = test[TARGET_COLUMN].apply(lambda x: 1 if x == POS_VALUE else 0)
x_test = test.drop(columns=[TARGET_COLUMN])
# Split data frames into features and label

Training set:
0.0    17538
1.0     1891
Name: label, dtype: int64
Test set:
0.0    4371
1.0     487
Name: label, dtype: int64


### Instantiate the BRCG explainer and train it using default parameters

In [33]:
fb = FeatureBinarizer(negations=True)
X_train_fb = fb.fit_transform(x_train)
x_test_fb = fb.transform(x_test)

explainer = BRCG(silent=True)
start_time = time.time()
explainer.fit(X_train_fb, y_train)
end_time = time.time()
print('Training time (sec): ' + str(end_time - start_time))

# compute performance metrics on test set
y_pred = explainer.predict(x_test_fb)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Balanced accuracy:', balanced_accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, pos_label=1))
print('Recall:', recall_score(y_test, y_pred, pos_label=1))

  A[colName] = data[c].map(maps[c]).astype(int)
  A[(str(c), '==', str(maps[c].index[0]))] = 1 - A[colName]
  A[colName] = data[c].map(maps[c]).astype(int)
  A[(str(c), '==', str(maps[c].index[0]))] = 1 - A[colName]


IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

### Extract the rule set

In [11]:
trxf_ruleset = explainer.explain()
print(str(trxf_ruleset))

if
false
then
1


## Export the resulting ruleset to a PMML file
### Construct a RuleSetClassifier object
A rule set by itself is merely a description of the given concept/target. Therefore, to use rule sets for a binary classification task, we must specify how to deal with potential overlaps between rule sets. For example, we could have learned 2 rule sets: one for >50K and another for <=50K. For instances where both rule sets are triggered, how do we classify that instance? There are 3 rule selection methods supported in PMML: First Hit, Weighted Sum, and Weighted Max. See here for more info: https://dmg.org/pmml/v4-4/RuleSet.html#xsdElement_RuleSelectionMethod. If we only learn a rule set for a single label, we can set a default label to which instances will be classified when the learned rule set does not trigger. 

In our case, since we only learn a rule set for a single label and use the default label for the rest, all 3 rule selection methods will have the same effect. However, if a rule selection method other than FirstHit is chosen, we need to compute the weights and confidence values for each rule.

In [11]:
import aix360.algorithms.rule_induction.trxf.classifier.ruleset_classifier as trxf_classifier
import aix360.algorithms.rule_induction.trxf.pmml_export as pmml
classifier = trxf_classifier.RuleSetClassifier([trxf_ruleset],
                                               rule_selection_method=trxf_classifier.RuleSelectionMethod.WEIGHTED_MAX,
                                               confidence_metric=trxf_classifier.ConfidenceMetric.LAPLACE,
                                               weight_metric=trxf_classifier.WeightMetric.CONFIDENCE,
                                               default_label='<=50K')
classifier.update_rules_with_metrics(x_test, y_test)

### Export the TRXF classifier to a PMML document

In [12]:
reader = pmml.TrxfReader()
reader.load_data_dictionary(x_test)
serializer = pmml.NyokaSerializer()
exporter = pmml.PmmlExporter(reader, serializer)
with open("adult_weighted_max_brcg.pmml", "w") as text_file:
    text_file.write(exporter.export(classifier))

In [41]:
# first row of x_test
ele = x_test.iloc[100:102]
ele

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
28197,40.0,Private,287008.0,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,15024.0,0.0,55.0,Germany
13925,33.0,Private,93056.0,7th-8th,4.0,Divorced,Handlers-cleaners,Own-child,White,Male,0.0,0.0,40.0,United-States


In [42]:
classifier.predict(ele)

1