In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, balanced_accuracy_score
from aix360.algorithms.rule_induction.ripper import RipperExplainer
import time

# Rule Induction using RIPPER

## Binary classification with a random 20% test set

We read the adult dataset from the UCI repository. The goal is to learn a rule describing people who earn more than 50K.

In [18]:
df = pd.read_csv('../../../rule_injection_embed/data/full_features/low.csv')


### Comlum names shall not contain whitespace or arithmetic operators (+, -, *, /)
We eventually output the rule set in TRXF format, where compound features are supported by parsing an expression string. So simple features like column names of a data frame must not contain these so that they are parsed as a single variable rather than an expression.

In [19]:
df.columns = df.columns.str.replace('-', '_')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24287 entries, 0 to 24286
Columns: 133 entries, glucose_t0_24_t0_22hours to label
dtypes: float64(133)
memory usage: 24.6 MB


In [20]:
TARGET_COLUMN = 'label'
print(df.head())

   glucose_t0_24_t0_22hours  glucose_t0_22_t0_20hours  \
0                      10.8                       8.8   
1                       NaN                      10.8   
2                       NaN                       NaN   
3                       NaN                      10.5   
4                       NaN                       NaN   

   glucose_t0_20_t0_18hours  glucose_t0_18_t0_16hours  \
0                       NaN                       NaN   
1                       NaN                       NaN   
2                       6.3                       NaN   
3                       9.4                       NaN   
4                      10.4                       NaN   

   glucose_t0_16_t0_14hours  glucose_t0_14_t0_12hours  \
0                       7.7                       NaN   
1                       7.2                       NaN   
2                       6.7                       NaN   
3                      12.1                       NaN   
4                       8.2  

In [21]:
positive = df[df[TARGET_COLUMN] == 1]
negative = df[df[TARGET_COLUMN] == 0]
sample_negative = negative.sample(n=positive.shape[0], random_state=42)
df = pd.concat([positive, sample_negative])
df = df.sample(frac=1, random_state=42)


### The rule induction trains for specific 'foreground' aka 'positive' value of the target label, which we set to '>50K' below. This means that the rule set will characterize the set of adults who earn more than 50K).

In [22]:
POS_VALUE = 1.0 # Setting positive value of the label for which we train
values_dist = df[TARGET_COLUMN].value_counts()
print('Positive value {} occurs {} times.'.format(POS_VALUE,values_dist[POS_VALUE]))
print(values_dist)
# This is distribution of the two values of the target label

Positive value 1.0 occurs 2378 times.
1.0    2378
0.0    2378
Name: label, dtype: int64


In [23]:
train, test = train_test_split(df, test_size=0.2, random_state=42)
# Split the data set into 80% training and 20% test set
print('Training set:')
print(train[TARGET_COLUMN].value_counts())
print('Test set:')
print(test[TARGET_COLUMN].value_counts())

y_train = train[TARGET_COLUMN]
x_train = train.drop(columns=[TARGET_COLUMN])

y_test = test[TARGET_COLUMN]
x_test = test.drop(columns=[TARGET_COLUMN])
# Split data frames into features and label

Training set:
1.0    1908
0.0    1896
Name: label, dtype: int64
Test set:
0.0    482
1.0    470
Name: label, dtype: int64


### Instantiate the Ripper estimator and train it using default parameters

In [24]:
estimator = RipperExplainer()

start_time = time.time()
estimator.fit(x_train, y_train, target_label=POS_VALUE) # Run RIPPER rule induction
end_time = time.time()
print('Training time (sec): ' + str(end_time - start_time))

# compute performance metrics on test set
y_pred = estimator.predict(x_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Balanced accuracy:', balanced_accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, pos_label=POS_VALUE))
print('Recall:', recall_score(y_test, y_pred, pos_label=POS_VALUE))

Training time (sec): 7.487473249435425
Accuracy: 0.7006302521008403
Balanced accuracy: 0.7006224066390041
Precision: 0.6955602536997886
Recall: 0.7


### Extract the rule set

In [25]:
trxf_ruleset = estimator.explain()
print(str(trxf_ruleset))
print(type(trxf_ruleset))

if
([glucose_t0_2_t00hours <= 4.949999999999999]) v
([future_insulin_group2_t0+2_t0+4hours <= 0.0] ^ [glucose_t0_2_t00hours >= 6.6] ^ [past_insulin_group1_t0_16_t0_14hours >= 1.0] ^ [glucose_t0_2_t00hours <= 9.7]) v
([future_insulin_group2_t0+8_t0+10hours <= 0.0] ^ [glucose_t0_12_t0_10hours >= 9.05] ^ [future_insulin_group2_t0+14_t0+16hours <= 0.0] ^ [past_insulin_group4_t0_10_t0_8hours >= 2.0] ^ [glucose_t0_14_t0_12hours >= 2.52]) v
([future_insulin_group5_t0+0_t0+2hours >= 14.0]) v
([past_insulin_group4_t0_2_t00hours >= 4.0] ^ [glucose_t0_14_t0_12hours <= 9.9] ^ [glucose_t0_14_t0_12hours >= 6.8] ^ [past_insulin_group1_t0_22_t0_20hours <= 0.0]) v
([future_insulin_group3_t0+0_t0+2hours >= 4.0] ^ [glucose_t0_4_t0_2hours <= 9.5] ^ [past_insulin_group3_t0_2_t00hours >= 9.0]) v
([glucose_t0_22_t0_20hours <= 7.6] ^ [glucose_t0_10_t0_8hours <= 5.15]) v
([future_insulin_group2_t0+14_t0+16hours <= 0.0] ^ [glucose_t0_12_t0_10hours <= 5.1]) v
([glucose_t0_16_t0_14hours <= 7.366666666666667] ^ [g

## Export the resulting ruleset to a PMML file
### Construct a RuleSetClassifier object
A rule set by itself is merely a description of the given concept/target. Therefore, to use rule sets for a binary classification task, we must specify how to deal with potential overlaps between rule sets. For example, we could have learned 2 rule sets: one for >50K and another for <=50K. For instances where both rule sets are triggered, how do we classify that instance? There are 3 rule selection methods supported in PMML: First Hit, Weighted Sum, and Weighted Max. See here for more info: https://dmg.org/pmml/v4-4/RuleSet.html#xsdElement_RuleSelectionMethod. If we only learn a rule set for a single label, we can set a default label to which instances will be classified when the learned rule set does not trigger. 

In our case, since we only learn a rule set for a single label and use the default label for the rest, all 3 rule selection methods will have the same effect. However, if a rule selection method other than FirstHit is chosen, we need to compute the weights and confidence values for each rule.

In [28]:
import aix360.algorithms.rule_induction.trxf.classifier.ruleset_classifier as trxf_classifier
import aix360.algorithms.rule_induction.trxf.pmml_export as pmml
classifier = trxf_classifier.RuleSetClassifier([trxf_ruleset],
                                               rule_selection_method=trxf_classifier.RuleSelectionMethod.WEIGHTED_MAX,
                                               confidence_metric=trxf_classifier.ConfidenceMetric.LAPLACE,
                                               weight_metric=trxf_classifier.WeightMetric.CONFIDENCE,
                                               default_label='1.0')
classifier.update_rules_with_metrics(x_test, y_test)

ValueError: could not convert string to float: 'future_insulin_group2_t0'

### support the computations of precision and recall for each rule
A revision to support the model return precision and recall for each rule:

In [None]:
default_label = 1.0
classifier = trxf_classifier.RuleSetClassifier([trxf_ruleset],rule_selection_method=trxf_classifier.RuleSelectionMethod.WEIGHTED_MAX,confidence_metric=trxf_classifier.ConfidenceMetric.LAPLACE,weight_metric=trxf_classifier.WeightMetric.CONFIDENCE,default_label=default_label)
classifier.update_rules_with_metrics(x_test, y_test)

precision = []
recall = []
for rule in classifier.rules:
    precision.append(rule.confidence)
    recall.append(rule.recall)
    

### Export the TRXF classifier to a PMML document

In [10]:
reader = pmml.TrxfReader()
reader.load_data_dictionary(x_test)
serializer = pmml.NyokaSerializer()
exporter = pmml.PmmlExporter(reader, serializer)
with open("adult_weighted_max.pmml", "w") as text_file:
    text_file.write(exporter.export(classifier))