# Demo of the combination and simplification algorithm

In [13]:
import pandas as pd


## For using the library, just import the _RuleCOSIClassifier_ class from **rulecosi** package

The algorithm works with several type of tree ensembles and it uses the **sklearn** implementations.
- Bagging Trees
- RandomForests
- Gradient Boosting Trees (original implementation)
- XGBoost
- Light GBM
- CatBoost

In [14]:
#from catboost import CatBoostClassifier
#from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

#from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier

import numpy as np
from sklearn.utils import Bunch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer

### Load a sample dataset and split the data

We use the Wisconsin diagnostic breast cancer dataset. There are two classes, malignant (0) and benign (1).

In [15]:

from notebooks.IPRules.read_datasets import read_dataset
test_size = 0.1
name = "criotherapy"

path = f'../../..'
dataset_path_name = f'{path}/data/{name}.csv'
X, y, dataset, target_value_name, pandas_dataset = read_dataset(name, dataset_path_name)

pandas_dataset.head()

Unnamed: 0,sex_1,sex_2,age_High,age_Low,age_Medium,age_VeryHigh,age_VeryLow,Time_High,Time_Low,Time_Medium,...,Number_of_Warts_10,Number_of_Warts_11,Number_of_Warts_12,Type_1,Type_2,Type_3,Area_Low,Area_VeryHigh,Area_VeryLow,target_value
0,True,False,False,True,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,True,False
1,True,False,False,True,False,False,False,False,False,True,...,False,False,False,True,False,False,False,False,True,True
2,True,False,True,False,False,False,False,True,False,False,...,False,False,False,False,False,True,False,False,True,False
3,True,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
4,True,False,False,False,False,True,False,True,False,False,...,False,False,False,True,False,False,False,False,True,False


In [16]:
#Define dataset
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=test_size, random_state=1)
encoded_train_pandas_dataset = pd.DataFrame(data= np.c_[X_train, y_train], columns= list(dataset['feature_names']) + [target_value_name])
encoded_test_pandas_dataset = pd.DataFrame(data= np.c_[X_test, y_test], columns= list(dataset['feature_names']) + [target_value_name])
print('Sizes (without target):')
print(f'Original size {dataset.data.shape}')
print(f'Train size {X_train.shape}')
print(f'Test size {X_test.shape}')
print(f'encoded_train_pandas_dataset size {encoded_train_pandas_dataset.shape}')
print(f'encoded_test_pandas_dataset size {encoded_test_pandas_dataset.shape}')

Sizes (without target):
Original size (90, 30)
Train size (81, 30)
Test size (9, 30)
encoded_train_pandas_dataset size (81, 31)
encoded_test_pandas_dataset size (9, 31)


### Simplifying an XGBoost classifier

We create a XGBClassifier instance. The ensemble can be fitted, or it can be just instantiated and RuleCOSI will fit the ensemble first and then simplify it.

In [17]:
#ens = XGBClassifier(random_state=1212)

This is done by instanciating a **RuleCOSIClassifier** class with the desired parameters, _n\_estimator_, _tree\_max\_depth_, _conf\_threshold_ and _min\_samples_.

In [18]:


from sklearn.ensemble import RandomForestClassifier
from rulecosi import RuleCOSIClassifier

ensemble = RandomForestClassifier(n_estimators=100, criterion="gini")
rc = RuleCOSIClassifier(base_ensemble=ensemble,
                        metric='f1',n_estimators=100, tree_max_depth=3, 
                        conf_threshold=0.9, cov_threshold=0.0,
                        random_state=1212, column_names=dataset.feature_names)

In [19]:
%%time
import time
start_time = time.time()
rc.fit(X_train, y_train)
elapsed_time = time.time() - start_time
print(f"Elapsed TOTAL TIME: {elapsed_time:.3f} seconds")


Elapsed TOTAL TIME: 1.610 seconds
CPU times: user 866 ms, sys: 22.7 ms, total: 889 ms
Wall time: 1.61 s


## Examining the simplified rules

The rules will be stored in the _simplified\_ruleset_ \_ attribute of the RuleCOSI object. The function _print\_rules_ print the rules and its heuristics on the console. It can also return a string object or a pandas DataFrame object to be used for further analysis. Additionally, the decimal digits displayed on the heuristics values and the condition thresholds can be modified with the _heuristics\_digits_ and the _condition\_digits_ parameters.

In [20]:
rc.simplified_ruleset_.print_rules(heuristics_digits=4, condition_digits=1)

cov 	conf 	supp 	samples 		rule
0.3704	1.0000	0.3704	[0,30]		r_1: (Time_High ≤ 0.5) ˄ (Time_VeryHigh ≤ 0.5) ˄ (age_VeryHigh ≤ 0.5) → [ True]
0.2346	1.0000	0.2346	[19,0]		r_2: (Number_of_Warts_2 ≤ 0.5) ˄ (Time_VeryHigh > 0.5) ˄ (age_VeryLow ≤ 0.5) → [False]
0.0741	1.0000	0.0741	[0,6]		r_3: (Number_of_Warts_6 ≤ 0.5) ˄ (Time_VeryHigh ≤ 0.5) ˄ (Type_3 ≤ 0.5) ˄ (age_Low ≤ 0.5) ˄ (age_VeryHigh ≤ 0.5) → [ True]
0.0247	1.0000	0.0247	[2,0]		r_4: (Time_VeryHigh > 0.5) ˄ (Type_3 > 0.5) → [False]
0.0123	1.0000	0.0123	[0,1]		r_5: (Time_VeryHigh ≤ 0.5) ˄ (Type_3 ≤ 0.5) ˄ (age_VeryLow > 0.5) → [ True]
0.0123	1.0000	0.0123	[1,0]		r_6: (Time_VeryHigh > 0.5) ˄ (Type_2 ≤ 0.5) ˄ (age_VeryLow ≤ 0.5) → [False]
0.2716	0.7273	0.1975	[16,6]		r_7: ( ) → [False]



In [21]:
rc.simplified_ruleset_.print_rules(return_object='dataframe',heuristics_digits=4, condition_digits=1)

Unnamed: 0,cov,conf,supp,samples,#,A,y
0,0.3704,1.0,0.3704,"[0,30]",r_1,(Time_High ≤ 0.5) ˄ (Time_VeryHigh ≤ 0.5) ˄ (a...,[True]
1,0.2346,1.0,0.2346,"[19,0]",r_2,(Number_of_Warts_2 ≤ 0.5) ˄ (Time_VeryHigh > 0...,[False]
2,0.0741,1.0,0.0741,"[0,6]",r_3,(Number_of_Warts_6 ≤ 0.5) ˄ (Time_VeryHigh ≤ 0...,[True]
3,0.0247,1.0,0.0247,"[2,0]",r_4,(Time_VeryHigh > 0.5) ˄ (Type_3 > 0.5),[False]
4,0.0123,1.0,0.0123,"[0,1]",r_5,(Time_VeryHigh ≤ 0.5) ˄ (Type_3 ≤ 0.5) ˄ (age_...,[True]
5,0.0123,1.0,0.0123,"[1,0]",r_6,(Time_VeryHigh > 0.5) ˄ (Type_2 ≤ 0.5) ˄ (age_...,[False]
6,0.2716,0.7273,0.1975,"[16,6]",r_7,(),[False]


In [22]:
#print(len(rc.simplified_ruleset_))

## Checking the classification performance of the simplified rule-based classifier

In [23]:
# this function is used for counting the number of rules extracted from the tree ensemble (original ruelesets)
def get_n_rules(rulesets):
    n_rules = 0
    for ruleset in rulesets:
        for rule in ruleset:
            n_rules += 1
    return n_rules

In [24]:
print(f'== Original ensemble ==')
print(f'Number of trees: {rc.base_ensemble_.n_estimators} trees')
print(f'Number of rules: {get_n_rules(rc.original_rulesets_)} rules\n')

print(f'== Simplified rules ==')
rc.simplified_ruleset_.print_rules()
y_pred = rc.predict(X_test)
if isinstance(rc.base_ensemble, XGBClassifier):
    y_pred_ens = rc.base_ensemble_.predict(X_test, validate_features=False)
else:
    y_pred_ens = rc.base_ensemble_.predict(X_test)
print("Combinations: {}".format(rc.n_combinations_))
print("Time: {}\n".format(rc.combination_time_))
print(f'====== Classification performance ENSEMBLE ======')
print(classification_report(y_test, y_pred_ens,digits=4))
print(f'\n====== Classification performance of simplified rules ======')
print(classification_report(y_test, y_pred,digits=4))
print('\n')


== Original ensemble ==
Number of trees: 100 trees
Number of rules: 627 rules

== Simplified rules ==
cov 	conf 	supp 	samples 		rule
0.3704	1.0000	0.3704	[0,30]		r_1: (Time_High ≤ 0.500) ˄ (Time_VeryHigh ≤ 0.500) ˄ (age_VeryHigh ≤ 0.500) → [ True]
0.2346	1.0000	0.2346	[19,0]		r_2: (Number_of_Warts_2 ≤ 0.500) ˄ (Time_VeryHigh > 0.500) ˄ (age_VeryLow ≤ 0.500) → [False]
0.0741	1.0000	0.0741	[0,6]		r_3: (Number_of_Warts_6 ≤ 0.500) ˄ (Time_VeryHigh ≤ 0.500) ˄ (Type_3 ≤ 0.500) ˄ (age_Low ≤ 0.500) ˄ (age_VeryHigh ≤ 0.500) → [ True]
0.0247	1.0000	0.0247	[2,0]		r_4: (Time_VeryHigh > 0.500) ˄ (Type_3 > 0.500) → [False]
0.0123	1.0000	0.0123	[0,1]		r_5: (Time_VeryHigh ≤ 0.500) ˄ (Type_3 ≤ 0.500) ˄ (age_VeryLow > 0.500) → [ True]
0.0123	1.0000	0.0123	[1,0]		r_6: (Time_VeryHigh > 0.500) ˄ (Type_2 ≤ 0.500) ˄ (age_VeryLow ≤ 0.500) → [False]
0.2716	0.7273	0.1975	[16,6]		r_7: ( ) → [False]

Combinations: 1457
Time: 1.4322178363800049

              precision    recall  f1-score   support

       False 