# Demo of the combination and simplification algorithm

In [385]:
import pandas as pd


## For using the library, just import the _RuleCOSIClassifier_ class from **rulecosi** package

The algorithm works with several type of tree ensembles and it uses the **sklearn** implementations.
- Bagging Trees
- RandomForests
- Gradient Boosting Trees (original implementation)
- XGBoost
- Light GBM
- CatBoost

In [386]:
#from catboost import CatBoostClassifier
#from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

#from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier

import numpy as np
from sklearn.utils import Bunch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer

### Load a sample dataset and split the data

We use the Wisconsin diagnostic breast cancer dataset. There are two classes, malignant (0) and benign (1).

In [387]:

from notebooks.IPRules.read_datasets import read_dataset
test_size = 0.3
name = "connect-4"

path = f'../../..'
dataset_path_name = f'{path}/data/{name}.csv'
X, y, dataset, target_value_name, pandas_dataset = read_dataset(name, dataset_path_name)

pandas_dataset.head()

Unnamed: 0,F1_b,F1_o,F1_x,F2_b,F2_o,F2_x,F3_b,F3_o,F3_x,F4_b,...,F40_b,F40_o,F40_x,F41_b,F41_o,F41_x,F42_b,F42_o,F42_x,Class
0,True,False,False,True,False,False,True,False,False,True,...,True,False,False,True,False,False,True,False,False,True
1,True,False,False,True,False,False,True,False,False,True,...,True,False,False,True,False,False,True,False,False,True
2,True,False,False,True,False,False,True,False,False,True,...,True,False,False,True,False,False,True,False,False,True
3,True,False,False,True,False,False,True,False,False,True,...,True,False,False,True,False,False,True,False,False,True
4,False,True,False,True,False,False,True,False,False,True,...,True,False,False,True,False,False,True,False,False,True


In [388]:
#Define dataset
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=test_size, random_state=1)
encoded_train_pandas_dataset = pd.DataFrame(data= np.c_[X_train, y_train], columns= list(dataset['feature_names']) + [target_value_name])
encoded_test_pandas_dataset = pd.DataFrame(data= np.c_[X_test, y_test], columns= list(dataset['feature_names']) + [target_value_name])
print('Sizes (without target):')
print(f'Original size {dataset.data.shape}')
print(f'Train size {X_train.shape}')
print(f'Test size {X_test.shape}')
print(f'encoded_train_pandas_dataset size {encoded_train_pandas_dataset.shape}')
print(f'encoded_test_pandas_dataset size {encoded_test_pandas_dataset.shape}')

Sizes (without target):
Original size (67557, 126)
Train size (47289, 126)
Test size (20268, 126)
encoded_train_pandas_dataset size (47289, 127)
encoded_test_pandas_dataset size (20268, 127)


### Simplifying an XGBoost classifier

We create a XGBClassifier instance. The ensemble can be fitted, or it can be just instantiated and RuleCOSI will fit the ensemble first and then simplify it.

In [389]:
#ens = XGBClassifier(random_state=1212)

This is done by instanciating a **RuleCOSIClassifier** class with the desired parameters, _n\_estimator_, _tree\_max\_depth_, _conf\_threshold_ and _min\_samples_.

In [390]:


from sklearn.ensemble import RandomForestClassifier
from rulecosi import RuleCOSIClassifier

ensemble = RandomForestClassifier(n_estimators=100, criterion="gini")
rc = RuleCOSIClassifier(base_ensemble=ensemble,
                        metric='f1',n_estimators=100, tree_max_depth=3, 
                        conf_threshold=0.9, cov_threshold=0.0,
                        random_state=1212, column_names=dataset.feature_names)

In [391]:
%%time
import time
start_time = time.time()
rc.fit(X_train, y_train)
elapsed_time = time.time() - start_time
print(f"Elapsed TOTAL TIME: {elapsed_time:.3f} seconds")


Elapsed TOTAL TIME: 4.438 seconds
CPU times: user 4.16 s, sys: 81.7 ms, total: 4.24 s
Wall time: 4.44 s


## Examining the simplified rules

The rules will be stored in the _simplified\_ruleset_ \_ attribute of the RuleCOSI object. The function _print\_rules_ print the rules and its heuristics on the console. It can also return a string object or a pandas DataFrame object to be used for further analysis. Additionally, the decimal digits displayed on the heuristics values and the condition thresholds can be modified with the _heuristics\_digits_ and the _condition\_digits_ parameters.

In [392]:
rc.simplified_ruleset_.print_rules(heuristics_digits=4, condition_digits=1)

cov 	conf 	supp 	samples 		rule
0.0035	0.9939	0.0035	[1,164]		r_1: (F13_o > 0.5) ˄ (F21_x > 0.5) ˄ (F7_o > 0.5) → [ True]
0.9965	0.6571	0.6548	[16158,30966]	r_2: ( ) → [ True]



In [393]:
rc.simplified_ruleset_.print_rules(return_object='dataframe',heuristics_digits=4, condition_digits=1)

Unnamed: 0,cov,conf,supp,samples,#,A,y
0,0.0035,0.9939,0.0035,"[1,164]",r_1,(F13_o > 0.5) ˄ (F21_x > 0.5) ˄ (F7_o > 0.5),[True]
1,0.9965,0.6571,0.6548,"[16158,30966]",r_2,(),[True]


In [394]:
#print(len(rc.simplified_ruleset_))

## Checking the classification performance of the simplified rule-based classifier

In [395]:
# this function is used for counting the number of rules extracted from the tree ensemble (original ruelesets)
def get_n_rules(rulesets):
    n_rules = 0
    for ruleset in rulesets:
        for rule in ruleset:
            n_rules += 1
    return n_rules

In [396]:
print(f'== Original ensemble ==')
print(f'Number of trees: {rc.base_ensemble_.n_estimators} trees')
print(f'Number of rules: {get_n_rules(rc.original_rulesets_)} rules\n')

print(f'== Simplified rules ==')
rc.simplified_ruleset_.print_rules()
y_pred = rc.predict(X_test)
if isinstance(rc.base_ensemble, XGBClassifier):
    y_pred_ens = rc.base_ensemble_.predict(X_test, validate_features=False)
else:
    y_pred_ens = rc.base_ensemble_.predict(X_test)
print("Combinations: {}".format(rc.n_combinations_))
print("Time: {}\n".format(rc.combination_time_))
print(f'====== Classification performance ENSEMBLE ======')
print(classification_report(y_test, y_pred_ens,digits=4))
print(f'\n====== Classification performance of simplified rules ======')
print(classification_report(y_test, y_pred,digits=4))
print('\n')


== Original ensemble ==
Number of trees: 100 trees
Number of rules: 800 rules

== Simplified rules ==
cov 	conf 	supp 	samples 		rule
0.0035	0.9939	0.0035	[1,164]		r_1: (F13_o > 0.500) ˄ (F21_x > 0.500) ˄ (F7_o > 0.500) → [ True]
0.9965	0.6571	0.6548	[16158,30966]	r_2: ( ) → [ True]

Combinations: 380
Time: 2.200525999069214

              precision    recall  f1-score   support

       False     1.0000    0.0004    0.0009      6925
        True     0.6584    1.0000    0.7940     13343

    accuracy                         0.6585     20268
   macro avg     0.8292    0.5002    0.3975     20268
weighted avg     0.7751    0.6585    0.5230     20268


              precision    recall  f1-score   support

       False     0.0000    0.0000    0.0000      6925
        True     0.6583    1.0000    0.7940     13343

    accuracy                         0.6583     20268
   macro avg     0.3292    0.5000    0.3970     20268
weighted avg     0.4334    0.6583    0.5227     20268





  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
