# Demo of the combination and simplification algorithm

In [173]:
import pandas as pd


## For using the library, just import the _RuleCOSIClassifier_ class from **rulecosi** package

The algorithm works with several type of tree ensembles and it uses the **sklearn** implementations.
- Bagging Trees
- RandomForests
- Gradient Boosting Trees (original implementation)
- XGBoost
- Light GBM
- CatBoost

In [174]:
#from catboost import CatBoostClassifier
#from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

#from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier

import numpy as np
from sklearn.utils import Bunch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer

### Load a sample dataset and split the data

We use the Wisconsin diagnostic breast cancer dataset. There are two classes, malignant (0) and benign (1).

In [175]:

from notebooks.IPRules.read_datasets import read_dataset
test_size = 0.1
name = "connect-4"

path = f'../../..'
dataset_path_name = f'{path}/data/{name}.csv'
X, y, dataset, target_value_name, pandas_dataset = read_dataset(name, dataset_path_name)

pandas_dataset.head()

Unnamed: 0,F1_b,F1_o,F1_x,F2_b,F2_o,F2_x,F3_b,F3_o,F3_x,F4_b,...,F40_b,F40_o,F40_x,F41_b,F41_o,F41_x,F42_b,F42_o,F42_x,Class
0,True,False,False,True,False,False,True,False,False,True,...,True,False,False,True,False,False,True,False,False,True
1,True,False,False,True,False,False,True,False,False,True,...,True,False,False,True,False,False,True,False,False,True
2,True,False,False,True,False,False,True,False,False,True,...,True,False,False,True,False,False,True,False,False,True
3,True,False,False,True,False,False,True,False,False,True,...,True,False,False,True,False,False,True,False,False,True
4,False,True,False,True,False,False,True,False,False,True,...,True,False,False,True,False,False,True,False,False,True


In [176]:
#Define dataset
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=test_size, random_state=1)
encoded_train_pandas_dataset = pd.DataFrame(data= np.c_[X_train, y_train], columns= list(dataset['feature_names']) + [target_value_name])
encoded_test_pandas_dataset = pd.DataFrame(data= np.c_[X_test, y_test], columns= list(dataset['feature_names']) + [target_value_name])
print('Sizes (without target):')
print(f'Original size {dataset.data.shape}')
print(f'Train size {X_train.shape}')
print(f'Test size {X_test.shape}')
print(f'encoded_train_pandas_dataset size {encoded_train_pandas_dataset.shape}')
print(f'encoded_test_pandas_dataset size {encoded_test_pandas_dataset.shape}')

Sizes (without target):
Original size (67557, 126)
Train size (60801, 126)
Test size (6756, 126)
encoded_train_pandas_dataset size (60801, 127)
encoded_test_pandas_dataset size (6756, 127)


### Simplifying an XGBoost classifier

We create a XGBClassifier instance. The ensemble can be fitted, or it can be just instantiated and RuleCOSI will fit the ensemble first and then simplify it.

In [177]:
#ens = XGBClassifier(random_state=1212)

This is done by instanciating a **RuleCOSIClassifier** class with the desired parameters, _n\_estimator_, _tree\_max\_depth_, _conf\_threshold_ and _min\_samples_.

In [178]:


from sklearn.ensemble import RandomForestClassifier
from rulecosi import RuleCOSIClassifier

ensemble = RandomForestClassifier(n_estimators=500, max_depth=6, criterion="gini")
rc = RuleCOSIClassifier(base_ensemble=ensemble, column_names=dataset.feature_names)

In [179]:
%%time
import time
start_time = time.time()
rc.fit(X_train, y_train)
elapsed_time = time.time() - start_time
print(f"Elapsed TOTAL TIME: {elapsed_time:.3f} seconds")


Elapsed TOTAL TIME: 1.641 seconds
CPU times: user 1.52 s, sys: 36.9 ms, total: 1.55 s
Wall time: 1.64 s


## Examining the simplified rules

The rules will be stored in the _simplified\_ruleset_ \_ attribute of the RuleCOSI object. The function _print\_rules_ print the rules and its heuristics on the console. It can also return a string object or a pandas DataFrame object to be used for further analysis. Additionally, the decimal digits displayed on the heuristics values and the condition thresholds can be modified with the _heuristics\_digits_ and the _condition\_digits_ parameters.

In [180]:
rc.simplified_ruleset_.print_rules(heuristics_digits=4, condition_digits=1)

cov 	conf 	supp 	samples 		rule
0.2947	0.8162	0.2406	[3293,14626]	r_1: (F15_o ≤ 0.5) ˄ (F19_o ≤ 0.5) ˄ (F20_o ≤ 0.5) ˄ (F21_o ≤ 0.5) ˄ (F37_x ≤ 0.5) ˄ (F8_o ≤ 0.5) → [ True]
0.1088	0.6896	0.0751	[2054,4564]	r_2: (F15_o ≤ 0.5) ˄ (F20_o ≤ 0.5) ˄ (F21_o ≤ 0.5) ˄ (F37_x ≤ 0.5) → [ True]
0.1061	0.7022	0.0745	[1922,4532]	r_3: (F14_o ≤ 0.5) ˄ (F19_o ≤ 0.5) ˄ (F20_o ≤ 0.5) ˄ (F21_o ≤ 0.5) ˄ (F26_o ≤ 0.5) ˄ (F8_o ≤ 0.5) → [ True]
0.0898	0.7224	0.0649	[1515,3943]	r_4: (F15_o ≤ 0.5) ˄ (F20_o ≤ 0.5) ˄ (F21_o ≤ 0.5) ˄ (F37_x ≤ 0.5) ˄ (F8_o ≤ 0.5) → [ True]
0.0412	0.7764	0.0320	[560,1944]	r_5: (F19_o ≤ 0.5) ˄ (F1_x ≤ 0.5) ˄ (F21_o ≤ 0.5) ˄ (F26_o ≤ 0.5) ˄ (F37_x ≤ 0.5) ˄ (F8_o ≤ 0.5) → [ True]
0.0299	0.5642	0.0168	[791,1024]	r_6: (F1_x ≤ 0.5) ˄ (F21_o ≤ 0.5) ˄ (F26_o ≤ 0.5) ˄ (F37_x ≤ 0.5) ˄ (F8_o ≤ 0.5) → [ True]
0.0177	0.8386	0.0149	[174,904]	r_7: (F20_x > 0.5) ˄ (F21_o ≤ 0.5) ˄ (F26_x ≤ 0.5) → [ True]
0.0170	0.6854	0.0116	[325,708]	r_8: (F19_o ≤ 0.5) ˄ (F21_o ≤ 0.5) ˄ (F26_o ≤ 0.5) ˄ (F38_o > 0.5

In [181]:
rc.simplified_ruleset_.print_rules(return_object='dataframe',heuristics_digits=4, condition_digits=1)

Unnamed: 0,cov,conf,supp,samples,#,A,y
0,0.2947,0.8162,0.2406,"[3293,14626]",r_1,(F15_o ≤ 0.5) ˄ (F19_o ≤ 0.5) ˄ (F20_o ≤ 0.5) ...,[True]
1,0.1088,0.6896,0.0751,"[2054,4564]",r_2,(F15_o ≤ 0.5) ˄ (F20_o ≤ 0.5) ˄ (F21_o ≤ 0.5) ...,[True]
2,0.1061,0.7022,0.0745,"[1922,4532]",r_3,(F14_o ≤ 0.5) ˄ (F19_o ≤ 0.5) ˄ (F20_o ≤ 0.5) ...,[True]
3,0.0898,0.7224,0.0649,"[1515,3943]",r_4,(F15_o ≤ 0.5) ˄ (F20_o ≤ 0.5) ˄ (F21_o ≤ 0.5) ...,[True]
4,0.0412,0.7764,0.032,"[560,1944]",r_5,(F19_o ≤ 0.5) ˄ (F1_x ≤ 0.5) ˄ (F21_o ≤ 0.5) ˄...,[True]
5,0.0299,0.5642,0.0168,"[791,1024]",r_6,(F1_x ≤ 0.5) ˄ (F21_o ≤ 0.5) ˄ (F26_o ≤ 0.5) ˄...,[True]
6,0.0177,0.8386,0.0149,"[174,904]",r_7,(F20_x > 0.5) ˄ (F21_o ≤ 0.5) ˄ (F26_x ≤ 0.5),[True]
7,0.017,0.6854,0.0116,"[325,708]",r_8,(F19_o ≤ 0.5) ˄ (F21_o ≤ 0.5) ˄ (F26_o ≤ 0.5) ...,[True]
8,0.012,0.6676,0.008,"[242,486]",r_9,(F19_o ≤ 0.5) ˄ (F21_o ≤ 0.5) ˄ (F26_o ≤ 0.5) ...,[True]
9,0.0128,0.6018,0.0077,"[311,470]",r_10,(F14_o ≤ 0.5) ˄ (F19_o ≤ 0.5) ˄ (F20_o ≤ 0.5) ...,[True]


In [182]:
len(rc.simplified_ruleset_.rules)

40

## Checking the classification performance of the simplified rule-based classifier

In [183]:

print(f'== Simplified rules ==')
y_pred = rc.predict(X_test)
if isinstance(rc.base_ensemble, XGBClassifier):
    y_pred_ens = rc.base_ensemble_.predict(X_test, validate_features=False)
else:
    y_pred_ens = rc.base_ensemble_.predict(X_test)
print("Combinations: {}".format(rc.n_combinations_))
print("Time: {}\n".format(rc.combination_time_))
print(f'====== Classification performance ENSEMBLE ======')
print(classification_report(y_test, y_pred_ens,digits=4))
print(f'\n====== Classification performance of simplified rules ======')
print(classification_report(y_test, y_pred,digits=4))
print('\n')


== Simplified rules ==
Combinations: 376
Time: 1.4246468544006348

              precision    recall  f1-score   support

       False     0.9333    0.0061    0.0122      2281
        True     0.6637    0.9998    0.7978      4475

    accuracy                         0.6643      6756
   macro avg     0.7985    0.5030    0.4050      6756
weighted avg     0.7547    0.6643    0.5326      6756


              precision    recall  f1-score   support

       False     0.6600    0.4169    0.5110      2281
        True     0.7498    0.8905    0.8141      4475

    accuracy                         0.7306      6756
   macro avg     0.7049    0.6537    0.6626      6756
weighted avg     0.7194    0.7306    0.7118      6756



