In [1]:
from IPython.display import Image

# Demo of the combination and simplification algorithm

In [2]:
import pandas as pd

## For using the library, just import the _RuleCOSIClassifier_ class from **rulecosi** package

In [3]:
from rulecosi import RuleCOSIClassifier

The algorithm works with several type of tree ensembles and it uses the **sklearn** implementations.
- Bagging Trees
- RandomForests
- Gradient Boosting Trees (original implementation)
- XGBoost
- Light GBM
- CatBoost

In [4]:
#from catboost import CatBoostClassifier
#from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

#from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer

### Load a sample dataset and split the data

We use the Wisconsin diagnostic breast cancer dataset. There are two classes, malignant (0) and benign (1).

In [5]:
data = pd.read_csv('data/wisconsin.csv')

In [6]:
data.head()

Unnamed: 0,ClumpThickness,CellSize,CellShape,MarginalAdhesion,EpithelialSize,BareNuclei,BlandChromatin,NormalNucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,1
1,5,4,4,5,7,10,3,2,1,1
2,3,1,1,1,2,2,3,1,1,1
3,6,8,8,1,3,4,3,7,1,1
4,4,1,1,3,2,1,3,1,1,1


In [7]:
X = data.drop(['Class'], axis=1)
y = data['Class']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=1212)

### Simplifying an XGBoost classifier

We create a XGBClassifier instance. The ensemble can be fitted, or it can be just instantiated and RuleCOSI will fit the ensemble first and then simplify it.

In [9]:
from sklearn.ensemble import RandomForestClassifier

ens = RandomForestClassifier()

This is done by instanciating a **RuleCOSIClassifier** class with the desired parameters, _n\_estimator_, _tree\_max\_depth_, _conf\_threshold_ and _min\_samples_.

In [10]:
rc = RuleCOSIClassifier(base_ensemble=ens, 
                        metric='f1',n_estimators=100, tree_max_depth=3, 
                        conf_threshold=0.9, cov_threshold=0.0,
                        random_state=1212, column_names=X_train.columns)

In [11]:
%%time
rc.fit(X_train, y_train)

CPU times: user 2.7 s, sys: 61.2 ms, total: 2.76 s
Wall time: 3.02 s


## Examining the simplified rules

The rules will be stored in the _simplified\_ruleset_ \_ attribute of the RuleCOSI object. The function _print\_rules_ print the rules and its heuristics on the console. It can also return a string object or a pandas DataFrame object to be used for further analysis. Additionally, the decimal digits displayed on the heuristics values and the condition thresholds can be modified with the _heuristics\_digits_ and the _condition\_digits_ parameters.

In [12]:
rc.simplified_ruleset_.print_rules(heuristics_digits=4, condition_digits=1)

cov 	conf 	supp 	samples 		rule
0.5928	1.0000	0.5928	[0,364]		r_1: (BareNuclei ≤ 2.5) ˄ (CellSize ≤ 3.5) ˄ (EpithelialSize ≤ 4.5) → [1]
0.1889	1.0000	0.1889	[116,0]		r_2: (CellSize > 4.5) ˄ (MarginalAdhesion > 1.5) ˄ (NormalNucleoli > 2.5) → [0]
0.0586	0.9722	0.0570	[35,1]		r_3: (BareNuclei > 3.5) ˄ (CellShape > 4.5) ˄ (MarginalAdhesion > 1.5) → [0]
0.0358	1.0000	0.0358	[22,0]		r_4: (BareNuclei > 8.5) ˄ (BlandChromatin > 3.5) → [0]
0.0293	1.0000	0.0293	[0,18]		r_5: (BareNuclei ≤ 5.5) ˄ (BlandChromatin ≤ 3.5) ˄ (CellSize ≤ 4.5) ˄ (MarginalAdhesion ≤ 2.5) ˄ (NormalNucleoli ≤ 2.5) → [1]
0.0228	0.9286	0.0212	[13,1]		r_6: (BareNuclei > 1.5) ˄ (CellSize > 3.5) ˄ (ClumpThickness > 6.5) → [0]
0.0098	1.0000	0.0098	[6,0]		r_7: (BareNuclei > 2.5) ˄ (ClumpThickness > 6.5) → [0]
0.0065	1.0000	0.0065	[0,4]		r_8: (BareNuclei ≤ 4.5) ˄ (BlandChromatin ≤ 3.5) ˄ (CellSize ≤ 2.5) ˄ (NormalNucleoli ≤ 2.5) → [1]
0.0033	1.0000	0.0033	[2,0]		r_9: (CellSize > 4.5) ˄ (ClumpThickness > 6.5) → [0]
0.0033	1.0000	0

In [13]:
rc.simplified_ruleset_.print_rules(return_object='dataframe',heuristics_digits=4, condition_digits=1)

Unnamed: 0,cov,conf,supp,samples,#,A,y
0,0.5928,1.0,0.5928,"[0,364]",r_1,(BareNuclei ≤ 2.5) ˄ (CellSize ≤ 3.5) ˄ (Epith...,[1]
1,0.1889,1.0,0.1889,"[116,0]",r_2,(CellSize > 4.5) ˄ (MarginalAdhesion > 1.5) ˄ ...,[0]
2,0.0586,0.9722,0.057,"[35,1]",r_3,(BareNuclei > 3.5) ˄ (CellShape > 4.5) ˄ (Marg...,[0]
3,0.0358,1.0,0.0358,"[22,0]",r_4,(BareNuclei > 8.5) ˄ (BlandChromatin > 3.5),[0]
4,0.0293,1.0,0.0293,"[0,18]",r_5,(BareNuclei ≤ 5.5) ˄ (BlandChromatin ≤ 3.5) ˄ ...,[1]
5,0.0228,0.9286,0.0212,"[13,1]",r_6,(BareNuclei > 1.5) ˄ (CellSize > 3.5) ˄ (Clump...,[0]
6,0.0098,1.0,0.0098,"[6,0]",r_7,(BareNuclei > 2.5) ˄ (ClumpThickness > 6.5),[0]
7,0.0065,1.0,0.0065,"[0,4]",r_8,(BareNuclei ≤ 4.5) ˄ (BlandChromatin ≤ 3.5) ˄ ...,[1]
8,0.0033,1.0,0.0033,"[2,0]",r_9,(CellSize > 4.5) ˄ (ClumpThickness > 6.5),[0]
9,0.0033,1.0,0.0033,"[2,0]",r_10,(CellSize > 3.5) ˄ (MarginalAdhesion > 5.5),[0]


## Checking the classification performance of the simplified rule-based classifier

In [14]:
# this function is used for counting the number of rules extracted from the tree ensemble (original ruelesets)
def get_n_rules(rulesets):
    n_rules = 0
    for ruleset in rulesets:
        for rule in ruleset:
            n_rules += 1
    return n_rules

In [15]:
print(f'== Original XGBoost ensemble ==')
print(f'Number of trees: {rc.base_ensemble_.n_estimators} trees')
print(f'Number of rules: {get_n_rules(rc.original_rulesets_)} rules\n')

print(f'== Simplified rules ==')
rc.simplified_ruleset_.print_rules()
y_pred = rc.predict(X_test)
if isinstance(rc.base_ensemble, XGBClassifier):
    y_pred_ens = rc.base_ensemble_.predict(X_test, validate_features=False)
else:
    y_pred_ens = rc.base_ensemble_.predict(X_test)
print("Combinations: {}".format(rc.n_combinations_))
print("Time: {}\n".format(rc.combination_time_))
print(f'====== Classification performance of XGBoost ======')
print(classification_report(y_test, y_pred_ens,digits=4))
print(f'\n====== Classification performance of simplified rules ======')
print(classification_report(y_test, y_pred,digits=4))
print('\n')


== Original XGBoost ensemble ==
Number of trees: 100 trees
Number of rules: 766 rules

== Simplified rules ==
cov 	conf 	supp 	samples 		rule
0.5928	1.0000	0.5928	[0,364]		r_1: (BareNuclei ≤ 2.500) ˄ (CellSize ≤ 3.500) ˄ (EpithelialSize ≤ 4.500) → [1]
0.1889	1.0000	0.1889	[116,0]		r_2: (CellSize > 4.500) ˄ (MarginalAdhesion > 1.500) ˄ (NormalNucleoli > 2.500) → [0]
0.0586	0.9722	0.0570	[35,1]		r_3: (BareNuclei > 3.500) ˄ (CellShape > 4.500) ˄ (MarginalAdhesion > 1.500) → [0]
0.0358	1.0000	0.0358	[22,0]		r_4: (BareNuclei > 8.500) ˄ (BlandChromatin > 3.500) → [0]
0.0293	1.0000	0.0293	[0,18]		r_5: (BareNuclei ≤ 5.500) ˄ (BlandChromatin ≤ 3.500) ˄ (CellSize ≤ 4.500) ˄ (MarginalAdhesion ≤ 2.500) ˄ (NormalNucleoli ≤ 2.500) → [1]
0.0228	0.9286	0.0212	[13,1]		r_6: (BareNuclei > 1.500) ˄ (CellSize > 3.500) ˄ (ClumpThickness > 6.500) → [0]
0.0098	1.0000	0.0098	[6,0]		r_7: (BareNuclei > 2.500) ˄ (ClumpThickness > 6.500) → [0]
0.0065	1.0000	0.0065	[0,4]		r_8: (BareNuclei ≤ 4.500) ˄ (BlandChromatin

