In [1]:
import sys

sys.path.append(r'../src')
import uncertainpy

import numpy as np
import pandas as pd

# Read heart failure data

In [2]:
data = pd.read_csv('../data/heart_failure_clinical_records_dataset.csv')
data.head(8)

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1
5,90.0,1,47,0,40,1,204000.0,2.1,132,1,1,8,1
6,75.0,1,246,0,15,0,127000.0,1.2,137,1,0,10,1
7,60.0,1,315,1,60,0,454000.0,1.1,131,1,1,10,1


In [3]:
data.describe()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.833893,0.431438,581.839465,0.41806,38.083612,0.351171,263358.029264,1.39388,136.625418,0.648829,0.32107,130.26087,0.32107
std,11.894809,0.496107,970.287881,0.494067,11.834841,0.478136,97804.236869,1.03451,4.412477,0.478136,0.46767,77.614208,0.46767
min,40.0,0.0,23.0,0.0,14.0,0.0,25100.0,0.5,113.0,0.0,0.0,4.0,0.0
25%,51.0,0.0,116.5,0.0,30.0,0.0,212500.0,0.9,134.0,0.0,0.0,73.0,0.0
50%,60.0,0.0,250.0,0.0,38.0,0.0,262000.0,1.1,137.0,1.0,0.0,115.0,0.0
75%,70.0,1.0,582.0,1.0,45.0,1.0,303500.0,1.4,140.0,1.0,1.0,203.0,1.0
max,95.0,1.0,7861.0,1.0,80.0,1.0,850000.0,9.4,148.0,1.0,1.0,285.0,1.0


# Turn features into boolean features

In [4]:
from uncertainpy.util.dataTransformation import booleanizeNumericalColumnUsingQuartiles
from uncertainpy.util.dataTransformation import booleanizeBinaryColumn

#assuming that all features are numerical or binary (categorical features should be treated  by binarizing them)
numerical_features = ["age","creatinine_phosphokinase","ejection_fraction","platelets","serum_creatinine","serum_sodium","time"]
binary_features = ["anaemia","diabetes","high_blood_pressure","sex","smoking","DEATH_EVENT"]

for f in numerical_features:
    booleanizeNumericalColumnUsingQuartiles(data, f)
    
for f in binary_features:
    booleanizeBinaryColumn(data, f)

data.head()

Unnamed: 0,anaemia,diabetes,high_blood_pressure,sex,smoking,DEATH_EVENT,age_small,age_med,age_large,creatinine_phosphokinase_small,...,platelets_large,serum_creatinine_small,serum_creatinine_med,serum_creatinine_large,serum_sodium_small,serum_sodium_med,serum_sodium_large,time_small,time_med,time_large
0,False,False,True,True,False,True,False,False,True,False,...,False,False,False,True,True,False,False,True,False,False
1,False,False,False,True,False,True,False,True,False,False,...,False,False,True,False,False,True,False,True,False,False
2,False,False,False,True,True,True,False,True,False,False,...,False,False,True,False,True,False,False,True,False,False
3,True,False,False,True,False,True,True,False,False,True,...,False,False,False,True,False,True,False,True,False,False
4,True,True,False,False,False,True,False,True,False,False,...,True,False,False,True,True,False,False,True,False,False


# Build small trees to find short rules that generalize well

In [5]:
target_variable_index = 0
bmask = np.full((1, data.shape[1]), True).flatten()
bmask[target_variable_index] = False

features = data.columns[bmask].to_list()
print(f'predict {data.columns[target_variable_index]} using the following features:')
print(features)

predict anaemia using the following features:
['diabetes', 'high_blood_pressure', 'sex', 'smoking', 'DEATH_EVENT', 'age_small', 'age_med', 'age_large', 'creatinine_phosphokinase_small', 'creatinine_phosphokinase_med', 'creatinine_phosphokinase_large', 'ejection_fraction_small', 'ejection_fraction_med', 'ejection_fraction_large', 'platelets_small', 'platelets_med', 'platelets_large', 'serum_creatinine_small', 'serum_creatinine_med', 'serum_creatinine_large', 'serum_sodium_small', 'serum_sodium_med', 'serum_sodium_large', 'time_small', 'time_med', 'time_large']


In [7]:
from sklearn import tree
from sklearn.model_selection import train_test_split

X = data.loc[:, bmask]
Y = data.loc[:, bmask==False]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.7, random_state = 1)
clf = tree.DecisionTreeClassifier(max_leaf_nodes=8) #min_samples_split=40, min_samples_leaf=40)
clf = clf.fit(X, Y)

rules = tree.export_text(clf, feature_names=features,show_weights=True)
print(rules)

|--- creatinine_phosphokinase_large <= 0.50
|   |--- smoking <= 0.50
|   |   |--- time_large <= 0.50
|   |   |   |--- serum_creatinine_med <= 0.50
|   |   |   |   |--- platelets_small <= 0.50
|   |   |   |   |   |--- weights: [23.00, 21.00] class: False
|   |   |   |   |--- platelets_small >  0.50
|   |   |   |   |   |--- weights: [2.00, 11.00] class: True
|   |   |   |--- serum_creatinine_med >  0.50
|   |   |   |   |--- weights: [7.00, 31.00] class: True
|   |   |--- time_large >  0.50
|   |   |   |--- serum_sodium_small <= 0.50
|   |   |   |   |--- weights: [11.00, 10.00] class: False
|   |   |   |--- serum_sodium_small >  0.50
|   |   |   |   |--- weights: [8.00, 1.00] class: False
|   |--- smoking >  0.50
|   |   |--- weights: [37.00, 26.00] class: False
|--- creatinine_phosphokinase_large >  0.50
|   |--- age_med <= 0.50
|   |   |--- weights: [52.00, 12.00] class: False
|   |--- age_med >  0.50
|   |   |--- weights: [30.00, 17.00] class: False

