In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.tree import _tree
from sklearn import tree
import copy
from graphviz import Source

In [35]:
#filename = 'iris.csv'
#filename = 'glass.csv'
#filename = 'heloc_dataset_v1.csv'
#filename = 'vehicle.csv'
#filename = 'wine.csv'
#filename = 'letter.csv'
filename = 'diabetes.csv'

df = pd.read_csv('data/' + filename)
df.head()

X = df.drop('class', axis=1)
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)
feature_names = list(X_train.columns.values)

In [36]:
NR_TREES = 100

rf = RandomForestClassifier(n_estimators=NR_TREES)
rf.fit(X_train, y_train)

rf.feature_names = feature_names
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Feature names:", feature_names)
print("Accuracy:", accuracy)

Feature names: ['preg', 'plas', 'pres', 'skin', 'insu', 'mass', 'pedi', 'age']
Accuracy: 0.7532467532467533


In [37]:
# Export trees
class_names = copy.deepcopy(rf.classes_)
nClasses    = len(class_names)
features    = rf.feature_names

f = open("RF_" + filename.replace('.csv', '') + ".csv", "w")
f.write("tree_id, node_id, is_leave, samples, impurity, split_feature, threshold, min_feature, max_feature, children\n")

for i in range(len(rf.estimators_)):
    model = rf.estimators_[i]
    #tree = classifier.tree_
    
    DTree = model.tree_
    children_left = DTree.children_right
    children_right = DTree.children_left
    n_nodes = DTree.node_count
    
    feature_name = [
        features[i] if i != _tree.TREE_UNDEFINED else "undefined!" for i in DTree.feature
    ]

    node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
    is_leaves  = np.zeros(shape=n_nodes, dtype=bool)
    stack  = [(0, -1, -1)] #seed is the root node id and its parent depth and parent id
    parent = {}
    childIndex = {}
    nodes = [0]
    edges = {}
    samples = {}
    impurities = {}
    feature_and_threshold = {}
    while len(stack) > 0:
        node_id, parent_depth, parent_node_id = stack.pop()
        node_depth[node_id] = parent_depth + 1

        if (node_id != 0):
            # Check if key already exists in dict
            childs = []
            if parent_node_id in edges:
                # get the list of children
                childs = edges[parent_node_id]
            childs.append(node_id)
            edges[parent_node_id] = childs
            
            parent[node_id] = parent_node_id
            nodes.append(node_id)
            
            if parent_node_id not in feature_and_threshold:
                feature = feature_name[parent_node_id]
                threshold = DTree.threshold[parent_node_id]
                threshold = "{:.2f}".format(threshold)
                feature_and_threshold[parent_node_id] = (feature, threshold)
                
        samples[node_id]    = DTree.n_node_samples[node_id]
        impurities[node_id] = "{:.3f}".format(DTree.impurity[node_id])
        
        if (children_right[node_id] != children_left[node_id]):
            stack.append((children_left[node_id], parent_depth + 1, node_id))
            stack.append((children_right[node_id], parent_depth + 1, node_id))
        else:
            is_leaves[node_id] = True

    # Sanity check
    #graph = Source(tree.export_graphviz(model, out_file=None, feature_names=features))
    #graph.format = "png"
    #graph.render("dtree_render_" + str(i), view=False)
            
    #print(nodes)
    #print(edges)
    #print(feature_and_threshold)
    #print(samples)
    #print(impurities)
    
    # Write info to file
    # tree_id, node_id, is_leave, samples, impurity, split_feature, threshold, children
    for node in nodes:
        line = str(i) + "," + str(node) + "," + str(is_leaves[node]) + "," + str(samples[node]) + "," + str(impurities[node])
        if not is_leaves[node]:
            line += "," + "\"" + feature_and_threshold[node][0] + "\""
            line += "," + feature_and_threshold[node][1]
            
            line += "," + str(df[feature_and_threshold[node][0]].min())
            line += "," + str(df[feature_and_threshold[node][0]].max())
            
            # get children
            line += ",\""
            for child in edges[node]:
                line += str(child) + ";"
            line = line[:-1]
            line += "\""
            
        else:
            line += ",\"\",\"\",\"\",\"\",\"\""
        
        f.write(line)
        
        if not ((i == len(rf.estimators_)-1) and (node == nodes[-1])):
            f.write('\n')
        
f.close()
print("Done.")

Done.
