In [29]:
import Data.data_provider as dp
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X, y = dp.load_data("spambase", "./")
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.005, shuffle=True, random_state=0)

clf = RandomForestClassifier(n_estimators=10, max_depth=4, random_state=0)
clf.fit(x_train, y_train)

In [30]:
trees = clf.estimators_

leaf_nodes_per_tree = []

for tree in trees:
    leaf_nodes = []
    node_count = tree.tree_.node_count
    children_left = tree.tree_.children_left
    children_right = tree.tree_.children_right
    
    def dfs(node_id):
        if children_left[node_id] == children_right[node_id]:
            leaf_nodes.append(node_id)
        else:
            if children_left[node_id] != -1:
                dfs(children_left[node_id])
            if children_right[node_id] != -1:
                dfs(children_right[node_id])
    
    dfs(0)  # Start DFS from root node (node_id = 0)
    leaf_nodes_per_tree.append(leaf_nodes)

# Print leaf nodes for each tree
for i, leaf_nodes in enumerate(leaf_nodes_per_tree):
    print(f"Leaf nodes for tree {i + 1}: {leaf_nodes}")

print("---------------------------------")


Leaf nodes for tree 1: [4, 5, 7, 8, 11, 12, 14, 15, 19, 20, 22, 23, 26, 27, 29, 30]
Leaf nodes for tree 2: [4, 5, 7, 8, 11, 12, 13, 17, 18, 20, 21, 24, 25, 26]
Leaf nodes for tree 3: [4, 5, 7, 8, 11, 12, 14, 15, 19, 20, 22, 23, 26, 27, 29, 30]
Leaf nodes for tree 4: [4, 5, 7, 8, 11, 12, 14, 15, 19, 20, 22, 23, 26, 27, 28]
Leaf nodes for tree 5: [4, 5, 7, 8, 11, 12, 14, 15, 19, 20, 22, 23, 26, 27, 29, 30]
Leaf nodes for tree 6: [4, 5, 7, 8, 11, 12, 14, 15, 19, 20, 22, 23, 24]
Leaf nodes for tree 7: [4, 5, 7, 8, 11, 12, 13, 17, 18, 20, 21, 24, 25, 26]
Leaf nodes for tree 8: [4, 5, 7, 8, 11, 12, 14, 15, 19, 20, 21, 22]
Leaf nodes for tree 9: [4, 5, 7, 8, 11, 12, 14, 15, 19, 20, 22, 23, 25, 27, 28]
Leaf nodes for tree 10: [4, 5, 7, 8, 11, 12, 14, 15, 18, 20, 21, 24, 25, 27, 28]
---------------------------------


In [31]:
x_data = x_test[0:5]
tree_prob = trees[0].predict_proba(x_data)
leaf_index_array = trees[0].apply(x_data)
tree_path = trees[0].decision_path(x_data)
tree_path

<5x31 sparse matrix of type '<class 'numpy.int64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [32]:
path  = []
for data_index in range(len(x_data)):

    node_index = tree_path.indices[
        tree_path.indptr[data_index] : tree_path.indptr[data_index + 1]
    ]
    path.append(node_index)
    
path

[array([ 0,  1,  9, 13, 14], dtype=int32),
 array([0, 1, 2, 3, 4], dtype=int32),
 array([0, 1, 2, 3, 4], dtype=int32),
 array([ 0,  1,  9, 10, 11], dtype=int32),
 array([0, 1, 2, 3, 4], dtype=int32)]