In [1]:
import pandas as pd
import numpy as np

class Node:
    
    def __init__(self, data_frame, target, is_left=True, feature=None, value=None, depth=0):
        self.data_frame = data_frame
        self.target = target
        self.is_left = is_left
        self.feature = feature
        self.value = value
        self.depth = depth
        self.label = None
        self.left = None
        self.right = None
    
    def calculate_gini(self, data_frame):
        frequency = data_frame[self.target].value_counts(normalize=True)
        gini = 1 - (frequency ** 2).sum()
        return gini

    def calculate_split_gini(self, feature_column, value):
        left = self.data_frame.loc[self.data_frame[feature_column]==value]
        right = self.data_frame.drop(left.index, axis=0)
        p_left = len(left) / len(self.data_frame)
        p_right = 1 - p_left
        left_gini = self.calculate_gini(left)
        right_gini = self.calculate_gini(right)
        return p_left * left_gini + p_right * right_gini, left, right

    def calculate_best_split(self):
        features = self.data_frame.drop(columns=self.target).columns
        best_gini = 1
        best_feature = None
        best_value = None
        best_left = None
        best_right = None
        if len(self.data_frame[self.target].unique()) == 1:
            return best_feature, best_value, best_left, best_right
        for feature in features:
            feature_values = self.data_frame[feature].unique()
            for value in feature_values:
                split_gini, left, right = self.calculate_split_gini(feature, value)
                if split_gini < best_gini:
                    best_gini = split_gini
                    best_feature = feature
                    best_value = value
                    best_left = left
                    best_right = right
        return best_feature, best_value, best_left, best_right

    def build_tree(self):
        best_feature, best_value, best_left, best_right = self.calculate_best_split()
        if best_feature is None:
            self.label = self.data_frame[self.target].value_counts().idxmax()
            return
        # Add left and right node
        self.left = Node(data_frame=best_left, target=self.target, is_left=True, feature=best_feature, value=best_value, depth=self.depth+1)
        self.left.build_tree()
        self.right = Node(data_frame=best_right, target=self.target, is_left=False, feature=best_feature, value=best_value, depth=self.depth+1)
        self.right.build_tree()

column_A_value_list = [1, 2, 3, 4, 5]
column_B_value_list = [1, 2, 3]
target = "species"
label_column_value_list = [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
data = [[a, b] for a in column_A_value_list for b in column_B_value_list]
data_frame = pd.DataFrame(data=data, columns=["A", "B"])
data_frame[target] = label_column_value_list

root_node = Node(data_frame, target)
root_node.build_tree()


In [9]:
def print_branch(node, n_tabs=0):
    child_nodes = [node.left, node.right]
    for child_node in child_nodes:
        if child_node is None:
            continue
        print_label = " "
        if child_node.label is not None:
            print_label = str(child_node.label)
        post_print_label = " " + "."*(8-len(print_label)-4)
        sign_label = "="
        if not child_node.is_left:
            post_print_label = " " + "."*(8-len(print_label)-5)
            sign_label = "!="
        print("\t"*n_tabs*2 + "....... " + print_label + post_print_label + str(child_node.feature) + sign_label + str(child_node.value))
        print_branch(child_node, n_tabs=n_tabs+1)

def print_tree(root_node):
    print("ROOT............")
    print_branch(root_node, n_tabs=1)

print_tree(root_node)

ROOT............
		....... 0 ...A=4
		.......   ..A!=4
				....... 0 ...A=5
				.......   ..A!=5
						.......   ...A=3
								....... 0 ...B=3
								....... 1 ..B!=3
						....... 1 ..A!=3
