In [141]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import math
from collections import Counter
import numpy as np

iris = load_iris()

x = iris.data
y = iris.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=123)

In [142]:
def entropy_func(class_count, num_samples):
    return (-1) * sum([count/num_samples * math.log(count/num_samples) for count in class_count.values()])


class Group:
    def __init__(self, group_classes):
        self.group_classes = group_classes
        self.entropy = self.group_entropy()

    def __len__(self):
        return self.group_classes.size

    def group_entropy(self):
        return entropy_func(Counter(self.group_classes), len(self.group_classes))


class Node:
    def __init__(self, split_feature, split_val, depth=None, child_node_a=None, child_node_b=None, val=None):
        self.split_feature = split_feature
        self.split_val = split_val
        self.depth = depth
        self.child_node_a = child_node_a
        self.child_node_b = child_node_b
        self.val = val

    def predict(self, data):
        if self.val is not None:
            return self.val
        elif data[self.split_feature] <= self.split_val:
            return self.child_node_a.predict(data)
        else:
            return self.child_node_b.predict(data)


class DecisionTreeClassifier(object):
    def __init__(self, max_depth):
        self.depth = 0
        self.max_depth = max_depth
        self.tree = None

    @staticmethod
    def get_split_entropy(group_a, group_b):
        num_samples = len(group_a) + len(group_b)
        return group_a.entropy * (len(group_a) / num_samples) + group_b.entropy * (len(group_b) / num_samples)

    def get_information_gain(self, parent_group, child_group_a, child_group_b):
        return parent_group.entropy - self.get_split_entropy(child_group_a, child_group_b)

    def get_best_feature_split(self, feature_values, classes):
        parent_group = Group(classes)
        best_gain = 0
        best_feature = None
        best_split_val = None

        for feature in range(len(feature_values[0])):
            chosen_feature = feature_values[:, feature]
            unique_values = np.unique(chosen_feature)
            for val in unique_values:
                group_a_idxs, group_b_idxs = self.separate_groups_by_val(chosen_feature, best_feature, val, 1)
                group_a = Group(classes[group_a_idxs])
                group_b = Group(classes[group_b_idxs])

                gain = self.get_information_gain(parent_group, group_a, group_b)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_split_val = val

        return best_feature, best_split_val

    def get_best_split(self, data, classes):
        if self.depth >= self.max_depth or len(np.unique(classes)) == 1:
            return Node(split_feature=None, split_val=None, val=Counter(classes).most_common(1)[0][0])

        best_feature, best_split_value = self.get_best_feature_split(data, classes)
        group_a_idxs, group_b_idxs = self.separate_groups_by_val(data, best_feature, best_split_value, 2)

        self.depth += 1
        child_group_a = self.get_best_split(data[group_a_idxs], classes[group_a_idxs])
        child_group_b = self.get_best_split(data[group_b_idxs], classes[group_b_idxs])

        return Node(best_feature, best_split_value, self.depth, child_group_a, child_group_b)

    def build_tree(self, data, classes, depth=0):
        self.tree = self.get_best_split(data, classes)

    def predict(self, data):
        return self.tree.predict(data)

    @staticmethod
    def separate_groups_by_val(data, feature, value, dim):
        temp_a_indexes = []
        temp_b_indexes = []
        for idx in range(len(data)):
            if dim == 1:
                if data[idx] <= value:
                    temp_a_indexes.append(idx)
                else:
                    temp_b_indexes.append(idx)
            elif dim == 2:
                if data[idx, feature] <= value:
                    temp_a_indexes.append(idx)
                else:
                    temp_b_indexes.append(idx)

        return np.asarray(temp_a_indexes, dtype=int), np.asarray(temp_b_indexes, dtype=int)

In [143]:
dc = DecisionTreeClassifier(3)
dc.build_tree(x_train, y_train)
predictions = []
for sample, gt in zip(x_test, y_test):
    predictions.append(dc.predict(sample))
    print(f'Prediction: {predictions[-1]}; Real class: {gt}')
accuracy = np.mean(predictions == y_test)
print(f"Accuracy: {accuracy}")

Prediction: 2; Real class: 1
Prediction: 2; Real class: 2
Prediction: 2; Real class: 2
Prediction: 1; Real class: 1
Prediction: 0; Real class: 0
Prediction: 2; Real class: 2
Prediction: 1; Real class: 1
Prediction: 0; Real class: 0
Prediction: 0; Real class: 0
Prediction: 1; Real class: 1
Prediction: 2; Real class: 2
Prediction: 0; Real class: 0
Prediction: 1; Real class: 1
Prediction: 2; Real class: 2
Prediction: 2; Real class: 2
Accuracy: 0.9333333333333333
