In [42]:
from sklearn.datasets import load_iris
import numpy as np
import copy
import sys

In [17]:
def get_splits(X, i):
    unique_vals = np.unique(X[:, i])
    splits = np.zeros(unique_vals.shape[0] - 1)
    
    for i in range(len(splits)):
        # Choose a value in between the two unique vals
        splits[i] = (unique_vals[i] + unique_vals[i + 1]) / 2
        
    return splits

get_splits(np.array([[1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4.75]]).T, 0)

array([1.5  , 2.5  , 3.5  , 4.375])

In [18]:
def get_entropy(target):
    _, counts = np.unique(target, return_counts=True)
    
    probs = counts / target.shape[0]
    
    return -np.sum(probs * np.log2(probs), axis=0)

In [19]:
get_entropy(np.array([1, 1, 1, 1, 1, 1, 1, 1]))

-0.0

In [20]:
def get_information_gain(y, left_y, right_y):
    return get_entropy(y) - get_entropy(left_y) - get_entropy(right_y)

In [21]:
def get_best_split(X, y):
    
    max_info_gain = -sys.maxsize
    max_left_y = None
    max_right_y = None
    max_left_X = None
    max_right_X = None
    
    for i in range(X.shape[1]):
        splits = get_splits(X, i)
        
        for split in splits:
            left_indices = X[:, i] < split
            right_indices = X[:, i] > split

            left_y = y[left_indices]
            right_y = y[right_indices]

            info_gain = get_information_gain(y, left_y, right_y)

            if info_gain > max_info_gain:
                max_info_gain = info_gain
                max_left_y = left_y
                max_right_y = right_y
                max_left_X = X[left_indices, :]
                max_right_X = X[right_indices, :]
                max_split = split
                max_feature = i
    
    return max_left_X, max_left_y, max_right_X, max_right_y, max_split, max_feature, max_info_gain

In [22]:
X = np.array([[0, 0, 1, 1, 1, 2, 2, 2]]).T
y = np.array([0, 0, 1, 1, 1, 2, 2, 1])
splits = np.array([.5, 1.5])
get_best_split(X, y)

(array([[0],
        [0]]), array([0, 0]), array([[1],
        [1],
        [1],
        [2],
        [2],
        [2]]), array([1, 1, 1, 2, 2, 1]), 0.5, 0, 0.5817041659455104)

In [23]:
def get_probs(y, n_classes):
    
    return np.bincount(y, minlength=n_classes) / len(y)

In [24]:
def construct_tree(X, y, min_info_gain, max_depth, min_sample_split, tree, n_classes):
    
    if X.shape[0] < min_sample_split or max_depth < 0:
        tree['probs'] = get_probs(y, n_classes)
        return
    
    left_X,\
    left_y,\
    right_X,\
    right_y,\
    split,\
    feature,\
    info_gain = get_best_split(X, y)

    if info_gain < min_info_gain:
        tree['probs'] = get_probs(y, n_classes)
        return
    
    tree['feature'] = feature
    tree['split'] = split
    tree['info_gain'] = info_gain
    tree['left'] = {}
    tree['right'] = {}
    
    construct_tree(left_X, left_y, min_info_gain, max_depth - 1, min_sample_split, tree['left'], n_classes)
    construct_tree(right_X, right_y, min_info_gain, max_depth - 1, min_sample_split, tree['right'], n_classes)

In [25]:
def predict_row(x_row, tree):
    
    # If this is not a leaf node
    if 'split' not in tree:
        return copy.deepcopy(tree['probs'])
    
    if x_row[tree['feature']] < tree['split']:
        return predict_row(x_row, tree['left'])
    
    return predict_row(x_row, tree['right'])

In [35]:
class DecisionTree:
    def __init__(self, max_depth=3, min_sample_split=10, min_info_gain=1e-7):
        self.max_depth = max_depth
        self.min_sample_split = min_sample_split
        self.min_info_gain = min_info_gain
        self.tree = {}
        
    def fit(self, X, y):
        self.n_classes = np.unique(y).shape[0]
        construct_tree(X, y, self.min_info_gain, self.max_depth, self.min_sample_split, self.tree, self.n_classes)
    
    def predict_proba(self, X):
        
        res = np.zeros((X.shape[0], self.n_classes))
        
        for i in range(X.shape[0]):
            x_row = X[i, :]
            
            probs = predict_row(x_row, self.tree)
        
            res[i, :] = probs
        
        return res
    
    def predict(self, X):
        proba = self.predict_proba(X)
        
        return np.argmax(proba, axis=1)

In [27]:
iris_data = load_iris()
X = iris_data['data']
y = iris_data['target']

In [37]:
model = DecisionTree()

In [38]:
model.fit(X, y)

In [40]:
y_pred = model.predict(X)

In [41]:
from sklearn.metrics import accuracy_score
accuracy_score(y, y_pred)

0.9733333333333334