In [1]:
import numpy as np

# Sample toy dataset (binary classification)
# Features: [feature1, feature2]
# Labels: 0 or 1
X = np.array([
    [2.7, 2.5],
    [1.3, 1.5],
    [3.1, 3.0],
    [2.0, 2.2],
    [1.0, 1.1],
    [1.2, 0.9],
    [3.2, 2.8],
    [2.8, 3.0]
])
y = np.array([0, 0, 1, 0, 0, 0, 1, 1])

# Calculate Gini impurity
def gini(y):
    classes, counts = np.unique(y, return_counts=True)
    probabilities = counts / counts.sum()
    return 1 - np.sum(probabilities ** 2)

# Split dataset by feature index and threshold
def split_dataset(X, y, feature_index, threshold):
    left_idx = X[:, feature_index] <= threshold
    right_idx = X[:, feature_index] > threshold
    return X[left_idx], y[left_idx], X[right_idx], y[right_idx]

# Find the best split
def best_split(X, y):
    best_gini = float('inf')
    best_idx, best_thresh = None, None
    for feature_index in range(X.shape[1]):
        thresholds = np.unique(X[:, feature_index])
        for threshold in thresholds:
            _, y_left, _, y_right = split_dataset(X, y, feature_index, threshold)
            if len(y_left) == 0 or len(y_right) == 0:
                continue
            g = (len(y_left) * gini(y_left) + len(y_right) * gini(y_right)) / len(y)
            if g < best_gini:
                best_gini = g
                best_idx = feature_index
                best_thresh = threshold
    return best_idx, best_thresh

# Recursive tree building
class Node:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value  # only for leaf nodes

def build_tree(X, y, depth=0, max_depth=3):
    # Stop if pure or max depth reached
    if len(np.unique(y)) == 1 or depth >= max_depth:
        leaf_value = np.bincount(y).argmax()
        return Node(value=leaf_value)

    feature_index, threshold = best_split(X, y)
    if feature_index is None:
        leaf_value = np.bincount(y).argmax()
        return Node(value=leaf_value)

    X_left, y_left, X_right, y_right = split_dataset(X, y, feature_index, threshold)

    left_child = build_tree(X_left, y_left, depth + 1, max_depth)
    right_child = build_tree(X_right, y_right, depth + 1, max_depth)

    return Node(feature_index, threshold, left_child, right_child)

# Prediction
def predict(sample, tree):
    if tree.value is not None:
        return tree.value
    if sample[tree.feature_index] <= tree.threshold:
        return predict(sample, tree.left)
    else:
        return predict(sample, tree.right)

# Build and test the tree
tree = build_tree(X, y)

# Test prediction
for i, sample in enumerate(X):
    print(f"Sample {sample}, Predicted: {predict(sample, tree)}, Actual: {y[i]}")


Sample [2.7 2.5], Predicted: 0, Actual: 0
Sample [1.3 1.5], Predicted: 0, Actual: 0
Sample [3.1 3. ], Predicted: 1, Actual: 1
Sample [2.  2.2], Predicted: 0, Actual: 0
Sample [1.  1.1], Predicted: 0, Actual: 0
Sample [1.2 0.9], Predicted: 0, Actual: 0
Sample [3.2 2.8], Predicted: 1, Actual: 1
Sample [2.8 3. ], Predicted: 1, Actual: 1
