In [13]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [14]:
class Node:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, info_gain=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain = info_gain
        
        self.value = value

In [None]:
class DecisionTreeClassifier():
    def __init__(self, min_samples_split=2, max_depth=2):
        self.root = None
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.feature_names = None

    def fit(self, data):
        self.feature_names = data.columns[:-1].tolist()

        self.root = self.build_tree(data, curr_depth=0)
        return self
        
    def build_tree(self, dataset, curr_depth=0):
        X = dataset.iloc[:, :-1].values
        y = dataset.iloc[:, -1].values
        
        num_samples, num_features = X.shape
        
        if (num_samples >= self.min_samples_split) and (curr_depth < self.max_depth):
            best_split = self.get_best_split(dataset, num_features)
            
            if best_split['info_gain'] > 0:
                left_subtree = self.build_tree(best_split['left'], curr_depth + 1)
                right_subtree = self.build_tree(best_split['right'], curr_depth + 1)
                
                return Node(
                    feature_index=best_split['feature_index'],
                    threshold=best_split['threshold'],
                    left=left_subtree,
                    right=right_subtree,
                    info_gain=best_split['info_gain']
                )
        
        leaf_value = self.most_common_label(y)
        return Node(value=leaf_value)
    
    def get_best_split(self, dataset, num_features):
        X = dataset.iloc[:, :-1].values
        y = dataset.iloc[:, -1].values
        
        best_split = {}
        max_info_gain = -float("inf")
        
        for feature_index in range(num_features):
            feature_values = X[:, feature_index]
            possible_thresholds = np.unique(feature_values)
            
            for threshold in possible_thresholds:
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                
                if len(dataset_left) > 0 and len(dataset_right) > 0:
                    y_left = dataset_left.iloc[:, -1].values
                    y_right = dataset_right.iloc[:, -1].values
                    
                    curr_info_gain = self.information_gain(y, y_left, y_right)
                    
                    if curr_info_gain > max_info_gain:
                        best_split['feature_index'] = feature_index
                        best_split['threshold'] = threshold
                        best_split['info_gain'] = curr_info_gain
                        best_split['left'] = dataset_left
                        best_split['right'] = dataset_right
                        max_info_gain = curr_info_gain
                        
        return best_split
    
    def split(self, dataset, feature_index, threshold):
        feature_name = dataset.columns[feature_index]
        dataset_left = dataset[dataset[feature_name] <= threshold]
        dataset_right = dataset[dataset[feature_name] > threshold]
        
        return dataset_left, dataset_right
    
    def information_gain(self, parent, left_child, right_child):
        weight_left = len(left_child) / len(parent)
        weight_right = len(right_child) / len(parent)
        
        gain = self.entropy(parent) - (weight_left * self.entropy(left_child) + weight_right * self.entropy(right_child))
        return gain
    
    def entropy(self, y):
        class_labels = np.unique(y)
        entropy = 0
        
        for label in class_labels:
            p_label = len(y[y == label]) / len(y)
            entropy -= p_label * np.log2(p_label)
            
        return entropy
    
    def most_common_label(self, y):
        return np.bincount(y.astype(int)).argmax()
    
    def predict(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        
        return np.array([self._make_prediction(x, self.root) for x in X])
    
    def _make_prediction(self, x, tree):
        if tree.value is not None:
            return tree.value
        
        feature_val = x[tree.feature_index]
        
        if feature_val <= tree.threshold:
            return self._make_prediction(x, tree.left)
        else:
            return self._make_prediction(x, tree.right)
            
    def print_tree(self, tree=None, indent="  "):
        if not tree:
            tree = self.root

        if tree.value is not None:
            print(f"Prediction: {tree.value}")
        else:
            feature_name = self.feature_names[tree.feature_index]
            print(f"Feature '{feature_name}' <= {tree.threshold:.4f} ? (gain: {tree.info_gain:.4f})")
            
            print(f"{indent}Left: ", end="")
            self.print_tree(tree.left, indent + "  ")
            
            print(f"{indent}Right: ", end="")
            self.print_tree(tree.right, indent + "  ")

In [21]:

iris = load_iris()
data = pd.DataFrame(iris.data, columns=iris.feature_names)
data['target'] = iris.target

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

dt = DecisionTreeClassifier(max_depth=2)
dt.fit(train_data)

dt.print_tree()

X_test = test_data.iloc[:, :-1]
y_test = test_data.iloc[:, -1]
predictions = dt.predict(X_test)

accuracy = np.sum(predictions == y_test.values) / len(y_test)
print(f"Accuracy: {accuracy:.4f}")

Feature 'petal length (cm)' <= 1.9000 ? (gain: 0.9183)
  Left: Prediction: 0
  Right: Feature 'petal length (cm)' <= 4.7000 ? (gain: 0.6379)
    Left: Prediction: 1
    Right: Prediction: 2
Accuracy: 0.9667
