In [None]:
import numpy as np
from sklearn.datasets import load_iris
from collections import Counter

In [None]:
# Tải dữ liệu Iris
iris = load_iris()
X, y = iris.data, iris.target
feature_names = iris.feature_names

In [None]:
# Hàm tính entropy
def entropy(y):
    counts = np.bincount(y)  # Đếm số lượng mỗi label
    probabilities = counts / len(y)  # Tính xác suất của mỗi label
    return -np.sum([p * np.log2(p) for p in probabilities if p > 0])


# Hàm chia dữ liệu theo điều kiện
def split_dataset(X, y, feature_index, threshold):
    # Kiểm tra từng item trong mảng có thỏa điền kiện, trả về mảng boolean
    left_idx = X[:, feature_index] <= threshold
    right_idx = X[:, feature_index] > threshold
    return X[left_idx], y[left_idx], X[right_idx], y[right_idx]


# Tìm feature và ngưỡng tốt nhất
def best_split(X, y):
    best_gain = 0
    best_feature = None
    best_threshold = None
    current_entropy = entropy(y)

    n_features = X.shape[1]
    for feature_index in range(n_features):
        thresholds = np.unique(X[:, feature_index])
        for threshold in thresholds:
            X_left, y_left, X_right, y_right = split_dataset(
                X, y, feature_index, threshold
            )
            if len(y_left) == 0 or len(y_right) == 0:
                continue
            p_left = len(y_left) / len(y)
            p_right = 1 - p_left
            gain = current_entropy - (
                p_left * entropy(y_left) + p_right * entropy(y_right)
            )
            if gain > best_gain:
                best_gain = gain
                best_feature = feature_index
                best_threshold = threshold
    return best_feature, best_threshold

In [None]:
# Node trong cây
class DecisionNode:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value  # nhãn nếu là node lá


# Xây dựng cây
def build_tree(X, y, depth=0, max_depth=5):
    if len(set(y)) == 1 or depth >= max_depth:
        most_common = Counter(y).most_common(1)[0][0]
        return DecisionNode(value=most_common)

    feature, threshold = best_split(X, y)
    if feature is None:
        most_common = Counter(y).most_common(1)[0][0]
        return DecisionNode(value=most_common)

    X_left, y_left, X_right, y_right = split_dataset(X, y, feature, threshold)
    left = build_tree(X_left, y_left, depth + 1, max_depth)
    right = build_tree(X_right, y_right, depth + 1, max_depth)
    return DecisionNode(feature, threshold, left, right)


# Dự đoán
def predict_one(x, node):
    if node.value is not None:
        return node.value
    if x[node.feature] <= node.threshold:
        return predict_one(x, node.left)
    else:
        return predict_one(x, node.right)


def predict(X, tree):
    return [predict_one(x, tree) for x in X]

In [None]:
# Huấn luyện và đánh giá
tree = build_tree(X, y, max_depth=3)
y_pred = predict(X, tree)
accuracy = np.mean(y_pred == y)
print(f"Accuracy on training set: {accuracy:.2f}")