In [7]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [12]:
class DecisionTree:
    def __init__(self, max_depth=None, min_samples_leaf=1, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        unique_classes = np.unique(y)

        # Se não há mais amostras ou atingimos a profundidade máxima, retorna a classe mais comum
        if num_samples == 0 or (self.max_depth is not None and depth == self.max_depth):
            return self._most_common_class(y)

        # Se todas as amostras pertencem à mesma classe, retorna essa classe
        if len(unique_classes) == 1:
            return unique_classes[0]

        # Se o número de amostras é menor que o mínimo para divisão, retorna a classe mais comum
        if num_samples < self.min_samples_split:
            return self._most_common_class(y)

        # Encontrar a melhor divisão
        best_feature, best_threshold = self._best_split(X, y)

        # Se não houver divisão válida, retorna a classe mais comum
        if best_feature is None:
            return self._most_common_class(y)

        # Dividir os dados
        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        # Verifica se os grupos têm amostras suficientes
        if np.sum(left_indices) < self.min_samples_leaf or np.sum(right_indices) < self.min_samples_leaf:
            return self._most_common_class(y)

        left_tree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_tree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return (best_feature, best_threshold, left_tree, right_tree)

    def _best_split(self, X, y):
        num_samples, num_features = X.shape
        best_gain = -1
        best_feature = None
        best_threshold = None

        for feature in range(num_features):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                gain = self._information_gain(X, y, feature, threshold)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _information_gain(self, X, y, feature, threshold):
        parent_entropy = self._entropy(y)

        left_indices = X[:, feature] <= threshold
        right_indices = X[:, feature] > threshold

        if np.sum(left_indices) == 0 or np.sum(right_indices) == 0:
            return 0

        left_entropy = self._entropy(y[left_indices])
        right_entropy = self._entropy(y[right_indices])

        # Calculando a entropia ponderada
        n = len(y)
        n_left = np.sum(left_indices)
        n_right = np.sum(right_indices)

        child_entropy = (n_left / n) * left_entropy + (n_right / n) * right_entropy
        gain = parent_entropy - child_entropy

        return gain

    def _entropy(self, y):
        class_labels, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        return -np.sum(probabilities * np.log2(probabilities + 1e-10))

    def _most_common_class(self, y):
        return np.bincount(y).argmax()

    def predict(self, X):
        return np.array([self._predict(sample, self.tree) for sample in X])

    def _predict(self, sample, tree):
        if not isinstance(tree, tuple):
            return tree

        feature, threshold, left_tree, right_tree = tree
        if sample[feature] <= threshold:
            return self._predict(sample, left_tree)
        else:
            return self._predict(sample, right_tree)

In [13]:
# Exemplo de uso com o dataset Iris
if __name__ == "__main__":
    iris = load_iris()
    X = iris.data
    y = iris.target

    # Configuração dos parâmetros
    tree = DecisionTree(max_depth=3, min_samples_leaf=2, min_samples_split=2)
    tree.fit(X, y)

    # Predições
    predictions = tree.predict(X)

    # Métricas de avaliação
    cm = confusion_matrix(y, predictions)
    accuracy = accuracy_score(y, predictions)
    precision = precision_score(y, predictions, average='macro')
    recall = recall_score(y, predictions, average='macro')
    f1 = f1_score(y, predictions, average='macro')

    # Resultados
    print(f"Matriz de Confusão:\n{cm}")
    print(f"Acurácia: {accuracy:.2f}")
    print(f"Precisão: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")


Matriz de Confusão:
[[50  0  0]
 [ 0 47  3]
 [ 0  1 49]]
Acurácia: 0.97
Precisão: 0.97
Recall: 0.97
F1-Score: 0.97
