In [2]:
pip install graphviz


Note: you may need to restart the kernel to use updated packages.


In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter

class DecisionTreeID3:
    def _init_(self):
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _entropy(self, y):
        counts = np.bincount(y)
        probabilities = counts / len(y)
        entropy = -np.sum(probabilities * np.log2(probabilities + 1e-10))
        return entropy

    def _information_gain(self, X_col, y):
        parent_entropy = self._entropy(y)
        values, counts = np.unique(X_col, return_counts=True)
        weighted_entropy = np.sum((counts / len(X_col)) * np.array([self._entropy(y[X_col == value]) for value in values]))
        return parent_entropy - weighted_entropy

    def _best_split(self, X, y):
        best_gain = 0
        best_split = None
        for feature_index in range(X.shape[1]):
            X_col = X[:, feature_index]
            gain = self._information_gain(X_col, y)
            if gain > best_gain:
                best_gain = gain
                best_split = feature_index
        return best_split

    def _build_tree(self, X, y):
        if len(np.unique(y)) == 1:
            return y[0]
        
        if X.shape[1] == 0:
            return Counter(y).most_common(1)[0][0]
        
        split = self._best_split(X, y)
        if split is None:
            return Counter(y).most_common(1)[0][0]
        
        tree = {split: {}}
        values = np.unique(X[:, split])
        for value in values:
            X_subset = X[X[:, split] == value]
            y_subset = y[X[:, split] == value]
            tree[split][value] = self._build_tree(X_subset, y_subset)
        return tree

    def _predict_sample(self, tree, sample):
        if not isinstance(tree, dict):
            return tree
        split = list(tree.keys())[0]
        value = sample[split]
        subtree = tree[split].get(value, None)
        if subtree is None:
            return None
        return self._predict_sample(subtree, sample)

    def predict(self, X):
        predictions = []
        for sample in X:
            prediction = self._predict_sample(self.tree, sample)
            if prediction is None:
                prediction = -1  
            predictions.append(prediction)
        return np.array(predictions, dtype=int) 

iris = load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names
target_names = iris.target_names
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = DecisionTreeID3()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
new_sample = np.array([5.1, 3.5, 1.4, 0.2])
prediction = model.predict(new_sample.reshape(1, -1))
print(f"Predicted class for the new sample: {target_names[prediction[0]]}")

Accuracy: 0.69
Predicted class for the new sample: setosa
