In [1]:
import numpy as np
import pandas as pd

In [2]:
class DescissionTree:
    def __init__(self,depth):
        self.tree=None
        self.max_depth=depth

    def _gini(self,arr):
        classes,count=np.unique(arr,return_counts=True)
        probs=count/count.sum()
        return 1 - sum(probs**2)

    def _entropy(self,arr):
        classes,count=np.unique(arr,return_counts=True)
        probs=count/count.sum()
        return sum([-i * np.log2(i) for i in probs])
        

    def _best_split(self, X ,y):
        best_gini=float('inf')
        best_index=None
        best_thresh=None
        for feature_index in range(X.shape[1]):
            thresholds=np.unique(X[:,feature_index])
            for threshold in thresholds:
                left_mask=X[:,feature_index]<=threshold
                right_mask=X[:,feature_index]>threshold

                if len(y[left_mask])==0 or len(y[right_mask])==0:
                    continue
                left_gini=self._entropy(y[left_mask])
                right_gini=self._entropy(y[right_mask])

                weighted_gini=(len(y[left_mask])*left_gini+ len(y[right_mask])* right_gini)/len(y)
                
                if weighted_gini<best_gini:
                    best_gini=weighted_gini
                    best_index=feature_index
                    best_thresh=threshold

        return best_index,best_thresh

    def build_Tree(self,X,y,depth=0):
        if len(np.unique(y)) == 1 or depth == self.max_depth:
            return {"leaf":True,'class':np.bincount(y).argmax()}
            
        feature_index,threshold=self._best_split(X,y)

        if feature_index==None:
            return {"leaf":True,'class':np.bincount(y).argmax()}

        left_mask=X[:,feature_index]<=threshold
        right_mask=X[:,feature_index]>threshold

        left_subtree=self.build_Tree(X[left_mask],y[left_mask],depth+1)
        right_subtree=self.build_Tree(X[right_mask],y[right_mask],depth+1)

        return {
                'leaf':False,
                'feature_index':feature_index,
                'threshold':threshold,
                'left_subtree':left_subtree,
                'right_subtree':right_subtree
               }
        
    def fit(self,X,y):
        self.Tree=self.build_Tree(X,y)

    def predict_one(self,x):
        tree=self.Tree
        while not tree['leaf']:
            if x[tree['feature_index']] <= tree['threshold']:
                tree = tree['left_subtree']
            else:
                tree = tree['right_subtree']
        return tree['class']
        
    def predict(self,X):
        return [self.predict_one(i) for i in X]

In [19]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

X,y=load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

tree = DescissionTree(3)
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)

# Accuracy
# print("Predicted:", y_pred)
# print("Actual   :", y_test)
print("Accuracy:", (y_pred == y_test).mean())


Accuracy: 1.0


In [20]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_pred,y_test)

array([[10,  0,  0],
       [ 0,  9,  0],
       [ 0,  0, 11]])