## Import tools

In [1]:
import numpy as np
import pandas as pd

## Get the data

In [2]:
from google.colab import files
data = files.upload()

Saving iris.csv to iris.csv


In [3]:
col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'type']
data = pd.read_csv(r"/content/iris.csv", skiprows=1, header=None, names=col_names)
data.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,type
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
5,5.4,3.9,1.7,0.4,0
6,4.6,3.4,1.4,0.3,0
7,5.0,3.4,1.5,0.2,0
8,4.4,2.9,1.4,0.2,0
9,4.9,3.1,1.5,0.1,0


## Node class

In [4]:
class Node():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, info_gain=None, value=None):
        ''' constructor '''

        # for decision node
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain = info_gain

        # for leaf node
        self.value = value

## Tree class

In [5]:
import numpy as np

class DecisionTreeClassifier():
    def __init__(self, min_samples_split=2, max_depth=2):
        ''' constructor '''
        # initialize the root of the tree
        self.root = None
        # stopping conditions
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth

    def build_tree(self, dataset, curr_depth=0):
        ''' recursive function to build the tree '''
        X, Y = dataset[:, :-1], dataset[:, -1]
        num_samples, num_features = np.shape(X)
        # split until stopping conditions are met
        if num_samples >= self.min_samples_split and curr_depth <= self.max_depth:
            # find the best split
            best_split = self.get_best_split(dataset, num_samples, num_features)
            # check if information gain ratio is positive
            if best_split["info_gain_ratio"] > 0:
                # recur left
                left_subtree = self.build_tree(best_split["dataset_left"], curr_depth + 1)
                # recur right
                right_subtree = self.build_tree(best_split["dataset_right"], curr_depth + 1)
                # return decision node
                return Node(best_split["feature_index"], best_split["threshold"],
                            left_subtree, right_subtree, best_split["info_gain_ratio"])
        # compute leaf node
        leaf_value = self.calculate_leaf_value(Y)
        # return leaf node
        return Node(value=leaf_value)

    def get_best_split(self, dataset, num_samples, num_features):
        ''' function to find the best split '''
        # dictionary to store the best split
        best_split = {}
        max_info_gain_ratio = -float("inf")
        # loop over all the features
        for feature_index in range(num_features):
            feature_values = dataset[:, feature_index]
            possible_thresholds = np.unique(feature_values)
            # loop over all the feature values present in the data
            for threshold in possible_thresholds:
                # get current split
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                # check if childs are not null
                if len(dataset_left) > 0 and len(dataset_right) > 0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    # compute information gain ratio
                    curr_info_gain_ratio = self.information_gain_ratio(y, left_y, right_y, "entropy")
                    # update the best split if needed
                    if curr_info_gain_ratio > max_info_gain_ratio:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["info_gain_ratio"] = curr_info_gain_ratio
                        max_info_gain_ratio = curr_info_gain_ratio
        # return best split
        return best_split

    def split(self, dataset, feature_index, threshold):
        ''' function to split the data '''
        dataset_left = np.array([row for row in dataset if row[feature_index] <= threshold])
        dataset_right = np.array([row for row in dataset if row[feature_index] > threshold])
        return dataset_left, dataset_right

    def information_gain_ratio(self, parent, l_child, r_child, mode="entropy"):
        ''' function to compute information gain ratio '''
        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)
        gain = self.entropy(parent) - (weight_l * self.entropy(l_child) + weight_r * self.entropy(r_child))
        split_info = - (weight_l * np.log2(weight_l) + weight_r * np.log2(weight_r))
        if split_info == 0:  # Avoid division by zero
            return 0
        gain_ratio = gain / split_info
        return gain_ratio

    def entropy(self, y):
        ''' function to compute entropy '''
        class_labels, counts = np.unique(y, return_counts=True)
        entropy = 0
        total_samples = len(y)
        for count in counts:
            p_cls = count / total_samples
            entropy += -p_cls * np.log2(p_cls)
        return entropy

    def calculate_leaf_value(self, Y):
        ''' function to compute leaf node '''
        unique, counts = np.unique(Y, return_counts=True)
        return unique[np.argmax(counts)]

    def print_tree(self, tree=None, indent=" "):
        ''' function to print the tree '''
        if not tree:
            tree = self.root
        if tree.value is not None:
            print(tree.value)
        else:
            print("X_" + str(tree.feature_index), "<=", tree.threshold, "?", tree.info_gain_ratio)
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left, indent + indent)
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right, indent + indent)

    def fit(self, X, Y):
        ''' function to train the tree '''
        dataset = np.column_stack((X, Y))
        self.root = self.build_tree(dataset)

    def predict(self, X):
        ''' function to predict new dataset '''
        predictions = [self.make_prediction(x, self.root) for x in X]
        return predictions

    def make_prediction(self, x, tree):
        ''' function to predict a single data point '''
        if tree.value is not None:
            return tree.value
        feature_val = x[tree.feature_index]
        if feature_val <= tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)

class Node:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None, info_gain_ratio=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
        self.info_gain_ratio = info_gain_ratio


## Train-Test split

In [6]:
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values.reshape(-1,1)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=41)

## Fit the model

In [7]:
classifier = DecisionTreeClassifier(min_samples_split=3, max_depth=8)
classifier.fit(X_train,Y_train)
classifier.print_tree()

1.0


## Test the model

In [8]:
def custom_evaluation(y_true, y_pred):
    correct_predictions = np.sum(y_true == y_pred)
    total_samples = len(y_true)
    accuracy = correct_predictions / total_samples

    unique_classes = np.unique(y_true)
    precision = 0
    recall = 0
    f1 = 0
    for cls in unique_classes:
        true_positive = np.sum((y_true == cls) & (y_pred == cls))
        false_positive = np.sum((y_true != cls) & (y_pred == cls))
        false_negative = np.sum((y_true == cls) & (y_pred != cls))

        cls_precision = true_positive / (true_positive + false_positive) if true_positive + false_positive != 0 else 0
        cls_recall = true_positive / (true_positive + false_negative) if true_positive + false_negative != 0 else 0
        cls_f1 = 2 * cls_precision * cls_recall / (cls_precision + cls_recall) if cls_precision + cls_recall != 0 else 0

        precision += cls_precision
        recall += cls_recall
        f1 += cls_f1

    precision /= len(unique_classes)
    recall /= len(unique_classes)
    f1 /= len(unique_classes)

    return accuracy, round(precision, 3), recall, f1

In [9]:
classifier = DecisionTreeClassifier(max_depth=3)
classifier.fit(X_train, Y_train)
y_pred = classifier.predict(X_test)

In [10]:
Y_pred = classifier.predict(X_test)

accuracy, precision, recall, f1 = custom_evaluation(Y_test, Y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 4.0
Precision: 0.089
Recall: 0.3333333333333333
F1-score: 0.14035087719298248
