In [10]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from collections import Counter
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('/Users/inakiewodo/Desktop/Machine-learning/Iris.csv')

data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
data.tail()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica
149,150,5.9,3.0,5.1,1.8,Iris-virginica


In [4]:
X = data.drop(columns=['Id', 'Species']).values
y = data['Species'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# print x_train
print("Type of x_train:",type(X_train))
print("First five elements of x_train are:\n", X_train[:5]) 

Type of x_train: <class 'numpy.ndarray'>
First five elements of x_train are:
 [[4.6 3.6 1.  0.2]
 [5.7 4.4 1.5 0.4]
 [6.7 3.1 4.4 1.4]
 [4.8 3.4 1.6 0.2]
 [4.4 3.2 1.3 0.2]]


In [8]:
# print y_train
print("Type of y_train:",type(y_train))
print("First five elements of y_train are:\n", y_train[:5])  

Type of y_train: <class 'numpy.ndarray'>
First five elements of y_train are:
 ['Iris-setosa' 'Iris-setosa' 'Iris-versicolor' 'Iris-setosa' 'Iris-setosa']


In [9]:
# print X_train
print ('The shape of x_train is:', X_train.shape)
print ('The shape of y_train is: ', y_train.shape)
print ('Number of training examples (m):', len(X_train))

The shape of x_train is: (120, 4)
The shape of y_train is:  (120,)
Number of training examples (m): 120


In [18]:
class Node:
    def __init__(self, feature_idx=None, threshold=None, info_gain=None, left=None, right=None, value=None):

        # Decision Node
        self.feature_idx = feature_idx
        self.threshold = threshold
        self.info_gain = info_gain
        self.left = left
        self.right = right

        # Leaf Node
        self.value = value

In [30]:
class DecisionTree:
    def __init__(self, min_samples_split=2, max_depth=2):

        self.min_samples_split = min_samples_split
        self.max_depth = max_depth

    def build_tree(self, dataset, curr_depth=0):
        X, y = dataset[:, :-1], dataset[:, -1]
        n_samples, n_features = X.shape

        if n_samples >= self.min_samples_split and curr_depth <= self.max_depth:
            best_split = self.best_split(dataset, n_features)

            if best_split["info_gain"] > 0:
                left_node = self.build_tree(best_split["left_dataset"], curr_depth + 1)
                right_node = self.build_tree(best_split["right_dataset"], curr_depth + 1)

                return Node(best_split["feature_idx"], best_split["threshold"], best_split["info_gain"], left_node, right_node)
            
        leaf_value = Counter(y).most_common(1)[0][0]
        return Node(value=leaf_value)
    
    def best_split(self, dataset, n_features):
        best_split = {'feature_idx': None, 'threshold': None, 'info_gain': -1, 'left_dataset': None, 'right_dataset': None}

        for feature_idx in range(n_features):
            feature_values = dataset[:, feature_idx]
            thresholds = np.unique(feature_values)

            for threshold in thresholds:
                left_dataset, right_dataset = self.split(dataset, feature_idx, threshold)

                if len(left_dataset) and len(right_dataset):
                    parent_y, left_y, right_y = dataset[:, -1], left_dataset[:, -1], right_dataset[:, -1]

                    info_gain = self.information_gain(parent_y, left_y, right_y)

                    if info_gain > best_split['info_gain']:
                        best_split['feature_idx'] = feature_idx
                        best_split['threshold'] = threshold
                        best_split['info_gain'] = info_gain
                        best_split['left_dataset'] = left_dataset
                        best_split['right_dataset'] = right_dataset

        return best_split
    
    def split(self, dataset, feature_idx, threshold):
        left_dataset = np.array([row for row in dataset if row[feature_idx] <= threshold])
        right_dataset = np.array([row for row in dataset if row[feature_idx] > threshold])

        return left_dataset, right_dataset
    
    def information_gain(self, parent_y, left_y, right_y):
        weight_left = len(left_y) / len(parent_y)
        weight_right = len(right_y) / len(parent_y)

        information_gain = self.entropy(parent_y) - (weight_left * self.entropy(left_y) + weight_right * self.entropy(right_y))
        return information_gain
    
    def entropy(self, y):
        entropy = 0

        class_labels = np.unique(y)
        for cls_label in class_labels:
            p_cls = len(y[y == cls_label]) / len(y)
            entropy += -p_cls * np.log2(p_cls)
        
        return entropy
    
    def fit(self, X, y):
        dataset = np.concatenate([X, y.reshape(-1, 1)], axis=1)
        self.root = self.build_tree(dataset)
    
    def predict(self, X):
        predictions = [self.predict_class(row, self.root) for row in X]
        return predictions
    
    def predict_class(self, row, node):
        if node.value != None:
            return node.value
        
        feature_value = row[node.feature_idx]
        if feature_value <= node.threshold:
            return self.predict_class(row, node.left)
        else:
            return self.predict_class(row, node.right)
    
    def print_tree(self, node=None, depth=0, indent="|   "):
        prefix = indent * depth

        if node is None:
            node = self.root

        if node.value is not None:
            print(f"{prefix}|--- class: {node.value}")
            return

        feature_label = f"Feature {node.feature_idx}"

        print(f"{prefix}|--- {feature_label} <= {node.threshold}")
        self.print_tree(node.left, depth + 1, indent)

        print(f"{prefix}|--- {feature_label} > {node.threshold}")
        self.print_tree(node.right, depth + 1, indent)

In [None]:
dt = DecisionTree(min_samples_split=2, max_depth=2)
dt.fit(X_train, y_train)
predictions = dt.predict(X_test)

accuracy = np.mean(predictions == y_test) * 100
print(f"Accuracy: {accuracy:.2f}%")


Accuracy: 100.00%


In [54]:
dt.print_tree()

|--- Feature 2 <= 1.9
|   |--- class: Iris-setosa
|--- Feature 2 > 1.9
|   |--- Feature 2 <= 4.7
|   |   |--- Feature 3 <= 1.6
|   |   |   |--- class: Iris-versicolor
|   |   |--- Feature 3 > 1.6
|   |   |   |--- class: Iris-virginica
|   |--- Feature 2 > 4.7
|   |   |--- Feature 3 <= 1.7
|   |   |   |--- Feature 2 <= 4.9
|   |   |   |   |--- class: Iris-versicolor
|   |   |   |--- Feature 2 > 4.9
|   |   |   |   |--- Feature 3 <= 1.5
|   |   |   |   |   |--- class: Iris-virginica
|   |   |   |   |--- Feature 3 > 1.5
|   |   |   |   |   |--- class: Iris-versicolor
|   |   |--- Feature 3 > 1.7
|   |   |   |--- Feature 2 <= 4.8
|   |   |   |   |--- class: Iris-virginica
|   |   |   |--- Feature 2 > 4.8
|   |   |   |   |--- class: Iris-virginica


In [50]:
new_sample = np.array([[6.1, 3.7, 4.5, 3.9]])
pred = dt.predict(new_sample)
print(pred[0])

Iris-virginica
