In [149]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

In [150]:
dataSet = "/kaggle/input/irdatadec/Iris (1).csv"

In [151]:
iris = pd.read_csv(dataSet)

In [152]:
iris['Species'] = iris['Species'].map({
    'Iris-setosa': 0, 
    'Iris-versicolor': 1, 
    'Iris-virginica': 2
})

In [153]:
x = iris[['SepalLengthCm', "SepalWidthCm" ,	"PetalLengthCm" ,"PetalWidthCm" ]].values
y = iris['Species'].values

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [154]:
def entropy(y):
    """Calculate the entropy of a dataset"""
    unique_classes, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    return -np.sum(probabilities * np.log2(probabilities))

def information_gain(X_column, y, threshold):
    """
    Calculate information gain for a split on a feature
    X_column: The feature column to split on
    y: The target column
    threshold: The threshold value for splitting
    """
    left_indices = X_column <= threshold
    right_indices = X_column > threshold

    # Calculate weighted average entropy after the split
    n = len(y)
    left_entropy = entropy(y[left_indices])
    right_entropy = entropy(y[right_indices])
    weighted_avg_entropy = (len(y[left_indices]) / n) * left_entropy + (len(y[right_indices]) / n) * right_entropy

    # Calculate information gain
    return entropy(y) - weighted_avg_entropy

def find_best_split(X, y):
    """
    Find the best feature and threshold to split on
    X: Features dataset
    y: Target column
    """
    best_feature = None
    best_threshold = None
    best_info_gain = -1
    for feature in range(X.shape[1]):  # Loop over each feature
        X_column = X[:, feature]
        thresholds = np.unique(X_column)

        for threshold in thresholds:
            gain = information_gain(X_column, y, threshold)
            if gain > best_info_gain:
                best_feature = feature
                best_threshold = threshold
                best_info_gain = gain

    return best_feature, best_threshold


In [155]:
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y, depth=0):
        """
        Fit the training data to build the decision tree
        X: Features dataset
        y: Target column
        depth: Current depth of the tree
        """
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))

        # Stopping criteria
        if n_classes == 1 or n_samples <= 1 or (self.max_depth is not None and depth >= self.max_depth):
            most_common_class = np.argmax(np.bincount(y))
            return most_common_class

        # Find the best feature and threshold to split on
        best_feature, best_threshold = find_best_split(X, y)

        if best_feature is None:  # If no split is found, return the majority class
            most_common_class = np.argmax(np.bincount(y))
            return most_common_class

        # Split the data into left and right subsets
        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        # Recursively build the tree
        left_subtree = self.fit(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self.fit(X[right_indices], y[right_indices], depth + 1)

        # Return a dictionary representation of the tree
        self.tree = {
            "feature": best_feature,
            "threshold": best_threshold,
            "left": left_subtree,
            "right": right_subtree,
        }
        return self.tree

    def predict_one(self, x, tree):
        """Predict a single sample"""
        if not isinstance(tree, dict):
            return tree

        feature = tree["feature"]
        threshold = tree["threshold"]

        if x[feature] <= threshold:
            return self.predict_one(x, tree["left"])
        else:
            return self.predict_one(x, tree["right"])

    def predict(self, X):
        """Predict multiple samples"""
        return [self.predict_one(x, self.tree) for x in X]

In [156]:
tree = DecisionTree(max_depth=3)
tree.fit(X_train, y_train)

{'feature': 2,
 'threshold': 1.9,
 'left': 0,
 'right': {'feature': 2,
  'threshold': 4.7,
  'left': {'feature': 3, 'threshold': 1.5, 'left': 1, 'right': 2},
  'right': {'feature': 2, 'threshold': 5.1, 'left': 2, 'right': 2}}}

In [157]:
y_pred = tree.predict(X_test)

In [158]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Predictions: {y_pred}")
print(f"Actual Labels: {y_test}")
print(f"Accuracy: {accuracy * 100:.2f}%")

Predictions: [1, 0, 2, 1, 2, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 2, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 2, 1, 0, 0]
Actual Labels: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0]
Accuracy: 93.33%


In [159]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      0.77      0.87        13
           2       0.81      1.00      0.90        13

    accuracy                           0.93        45
   macro avg       0.94      0.92      0.92        45
weighted avg       0.95      0.93      0.93        45



In [160]:
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Confusion Matrix:
[[19  0  0]
 [ 0 10  3]
 [ 0  0 13]]
