# Decision Tree Classifier

### Import Libraries

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

### Prepare Data

In [None]:
def prepare_data():
    """
    Prepare the training data from the table.
    The table includes students' average scores, hours studied, and whether they passed.
    """
    data = {
        'Student': ['Jeremy', 'Noah', 'Alyssa', 'Daniel', 'Sofia', 'Colin', 'Gracie', 'Anne', 'Jamal', 'Trisha'],
        'Average Score': [65, 70, 82, 67, 92, 23, 96, 47, 75, 46],
        'Hours Studied': [3, 1, 3, 2, 4, 1, 5, 4, 2, 1],
        'Passed': ['No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No']
    }

    # Create a DataFrame
    df = pd.DataFrame(data)

    # Encode 'Passed' column: Yes = 1, No = 0
    df['Passed'] = df['Passed'].map({'Yes': 1, 'No': 0})

    return df

### Calculate Entropy

In [None]:
def entropy(y):
    """
    Calculate the entropy of a target variable 'y'.
    Uses the formula: ∑ −pᵢ log₂(pᵢ)
    """
    values, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)

    entropy = -np.sum([p * np.log2(p) for p in probabilities if p > 0])
    return entropy

### Calculate Information Gain

In [None]:
def information_gain(X_column, y, threshold):
    """
    Calculate information gain for a split on a feature with a given threshold.
    """
    # Split data into two parts based on the threshold
    left_mask = X_column <= threshold
    right_mask = X_column > threshold

    # Calculate the entropy before the split
    parent_entropy = entropy(y)

    # Calculate the entropy after the split (weighted sum of child entropies)
    n = len(y)
    n_left, n_right = np.sum(left_mask), np.sum(right_mask)

    if n_left == 0 or n_right == 0:
        return 0

    left_entropy = entropy(y[left_mask])
    right_entropy = entropy(y[right_mask])

    weighted_avg_entropy = (n_left / n) * left_entropy + (n_right / n) * right_entropy

    # Information gain is the reduction in entropy
    info_gain = parent_entropy - weighted_avg_entropy
    return info_gain

### Find Best Split

In [None]:
def find_best_split(X, y):
    """
    Find the best feature and threshold to split the data.
    """
    best_gain = -1
    best_feature = None
    best_threshold = None

    # Iterate over all features to find the best split
    for feature in X.columns:
        X_column = X[feature]
        sorted_values = np.sort(np.unique(X_column))

        # Calculate midpoints between consecutive sorted values
        thresholds = (sorted_values[:-1] + sorted_values[1:]) / 2

        for threshold in thresholds:
            gain = information_gain(X_column, y, threshold)

            if gain > best_gain:
                best_gain = gain
                best_feature = feature
                best_threshold = threshold

    return best_feature, best_threshold

### Decision Tree Classifier

In [None]:
class DecisionTreeClassifierCustom:
    def __init__(self, max_depth=2):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        """
        Train the decision tree by recursively finding the best splits.
        """
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        """
        Recursively build the decision tree.
        """
        if depth == self.max_depth or len(np.unique(y)) == 1:
            # Return the majority class
            return np.bincount(y).argmax()

        feature, threshold = find_best_split(X, y)
        if feature is None:
            return np.bincount(y).argmax()

        # Split data
        left_mask = X[feature] <= threshold
        right_mask = X[feature] > threshold

        # Recursively build the left and right branches
        left_branch = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right_branch = self._build_tree(X[right_mask], y[right_mask], depth + 1)

        return (feature, threshold, left_branch, right_branch)

    def predict(self, X):
        """
        Predict class labels for samples in X.
        """
        return np.array([self._predict_sample(sample) for sample in X.to_dict(orient='records')])

    def _predict_sample(self, sample):
        """
        Predict the class of a single sample by traversing the tree.
        """
        node = self.tree

        while isinstance(node, tuple):
            feature, threshold, left_branch, right_branch = node

            if sample[feature] <= threshold:
                node = left_branch
            else:
                node = right_branch

        return node

### Visualize Decision Tree and Data Points

In [None]:
def plot_tree_and_data(tree, X, y):
    """
    Plot the decision tree decision boundaries and data points.
    """
    # Map 0 and 1 in y to "failed" and "passed"
    y_labels = np.where(y == 1, 'Passed', 'Failed')
    
    # Scatter plot of the data
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=X['Average Score'], y=X['Hours Studied'], hue=y_labels, palette='coolwarm', s=100)
    
    # Recursive plot of decision boundaries
    _plot_tree_recursive(tree, X, y)
    
    plt.xlabel("Average Score")
    plt.ylabel("Hours Studied")
    plt.title("Decision Tree Thresholds and Data Points")
    plt.show()

def _plot_tree_recursive(tree, X, y):
    """
    Recursively plot the decision boundaries for the decision tree.
    """
    if isinstance(tree, tuple):
        feature, threshold, left_branch, right_branch = tree
        
        # Plot decision boundary based on feature
        if feature == 'Average Score':
            plt.axvline(x=threshold, color='k', linestyle='--', lw=2)
        elif feature == 'Hours Studied':
            plt.axhline(y=threshold, color='k', linestyle='--', lw=2)
        
        # Recursively plot the left and right branches
        _plot_tree_recursive(left_branch, X[X[feature] <= threshold], y[X[feature] <= threshold])
        _plot_tree_recursive(right_branch, X[X[feature] > threshold], y[X[feature] > threshold])

### Main Function

In [None]:
def print_tree(node, depth=0):
    """
    Recursively print the decision tree structure with more details.
    """
    if isinstance(node, tuple):
        feature, threshold, left_branch, right_branch = node
        print(f"{'|   ' * depth}Depth {depth}: [Decision Node] Feature: {feature}, Threshold: {threshold}")
        print_tree(left_branch, depth + 1)
        print_tree(right_branch, depth + 1)
    else:
        class_label = 'Passed' if node == 1 else 'Failed'
        print(f"{'|   ' * depth}Depth {depth}: [Leaf Node] Class: {class_label}")

def main():
    # Prepare the data
    df = prepare_data()

    # Separate features (X) and target (y)
    X = df[['Average Score', 'Hours Studied']]
    y = df['Passed']

    # Train a custom decision tree classifier
    clf = DecisionTreeClassifierCustom(max_depth=2)
    clf.fit(X, y)

    # Print the tree structure
    print("Decision Tree Structure:")
    print_tree(clf.tree)

    # Visualize the tree and data points
    plot_tree_and_data(clf.tree, X, y)

# Run the main function
if __name__ == "__main__":
    main()