<a href="https://colab.research.google.com/github/mlan18/ML-AND-DS-ASSIGNMENT1/blob/main/C4_5_Algo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd


class C45DecisionTreeClassifier:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None


    # --------------------------
    # 1. Calculate Entropy
    # --------------------------
    def entropy(self, y):
        classes, counts = np.unique(y, return_counts=True)
        probs = counts / counts.sum()
        return -np.sum(probs * np.log2(probs + 1e-9))  # Add epsilon to avoid log(0)


    # --------------------------
    # 2. Calculate Gain Ratio
    # --------------------------
    def gain_ratio(self, X_column, y):
        total_entropy = self.entropy(y)
        values, counts = np.unique(X_column, return_counts=True)


        weighted_entropy = 0
        split_info = 0


        for v, count in zip(values, counts):
            y_subset = y[X_column == v]
            weighted_entropy += (count / len(y)) * self.entropy(y_subset)
            proportion = count / len(y)
            split_info -= proportion * np.log2(proportion + 1e-9)


        info_gain = total_entropy - weighted_entropy
        gain_ratio = info_gain / (split_info + 1e-9)
        return gain_ratio


    # --------------------------
    # 3. Select Best Attribute
    # --------------------------
    def best_attribute(self, X, y, features):
        best_gain_ratio = -1
        best_feature = None
        for feature in features:
            gain_ratio = self.gain_ratio(X[:, feature], y)
            if gain_ratio > best_gain_ratio:
                best_gain_ratio = gain_ratio
                best_feature = feature
        return best_feature


    # --------------------------
    # 4. Build the Decision Tree
    # --------------------------
    def build_tree(self, X, y, features, depth=0):
        classes, counts = np.unique(y, return_counts=True)
        majority_class = classes[np.argmax(counts)]


        # Stopping conditions
        if len(classes) == 1:
            return classes[0]
        if len(features) == 0 or (self.max_depth is not None and depth >= self.max_depth):
            return majority_class


        best_feat = self.best_attribute(X, y, features)
        if best_feat is None:
            return majority_class


        tree = {best_feat: {}, "_majority": majority_class}
        remaining_features = [f for f in features if f != best_feat]


        for v in np.unique(X[:, best_feat]):
            X_subset = X[X[:, best_feat] == v]
            y_subset = y[X[:, best_feat] == v]
            if len(y_subset) == 0:
                tree[best_feat][v] = majority_class
            else:
                tree[best_feat][v] = self.build_tree(X_subset, y_subset, remaining_features, depth + 1)


        return tree


    # --------------------------
    # 5. Fit the Model
    # --------------------------
    def fit(self, X, y):
        features = list(range(X.shape[1]))
        self.tree = self.build_tree(X, y, features)


    # --------------------------
    # 6. Predict
    # --------------------------
    def _predict_single(self, x, tree):
        if not isinstance(tree, dict):
            return tree
        feature = [k for k in tree.keys() if k != "_majority"][0]
        value = x[feature]
        if value in tree[feature]:
            return self._predict_single(x, tree[feature][value])
        else:
            return tree["_majority"]


    def predict(self, X):
        return np.array([self._predict_single(x, self.tree) for x in X])




# --------------------------
# Example Dataset: “Buy Computer” (Play Tennis Style)
# --------------------------
data = {
    'Age': ['Youth', 'Youth', 'Middle', 'Senior', 'Senior', 'Senior', 'Middle', 'Youth', 'Youth', 'Senior', 'Youth', 'Middle', 'Middle', 'Senior'],
    'Income': ['High', 'High', 'High', 'Medium', 'Low', 'Low', 'Low', 'Medium', 'Low', 'Medium', 'Medium', 'Medium', 'High', 'Medium'],
    'Student': ['No', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No'],
    'Credit': ['Fair', 'Excellent', 'Fair', 'Fair', 'Fair', 'Excellent', 'Excellent', 'Fair', 'Fair', 'Fair', 'Excellent', 'Excellent', 'Fair', 'Excellent'],
    'Buy': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}


df = pd.DataFrame(data)


X = df.drop('Buy', axis=1).values
y = df['Buy'].values


# --------------------------
# Train C4.5 Classifier
# --------------------------
clf = C45DecisionTreeClassifier()
clf.fit(X, y)


# --------------------------
# Display Decision Tree
# --------------------------
print("\nC4.5 Decision Tree:\n", clf.tree)


# --------------------------
# Test Predictions
# --------------------------
y_pred = clf.predict(X)
print("\nPredictions:", y_pred)
print("Actual:", y.values)
