In [64]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Implement CART (Classification and Regression Trees)
class Node:
    def __init__(self, depth, max_depth):
        self.depth = depth
        self.max_depth = max_depth
        self.feature_index = None
        self.threshold = None
        self.left = None
        self.right = None
        self.value = None

class CART:
    def __init__(self, max_depth=None, min_samples_split=2, min_impurity=1e-7):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_impurity = min_impurity

    def fit(self, X, y):
        self.n_classes = len(np.unique(y))
        self.n_features = X.shape[1]
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        n_samples, n_features = X.shape
        unique_classes, class_counts = np.unique(y, return_counts=True)
        node = Node(depth, self.max_depth)

        if (len(unique_classes) == 1) or (depth == self.max_depth) or (n_samples < self.min_samples_split):
            node.value = unique_classes[np.argmax(class_counts)]
            return node

        best_gini = 1.0
        for feature_index in range(n_features):
            unique_values = np.unique(X[:, feature_index])
            for threshold in unique_values:
                left_mask = X[:, feature_index] <= threshold
                right_mask = X[:, feature_index] > threshold
                if len(y[left_mask]) > 0 and len(y[right_mask]) > 0:
                    gini_left = self._gini_impurity(y[left_mask])
                    gini_right = self._gini_impurity(y[right_mask])
                    gini = (len(y[left_mask]) / n_samples) * gini_left + (len(y[right_mask]) / n_samples) * gini_right
                    if gini < best_gini:
                        best_gini = gini
                        node.feature_index = feature_index
                        node.threshold = threshold
                        left_mask_best = left_mask
                        right_mask_best = right_mask

        if best_gini < self.min_impurity:
            node.value = unique_classes[np.argmax(class_counts)]
            return node

        node.left = self._build_tree(X[left_mask_best, :], y[left_mask_best], depth + 1)
        node.right = self._build_tree(X[right_mask_best, :], y[right_mask_best], depth + 1)
        return node

    def _gini_impurity(self, y):
        n_samples = len(y)
        if n_samples == 0:
            return 0.0
        unique_classes, class_counts = np.unique(y, return_counts=True)
        p = class_counts / n_samples
        return 1 - np.sum(p ** 2)

    def predict(self, X):
        predictions = [self._predict_tree(x) for x in X]
        return np.array(predictions)

    def _predict_tree(self, x, node=None):
        if node is None:
            node = self.tree
        if node.value is not None:
            return node.value
        if x[node.feature_index] <= node.threshold:
            return self._predict_tree(x, node.left)
        else:
            return self._predict_tree(x, node.right)

# Load the Iris dataset
iris = load_breast_cancer()
X, y = iris.data, iris.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate CART
cart = CART(max_depth=5)
cart.fit(X_train, y_train)
y_pred_cart = cart.predict(X_test)

# Implement ID3 using scikit-learn
id3 = DecisionTreeClassifier(criterion="entropy", max_depth=5)
id3.fit(X_train, y_train)
y_pred_id3 = id3.predict(X_test)

# Implement C4.5 (using Gini index) using scikit-learn
c45 = DecisionTreeClassifier(criterion="gini", max_depth=5)
c45.fit(X_train, y_train)
y_pred_c45 = c45.predict(X_test)

# Evaluation metrics
def evaluate_model(model_name, y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    confusion = confusion_matrix(y_true, y_pred)

    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print("Confusion Matrix:")
    print(confusion)
    print("\n")

evaluate_model("CART", y_test, y_pred_cart)
evaluate_model("ID3", y_test, y_pred_id3)
evaluate_model("C4.5 (Gini)", y_test, y_pred_c45)


Model: CART
Accuracy: 0.93
Precision: 0.93
Recall: 0.93
F1 Score: 0.93
Confusion Matrix:
[[39  4]
 [ 4 67]]


Model: ID3
Accuracy: 0.95
Precision: 0.95
Recall: 0.95
F1 Score: 0.95
Confusion Matrix:
[[39  4]
 [ 2 69]]


Model: C4.5 (Gini)
Accuracy: 0.94
Precision: 0.94
Recall: 0.94
F1 Score: 0.94
Confusion Matrix:
[[39  4]
 [ 3 68]]




**Steps**

*1-Data Loading and Splitting:*

-The code loads the Breast Cancer dataset using scikit-learn's load_breast_cancer.
-It splits the dataset into training and testing sets using train_test_split.

*2-Decision Tree Implementations:*

-The code implements a custom CART (Classification and Regression Trees) classifier, including a Node class and a CART class, following the Gini impurity for node splitting.
-It also implements decision trees using scikit-learn for both ID3 (entropy criterion) and C4.5 (Gini criterion) algorithms.

*3-Model Training:*

-The code trains each of the three decision tree models (CART, ID3, and C4.5) on the training data.

*4-Model Evaluation:*

The code defines an evaluate_model function to calculate various evaluation metrics for each model, including accuracy, precision, recall, F1-score, and the confusion matrix.
It applies this function to the testing data for each model.

*5-Results and Interpretation:*
-Results

-CART:The CART model achieved an accuracy of 93%, indicating that it correctly predicted the class labels for 93% of the test samples. The precision, recall, and F1 score are also approximately 93%, suggesting that the model provides a good balance between precision and recall. The confusion matrix shows that there were 39 true negatives, 67 true positives, 4 false positives, and 4 false negatives.

-ID3:The ID3 model achieved a higher accuracy of 95% compared to CART. It also has higher precision, recall, and F1 score. The confusion matrix shows that there were 39 true negatives, 69 true positives, 4 false positives, and 2 false negatives.

-C4.5:The C4.5 model achieved an accuracy of 94% and demonstrated good precision, recall, and F1 score. The confusion matrix indicates that there were 39 true negatives, 68 true positives, 4 false positives, and 3 false negatives.

-Interpretation

-Based on the results provided, the best model among CART, ID3, and C4.5 for the Breast Cancer classification task is ID3. It achieved the highest accuracy of 95%, making it the top-performing model in terms of overall classification accuracy.