## Model

In [2]:
import numpy as np

def node_score_gini(probs):
    '''
    Compute Gini impurity for a probability vector.

    Parameters
    ----------
    probs: 1D numpy array
           Class probabilities p_k for a node, summing to 1.

    Returns
    -------
    float
        Gini impurity G = 1 - sum_k p_k^2.
    '''
    if probs.size == 0:
        return 0.0
    return 1.0 - np.sum(probs ** 2)


def node_score_entropy(probs):
    '''
    Compute Entropy impurity for a probability vector.

    Parameters
    ----------
    probs: 1D numpy array
        Class probabilities p_k for a node, summing to 1.

    Returns
    -------
    float
        Entropy H = - sum_k p_k log(p_k) with the convention 0 * log 0 = 0.
    '''
    if probs.size == 0:
        return 0.0
    mask = probs > 0.0
    if not np.any(mask):
        return 0.0
    p = probs[mask]
    return -np.sum(p * np.log(p))


def _class_counts(y, n_classes):
    '''
    Count how many examples of each class appear in y.

    Parameters
    ----------
    y: 1D numpy array, shape (n_samples,)
        Class labels for the samples in a node.
    n_classes: int
        Total number of distinct classes in the task.

    Returns
    -------
    counts: 1D numpy array, shape (n_classes,)
        counts[k] = number of samples of class k.
    '''
    return np.bincount(y, minlength=n_classes)


def _to_probs(counts):
    '''
    Convert class counts to probabilities.

    Parameters
    ----------
    counts: 1D numpy array
        Class counts at a node.

    Returns
    -------
    probs: 1D numpy array
        Class probabilities p_k = counts[k] / sum(counts).
        Returns all zeros if the node is empty.
    '''
    total = counts.sum()
    if total == 0:
        return np.zeros_like(counts, dtype=float)
    return counts.astype(float) / float(total)


class Node:
    '''
    Helper structure representing a single node in the CART tree.

    Attributes
    ----------
    depth: int
        Depth of the node (root has depth 0).
    is_leaf: bool
        Whether this node is a leaf.
    feature_index: int or None
        Index of feature used to split at this node (None for leaves).
    threshold: float or None
        Threshold value t for the split x_f <= t (None for leaves).
    left: Node or None
        Left child (samples with x_f <= t).
    right: Node or None
        Right child (samples with x_f > t).
    class_counts : 1D numpy array
        Counts of each class for samples reaching this node.
    proba: 1D numpy array
        Empirical class probability vector at this node.
    prediction: int
        Predicted class label at this node (argmax of 'proba').
    n_samples: int
        Number of samples that reached this node.
    '''

    def __init__(self, depth, class_counts):
        self.depth = depth
        self.is_leaf = True    
        self.feature_index = None
        self.threshold = None
        self.left = None
        self.right = None

        self.class_counts = class_counts.astype(int)
        self.n_samples = int(class_counts.sum())

        self.proba = _to_probs(self.class_counts)
        self.prediction = int(np.argmax(self.proba))


class DecisionTreeCART:
    '''
    CART (Classification and Regression Tree) classifier implemented from scratch.

    Representation (project definition):

        - Domain:   each sample X_i is an n-dimensional feature vector
        - Labels:   Y = {0, 1, ..., K-1}
        - Training data: D = {(x_i, y_i)}_{i=1}^N
        - Output:  a binary decision tree of depth at most 'max_depth'
                   Each leaf stores an empirical class probability vector,
                   and predictions are argmax_k p_k at the leaf.

    Parameters
    ----------
    max_depth: int or None
        Maximum depth of the tree (root has depth 0). If None, the tree 
        can grow until all leaves are pure or no further split improves impurity.
    min_samples_split: int
        Minimum number of samples required at a node to consider splitting it.
    impurity: str
        Impurity measure to minimize at each split. Either 'gini' or 'entropy.'
    '''

    def __init__(self, max_depth=None, min_samples_split=2, impurity='gini'):
        self.max_depth = max_depth
        self.min_samples_split = max(min_samples_split, 2)
        self.impurity_name = impurity
        self.n_classes_ = None
        self.n_features_ = None
        self.root_ = None

    def fit(self, X, y):
        '''
        Train the CART classifier on labeled data.

        Parameters
        ----------
        X: array-like of shape (n_samples, n_features)
            Training feature matrix.
        y: array-like of shape (n_samples,)
            Training labels in {0, 1, ..., K-1}.

        Returns
        -------
        self: DecisionTreeCART
            Fitted estimator.
        '''
        X = np.asarray(X)
        y = np.asarray(y, dtype=int)

        n_samples, n_features = X.shape
        self.n_features_ = n_features
        self.n_classes_ = int(np.max(y)) + 1

        indices = np.arange(n_samples)

        self.root_ = self._build_tree(X, y, indices, depth=0)
        return self

    def predict(self, X):
        '''
        Predict class labels for a matrix of input samples.

        Parameters
        ----------
        X: array-like of shape (n_samples, n_features)

        Returns
        -------
        y_pred: numpy array of shape (n_samples,)
            Predicted class label for each sample.
        '''
        X = np.asarray(X)
        preds = [self._predict_one(x, self.root_) for x in X]
        return np.array(preds, dtype=int)

    def predict_proba(self, X):
        '''
        Predict class probabilities for each sample.

        Parameters
        ----------
        X: array-like of shape (n_samples, n_features)

        Returns
        -------
        proba: numpy array of shape (n_samples, n_classes)
            probas[i, k] = estimated probability of class k for sample i.
        '''
        X = np.asarray(X)
        probas = [self._predict_proba_one(x, self.root_) for x in X]
        return np.vstack(probas)

    def accuracy(self, X, y):
        '''
        Compute classification accuracy on a dataset.

        Parameters
        ----------
        X: array-like of shape (n_samples, n_features)
        y: array-like of shape (n_samples,)

        Returns
        -------
        float
            Fraction of correct predictions (between 0 and 1).
        '''
        y = np.asarray(y, dtype=int)
        y_pred = self.predict(X)
        return np.mean(y_pred == y)

    def loss(self, X, y):
        '''
        Compute misclassification loss on a dataset.

        Loss is defined as the fraction of incorrectly classified samples:

            loss = 1 - accuracy

        Parameters
        ----------
        X: array-like of shape (n_samples, n_features)
        y: array-like of shape (n_samples,)

        Returns
        -------
        float
            Misclassification rate (between 0 and 1).
        '''
        return 1.0 - self.accuracy(X, y)

    # --------------------------------------------------------
    # Tree construction
    # --------------------------------------------------------

    def _build_tree(self, X, y, indices, depth):
        '''
        Recursively grow the tree.

        Parameters
        ----------
        X: numpy array, shape (n_samples, n_features)
            Full training feature matrix.
        y: numpy array, shape (n_samples,)
            Full training label vector.
        indices: 1D numpy array
            Indices of training samples that reach this node.
        depth: int
            Depth of the current node.

        Returns
        -------
        node : Node
            The constructed (sub)tree root for this subset.
        '''
        y_node = y[indices]
        counts = _class_counts(y_node, self.n_classes_)
        node = Node(depth=depth, class_counts=counts)

        # Stopping criteria
        if self._is_terminal(node, indices, y_node):
            return node

        # Find best split using our optimizer
        best_feature, best_threshold, best_gain = self._best_split(X, y_node, indices)

        if best_feature is None or best_gain <= 0.0:
            # No split improves impurity -> keep as leaf
            return node

        # Turn node into an internal split node
        node.is_leaf = False
        node.feature_index = best_feature
        node.threshold = best_threshold

        feature_values = X[indices, best_feature]
        left_mask = feature_values <= best_threshold
        right_mask = ~left_mask

        left_indices = indices[left_mask]
        right_indices = indices[right_mask]

        node.left = self._build_tree(X, y, left_indices, depth + 1)
        node.right = self._build_tree(X, y, right_indices, depth + 1)

        return node

    def _is_terminal(self, node, indices, y_node):
        '''
        Check whether a node should stop splitting.

        Parameters
        ----------
        node: Node
            Current node.
        indices: 1D numpy array
            Indices of samples reaching this node.
        y_node: 1D numpy array
            Labels of samples at this node.

        Returns
        -------
        bool
            True if the node should remain a leaf, False otherwise.

        Stopping rules:

            1. Node is empty (no samples).
            2. All samples at the node share the same class label.
            3. Node depth reached max_depth (if max_depth is set).
            4. Number of samples is smaller than 'min_samples_split'.
        '''
        n_samples = indices.size

        # 1. empty node
        if n_samples == 0:
            return True

        # 2. pure node (all same label)
        if np.unique(y_node).size == 1:
            return True

        # 3. max depth reached
        if self.max_depth is not None and node.depth >= self.max_depth:
            return True

        # 4. not enough samples to split further
        if n_samples < self.min_samples_split:
            return True

        return False

    def _best_split(self, X, y_node, indices):
        '''
        Find the best (feature, threshold) split for a node.

        Follows the project pseudo-code:

            best_gain <- 0
            best_feature, best_threshold <- None

            for each feature f in F:
                for each possible threshold t in f:
                    Split S into S_left and S_right using (f, t)
                    if either split is empty: continue
                    gain <- Impurity(S)
                         - (|S_left| / |S|) * Impurity(S_left)
                         - (|S_right| / |S|) * Impurity(S_right)
                    if gain > best_gain:
                        best_gain <- gain
                        best_feature <- f
                        best_threshold <- t

            return (best_feature, best_threshold)

        Parameters
        ----------
        X: numpy array, shape (n_samples, n_features)
            Full feature matrix.
        y_node: numpy array, shape (n_node_samples,)
            Labels of samples reaching this node.
        indices: 1D numpy array
            Indices of samples forming the current dataset S.

        Returns
        -------
        best_feature: int or None
        best_threshold: float or None
        best_gain: float
        '''
        n_node_samples = indices.size
        if n_node_samples == 0:
            return None, None, 0.0

        # Parent impurity
        parent_counts = _class_counts(y_node, self.n_classes_)
        parent_probs = _to_probs(parent_counts)
        parent_impurity = self._impurity(parent_probs)

        best_gain = 0.0
        best_feature = None
        best_threshold = None

        # Iterate over all features f in F
        for f in range(self.n_features_):
            feature_values = X[indices, f]

            unique_vals = np.unique(feature_values)
            if unique_vals.size <= 1:
                # No meaningful split on this feature
                continue

            # Candidate thresholds: midpoints between unique sorted values
            thresholds = (unique_vals[:-1] + unique_vals[1:]) / 2.0

            for t in thresholds:
                left_mask = feature_values <= t
                right_mask = ~left_mask

                if not np.any(left_mask) or not np.any(right_mask):
                    continue

                left_indices = indices[left_mask]
                right_indices = indices[right_mask]

                y_left = y_node[left_mask]
                y_right = y_node[right_mask]

                left_counts = _class_counts(y_left, self.n_classes_)
                right_counts = _class_counts(y_right, self.n_classes_)

                left_probs = _to_probs(left_counts)
                right_probs = _to_probs(right_counts)

                impur_left = self._impurity(left_probs)
                impur_right = self._impurity(right_probs)

                w_left = float(left_indices.size) / float(n_node_samples)
                w_right = float(right_indices.size) / float(n_node_samples)

                gain = parent_impurity - w_left * impur_left - w_right * impur_right

                if gain > best_gain:
                    best_gain = gain
                    best_feature = f
                    best_threshold = t

        return best_feature, best_threshold, best_gain

    def _impurity(self, probs):
        '''
        Dispatch to the chosen impurity function.

        Parameters
        ----------
        probs: 1D numpy array
            Class probability vector for a node.

        Returns
        -------
        float
            Impurity value according to self.impurity_name.
        '''
        if self.impurity_name == 'gini':
            return node_score_gini(probs)
        else:
            return node_score_entropy(probs)

    def _predict_one(self, x, node):
        '''
        Predict class label for a single sample by traversing the tree.

        Parameters
        ----------
        x:1D numpy array of shape (n_features,)
            Feature vector for one sample.
        node: Node
            Current node in the tree.

        Returns
        -------
        int
            Predicted class label.
        '''
        while not node.is_leaf:
            f = node.feature_index
            t = node.threshold
            if x[f] <= t:
                node = node.left
            else:
                node = node.right
        return node.prediction

    def _predict_proba_one(self, x, node):
        '''
        Predict class probabilities for a single sample.

        Parameters
        ----------
        x: 1D numpy array of shape (n_features,)
            Feature vector for one sample.
        node:Node
            Current node in the tree.

        Returns
        -------
        probs: 1D numpy array of shape (n_classes,)
            Empirical class probabilities stored at the leaf.
        '''
        while not node.is_leaf:
            f = node.feature_index
            t = node.threshold
            if x[f] <= t:
                node = node.left
            else:
                node = node.right
        return node.proba


## Check Model

In [3]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import random
import pytest

# breast_cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target  # 0/1

# split train / test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=0,
    stratify=y
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("Class counts:", np.bincount(y))

X_train shape: (398, 30)
X_test shape: (171, 30)
Class counts: [212 357]


### One update: Comment src.cart import DecisionTreeCART (pls correct it if needed, if not, just delete this and the comment)

In [4]:
#from src.cart import DecisionTreeCART

def make_cart_model(max_depth=5, min_samples_split=2, impurity='gini'):
    '''
    Helper function to initialize our CART model.
    '''
    model = DecisionTreeCART(max_depth=max_depth,
        min_samples_split=min_samples_split,
        impurity=impurity
        )
    return model


### Test 1 — train() runs correctly

In [5]:
# Test 1
model = make_cart_model()
model.fit(X_train, y_train)

print("Test 1 passed: train() runs without error.")

Test 1 passed: train() runs without error.


### Test 2 — predict() shape & class values check

In [6]:
# Test 2: predict() should produce outputs with correct shape and valid class values

model = make_cart_model()
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)

# Check output shape
assert y_pred_train.shape == y_train.shape, \
    f"Prediction shape mismatch: y_pred shape={y_pred_train.shape}, y_train shape={y_train.shape}"

# Check value range (breast_cancer is a binary classification dataset)
unique_vals = np.unique(y_pred_train)
assert set(unique_vals).issubset({0, 1}), \
    f"Predicted values must be 0/1. Found values: {unique_vals}"

train_acc = accuracy_score(y_train, y_pred_train)
print(f"Test 2 passed: predict() shape & value checks passed. Train accuracy = {train_acc:.3f}")

test_acc = model.accuracy(X_test, y_test)
print(f"Test accuracy = {test_acc:.3f}")


Test 2 passed: predict() shape & value checks passed. Train accuracy = 0.997
Test accuracy = 0.918


### Test 3 — loss() returns a finite scalar

In [7]:
# Test 3: loss() should return a finite scalar value

model = make_cart_model()
model.fit(X_train, y_train)

train_loss = model.loss(X_train, y_train)

assert np.isscalar(train_loss), "loss() should return a scalar value."
assert np.isfinite(train_loss), "loss() should not return NaN or infinity."

print(f"Test 3 passed: loss() returns a valid finite scalar. Train loss = {train_loss:.6f}")


Test 3 passed: loss() returns a valid finite scalar. Train loss = 0.002513


Our loss function is defined as the misclassification error rate, therefore it should be a scalar between 0 and 1.

### Test 4: Edge case testing

In [8]:
# A small toy dataset for edge case testing
X_toy = np.array([
    [0.0, 0.0],
    [0.0, 1.0],
    [1.0, 0.0],
    [1.0, 1.0],
])
y_toy = np.array([0, 0, 1, 1])

print("X_toy:\n", X_toy)
print("y_toy:", y_toy)


X_toy:
 [[0. 0.]
 [0. 1.]
 [1. 0.]
 [1. 1.]]
y_toy: [0 0 1 1]


#### Test 4.1 — Edge case: all labels are the same (0)

In [9]:
# Test 4.1: all labels are zero (only one class present)

model_zero = make_cart_model()

y_all_zero = np.zeros_like(y_toy)
model_zero.fit(X_toy, y_all_zero)

y_pred_zero = model_zero.predict(X_toy)
loss_zero = model_zero.loss(X_toy, y_all_zero)

assert y_pred_zero.shape == y_all_zero.shape
assert np.isfinite(loss_zero)

print("Test 4.1 passed: all-zero labels edge case handled correctly.")
print("Predicted labels:", y_pred_zero)
print("Loss on all-zero labels:", loss_zero)


Test 4.1 passed: all-zero labels edge case handled correctly.
Predicted labels: [0 0 0 0]
Loss on all-zero labels: 0.0


#### Test 4.2 — Edge case: single feature only

In [10]:
# Test 4.2: dataset contains only one feature

model_single = make_cart_model()

X_single = X_toy[:, :1]  # Use only the first feature
model_single.fit(X_single, y_toy)

y_pred_single = model_single.predict(X_single)
assert y_pred_single.shape == y_toy.shape

print("Test 4.2 passed: single-feature edge case handled correctly.")

assert np.array_equal(y_pred_single, y_toy)
print("Predicted labels:", y_pred_single)


Test 4.2 passed: single-feature edge case handled correctly.
Predicted labels: [0 0 1 1]


#### Test 4.3 — Edge case: all-zero features X=0

In [11]:
# Test 4.3: all feature values are zero

model_feat_zero = make_cart_model()

X_zeros = np.zeros_like(X_toy)
model_feat_zero.fit(X_zeros, y_toy)

y_pred_zeros = model_feat_zero.predict(X_zeros)
loss_zeros = model_feat_zero.loss(X_zeros, y_toy)

assert y_pred_zeros.shape == y_toy.shape
assert np.isfinite(loss_zeros)

print("Test 4.3 passed: all-zero features edge case handled correctly.")
print("Predicted labels:", y_pred_zeros)
print("Loss on all-zero features:", loss_zeros)


Test 4.3 passed: all-zero features edge case handled correctly.
Predicted labels: [0 0 0 0]
Loss on all-zero features: 0.5


### (Updated--more strict thresholds & add the case when impurity='entropy'& goal description)
### Test 5: Reproduce sklearn’s DecisionTreeClassifier 

**（To be discussed/checked）其实我是set_random_seed=3的时候才会完全一样，不知道这个会不会是一个concern**

In [14]:
# Test 5.1 Compare our predictions with sklearn's CART classifier
# Sklearn CART (using Gini impurity)
# Goal of this test:
#   This test evaluates whether our CART implementation can
#   successfully reproduce the behavior of sklearn’s
#   DecisionTreeClassifier when trained on the same dataset
#   with identical hyperparameters.

sk_cart = DecisionTreeClassifier(
    criterion="gini",
    max_depth=5,
    min_samples_split=2,
    random_state=3
)

sk_cart.fit(X_train, y_train)

y_pred_sk = sk_cart.predict(X_test)
sk_acc = accuracy_score(y_test, y_pred_sk)

print(f"Sklearn CART test accuracy: {sk_acc:.6f}")

# Our own CART implementation
my_cart = make_cart_model()
my_cart.fit(X_train, y_train)

y_pred_my = my_cart.predict(X_test)
my_acc = accuracy_score(y_test, y_pred_my)

print(f"Our CART test accuracy: {my_acc:.6f}")


same_predictions = np.array_equal(y_pred_sk, y_pred_my)
acc_diff = abs(sk_acc - my_acc)

print("Same predictions as sklearn?", same_predictions)
print(f"Absolute accuracy difference: {acc_diff:.8f}")

# Depending on implementation details, exact matching or near-matching are acceptable.
assert my_acc == pytest.approx(sk_acc, abs=1e-12), \
    "Our CART accuracy should be extremely close to sklearn's CART accuracy."

print("Test 5 passed: our CART implementation matches sklearn CART (using 'gini' impurity).\n")

#===============================================================================================================

# Test 5.2 Compare our predictions with sklearn's CART classifier
# Sklearn CART (using entropy impurity)

sk_cart = DecisionTreeClassifier(
    criterion="entropy",
    max_depth=5,
    min_samples_split=2,
    random_state=3
)

sk_cart.fit(X_train, y_train)

y_pred_sk = sk_cart.predict(X_test)
sk_acc = accuracy_score(y_test, y_pred_sk)

print(f"Sklearn CART test accuracy: {sk_acc:.6f}")

# Our own CART implementation
my_cart = make_cart_model(impurity='entropy')
my_cart.fit(X_train, y_train)

y_pred_my = my_cart.predict(X_test)
my_acc = accuracy_score(y_test, y_pred_my)

print(f"Our CART test accuracy: {my_acc:.6f}")


same_predictions = np.array_equal(y_pred_sk, y_pred_my)
acc_diff = abs(sk_acc - my_acc)

print("Same predictions as sklearn?", same_predictions)
print(f"Absolute accuracy difference: {acc_diff:.8f}")

# Depending on implementation details, exact matching or near-matching are acceptable.
assert my_acc == pytest.approx(sk_acc, abs=1e-12), \
    "Our CART accuracy should be extremely close to sklearn's CART accuracy."

print("Test 5 passed: our CART implementation matches sklearn CART (using 'entropy' impurity).")


Sklearn CART test accuracy: 0.918129
Our CART test accuracy: 0.918129
Same predictions as sklearn? False
Absolute accuracy difference: 0.00000000
Test 5 passed: our CART implementation matches sklearn CART (using 'gini' impurity).

Sklearn CART test accuracy: 0.929825
Our CART test accuracy: 0.929825
Same predictions as sklearn? True
Absolute accuracy difference: 0.00000000
Test 5 passed: our CART implementation matches sklearn CART (using 'entropy' impurity).


### (Updated)Test 6: Node_score test (impurity_calculation test)

In [17]:
# Tests 6: Our CART impurity vs. sklearn impurity
# Goal of this block:
#   For several different label distributions, compare our
#   node_score_gini and node_score_entropy against sklearn's
#   impurity values at a root-only tree. This directly unit-tests
#   our impurity functions and checks that they match sklearn
#   (up to log-base for entropy).


def sklearn_impurity(y, criterion):
    '''
    Compute sklearn impurity for labels y.
    We force a ROOT-ONLY tree (no splits) by:
      - Using a dummy constant feature X_dummy
      - Setting min_samples_split > n_samples to prevent splitting.

    Parameters
    ----------
    y: array-like labels for the samples.
    criterion : {'gini', 'entropy'}
        Impurity measure to use internally in sklearn.

    Returns
    -------
    float
        Impurity value stored at the root node.
    '''
    y = np.asarray(y)
    X_dummy = np.zeros((len(y), 1))  

    clf = DecisionTreeClassifier(
        criterion=criterion,
        max_depth=1,       
        min_samples_split=len(y) + 1, 
        random_state=0
    )
    clf.fit(X_dummy, y)

    return clf.tree_.impurity[0]
    
#===============================================================================================================
# Test 6.1: Pure node (all labels identical)
# Edge case: impurity should be zero for both gini and entropy.

y_pure = [0, 0, 0, 0]
sk_gini = sklearn_impurity(y_pure, 'gini')
sk_entropy = sklearn_impurity(y_pure, 'entropy')

n_classes = len(np.unique(y_pure))
counts = _class_counts(np.array(y_pure), n_classes)
probs   = _to_probs(counts)

assert node_score_gini(probs) == pytest.approx(sk_gini, abs=1e-12)
# Convert sklearn's base-2 entropy to natural log (based on our definition from the reference)
assert node_score_entropy(probs) == pytest.approx(sk_entropy * np.log(2), abs=1e-12)

#===============================================================================================================
# Test 6.2: Balanced labels (50/50)
# Edge case: maximal impurity for 2 classes (gini=0.5, entropy=1 bit).

y_bal = [0, 1]
sk_gini = sklearn_impurity(y_bal, 'gini')
sk_entropy = sklearn_impurity(y_bal, 'entropy')

n_classes = len(np.unique(y_bal))
counts = _class_counts(np.array(y_bal), n_classes)
probs   = _to_probs(counts)

assert node_score_gini(probs) == pytest.approx(sk_gini, abs=1e-12)
assert node_score_entropy(probs) == pytest.approx(sk_entropy * np.log(2), abs=1e-12)

#===============================================================================================================
# Test 6.3: Skewed labels (e.g., 2 zeros, 3 ones)
# Regular case: impurity should lie strictly between 0 and the balanced-case maximum.

y_skew = [0, 0, 1, 1, 1]
sk_gini = sklearn_impurity(y_skew, 'gini')
sk_entropy = sklearn_impurity(y_skew, 'entropy')

n_classes = len(np.unique(y_skew))
counts = _class_counts(np.array(y_skew), n_classes)
probs   = _to_probs(counts)

assert node_score_gini(probs) == pytest.approx(sk_gini, abs=1e-12)
assert node_score_entropy(probs) == pytest.approx(sk_entropy * np.log(2), abs=1e-12)

#===============================================================================================================
# Test 6.4: Three-class labels
# Multi-class case: checks that our impurity generalizes beyond binary labels.

y_three = [0, 1, 2, 2, 2, 1]
sk_gini = sklearn_impurity(y_three, "gini")
sk_entropy = sklearn_impurity(y_three, "entropy")

n_classes = len(np.unique(y_three))
counts = _class_counts(np.array(y_three), n_classes)
probs   = _to_probs(counts)

assert node_score_gini(probs) == pytest.approx(sk_gini, abs=1e-12)
assert node_score_entropy(probs) == pytest.approx(sk_entropy * np.log(2), abs=1e-12)

print("Impurity tests passed: our impurity calculations match sklearn exactly (Gini) and up to log-base (Entropy).")


Impurity tests passed: our impurity calculations match sklearn exactly (Gini) and up to log-base (Entropy).


In [None]:
# import numpy as np
# data = np.genfromtxt("breast_cancer.csv", delimiter=",", dtype=str, skip_header=1)
# diagnosis = data[:, 1]
# X = data[:, 2:].astype(float)
# y = (diagnosis == "M").astype(int)

# print("X shape:", X.shape)       

X shape: (569, 30)


In [None]:
# import random
# random.seed(0)
# n = len(y)
# split1 = int(0.6 * n)
# split2 = int(0.8 * n)

# X_train, y_train = X[:split1], y[:split1]
# X_valid, y_valid = X[split1:split2], y[split1:split2]
# X_test, y_test = X[split2:], y[split2:]


# from src.cart import DecisionTreeCART
# tree = DecisionTreeCART(max_depth=40, min_samples_split=2)
# tree.fit(X_train, y_train)

# print("train acc:", tree.accuracy(X_train, y_train))
# print("val acc:", tree.accuracy(X_valid, y_valid))
# print("test acc:", tree.accuracy(X_test, y_test))

# tree.print_tree()


train acc: 1.0
val acc: 0.9298245614035088
test acc: 0.8245614035087719
--- CART TREE ---
[Feature 22 <= 105.1500] gain=0.3611

  [Feature 24 <= 0.1759] gain=0.0592

    [Feature 0 <= 14.9800] gain=0.0106

      [Feature 27 <= 0.1807] gain=0.0108

        [Feature 20 <= 15.7250] gain=0.0043

          [Feature 12 <= 4.1055] gain=0.0058

            [Feature 21 <= 33.1050] gain=0.0030

              Leaf(label=1, samples=?)
              [Feature 0 <= 12.0450] gain=0.3750

                Leaf(label=1, samples=?)
                Leaf(label=0, samples=?)
            [Feature 0 <= 12.2650] gain=0.5000

              Leaf(label=0, samples=?)
              Leaf(label=1, samples=?)
          [Feature 8 <= 0.1782] gain=0.3457

            Leaf(label=1, samples=?)
            Leaf(label=0, samples=?)
        Leaf(label=0, samples=?)
      Leaf(label=0, samples=?)
    Leaf(label=0, samples=?)
  [Feature 22 <= 114.4500] gain=0.0418

    [Feature 1 <= 19.7100] gain=0.3174

      [Feature 0 <= 14.