## Model

## Check Model

In [None]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# breast_cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target  # 0/1

# split train / test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=0,
    stratify=y
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("Class counts:", np.bincount(y))

X_train shape: (398, 30)
X_test shape: (171, 30)
Class counts: [212 357]


In [None]:
from src.cart import DecisionTreeCART

def make_cart_model():
    """
    Helper function to initialize our CART model.
    """
    model = DecisionTreeCART(max_depth=5, min_samples_split=2)
    return model


### Test 1 — train() runs correctly

In [None]:
# Test 1
model = make_cart_model()
model.fit(X_train, y_train)

print("Test 1 passed: train() runs without error.")

Test 1 passed: train() runs without error.


### Test 2 — predict() shape & class values check

In [None]:
# Test 2: predict() should produce outputs with correct shape and valid class values

model = make_cart_model()
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)

# Check output shape
assert y_pred_train.shape == y_train.shape, \
    f"Prediction shape mismatch: y_pred shape={y_pred_train.shape}, y_train shape={y_train.shape}"

# Check value range (breast_cancer is a binary classification dataset)
unique_vals = np.unique(y_pred_train)
assert set(unique_vals).issubset({0, 1}), \
    f"Predicted values must be 0/1. Found values: {unique_vals}"

train_acc = accuracy_score(y_train, y_pred_train)
print(f"Test 2 passed: predict() shape & value checks passed. Train accuracy = {train_acc:.3f}")

test_acc = model.accuracy(X_test, y_test)
print(f"Test accuracy = {test_acc:.3f}")


Test 2 passed: predict() shape & value checks passed. Train accuracy = 0.997
Test accuracy = 0.918


### Test 3 — loss() returns a finite scalar

In [None]:
# Test 3: loss() should return a finite scalar value

model = make_cart_model()
model.fit(X_train, y_train)

train_loss = model.loss(X_train, y_train)

assert np.isscalar(train_loss), "loss() should return a scalar value."
assert np.isfinite(train_loss), "loss() should not return NaN or infinity."

print(f"Test 3 passed: loss() returns a valid finite scalar. Train loss = {train_loss:.6f}")


Test 3 passed: loss() returns a valid finite scalar. Train loss = 0.002513


Our loss function is defined as the misclassification error rate, therefore it should be a scalar between 0 and 1.

### Test 4: Edge case testing

In [None]:
# A small toy dataset for edge case testing
X_toy = np.array([
    [0.0, 0.0],
    [0.0, 1.0],
    [1.0, 0.0],
    [1.0, 1.0],
])
y_toy = np.array([0, 0, 1, 1])

print("X_toy:\n", X_toy)
print("y_toy:", y_toy)


X_toy:
 [[0. 0.]
 [0. 1.]
 [1. 0.]
 [1. 1.]]
y_toy: [0 0 1 1]


#### Test 4.1 — Edge case: all labels are the same (0)

In [None]:
# Test 4.1: all labels are zero (only one class present)

model_zero = make_cart_model()

y_all_zero = np.zeros_like(y_toy)
model_zero.fit(X_toy, y_all_zero)

y_pred_zero = model_zero.predict(X_toy)
loss_zero = model_zero.loss(X_toy, y_all_zero)

assert y_pred_zero.shape == y_all_zero.shape
assert np.isfinite(loss_zero)

print("Test 4.1 passed: all-zero labels edge case handled correctly.")
print("Predicted labels:", y_pred_zero)
print("Loss on all-zero labels:", loss_zero)


Test 4.1 passed: all-zero labels edge case handled correctly.
Predicted labels: [0 0 0 0]
Loss on all-zero labels: 0.0


#### Test 4.2 — Edge case: single feature only

In [None]:
# Test 4.2: dataset contains only one feature

model_single = make_cart_model()

X_single = X_toy[:, :1]  # Use only the first feature
model_single.fit(X_single, y_toy)

y_pred_single = model_single.predict(X_single)
assert y_pred_single.shape == y_toy.shape

print("Test 4.2 passed: single-feature edge case handled correctly.")

assert np.array_equal(y_pred_single, y_toy)
print("Predicted labels:", y_pred_single)


Test 4.2 passed: single-feature edge case handled correctly.
Predicted labels: [0 0 1 1]


#### Test 4.3 — Edge case: all-zero features X=0

In [None]:
# Test 4.3: all feature values are zero

model_feat_zero = make_cart_model()

X_zeros = np.zeros_like(X_toy)
model_feat_zero.fit(X_zeros, y_toy)

y_pred_zeros = model_feat_zero.predict(X_zeros)
loss_zeros = model_feat_zero.loss(X_zeros, y_toy)

assert y_pred_zeros.shape == y_toy.shape
assert np.isfinite(loss_zeros)

print("Test 4.3 passed: all-zero features edge case handled correctly.")
print("Predicted labels:", y_pred_zeros)
print("Loss on all-zero features:", loss_zeros)


Test 4.3 passed: all-zero features edge case handled correctly.
Predicted labels: [0 0 0 0]
Loss on all-zero features: 0.5


### Test 5: Reproduce sklearn’s DecisionTreeClassifier 

[这个部分感恩节前感觉可能不需要!]

In [None]:
# Sklearn CART (using Gini impurity)
sk_cart = DecisionTreeClassifier(
    criterion="gini",
    max_depth=5,
    min_samples_split=2
)

sk_cart.fit(X_train, y_train)

y_pred_sk = sk_cart.predict(X_test)
sk_acc = accuracy_score(y_test, y_pred_sk)

print(f"Sklearn CART test accuracy: {sk_acc:.3f}")


Sklearn CART test accuracy: 0.912


In [None]:
# Our own CART implementation
my_cart = make_cart_model()
my_cart.fit(X_train, y_train)

y_pred_my = my_cart.predict(X_test)
my_acc = accuracy_score(y_test, y_pred_my)

print(f"Our CART test accuracy: {my_acc:.3f}")


Our CART test accuracy: 0.918


In [None]:
# Test 5: Compare our predictions with sklearn's CART classifier

same_predictions = np.array_equal(y_pred_sk, y_pred_my)
acc_diff = abs(sk_acc - my_acc)

print("Same predictions as sklearn?", same_predictions)
print(f"Absolute accuracy difference: {acc_diff:.6f}")

# Depending on implementation details, exact matching or near-matching are acceptable.
assert acc_diff < 1e-6 or same_predictions, \
    "Our CART implementation should match sklearn's behavior (up to small numerical or tie-breaking differences)."

print("Test 5 passed: our CART implementation successfully reproduces sklearn performance.")


Same predictions as sklearn? False
Absolute accuracy difference: 0.005848


AssertionError: Our CART implementation should match sklearn's behavior (up to small numerical or tie-breaking differences).

In [None]:
# import numpy as np
# data = np.genfromtxt("breast_cancer.csv", delimiter=",", dtype=str, skip_header=1)
# diagnosis = data[:, 1]
# X = data[:, 2:].astype(float)
# y = (diagnosis == "M").astype(int)

# print("X shape:", X.shape)       

X shape: (569, 30)


In [None]:
# import random
# random.seed(0)
# n = len(y)
# split1 = int(0.6 * n)
# split2 = int(0.8 * n)

# X_train, y_train = X[:split1], y[:split1]
# X_valid, y_valid = X[split1:split2], y[split1:split2]
# X_test, y_test = X[split2:], y[split2:]


# from src.cart import DecisionTreeCART
# tree = DecisionTreeCART(max_depth=40, min_samples_split=2)
# tree.fit(X_train, y_train)

# print("train acc:", tree.accuracy(X_train, y_train))
# print("val acc:", tree.accuracy(X_valid, y_valid))
# print("test acc:", tree.accuracy(X_test, y_test))

# tree.print_tree()


train acc: 1.0
val acc: 0.9298245614035088
test acc: 0.8245614035087719
--- CART TREE ---
[Feature 22 <= 105.1500] gain=0.3611

  [Feature 24 <= 0.1759] gain=0.0592

    [Feature 0 <= 14.9800] gain=0.0106

      [Feature 27 <= 0.1807] gain=0.0108

        [Feature 20 <= 15.7250] gain=0.0043

          [Feature 12 <= 4.1055] gain=0.0058

            [Feature 21 <= 33.1050] gain=0.0030

              Leaf(label=1, samples=?)
              [Feature 0 <= 12.0450] gain=0.3750

                Leaf(label=1, samples=?)
                Leaf(label=0, samples=?)
            [Feature 0 <= 12.2650] gain=0.5000

              Leaf(label=0, samples=?)
              Leaf(label=1, samples=?)
          [Feature 8 <= 0.1782] gain=0.3457

            Leaf(label=1, samples=?)
            Leaf(label=0, samples=?)
        Leaf(label=0, samples=?)
      Leaf(label=0, samples=?)
    Leaf(label=0, samples=?)
  [Feature 22 <= 114.4500] gain=0.0418

    [Feature 1 <= 19.7100] gain=0.3174

      [Feature 0 <= 14.