In [4]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

# Load data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
columns = [
    "id", "Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape",
    "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei",
    "Bland Chromatin", "Normal Nucleoli", "Mitoses", "Class"
]
df = pd.read_csv(url, names=columns)

# Handle missing values 
df["Bare Nuclei"] = df["Bare Nuclei"].replace("?", pd.NA).fillna(df["Bare Nuclei"].mode()[0])
df["Bare Nuclei"] = df["Bare Nuclei"].astype(int)

# Convert labels to binary (benign=0, malignant=1)
df["Class"] = df["Class"].replace({2: 0, 4: 1})

# Separate features and labels
x = df.drop(["id", "Class"], axis=1)
y = df["Class"]

# Train a decision tree
clf = DecisionTreeClassifier(
    max_depth=2,
    min_samples_leaf=2,
    min_samples_split=5,
    criterion="gini",
    random_state=42
)
clf.fit(x, y)

# Extract the feature and threshold for the first split
feature_idx = clf.tree_.feature[0]
feature_name = x.columns[feature_idx]
threshold = clf.tree_.threshold[0]

# Manually calculate parent node purity metrics
total_samples = len(y)
class_counts = y.value_counts()
p_parent = class_counts[1] / total_samples  

# Parent node entropy
if p_parent == 0 or p_parent == 1:
    entropy_parent = 0.0
else:
    entropy_parent = - (p_parent * np.log2(p_parent) + (1 - p_parent) * np.log2(1 - p_parent))

# Parent node Gini index
gini_parent = 1 - (p_parent**2 + (1 - p_parent)**2)

# Parent node misclassification error
misclass_parent = 1 - max(p_parent, 1 - p_parent)

# Split data based on the first split
left_mask = x[feature_name] <= threshold
y_left = y[left_mask]
y_right = y[~left_mask]

n_left = len(y_left)
n_right = len(y_right)
p_left = y_left.mean() if n_left > 0 else 0.0
p_right = y_right.mean() if n_right > 0 else 0.0

# Function to calculate entropy
def calculate_entropy(p):
    if p == 0 or p == 1:
        return 0.0
    else:
        return - (p * np.log2(p) + (1 - p) * np.log2(1 - p))

# Calculate entropy for child nodes
entropy_left = calculate_entropy(p_left)
entropy_right = calculate_entropy(p_right)

# Weighted entropy of child nodes
weighted_entropy = (n_left / total_samples) * entropy_left + (n_right / total_samples) * entropy_right

# Information gain
information_gain = entropy_parent - weighted_entropy

# Print results
print("Metrics for the Parent Node of the First Split:")
print(f"——Entropy: {entropy_parent:.3f}")
print(f"——Gini Index: {gini_parent:.3f}")
print(f"——Misclassification Error: {misclass_parent:.3f}")
print(f"\nInformation Gain: {information_gain:.3f}")
print(f"\nFirst Split Feature: {feature_name}, Threshold: {threshold:.1f}")

Metrics for the Parent Node of the First Split:
——Entropy: 0.929
——Gini Index: 0.452
——Misclassification Error: 0.345

Information Gain: 0.579

First Split Feature: Uniformity of Cell Size, Threshold: 2.5
