In [2]:
import numpy as np
import matplotlib
try:
    matplotlib.use("QtAgg")
except Exception:
    matplotlib.use("TkAgg")

import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# Helpers

def one_hot(y: np.ndarray, k: int) -> np.ndarray:
    out = np.zeros((y.size, k), dtype=np.float32)
    out[np.arange(y.size), y] = 1.0
    return out

def relu(x): return np.maximum(0.0, x)
def relu_grad(x): return (x > 0).astype(np.float32)

def softmax(logits: np.ndarray) -> np.ndarray:
    z = logits - np.max(logits, axis=1, keepdims=True)
    ez = np.exp(z)
    return ez / np.sum(ez, axis=1, keepdims=True)

def cross_entropy(probs: np.ndarray, y_onehot: np.ndarray, eps=1e-12) -> float:
    p = np.clip(probs, eps, 1.0)
    return float(-np.mean(np.sum(y_onehot * np.log(p), axis=1)))

def accuracy(probs: np.ndarray, y_true: np.ndarray) -> float:
    return float(np.mean(np.argmax(probs, axis=1) == y_true))


# 2-hidden-layer MLP + AdamW-ish (decoupled weight decay)

class MLP2HiddenAdamW:
    def __init__(self, input_dim: int, h1: int, h2: int, out_dim: int, seed: int = 42):
        rng = np.random.default_rng(seed)

        # Init
        self.W1 = (rng.standard_normal((input_dim, h1)).astype(np.float32) * np.sqrt(2.0 / input_dim))
        self.b1 = np.zeros((1, h1), dtype=np.float32)
        self.W2 = (rng.standard_normal((h1, h2)).astype(np.float32) * np.sqrt(2.0 / h1))
        self.b2 = np.zeros((1, h2), dtype=np.float32)
        self.W3 = (rng.standard_normal((h2, out_dim)).astype(np.float32) * np.sqrt(2.0 / h2))
        self.b3 = np.zeros((1, out_dim), dtype=np.float32)

        # Adam moments
        self.mW1 = np.zeros_like(self.W1); self.vW1 = np.zeros_like(self.W1)
        self.mb1 = np.zeros_like(self.b1); self.vb1 = np.zeros_like(self.b1)
        self.mW2 = np.zeros_like(self.W2); self.vW2 = np.zeros_like(self.W2)
        self.mb2 = np.zeros_like(self.b2); self.vb2 = np.zeros_like(self.b2)
        self.mW3 = np.zeros_like(self.W3); self.vW3 = np.zeros_like(self.W3)
        self.mb3 = np.zeros_like(self.b3); self.vb3 = np.zeros_like(self.b3)
        self.t = 0

    def forward(self, X: np.ndarray):
        z1 = X @ self.W1 + self.b1
        a1 = relu(z1)
        z2 = a1 @ self.W2 + self.b2
        a2 = relu(z2)
        logits = a2 @ self.W3 + self.b3
        probs = softmax(logits)
        return probs, (X, z1, a1, z2, a2, probs)

    def backward(self, cache, y_onehot: np.ndarray):
        X, z1, a1, z2, a2, probs = cache
        N = X.shape[0]

        dlogits = (probs - y_onehot) / N

        dW3 = a2.T @ dlogits
        db3 = np.sum(dlogits, axis=0, keepdims=True)

        da2 = dlogits @ self.W3.T
        dz2 = da2 * relu_grad(z2)
        dW2 = a1.T @ dz2
        db2 = np.sum(dz2, axis=0, keepdims=True)

        da1 = dz2 @ self.W2.T
        dz1 = da1 * relu_grad(z1)
        dW1 = X.T @ dz1
        db1 = np.sum(dz1, axis=0, keepdims=True)

        return dW1, db1, dW2, db2, dW3, db3

    def step(self, grads, lr=0.01, wd=1e-4, beta1=0.9, beta2=0.999, eps=1e-8):
        dW1, db1, dW2, db2, dW3, db3 = grads
        self.t += 1
        t = self.t

        # Decoupled weight decay
        self.W1 *= (1.0 - lr * wd)
        self.W2 *= (1.0 - lr * wd)
        self.W3 *= (1.0 - lr * wd)

        def adam(param, grad, m, v):
            m[:] = beta1 * m + (1.0 - beta1) * grad
            v[:] = beta2 * v + (1.0 - beta2) * (grad * grad)
            mhat = m / (1.0 - beta1 ** t)
            vhat = v / (1.0 - beta2 ** t)
            param[:] = param - lr * mhat / (np.sqrt(vhat) + eps)

        adam(self.W1, dW1, self.mW1, self.vW1)
        adam(self.b1, db1, self.mb1, self.vb1)
        adam(self.W2, dW2, self.mW2, self.vW2)
        adam(self.b2, db2, self.mb2, self.vb2)
        adam(self.W3, dW3, self.mW3, self.vW3)
        adam(self.b3, db3, self.mb3, self.vb3)


# Data

iris = load_iris()
X = iris.data.astype(np.float32)
y = iris.target.astype(np.int32)
K = len(np.unique(y))

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train).astype(np.float32)
X_val = scaler.transform(X_val).astype(np.float32)

y_train_oh = one_hot(y_train, K)
y_val_oh = one_hot(y_val, K)


# Live plot 

plt.ion()
fig, (ax_acc, ax_loss) = plt.subplots(1, 2, figsize=(12, 5))
fig.canvas.manager.set_window_title("Iris NN Training (Live)")

ax_acc.set_title("Accuracy")
ax_acc.set_xlabel("Epoch")
ax_acc.set_ylabel("Accuracy")
ax_acc.set_ylim(0, 1.05)
train_acc_line, = ax_acc.plot([], [], label="Train")
val_acc_line,   = ax_acc.plot([], [], label="Val")
ax_acc.grid(True)
ax_acc.legend()

ax_loss.set_title("Loss")
ax_loss.set_xlabel("Epoch")
ax_loss.set_ylabel("Loss")
train_loss_line, = ax_loss.plot([], [], label="Train")
val_loss_line,   = ax_loss.plot([], [], label="Val")
ax_loss.grid(True)
ax_loss.legend()

fig.tight_layout()
fig.show()
fig.canvas.draw()
fig.canvas.flush_events()


def update_plot(train_accs, val_accs, train_losses, val_losses):
    ep = len(train_accs)
    xs = np.arange(1, ep + 1)

    train_acc_line.set_data(xs, train_accs)
    val_acc_line.set_data(xs, val_accs)
    ax_acc.set_xlim(1, max(10, ep))

    train_loss_line.set_data(xs, train_losses)
    val_loss_line.set_data(xs, val_losses)
    ax_loss.set_xlim(1, max(10, ep))

    all_losses = np.array(train_losses + val_losses, dtype=np.float32)
    ymin = max(0.0, float(all_losses.min()) * 0.95)
    ymax = float(all_losses.max()) * 1.05
    if ymax - ymin < 1e-6:
        ymax = ymin + 1.0
    ax_loss.set_ylim(ymin, ymax)

    fig.canvas.draw()
    fig.canvas.flush_events()
    plt.pause(0.001)


# Train

model = MLP2HiddenAdamW(input_dim=X_train.shape[1], h1=16, h2=8, out_dim=K, seed=42)

epochs = 250
batch_size = 16
lr = 0.01
wd = 1e-4
update_every = 2  # fewer redraws = smoother in VS Code

rng = np.random.default_rng(42)

train_losses, val_losses = [], []
train_accs, val_accs = [], []

for ep in range(1, epochs + 1):
    # shuffle
    idx = rng.permutation(len(X_train))
    Xs = X_train[idx]
    Ys = y_train_oh[idx]

    # minibatches
    for start in range(0, len(Xs), batch_size):
        xb = Xs[start:start + batch_size]
        yb = Ys[start:start + batch_size]
        probs, cache = model.forward(xb)
        grads = model.backward(cache, yb)
        model.step(grads, lr=lr, wd=wd)

    # epoch metrics (full sets; Iris is tiny, so this is fast)
    tr_probs, _ = model.forward(X_train)
    va_probs, _ = model.forward(X_val)

    tr_loss = cross_entropy(tr_probs, y_train_oh)
    va_loss = cross_entropy(va_probs, y_val_oh)
    tr_acc = accuracy(tr_probs, y_train)
    va_acc = accuracy(va_probs, y_val)

    train_losses.append(tr_loss)
    val_losses.append(va_loss)
    train_accs.append(tr_acc)
    val_accs.append(va_acc)

    if ep % update_every == 0 or ep == 1:
        update_plot(train_accs, val_accs, train_losses, val_losses)

    if ep == 1 or ep % 10 == 0:
        print(f"Epoch {ep:3d} | train loss {tr_loss:.4f}, acc {tr_acc:.3f} | val loss {va_loss:.4f}, acc {va_acc:.3f}")

plt.ioff()
plt.show()
print(f"\nFinal validation accuracy: {val_accs[-1]:.3f}")


Epoch   1 | train loss 0.6343, acc 0.800 | val loss 0.6836, acc 0.700
Epoch  10 | train loss 0.1265, acc 0.950 | val loss 0.1548, acc 0.933
Epoch  20 | train loss 0.0583, acc 0.975 | val loss 0.1078, acc 0.933
Epoch  30 | train loss 0.0416, acc 0.983 | val loss 0.1038, acc 0.933
Epoch  40 | train loss 0.0329, acc 0.992 | val loss 0.1219, acc 0.900
Epoch  50 | train loss 0.0252, acc 0.992 | val loss 0.1312, acc 0.967
Epoch  60 | train loss 0.0195, acc 1.000 | val loss 0.1378, acc 0.933
Epoch  70 | train loss 0.0168, acc 0.992 | val loss 0.1710, acc 0.967
Epoch  80 | train loss 0.0100, acc 1.000 | val loss 0.1786, acc 0.933
Epoch  90 | train loss 0.0075, acc 1.000 | val loss 0.2002, acc 0.933
Epoch 100 | train loss 0.0057, acc 1.000 | val loss 0.2198, acc 0.933
Epoch 110 | train loss 0.0043, acc 1.000 | val loss 0.2242, acc 0.933
Epoch 120 | train loss 0.0035, acc 1.000 | val loss 0.2427, acc 0.933
Epoch 130 | train loss 0.0029, acc 1.000 | val loss 0.2627, acc 0.933
Epoch 140 | train lo