In [2]:
import numpy as np
from sklearn.datasets import load_iris


iris = load_iris(as_frame=True)

X = iris.data.to_numpy()  # type: ignore
y = iris.target.to_numpy()  # type: ignore

X = np.c_[np.ones(len(X)), X]  # With bias term.

In [3]:
test_ratio = 0.2
valid_ratio = 0.2
size = len(X)
test_size = int(size * test_ratio)
valid_size = int(size * valid_ratio)
train_size = size - test_size - valid_size

np.random.seed(42)
ids = np.random.permutation(size)

X_train = X[ids[:train_size]]
y_train = y[ids[:train_size]]

X_valid = X[ids[train_size:-test_size]]
y_valid = y[ids[train_size:-test_size]]

X_test = X[ids[-test_size:]]
y_test = y[ids[-test_size:]]

In [4]:
def to_one_hot(y):
    return np.diag(np.ones(y.max() + 1))[y]

In [5]:
y_train[:5]

array([1, 0, 2, 1, 1])

In [6]:
to_one_hot(y_train[:5])

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [7]:
Y_train = to_one_hot(y_train)
Y_valid = to_one_hot(y_valid)
Y_test = to_one_hot(y_test)

In [8]:
mean = X_train[:, 1:].mean(axis=0)  # Without bias term.
std = X_train[:, 1:].mean(axis=0)

X_train[:, 1:] = (X_train[:, 1:] - mean) / std
X_valid[:, 1:] = (X_valid[:, 1:] - mean) / std
X_test[:, 1:] = (X_test[:, 1:] - mean) / std

In [9]:
def softmax(logits):
    exps = np.exp(logits)
    exp_sums = exps.sum(axis=1, keepdims=True)
    return exps / exp_sums

In [10]:
n_inputs = X_train.shape[1]
n_outputs = len(np.unique(y_train))

## Default Batch Gradient Descent

In [11]:
eta = 0.5
eps = 1e-9
n_epochs = 1000
m = len(X_train)

np.random.seed(42)
Theta = np.random.randn(n_inputs, n_outputs)

for epoch in range(n_epochs):
    errors = softmax(X_train @ Theta) - Y_train
    grads = 1 / m * X_train.T @ errors
    Theta = Theta - eta * grads

    if epoch % 100 == 0:
        Y_proba_valid = softmax(X_valid @ Theta)
        xentropy_losses = -(Y_valid * np.log(np.clip(Y_proba_valid, eps, 1 - eps)))
        mean_loss = xentropy_losses.sum(axis=1).mean()
        print(f"Epoch: {epoch:3d} - Valid Loss: {mean_loss}")

Epoch:   0 - Valid Loss: 2.1069489136021304
Epoch: 100 - Valid Loss: 0.30879084881854263
Epoch: 200 - Valid Loss: 0.24408622171429012
Epoch: 300 - Valid Loss: 0.2180997345309118
Epoch: 400 - Valid Loss: 0.203850681806548
Epoch: 500 - Valid Loss: 0.1946614583857917
Epoch: 600 - Valid Loss: 0.18809465830682998
Epoch: 700 - Valid Loss: 0.18305240341590498
Epoch: 800 - Valid Loss: 0.17896927278172073
Epoch: 900 - Valid Loss: 0.17552567687462337


In [12]:
Theta

array([[ 0.47265049,  3.07489532, -2.54140741],
       [ 0.43242377, -0.04647024,  0.668786  ],
       [ 3.33127519, -0.2266624 , -1.22743964],
       [-5.27752905,  0.66376905,  4.2271726 ],
       [-6.85188537, -2.37669375,  5.83234332]])

In [13]:
Y_proba = softmax(X_valid @ Theta)
y_pred = Y_proba.argmax(axis=1)

accuracy = (y_valid == y_pred).mean()
accuracy

0.9333333333333333

## L2 Regularization

In [85]:
eta = 0.5
eps = 1e-9
alpha = 1e-2  # L2 regularization strength.
n_epochs = 1000
m = len(X_train)

np.random.seed(42)
Theta = np.random.randn(n_inputs, n_outputs)

for epoch in range(n_epochs):
    errors = softmax(X_train @ Theta) - Y_train
    grads = 1 / m * X_train.T @ errors
    grads += np.r_[np.zeros((1, n_outputs)), alpha / m * Theta[1:]]  # Plus L2 term.
    Theta = Theta - eta * grads

    if epoch % 100 == 0:
        Y_proba_valid = softmax(X_valid @ Theta)
        xentropy_loss = -(Y_valid * np.log(np.clip(Y_proba_valid, eps, 1 - eps)))
        l2_loss = 2 * alpha / m * (Theta[1:] ** 2).sum()
        total_loss = xentropy_loss.sum(axis=1).mean() + l2_loss
        print(f"Epoch: {epoch:3d} - Valid Loss: {total_loss:.5f}")

Epoch:   0 - Valid Loss: 2.10948
Epoch: 100 - Valid Loss: 0.31671
Epoch: 200 - Valid Loss: 0.25671
Epoch: 300 - Valid Loss: 0.23449
Epoch: 400 - Valid Loss: 0.22341
Epoch: 500 - Valid Loss: 0.21697
Epoch: 600 - Valid Loss: 0.21284
Epoch: 700 - Valid Loss: 0.21000
Epoch: 800 - Valid Loss: 0.20794
Epoch: 900 - Valid Loss: 0.20635


In [86]:
Theta

array([[ 0.47765381,  3.01578941, -2.48730483],
       [ 0.38368129, -0.04079102,  0.65484904],
       [ 3.21336681, -0.24351742, -1.19412241],
       [-5.15268403,  0.64795731,  4.13903126],
       [-6.66748418, -2.29159497,  5.74638269]])

In [87]:
Y_proba = softmax(X_valid @ Theta)
y_pred = Y_proba.argmax(axis=1)

accuracy = (y_valid == y_pred).mean()
accuracy

0.9333333333333333

## Early Stopping

In [101]:
eta = 0.5
eps = 1e-9
alpha = 3e-2  # L2 regularization strength.
n_epochs = 1000
m = len(X_train)
best_loss = np.inf

np.random.seed(42)
Theta = np.random.randn(n_inputs, n_outputs)

for epoch in range(n_epochs):
    Y_train_proba = softmax(X_train @ Theta)
    Y_valid_proba = softmax(X_valid @ Theta)
    xentropy_loss = -(Y_valid * np.log(np.clip(Y_valid_proba, eps, 1 - eps)))
    l2_loss = 2 * alpha / m * (Theta[1:] ** 2).sum()
    total_loss = xentropy_loss.sum(axis=1).mean() + l2_loss

    errors = Y_train_proba - Y_train
    grads = 1 / m * X_train.T @ errors
    grads += np.r_[np.zeros((1, n_outputs)), alpha / m * Theta[1:]]
    Theta = Theta - eta * grads

    if epoch % 100 == 0:
        print(f"Epoch: {epoch:3d} - Valid Loss: {total_loss:.5f}")

    if total_loss < best_loss:
        best_loss = total_loss
    else:
        print(f"Epoch: {epoch:3d} - Valid Loss: {total_loss:.5f} - Early Stopping")
        break

Epoch:   0 - Valid Loss: 2.47371
Epoch: 100 - Valid Loss: 0.33325
Epoch: 200 - Valid Loss: 0.28143
Epoch: 300 - Valid Loss: 0.26592
Epoch: 400 - Valid Loss: 0.26035
Epoch: 500 - Valid Loss: 0.25854
Epoch: 563 - Valid Loss: 0.25832 - Early Stopping


In [102]:
Theta

array([[ 0.37983996,  2.48490524, -1.85860681],
       [ 0.4652836 , -0.11644288,  0.61126296],
       [ 2.82416639, -0.40012038, -0.71530102],
       [-4.31627247,  0.71349348,  3.25087791],
       [-5.68788957, -1.96749677,  4.56387565]])

In [103]:
Y_proba = softmax(X_valid @ Theta)
y_pred = Y_proba.argmax(axis=1)

accuracy = (y_valid == y_pred).mean()
accuracy

0.9333333333333333

In [104]:
Y_proba = softmax(X_test @ Theta)
y_pred = Y_proba.argmax(axis=1)

accuracy = (y_test == y_pred).mean()
accuracy

0.9666666666666667