# Introduction to regularization and batching

In [1]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split


In [2]:
mnist = fetch_openml('mnist_784', as_frame=False)

  warn(


In [37]:
n = int(1e3)

labels = []
for val in mnist.target:
    label = np.zeros(10)
    label[int(val)] = 1
    labels.append(label)

X_train, X_test, y_train, y_test = train_test_split(
    mnist.data / 255.0,
    labels,
    test_size=0.2,
    shuffle=True
)
X_train = X_train[:n]
y_train = y_train[:n]
X_test = X_test[:n]
y_test = y_test[:n]

In [31]:
def relu(x):
    return (x >= 0) * x


def relu2deriv(x):
    return x >= 0

## Dropout

In [67]:
alpha = 0.005
epochs = 300
hidden_size = 100

weights_0_1 = 0.2 * np.random.random((28 * 28, hidden_size)) - 0.1
weights_1_2 = 0.2 * np.random.random((hidden_size, 10)) - 0.1

use_dropout = True

for e in range(epochs):
    error = 0.0
    correct_cnt = 0
    for i in range(len(X_train)):
        layer_0 = X_train[i:i + 1]
        layer_1 = relu(np.dot(layer_0, weights_0_1))
        dropout_mask = np.random.randint(2, size=layer_1.shape)
        if use_dropout:
            layer_1 *= dropout_mask * 2  # mnożymy przez 2, aby nie 'osłabić' wejścia następnej warstwy
        layer_2 = np.dot(layer_1, weights_1_2)

        error += np.sum((y_train[i:i + 1] - layer_2) ** 2)
        correct_cnt += int(np.argmax(layer_2) == np.argmax(y_train[i:i + 1]))

        layer_2_delta = y_train[i:i + 1] - layer_2
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) * relu2deriv(layer_1)
        if use_dropout:
            layer_1_delta *= dropout_mask * 2

        weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)
        weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)

    if e % 10 == 0:
        test_error, test_correct_cnt = 0, 0

        for i in range(len(X_test)):
            layer_0 = X_test[i:i + 1]
            layer_1 = relu(np.dot(layer_0, weights_0_1))
            layer_2 = np.dot(layer_1, weights_1_2)

            test_error += np.sum((y_test[i:i + 1] - layer_2) ** 2)
            test_correct_cnt += int(
                np.argmax(layer_2) == np.argmax(y_test[i:i + 1])
            )

        print(f"Epoch: {e}")
        print(f" Train-Err: {error / len(X_train):.4}")
        print(f" Train-Acc: {correct_cnt / len(X_train):.4}")
        print(f" Test-Err: {test_error / len(X_test):.4f}")
        print(f" Test-Acc: {test_correct_cnt / len(X_test):.4}")

Epoch: 0
 Train-Err: 0.8475
 Train-Acc: 0.458
 Test-Err: 0.6176
 Test-Acc: 0.686
Epoch: 10
 Train-Err: 0.4563
 Train-Acc: 0.799
 Test-Err: 0.3996
 Test-Acc: 0.815
Epoch: 20
 Train-Err: 0.3879
 Train-Acc: 0.826
 Test-Err: 0.3968
 Test-Acc: 0.813
Epoch: 30
 Train-Err: 0.3926
 Train-Acc: 0.839
 Test-Err: 0.3853
 Test-Acc: 0.82
Epoch: 40
 Train-Err: 0.3784
 Train-Acc: 0.851
 Test-Err: 0.3946
 Test-Acc: 0.822
Epoch: 50
 Train-Err: 0.3752
 Train-Acc: 0.844
 Test-Err: 0.3975
 Test-Acc: 0.829
Epoch: 60
 Train-Err: 0.385
 Train-Acc: 0.841
 Test-Err: 0.3916
 Test-Acc: 0.812
Epoch: 70
 Train-Err: 0.3719
 Train-Acc: 0.859
 Test-Err: 0.3868
 Test-Acc: 0.822
Epoch: 80
 Train-Err: 0.3577
 Train-Acc: 0.873
 Test-Err: 0.3897
 Test-Acc: 0.816
Epoch: 90
 Train-Err: 0.3556
 Train-Acc: 0.873
 Test-Err: 0.3852
 Test-Acc: 0.812
Epoch: 100
 Train-Err: 0.3461
 Train-Acc: 0.887
 Test-Err: 0.3854
 Test-Acc: 0.812
Epoch: 110
 Train-Err: 0.3424
 Train-Acc: 0.88
 Test-Err: 0.3820
 Test-Acc: 0.808
Epoch: 120
 Train-

## Mini-batch Gradient Descent

In [66]:
batch_size = 100
alpha = 0.001
epochs = 300
hidden_size = 100

weights_0_1 = 0.2 * np.random.random((28 * 28, hidden_size)) - 0.1
weights_1_2 = 0.2 * np.random.random((hidden_size, 10)) - 0.1

for e in range(epochs):
    error = 0.0
    correct_cnt = 0
    for i in range(int(len(X_train) / batch_size)):
        batch_start = i * batch_size
        batch_end = (i + 1) * batch_size

        layer_0 = X_train[batch_start: batch_end]
        layer_1 = relu(np.dot(layer_0, weights_0_1))
        dropout_mask = np.random.randint(2, size=layer_1.shape)
        # print(layer_1, dropout_mask, dropout_mask*2, layer_1*dropout_mask*2)
        layer_1 *= dropout_mask * 2  # mnożymy przez 2, aby nie 'osłabić' wejścia następnej warstwy
        layer_2 = np.dot(layer_1, weights_1_2)

        error += np.sum((y_train[batch_start: batch_end] - layer_2) ** 2)
        for k in range(batch_size):
            correct_cnt += int(
                np.argmax(layer_2[k:k + 1]) == np.argmax(y_train[batch_start + k: batch_start + k + 1])
            )

            layer_2_delta = (y_train[batch_start: batch_end] - layer_2) / batch_size
            layer_1_delta = layer_2_delta.dot(weights_1_2.T) * relu2deriv(layer_1)
            layer_1_delta *= dropout_mask

            weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)
            weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)

    if e % 10 == 0:
        test_error, test_correct_cnt = 0, 0

        for i in range(len(X_test)):
            layer_0 = X_test[i:i + 1]
            layer_1 = relu(np.dot(layer_0, weights_0_1))
            layer_2 = np.dot(layer_1, weights_1_2)

            test_error += np.sum((y_test[i:i + 1] - layer_2) ** 2)
            test_correct_cnt += int(
                np.argmax(layer_2) == np.argmax(y_test[i:i + 1])
            )

        print(f"Epoch: {e}")
        print(f" Train-Err: {error / len(X_train):.4}")
        print(f" Train-Acc: {correct_cnt / len(X_train):.4}")
        print(f" Test-Err: {test_error / len(X_test):.4f}")
        print(f" Test-Acc: {test_correct_cnt / len(X_test):.4}")

Epoch: 0
 Train-Err: 1.379
 Train-Acc: 0.17
 Test-Err: 0.8110
 Test-Acc: 0.394
Epoch: 10
 Train-Err: 0.6005
 Train-Acc: 0.648
 Test-Err: 0.5578
 Test-Acc: 0.713
Epoch: 20
 Train-Err: 0.5336
 Train-Acc: 0.72
 Test-Err: 0.5047
 Test-Acc: 0.762
Epoch: 30
 Train-Err: 0.5028
 Train-Acc: 0.739
 Test-Err: 0.4710
 Test-Acc: 0.785
Epoch: 40
 Train-Err: 0.4759
 Train-Acc: 0.767
 Test-Err: 0.4526
 Test-Acc: 0.793
Epoch: 50
 Train-Err: 0.4716
 Train-Acc: 0.766
 Test-Err: 0.4441
 Test-Acc: 0.791
Epoch: 60
 Train-Err: 0.4666
 Train-Acc: 0.758
 Test-Err: 0.4393
 Test-Acc: 0.805
Epoch: 70
 Train-Err: 0.4505
 Train-Acc: 0.779
 Test-Err: 0.4320
 Test-Acc: 0.81
Epoch: 80
 Train-Err: 0.4602
 Train-Acc: 0.769
 Test-Err: 0.4351
 Test-Acc: 0.797
Epoch: 90
 Train-Err: 0.4358
 Train-Acc: 0.793
 Test-Err: 0.4334
 Test-Acc: 0.796
Epoch: 100
 Train-Err: 0.4445
 Train-Acc: 0.797
 Test-Err: 0.4323
 Test-Acc: 0.801
Epoch: 110
 Train-Err: 0.4358
 Train-Acc: 0.8
 Test-Err: 0.4375
 Test-Acc: 0.794
Epoch: 120
 Train-Err

Gradient descent: od 4 min 34 s
```
Train-Err: 0.3154
Train-Acc: 0.903
Test-Err: 0.4271
Test-Acc: 0.786
```

Mini-batch gradient descent: 7 min 48 s
```
Train-Err: 0.3927
Train-Acc: 0.832
Test-Err: 0.4049
Test-Acc: 0.8
```