# Assignment 2

Imports and preprocessing

In [3]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [4]:
# Load MNIST
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
assert x_train.shape == (60000, 28, 28)
assert x_test.shape == (10000, 28, 28)
assert y_train.shape == (60000,)
assert y_test.shape == (10000,)

# Combine so we can split later 
x_combined = np.concatenate([x_train, x_test], axis=0)
y_combined = np.concatenate([y_train, y_test], axis=0)

# Normalize
x_combined = x_combined.astype('float32') / 255.0

# Flatten images to vectors
x_combined = x_combined.reshape(-1, 28*28)

# 70/30 split
x_train, x_test, y_train, y_test = train_test_split(
    x_combined, y_combined, test_size=0.30, random_state=7, stratify=y_combined
)

x_train.shape, x_test.shape

((49000, 784), (21000, 784))

## Task 1: Compare 2-layer vs 3-layer vs 4-layer MLPs

### All models use 100 units per hidden layer with ReLU activation.

In [5]:
# Compare depth: 2-layer vs 3-layer vs 4-layer (hidden layers)

# A lot of bad code reuse here but its okay for this assignment 

# 2-layer MLP
model_2 = keras.Sequential([
    layers.Input(shape=(784,)),
    layers.Dense(100, activation='relu'),
    layers.Dense(100, activation='relu'),
    layers.Dense(10, activation='softmax')
])
model_2.compile(
    optimizer=keras.optimizers.Adam(learning_rate=2e-4),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)
model_2.fit(
    x_train, y_train,
    epochs=50,
    batch_size=32,
    validation_data=(x_test, y_test),
    verbose=0
)
train_loss_2, train_acc_2 = model_2.evaluate(x_train, y_train, verbose=0)
test_loss_2, test_acc_2 = model_2.evaluate(x_test, y_test, verbose=0)

# 3-layer MLP
model_3 = keras.Sequential([
    layers.Input(shape=(784,)),
    layers.Dense(100, activation='relu'),
    layers.Dense(100, activation='relu'),
    layers.Dense(100, activation='relu'),
    layers.Dense(10, activation='softmax')
])
model_3.compile(
    optimizer=keras.optimizers.Adam(learning_rate=2e-4),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)
model_3.fit(
    x_train, y_train,
    epochs=50,
    batch_size=32,
    validation_data=(x_test, y_test),
    verbose=0
)
train_loss_3, train_acc_3 = model_3.evaluate(x_train, y_train, verbose=0)
test_loss_3, test_acc_3 = model_3.evaluate(x_test, y_test, verbose=0)

# 4-layer MLP
model_4 = keras.Sequential([
    layers.Input(shape=(784,)),
    layers.Dense(100, activation='relu'),
    layers.Dense(100, activation='relu'),
    layers.Dense(100, activation='relu'),
    layers.Dense(100, activation='relu'),
    layers.Dense(10, activation='softmax')
])
model_4.compile(
    optimizer=keras.optimizers.Adam(learning_rate=2e-4),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)
model_4.fit(
    x_train, y_train,
    epochs=50,
    batch_size=32,
    validation_data=(x_test, y_test),
    verbose=0
)
train_loss_4, train_acc_4 = model_4.evaluate(x_train, y_train, verbose=0)
test_loss_4, test_acc_4 = model_4.evaluate(x_test, y_test, verbose=0)

print(f"2-layer | train acc: {train_acc_2:.4f} | test acc: {test_acc_2:.4f}")
print(f"3-layer | train acc: {train_acc_3:.4f} | test acc: {test_acc_3:.4f}")
print(f"4-layer | train acc: {train_acc_4:.4f} | test acc: {test_acc_4:.4f}")


2-layer | train acc: 0.9997 | test acc: 0.9750
3-layer | train acc: 1.0000 | test acc: 0.9771
4-layer | train acc: 0.9948 | test acc: 0.9713


### 2 vs 3 vs 4 layer MLP comparison

It seems that overall the 3-layer MLP performs the best on the MNIST dataset. The 4-layer has slightly degraded performance compared to the 3-layer, likely because this is a relatively simple task and 4-layers is trying to do too much. 


## Task 2: Compare different 2-layer MLP Variants

- Weight initializations
- Regularizations
- Optimizers

In [6]:
# Weight initialization comparison on 2-layer MLP
init_methods = ["glorot_uniform", "he_normal"]
init_results = []

for init in init_methods:
    # 2-layer MLP
    model_weights_init = keras.Sequential([
        layers.Input(shape=(784,)),
        layers.Dense(100, activation='relu', kernel_initializer=init),
        layers.Dense(100, activation='relu', kernel_initializer=init),
        layers.Dense(10, activation='softmax', kernel_initializer=init)
    ])
    model_weights_init.compile(
        optimizer=keras.optimizers.Adam(learning_rate=2e-4),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    model_weights_init.fit(
        x_train, y_train,
        epochs=50,
        batch_size=32,
        validation_data=(x_test, y_test),
        verbose=0
    )
    train_loss_init, train_acc_init = model_weights_init.evaluate(x_train, y_train, verbose=0)
    test_loss_init, test_acc_init = model_weights_init.evaluate(x_test, y_test, verbose=0)
    init_results.append((init, train_acc_init, test_acc_init))

for init, train_acc, test_acc in init_results:
    print(f"{init:15s} | train acc: {train_acc:.4f} | test acc: {test_acc:.4f}")


glorot_uniform  | train acc: 1.0000 | test acc: 0.9760
he_normal       | train acc: 1.0000 | test acc: 0.9784


In [7]:
# Regularization comparison on 2-layer MLP
# Using dict for printing purposes
reg_methods = {
    "L2 (1e-4)": keras.regularizers.l2(1e-4),
    "L1 (1e-4)": keras.regularizers.l1(1e-4),
}
reg_results = []

for reg_name, reg in reg_methods.items():
    # 2-layer MLP
    model_reg = keras.Sequential([
        layers.Input(shape=(784,)),
        layers.Dense(100, activation='relu', kernel_regularizer=reg),
        layers.Dense(100, activation='relu', kernel_regularizer=reg),
        layers.Dense(10, activation='softmax', kernel_regularizer=reg)
    ])
    model_reg.compile(
        optimizer=keras.optimizers.Adam(learning_rate=2e-4),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    model_reg.fit(
        x_train, y_train,
        epochs=50,
        batch_size=32,
        validation_data=(x_test, y_test),
        verbose=0
    )
    train_loss_reg, train_acc_reg = model_reg.evaluate(x_train, y_train, verbose=0)
    test_loss_reg, test_acc_reg = model_reg.evaluate(x_test, y_test, verbose=0)
    reg_results.append((reg_name, train_acc_reg, test_acc_reg))

for reg_name, train_acc, test_acc in reg_results:
    print(f"{reg_name:15s} | train acc: {train_acc:.4f} | test acc: {test_acc:.4f}")

L2 (1e-4)       | train acc: 0.9992 | test acc: 0.9770
L1 (1e-4)       | train acc: 0.9926 | test acc: 0.9733


In [8]:
# Optimizer comparison on 2-layer MLP
opt_methods = {
    "SGD": keras.optimizers.SGD(learning_rate=0.0001),
    "Adam": keras.optimizers.Adam(learning_rate=2e-4),
}
opt_results = []

for opt_name, opt in opt_methods.items():
    # 2-layer MLP
    model_opt = keras.Sequential([
        layers.Input(shape=(784,)),
        layers.Dense(100, activation='relu'),
        layers.Dense(100, activation='relu'),
        layers.Dense(10, activation='softmax')
    ])
    model_opt.compile(
        optimizer=opt,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    model_opt.fit(
        x_train, y_train,
        epochs=50,
        batch_size=32,
        validation_data=(x_test, y_test),
        verbose=0
    )
    train_loss_opt, train_acc_opt = model_opt.evaluate(x_train, y_train, verbose=0)
    test_loss_opt, test_acc_opt = model_opt.evaluate(x_test, y_test, verbose=0)
    opt_results.append((opt_name, train_acc_opt, test_acc_opt))

for opt_name, train_acc, test_acc in opt_results:
    print(f"{opt_name:15s} | train acc: {train_acc:.4f} | test acc: {test_acc:.4f}")

SGD             | train acc: 0.8737 | test acc: 0.8687
Adam            | train acc: 1.0000 | test acc: 0.9761


### Final summary

- Weight initializations
    - glorot_uniform  | train acc: 1.0000 | test acc: 0.9760
    - he_normal       | train acc: 1.0000 | test acc: 0.9784
- Regularizations
    - L2 (1e-4)       | train acc: 0.9992 | test acc: 0.9770
    - L1 (1e-4)       | train acc: 0.9926 | test acc: 0.9733
- Optimizers
    - SGD             | train acc: 0.8737 | test acc: 0.8687
    - Adam            | train acc: 1.0000 | test acc: 0.9761

The only real difference between these is the Adam optimizer when compared to SGD. Adam achieved a perfect training accuracy and substantially higher test accuracy compared to SGD showing faster convergence and overall better optimization for this task which was seen for all the other variants, that also used Adam.