# Task 1: 2-layer MLP for MNIST

This notebook implements a 2-hidden-layer MLP for the MNIST dataset with the specified training setup.

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [3]:
# Load MNIST
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
assert x_train.shape == (60000, 28, 28)
assert x_test.shape == (10000, 28, 28)
assert y_train.shape == (60000,)
assert y_test.shape == (10000,)

# Combine so we can split later 
x_combined = np.concatenate([x_train, x_test], axis=0)
y_combined = np.concatenate([y_train, y_test], axis=0)

# Normalize
x_combined = x_combined.astype('float32') / 255.0

# Flatten images to vectors
x_combined = x_combined.reshape(-1, 28*28)

# 70/30 split
x_train, x_test, y_train, y_test = train_test_split(
    x_combined, y_combined, test_size=0.30, random_state=7, stratify=y_combined
)

x_train.shape, x_test.shape

((49000, 784), (21000, 784))

In [7]:
# Build 2-hidden-layer MLP
model = keras.Sequential([
    layers.Input(shape=(784,)), # 28 * 28 = 784
    layers.Dense(100, activation='relu'),
    layers.Dense(100, activation='relu'),
    layers.Dense(10, activation='softmax')
])

optimizer = keras.optimizers.SGD(learning_rate=0.0001)

model.compile(
    optimizer=optimizer,
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

In [8]:
# Train
history = model.fit(
    x_train, y_train,
    epochs=50,
    batch_size=32,
    validation_data=(x_test, y_test),
    verbose=2
)

Epoch 1/50
1532/1532 - 1s - 622us/step - accuracy: 0.1443 - loss: 2.2712 - val_accuracy: 0.1841 - val_loss: 2.2343
Epoch 2/50
1532/1532 - 1s - 474us/step - accuracy: 0.2289 - loss: 2.2011 - val_accuracy: 0.2745 - val_loss: 2.1670
Epoch 3/50
1532/1532 - 1s - 462us/step - accuracy: 0.3183 - loss: 2.1340 - val_accuracy: 0.3605 - val_loss: 2.0997
Epoch 4/50
1532/1532 - 1s - 464us/step - accuracy: 0.3980 - loss: 2.0649 - val_accuracy: 0.4315 - val_loss: 2.0287
Epoch 5/50
1532/1532 - 1s - 479us/step - accuracy: 0.4608 - loss: 1.9910 - val_accuracy: 0.4908 - val_loss: 1.9524
Epoch 6/50
1532/1532 - 1s - 480us/step - accuracy: 0.5156 - loss: 1.9121 - val_accuracy: 0.5392 - val_loss: 1.8712
Epoch 7/50
1532/1532 - 1s - 489us/step - accuracy: 0.5617 - loss: 1.8284 - val_accuracy: 0.5836 - val_loss: 1.7858
Epoch 8/50
1532/1532 - 1s - 476us/step - accuracy: 0.6023 - loss: 1.7411 - val_accuracy: 0.6220 - val_loss: 1.6974
Epoch 9/50
1532/1532 - 1s - 464us/step - accuracy: 0.6358 - loss: 1.6515 - val_a

In [9]:
# Evaluate training and testing accuracy
train_loss, train_acc = model.evaluate(x_train, y_train, verbose=0)
test_loss, test_acc = model.evaluate(x_test, y_test, verbose=0)

print(f'Training accuracy: {train_acc:.4f}')
print(f'Testing accuracy:  {test_acc:.4f}')

Training accuracy: 0.8776
Testing accuracy:  0.8716


## Task 2: Implement 2-layer MLP for MNIST dataset 

### Compare Activation Functions

Train separate models with different activation functions (Sigmoid, Tanh, and relu) using 100 units per hidden layer.

In [4]:
# Data is already normalized and split from Task 1

import time

# Compare activation functions
activations = ["sigmoid", "tanh", "relu"]
results = []
activation_times = []

# Loop through activation functions
for act in activations:
    start_time = time.time()
    # Same structure as above but different activation
    model = keras.Sequential([
        layers.Input(shape=(784,)),
        layers.Dense(100, activation=act),
        layers.Dense(100, activation=act),
        layers.Dense(10, activation='softmax')
    ])

    optimizer = keras.optimizers.SGD(learning_rate=0.0001)
    model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    model.fit(
        x_train, y_train,
        epochs=50,
        batch_size=32,
        validation_data=(x_test, y_test),
        verbose=0
    )
    end_time = time.time()
    
    # Append activation function and time taken
    activation_times.append((act, end_time - start_time))

    train_loss, train_acc = model.evaluate(x_train, y_train, verbose=0)
    test_loss, test_acc = model.evaluate(x_test, y_test, verbose=0)
    # Append results as tuple
    results.append((act, train_acc, test_acc))

for act, train_acc, test_acc in results:
    print(f"Using activation function = {act:7s} | train acc: {train_acc:.4f} | test acc: {test_acc:.4f}")

for act, duration in activation_times:
    print(f"Activation function = {act:7s} took {duration:.2f} seconds")

Using activation function = sigmoid | train acc: 0.3261 | test acc: 0.3289
Using activation function = tanh    | train acc: 0.8793 | test acc: 0.8766
Using activation function = relu    | train acc: 0.8770 | test acc: 0.8714
Activation function = sigmoid took 37.91 seconds
Activation function = tanh    took 37.85 seconds
Activation function = relu    took 37.45 seconds


These results are quite interesting. It seems sigmoid performs a lot worse compared to relu and tanh. This may be because it doesn't handle the input as well because of 0-1 ranged data. The models all also might be underfit or could have had a more epochs to converge because the train and test acc are about equal. 

The activation function running times we're all within 1 second of each other, so it doesn't seem like they change training times a whole lot. Maybe this changes as data and epochs get larger