# Task 1: 2-layer MLP for MNIST

This notebook implements a 2-hidden-layer MLP for the MNIST dataset with the specified training setup.

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
# Load MNIST
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
assert x_train.shape == (60000, 28, 28)
assert x_test.shape == (10000, 28, 28)
assert y_train.shape == (60000,)
assert y_test.shape == (10000,)

# Combine and then split 70/30
x_combined = np.concatenate([x_train, x_test], axis=0)
y_combined = np.concatenate([y_train, y_test], axis=0)

# Normalize
x_combined = x_combined.astype('float32') / 255.0

# Flatten images to vectors
x_combined = x_combined.reshape((x_combined.shape[0], -1))

# 70/30 split
x_train, x_test, y_train, y_test = train_test_split(
    x_combined, y_combined, test_size=0.30, random_state=7, stratify=y_combined
)

x_train.shape, x_test.shape

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


((49000, 784), (21000, 784))

In [None]:
# Build 2-hidden-layer MLP
model = keras.Sequential([
    layers.Input(shape=(784,)), # 28 * 28 = 784
    layers.Dense(100, activation='relu'),
    layers.Dense(100, activation='relu'),
    layers.Dense(10, activation='softmax')
])

optimizer = keras.optimizers.SGD(learning_rate=0.0001)

model.compile(
    optimizer=optimizer,
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

In [4]:
# Train
history = model.fit(
    x_train, y_train,
    epochs=50,
    batch_size=32,
    validation_data=(x_test, y_test),
    verbose=2
)

Epoch 1/50
1532/1532 - 1s - 645us/step - accuracy: 0.1489 - loss: 2.2830 - val_accuracy: 0.1762 - val_loss: 2.2511
Epoch 2/50
1532/1532 - 1s - 476us/step - accuracy: 0.2161 - loss: 2.2208 - val_accuracy: 0.2673 - val_loss: 2.1883
Epoch 3/50
1532/1532 - 1s - 472us/step - accuracy: 0.3219 - loss: 2.1570 - val_accuracy: 0.3756 - val_loss: 2.1222
Epoch 4/50
1532/1532 - 1s - 481us/step - accuracy: 0.4192 - loss: 2.0889 - val_accuracy: 0.4624 - val_loss: 2.0512
Epoch 5/50
1532/1532 - 1s - 483us/step - accuracy: 0.4931 - loss: 2.0157 - val_accuracy: 0.5262 - val_loss: 1.9752
Epoch 6/50
1532/1532 - 1s - 521us/step - accuracy: 0.5462 - loss: 1.9375 - val_accuracy: 0.5740 - val_loss: 1.8946
Epoch 7/50
1532/1532 - 1s - 504us/step - accuracy: 0.5856 - loss: 1.8550 - val_accuracy: 0.6093 - val_loss: 1.8102
Epoch 8/50
1532/1532 - 1s - 507us/step - accuracy: 0.6185 - loss: 1.7691 - val_accuracy: 0.6377 - val_loss: 1.7232
Epoch 9/50
1532/1532 - 1s - 481us/step - accuracy: 0.6447 - loss: 1.6814 - val_a

In [5]:
# Evaluate training and testing accuracy
train_loss, train_acc = model.evaluate(x_train, y_train, verbose=0)
test_loss, test_acc = model.evaluate(x_test, y_test, verbose=0)

print(f'Training accuracy: {train_acc:.4f}')
print(f'Testing accuracy:  {test_acc:.4f}')

Training accuracy: 0.8688
Testing accuracy:  0.8652


## Task 2: Implement 2-layer MLP for MNIST dataset 

### Compare Activation Functions

Train separate models with different activation functions (ReLU, Sigmoid, Tanh) using 100 units per hidden layer.

In [7]:
# Compare activation functions
activations = ["relu", "sigmoid", "tanh"]
results = []

# Loop through activation functions
for act in activations:
    # Same structure as above but different activation
    model = keras.Sequential([
        layers.Input(shape=(784,)),
        layers.Dense(100, activation=act),
        layers.Dense(100, activation=act),
        layers.Dense(10, activation='softmax')
    ])

    optimizer = keras.optimizers.SGD(learning_rate=0.0001)
    model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    model.fit(
        x_train, y_train,
        epochs=50,
        batch_size=32,
        validation_data=(x_test, y_test),
        verbose=0
    )

    train_loss, train_acc = model.evaluate(x_train, y_train, verbose=0)
    test_loss, test_acc = model.evaluate(x_test, y_test, verbose=0)
    # Append results
    results.append((act, train_acc, test_acc))

for act, train_acc, test_acc in results:
    print(f"Using activation function = {act:7s} | train acc: {train_acc:.4f} | test acc: {test_acc:.4f}")


Using activation function = relu    | train acc: 0.8726 | test acc: 0.8696
Using activation function = sigmoid | train acc: 0.3231 | test acc: 0.3242
Using activation function = tanh    | train acc: 0.8785 | test acc: 0.8747


These results are quite interesting. It seems sigmoid performs a lot worse compared to relu and tanh. This may be because it doesn't handle the input as well because of 0-1 ranged data. The models all also might be underfit or could have had a more epochs to converge because the train and test acc are about equal. 