In [None]:
import numpy as np
import pickle
import pandas as pd
import time

np.random.seed(74)

EPOCHS = 60
BATCH_SIZE = 128
LEARNING_RATE = 0.01
DROPOUT_RATE = 0.3
L2_LAMBDA = 0.0001

In [32]:
def sigmoid(z):
    z = np.clip(z, -500, 500)
    return 1.0 / (1.0 + np.exp(-z))

def sigmoid_derivat(a):
    return a * (1.0 - a)

def one_hot_encode(y, n_classes=10):
    n = len(y)
    one_hot = np.zeros((n, n_classes))
    one_hot[np.arange(n), y] = 1
    return one_hot

def softmax(z):
    z = z - np.max(z, axis=1, keepdims=True)
    exp_z = np.exp(z)
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

def cross_entropy_loss(a2, y, weights_input_hidden, weights_hidden_output, l2_lambda=0.0):
    m = a2.shape[0]
    y_encoded = one_hot_encode(y, a2.shape[1])
    eps = 1e-12
    loss = -np.sum(y_encoded * np.log(a2 + eps)) / m

    if l2_lambda and l2_lambda > 0:
        loss += 0.5 * l2_lambda * (np.sum(weights_input_hidden**2) + np.sum(weights_hidden_output**2))
    return loss

In [None]:
def initialize_weights(input_size, hidden_size, output_size):
    weights_input_hidden = np.random.randn(input_size, hidden_size) * np.sqrt(1.0 / input_size)
    bias_hidden = np.zeros((1, hidden_size))
    weights_hidden_output = np.random.randn(hidden_size, output_size) * np.sqrt(1.0 / hidden_size)
    bias_output = np.zeros((1, output_size))
    return weights_input_hidden, bias_hidden, weights_hidden_output, bias_output

def forward(x_batch, weights_input_hidden, bias_hidden, weights_hidden_output, bias_output, dropout_rate=0, training=True):
    z1 = np.dot(x_batch, weights_input_hidden) + bias_hidden
    a1 = sigmoid(z1)

    dropout_mask = None
    # Check if the current state could be training
    if training and dropout_rate > 0:
        dropout_mask = (np.random.rand(*a1.shape) > dropout_rate) / (1 - dropout_rate)
        a1 = a1 * dropout_mask
    
    # Compute the next layer output
    z2 = np.dot(a1, weights_hidden_output) + bias_output
    a2 = softmax(z2)
    
    return z1, a1, z2, a2, dropout_mask

def backward(x_batch, y_batch, z1, a1, z2, a2, weights_input_hidden, weights_hidden_output, dropout_mask, l2_lambda=0.0001):
    m = x_batch.shape[0]
    y_encoded = one_hot_encode(y_batch, a2.shape[1])
    
    # Compute diff between target and prediction
    delta2 = a2 - y_encoded

    # Normalize l2 for computing gradients by using the next (second) layer
    dW2 = np.dot(a1.T, delta2) / m + l2_lambda * weights_hidden_output
    db2 = np.sum(delta2, axis=0, keepdims=True) / m

    # Backpropagate the error to the previous layer
    delta1 = np.dot(delta2, weights_hidden_output.T) * sigmoid_derivat(a1)
    if dropout_mask is not None:
        delta1 = delta1 * dropout_mask

    # Normalize l2 for computing gradients by using the current (first) layer
    dW1 = np.dot(x_batch.T, delta1) / m + l2_lambda * weights_input_hidden
    db1 = np.sum(delta1, axis=0, keepdims=True) / m
    
    return dW1, db1, dW2, db2

def compute_accuracy(X, y, weights_input_hidden, bias_hidden, weights_hidden_output, bias_output):
    _, _, _, a2, _ = forward(X, weights_input_hidden, bias_hidden, weights_hidden_output, bias_output, dropout_rate=0, training=False)
    predictions = np.argmax(a2, axis=1)
    return np.mean(predictions == y)

In [None]:
# Load data
train_file_path = '/kaggle/input/fii-nn-2025-homework-3/extended_mnist_train.pkl'
test_file_path = '/kaggle/input/fii-nn-2025-homework-3/extended_mnist_test.pkl'

with open(train_file_path, 'rb') as fp:
    train_dataset = pickle.load(fp)
with open(test_file_path, 'rb') as fp:
    test_dataset = pickle.load(fp)

# Preprocess
train_images_flat = []
train_image_labels = []
for image, label in train_dataset:
    train_images_flat.append(image.flatten() / 255.0)
    train_image_labels.append(label)

test_images_flat = []
for image, label in test_dataset:
    test_images_flat.append(image.flatten() / 255.0)

train_features_full = np.array(train_images_flat)
train_labels_full = np.array(train_image_labels)
test_features = np.array(test_images_flat)

# Split
split = int(0.9 * len(train_features_full))
train_features = train_features_full[:split]
train_image_labels = train_labels_full[:split]
validation_features = train_features_full[split:]
validation_labels = train_labels_full[split:]

# Initialize
weights_input_hidden, bias_hidden, weights_hidden_output, bias_output = initialize_weights(784, 100, 10)

n_samples = len(train_features)
n_batches = (n_samples + BATCH_SIZE - 1) // BATCH_SIZE  # Handle last batch

best_val_acc = 0
best_weights = None
count_lr_changes = 0
lr = LEARNING_RATE
start_time = time.time()

In [None]:
def compute_accuracies(epoch, epoch_loss):
    train_acc = compute_accuracy(train_features, train_image_labels, weights_input_hidden, bias_hidden, weights_hidden_output, bias_output)
    val_acc = compute_accuracy(validation_features, validation_labels, weights_input_hidden, bias_hidden, weights_hidden_output, bias_output)
        
    _, _, _, a2_val, _ = forward(validation_features, weights_input_hidden, bias_hidden, weights_hidden_output, bias_output, dropout_rate=0, training=False)
    val_loss = cross_entropy_loss(a2_val, validation_labels, weights_input_hidden, weights_hidden_output, L2_LAMBDA)
        
    print(f'epoch index {epoch+1:3d}/{EPOCHS} | '
          f'train_dataset acc: {train_acc:.4f}  val acc: {val_acc:.4f} | '
          f'train_dataset loss: {epoch_loss:.4f}  val loss: {val_loss:.4f} | '
          f'lr: {lr:.4f}  '
          f'time: {time.time() - start_time:.1f}s')
    return val_acc

def update_best_weights(val_acc):
    global best_val_acc
    global best_weights
    global count_lr_changes
    global lr

    # Check whether the model has improved with the current learning rate
    if val_acc <= best_val_acc:
        count_lr_changes += 1
        if count_lr_changes >= 3:
            lr *= 0.7
            count_lr_changes = 0
            print(f'reduced learning rate to {lr:.6f}')
        return

    # Update the global weights
    best_val_acc = val_acc
    best_weights = (weights_input_hidden.copy(), bias_hidden.copy(), weights_hidden_output.copy(), bias_output.copy())
    print(f'new model validation accuracy: {best_val_acc:.4f}')
    count_lr_changes = 0

In [None]:
# Training
print(f'Train {n_samples} samples using {BATCH_SIZE} batches per epoch')
print(f'Normalization parameters: dropout={DROPOUT_RATE}, L2={L2_LAMBDA}')

for epoch in range(EPOCHS):
    # Initial suffle
    index = np.random.permutation(n_samples)
    x_shuffled = train_features[index]
    y_shuffled = train_image_labels[index]
    
    epoch_loss = 0

    for i in range(n_batches):
        begin = i * BATCH_SIZE
        end = min(begin + BATCH_SIZE, n_samples)
        
        x_batch = x_shuffled[begin:end]
        y_batch = y_shuffled[begin:end]
        
        z1, a1, z2, a2, dropout_mask = forward(
            x_batch, weights_input_hidden, bias_hidden, weights_hidden_output, bias_output, 
            dropout_rate=DROPOUT_RATE, training=True
        )
        
        batch_loss = cross_entropy_loss(a2, y_batch, weights_input_hidden, weights_hidden_output, L2_LAMBDA)
        epoch_loss += batch_loss * len(x_batch)
        
        dW1, db1, dW2, db2 = backward(
            x_batch, y_batch, z1, a1, z2, a2, weights_input_hidden, weights_hidden_output, dropout_mask, L2_LAMBDA
        )
        
        # Compute new weights
        weights_input_hidden -= lr * dW1
        bias_hidden -= lr * db1
        weights_hidden_output -= lr * dW2
        bias_output -= lr * db2
    
    epoch_loss /= n_samples
    
    # Print extra statistics
    if epoch % 5 != 0 and epoch != EPOCHS - 1:
        continue
    val_acc = compute_accuracies(epoch, epoch_loss)
    update_best_weights(val_acc)

In [None]:
# Check for optimal results
if best_weights is not None:
    weights_input_hidden, bias_hidden, weights_hidden_output, bias_output = best_weights

training_time = time.time() - start_time

train_acc_final = compute_accuracy(train_features_full, train_labels_full, weights_input_hidden, bias_hidden, weights_hidden_output, bias_output)
val_acc_final = compute_accuracy(validation_features, validation_labels, weights_input_hidden, bias_hidden, weights_hidden_output, bias_output)

_, _, _, a2_test, _ = forward(test_features, weights_input_hidden, bias_hidden, weights_hidden_output, bias_output, dropout_rate=0, training=False)
predictions = np.argmax(a2_test, axis=1)

_, _, _, a2_train, _ = forward(train_features_full, weights_input_hidden, bias_hidden, weights_hidden_output, bias_output, dropout_rate=0, training=False)
_, _, _, a2_val, _ = forward(validation_features, weights_input_hidden, bias_hidden, weights_hidden_output, bias_output, dropout_rate=0, training=False)
train_loss = cross_entropy_loss(a2_train, train_labels_full, weights_input_hidden, weights_hidden_output, L2_LAMBDA)
val_loss = cross_entropy_loss(a2_val, validation_labels, weights_input_hidden, weights_hidden_output, L2_LAMBDA)

print(f'results\n')
print(f'training accuracy: {train_acc_final*100:.2f}%')
print(f'model validation accuracy: {val_acc_final*100:.2f}%')
print(f'training loss: {train_loss:.4f}')
print(f'validation loss loss: {val_loss:.4f}')
print(f'time: {training_time:.2f}s ({training_time/60:.2f}min)')

submission = pd.DataFrame({
    'ID': range(len(predictions)),
    'target': predictions.astype(int)
})

submission.to_csv('submission.csv', index=False)
print('submission done')