In [None]:
import os, sys
sys.path.append("C:/Users/isang/OneDrive/Desktop/DL/deep-learning-from-scratch-master")

import numpy as np
import matplotlib.pyplot as plt
from dataset.mnist import load_mnist
from common.util import smooth_curve
from common.multi_layer_net import MultiLayerNet

# Optimizer classes  
# Mini-batch Stochastic Gradient Descent class
class mini_SGD:
    def __init__(self, lr=0.01):
        self.lr = lr
        
    def update(self, params, grads):
        for key in params.keys():
            params[key] -= self.lr * grads[key]

# Momentum class
class Momentum:
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = None
        
    def update(self, params, grads):
        if self.v is None:
            self.v = {}
            for key, val in params.items():
                self.v[key] = np.zeros_like(val)
                
        for key in params.keys():
            self.v[key] = self.momentum*self.v[key] - self.lr*grads[key]
            params[key] += self.v[key]

# RMSprop class
class RMSprop:
    def __init__(self, lr=0.01, decay_rate=0.99, epsilon=1e-8):
        self.lr = lr
        self.decay_rate = decay_rate
        self.epsilon = epsilon
        self.h = {}  # Squared gradient moving average

    def update(self, params, grads):
        if not self.h:
            for key in params:
                self.h[key] = np.zeros_like(grads[key])  # Initialize h for each param

        for key in params:
            # 1. Update moving average of squared gradients
            self.h[key] = self.decay_rate * self.h[key] + (1 - self.decay_rate) * (grads[key] ** 2)

            # 2. Apply adaptive learning rate scaling
            params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + self.epsilon)

# Adam class
class Adam:
    def __init__(self, lr=0.01, beta1=0.9, beta2=0.999):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.iter = 0
        self.m = None
        self.v = None
        
    def update(self, params, grads):
        if self.m is None:
            self.m, self.v = {}, {}
            for key, val in params.items():
                self.m[key] = np.zeros_like(val)
                self.v[key] = np.zeros_like(val)
        
        self.iter += 1
        lr_t  = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / (1.0 - self.beta1**self.iter)         
        
        for key in params.keys():
            self.m[key] += (1 - self.beta1) * (grads[key] - self.m[key])
            self.v[key] += (1 - self.beta2) * (grads[key]**2 - self.v[key])
            
            params[key] -= lr_t * self.m[key] / (np.sqrt(self.v[key]) + 1e-7)

In [None]:
# Load & Read MNIST Dataset
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True)

train_size = x_train.shape[0]
batch_size = 128

# Experiment setup
optimizers = {
    'mini_SGD': mini_SGD(lr=0.01),
    'Momentum' : Momentum(lr=0.01),
    'RMSprop': RMSprop(lr=0.01),
    'Adam' : Adam(lr=0.01)
}

networks = {}
train_loss = {}

# Initialize networks & loss tracking  
for key in optimizers.keys():
    networks[key] = MultiLayerNet(
        input_size=784,
        hidden_size_list=[100, 100, 100, 100],
        output_size=10
    )
    train_loss[key] = []

In [None]:
# Training 
# Training loop
for epoch in range(2000):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]

    for key in optimizers.keys():
        # Compute gradients and update parameters
        grads = networks[key].gradient(x_batch, t_batch)
        optimizers[key].update(networks[key].params, grads)

        # Compute and record loss
        loss = networks[key].loss(x_batch, t_batch)
        train_loss[key].append(loss)

    # Print loss every 100 iterations
    if epoch % 100 == 0:
        print("========== epoch: " + str(epoch) + " ==========")
        for key in optimizers.keys():
            loss = networks[key].loss(x_batch, t_batch)
            print(key + ":" + str(loss))

In [None]:
markers = {"mini_SGD" :"v", "Momentum" :  "p", "RMSprop" : "o", "Adam" : "s"}
x = np.arange(2000)

for key in optimizers.keys():
    plt.plot(x, smooth_curve(train_loss[key]), marker=markers[key], markevery=100, label=key)

plt.xlabel("Iterations")
plt.ylabel("Loss")
plt.ylim(0, 1)
plt.legend()
plt.title("Comparison of Optimizers on MNIST")
plt.show()