In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical

# Load the MNIST dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Preprocess the data
x_train = x_train.reshape((x_train.shape[0], 28, 28, 1)).astype('float32') / 255
x_test = x_test.reshape((x_test.shape[0], 28, 28, 1)).astype('float32') / 255
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten

def create_model():
    model = Sequential([
        Flatten(input_shape=(28, 28, 1)),
        Dense(128, activation='relu'),
        Dense(10, activation='softmax')
    ])
    return model


In [None]:
from tensorflow.keras.optimizers import SGD, Adam, Adagrad, RMSprop, Adadelta


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import SGD, Adam, Adagrad, RMSprop, Adadelta
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.losses import categorical_crossentropy


In [None]:
def create_model():
    model = Sequential([
        Flatten(input_shape=(28, 28, 1)),
        Dense(128, activation='relu'),
        Dense(10, activation='softmax')
    ])
    return model


In [None]:
# Parameters
batch_size = 32
epochs = 5

# List of optimizers
optimizers = {
    'SGD': SGD(),
    'SGD with Momentum': SGD(momentum=0.9),
    'Adagrad': Adagrad(),
    'RMSprop': RMSprop(),
    'Adadelta': Adadelta(),
    'Adam': Adam(),
    'Mini-batch Gradient Descent': SGD(learning_rate=0.01),  # Mini-batch is just SGD with batch size
    'Stochastic Gradient Descent': SGD(learning_rate=0.01)  # Similar to SGD
}

# Train and evaluate models with different optimizers
results = {}

for opt_name, optimizer in optimizers.items():
    print(f"Training with {opt_name}...")

    model = create_model()
    model.compile(optimizer=optimizer, loss=categorical_crossentropy, metrics=[CategoricalAccuracy()])

    history = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=epochs, batch_size=batch_size, verbose=0)

    # Evaluate the model
    test_loss, test_accuracy = model.evaluate(x_test, y_test, verbose=0)
    results[opt_name] = test_accuracy

# Print results
for opt_name, accuracy in results.items():
    print(f"{opt_name}: Test Accuracy = {accuracy:.4f}")


Training with SGD...


  super().__init__(**kwargs)


Training with SGD with Momentum...
Training with Adagrad...
Training with RMSprop...
Training with Adadelta...
Training with Adam...
Training with Mini-batch Gradient Descent...
Training with Stochastic Gradient Descent...
SGD: Test Accuracy = 0.9372
SGD with Momentum: Test Accuracy = 0.9762
Adagrad: Test Accuracy = 0.9046
RMSprop: Test Accuracy = 0.9757
Adadelta: Test Accuracy = 0.6188
Adam: Test Accuracy = 0.9737
Mini-batch Gradient Descent: Test Accuracy = 0.9379
Stochastic Gradient Descent: Test Accuracy = 0.9384


#Advantages :
Gradient Descent (GD)
Pros: Straightforward and easy to implement.
Cons: Converges slowly; prone to getting trapped in local minima; often necessitates manual adjustment of the learning rate.

Stochastic Gradient Descent (SGD)
Pros: Converges faster than GD; capable of avoiding local minima.
Cons: The updates have high variance, which can cause noisy convergence.

SGD with Momentum
Pros: Speeds up SGD in the correct direction and reduces oscillations.
Cons: Requires tuning of both the learning rate and momentum, which can be challenging.

Mini-Batch Gradient Descent
Pros: Offers a good compromise between speed and stability; reduces the computational cost per update.
Cons: Still demands careful tuning of the batch size and learning rate.

Adagrad
Pros: Adapts learning rates for individual parameters; effective for sparse data.
Cons: The learning rate may decrease too quickly, hampering convergence.

RMSprop
Pros: Addresses Adagrad’s issue of rapidly decreasing learning rates; performs well with non-stationary objectives.
Cons: Selecting appropriate hyperparameters (e.g., decay rate) can be complex.

Adadelta
Pros: Enhances Adagrad by preventing the learning rate from dropping too aggressively, leading to better convergence.
Cons: Requires careful parameter tuning; may be sensitive to hyperparameters.

Adam
Pros: Merges the advantages of Adagrad and RMSprop; generally effective with minimal hyperparameter adjustment.
Cons: May converge to suboptimal solutions; sometimes sensitive to learning rate settings.






