In [1]:
import tensorflow as tf

mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.regularizers import l2
from keras.optimizers import Adam

# Create a sequential model
model = Sequential()

# Add layers to the model with L2 regularization
model.add(Dense(units=64, activation='relu', kernel_regularizer=l2(0.01), input_shape=(10,)))
model.add(Dense(units=32, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model with Adam optimizer and binary cross-entropy loss function
model.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])


Adagrad regularization is a variant of gradient descent that adapts the learning rate of each weight in the network based on the historical gradients for that weight. It is an effective way to reduce the learning rate for weights that receive frequent updates and increase the learning rate for weights that receive infrequent updates. 

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adagrad
from keras.regularizers import l2

# Create a sequential model
model = Sequential()

# Add layers to the model with L2 regularization
model.add(Dense(units=64, activation='relu', kernel_regularizer=l2(0.01), input_shape=(10,)))
model.add(Dense(units=32, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model with Adagrad optimizer and binary cross-entropy loss function
model.compile(optimizer=Adagrad(lr=0.01), loss='binary_crossentropy', metrics=['accuracy'])


RMSprop regularization is another variant of gradient descent that adapts the learning rate of each weight based on the moving average of the squared gradients for that weight.

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import RMSprop
from keras.regularizers import l2

# Create a sequential model
model = Sequential()

# Add layers to the model with L2 regularization
model.add(Dense(units=64, activation='relu', kernel_regularizer=l2(0.01), input_shape=(10,)))
model.add(Dense(units=32, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model with RMSprop optimizer and binary cross-entropy loss function
model.compile(optimizer=RMSprop(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

Adadelta regularization is another variant of gradient descent that is similar to RMSprop. Adadelta computes a moving average of the gradients and a moving average of the updates, and it uses these to adjust the learning rate for each weight. It also includes a parameter called "rho" that controls the size of the moving average.

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adadelta
from keras.regularizers import l2

# Create a sequential model
model = Sequential()

# Add layers to the model with L2 regularization
model.add(Dense(units=64, activation='relu', kernel_regularizer=l2(0.01), input_shape=(10,)))
model.add(Dense(units=32, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model with Adadelta optimizer and binary cross-entropy loss function
model.compile(optimizer=Adadelta(lr=1.0, rho=0.95), loss='binary_crossentropy', metrics=['accuracy'])

Nesterov accelerated gradient (NAG) is a variant of gradient descent that uses momentum to accelerate convergence. The momentum term is calculated based on the previous gradient and the current gradient estimate. This can help to reduce oscillations and improve convergence speed.

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from keras.regularizers import l2

# Create a sequential model
model = Sequential()

# Add layers to the model with L2 regularization
model.add(Dense(units=64, activation='relu', kernel_regularizer=l2(0.01), input_shape=(10,)))
model.add(Dense(units=32, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model with SGD optimizer and binary cross-entropy loss function
sgd = SGD(lr=0.01, momentum=0.9, nesterov=True)
model.compile(optimizer=sgd, loss='binary_crossentropy', metrics=['accuracy'])


Internal Covariate Shift (ICS) is a phenomenon that occurs when the distribution of inputs to a layer changes during training. This can cause the weights of the layer to become suboptimal, leading to slower convergence and decreased accuracy.One approach to address ICS is called Batch Normalization. In Batch Normalization, the inputs to each layer are normalized to have zero mean and unit variance, independently of the other inputs.

In [None]:
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization

# Create a sequential model
model = Sequential()

# Add layers to the model with Batch Normalization
model.add(Dense(units=64, activation='relu', input_shape=(10,)))
model.add(BatchNormalization())
model.add(Dense(units=32, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model with binary cross-entropy loss function and Adam optimizer
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
