<a href="https://colab.research.google.com/github/maruwrks/Deep-Learning-Task/blob/main/training_deep_neural_networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd

##Vanishing/Exploding Gradients Problem

### Glorot and He Initialization

In [5]:
keras.layers.Dense(100, activation="relu", kernel_initializer="he_normal")

<Dense name=dense_4, built=False>

### Leaky ReLU

In [6]:
leaky_relu_activation = keras.layers.LeakyReLU(alpha=0.2)

# Example model with Leaky ReLU
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.Dense(300, kernel_initializer="he_normal"),
    leaky_relu_activation,
    keras.layers.Dense(100, kernel_initializer="he_normal"),
    leaky_relu_activation,
    keras.layers.Dense(10, activation="softmax")
])

fashion_mnist = keras.datasets.fashion_mnist
(X_train_full, y_train_full), (X_test, y_test) = fashion_mnist.load_data()

X_valid, X_train = X_train_full[:5000] / 255.0, X_train_full[5000:] / 255.0
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]
X_test = X_test / 255.0

model.compile(loss="sparse_categorical_crossentropy", optimizer="sgd", metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid))

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
[1m29515/29515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
[1m       0/26421880[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 0s/step

  super().__init__(**kwargs)


[1m26421880/26421880[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
[1m5148/5148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
[1m4422102/4422102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.7024 - loss: 0.9332 - val_accuracy: 0.8268 - val_loss: 0.5081
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.8269 - loss: 0.4978 - val_accuracy: 0.8456 - val_loss: 0.4493
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.8429 - loss: 0.4493 - val_accuracy: 0.8542 - val_loss: 0.4201
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━

### PReLU (Parametric ReLU)

In [7]:
keras.layers.PReLU()

<PReLU name=p_re_lu, built=False>

### ELU (Exponential Linear Unit)

In [8]:
keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal")

<Dense name=dense_8, built=False>

### SELU (Scaled Exponential Linear Unit)

In [9]:
keras.layers.Dense(100, activation="selu", kernel_initializer="lecun_normal")

<Dense name=dense_9, built=False>

##Batch Normalization

In [10]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.BatchNormalization(), # BN layer after Flatten (input layer)
    keras.layers.Dense(300, activation="relu"),
    keras.layers.BatchNormalization(), # BN layer after hidden layer
    keras.layers.Dense(100, activation="relu"),
    keras.layers.BatchNormalization(), # BN layer after hidden layer
    keras.layers.Dense(10, activation="softmax")
])

In [11]:
# You can also add BN before activation function
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(300, kernel_initializer="he_normal"),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("relu"),
    keras.layers.Dense(100, kernel_initializer="he_normal"),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("relu"),
    keras.layers.Dense(10, activation="softmax")
])

In [12]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="sgd", metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid))

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 0.7352 - loss: 0.8296 - val_accuracy: 0.8590 - val_loss: 0.4079
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 7ms/step - accuracy: 0.8449 - loss: 0.4426 - val_accuracy: 0.8662 - val_loss: 0.3658
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 7ms/step - accuracy: 0.8637 - loss: 0.3845 - val_accuracy: 0.8808 - val_loss: 0.3510
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 7ms/step - accuracy: 0.8754 - loss: 0.3553 - val_accuracy: 0.8824 - val_loss: 0.3329
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 7ms/step - accuracy: 0.8793 - loss: 0.3384 - val_accuracy: 0.8838 - val_loss: 0.3249
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 7ms/step - accuracy: 0.8885 - loss: 0.3153 - val_accuracy: 0.8886 - val_loss: 0.3204
Epoch 7/10

##Gradient Clipping

In [13]:
optimizer = keras.optimizers.SGD(clipvalue=1.0) # Clip gradients to max value 1.0
optimizer = keras.optimizers.SGD(clipnorm=1.0) # Clip gradients by norm
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid))

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 8ms/step - accuracy: 0.9089 - loss: 0.2512 - val_accuracy: 0.8940 - val_loss: 0.2989
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 7ms/step - accuracy: 0.9143 - loss: 0.2408 - val_accuracy: 0.8970 - val_loss: 0.2978
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 7ms/step - accuracy: 0.9172 - loss: 0.2332 - val_accuracy: 0.8958 - val_loss: 0.2999
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 8ms/step - accuracy: 0.9169 - loss: 0.2283 - val_accuracy: 0.8966 - val_loss: 0.3011
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 0.9184 - loss: 0.2263 - val_accuracy: 0.8948 - val_loss: 0.3003
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 0.9222 - loss: 0.2176 - val_accuracy: 0.8956 - val_loss: 0.3062
Epoch 7/10

<keras.src.callbacks.history.History at 0x7ef655004b90>

##Reusing Pretrained Layers (Transfer Learning)

In [14]:
base_model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(50, activation="relu")
])

model = keras.models.Sequential([
    base_model,
    keras.layers.Dense(10, activation="softmax")
])

### Freezing the base model's layers

In [15]:
base_model.trainable = False # Freeze the base model

In [16]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="sgd", metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid))

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.2068 - loss: 2.1788 - val_accuracy: 0.5306 - val_loss: 1.8273
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.5502 - loss: 1.7586 - val_accuracy: 0.6256 - val_loss: 1.5595
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.6168 - loss: 1.5343 - val_accuracy: 0.6594 - val_loss: 1.3981
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.6432 - loss: 1.3900 - val_accuracy: 0.6644 - val_loss: 1.2922
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.6545 - loss: 1.2960 - val_accuracy: 0.6730 - val_loss: 1.2163
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.6622 - loss: 1.2196 - val_accuracy: 0.6762 - val_loss: 1.1607
Epoch 7/10
[1m

### Unfreezing layers (fine-tuning)

In [17]:
base_model.trainable = True # Unfreeze the base model
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=keras.optimizers.SGD(learning_rate=1e-4), # Use a very low learning rate
              metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid))

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.6899 - loss: 0.9793 - val_accuracy: 0.7296 - val_loss: 0.8249
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.7243 - loss: 0.8268 - val_accuracy: 0.7398 - val_loss: 0.7607
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.7368 - loss: 0.7755 - val_accuracy: 0.7498 - val_loss: 0.7222
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.7465 - loss: 0.7304 - val_accuracy: 0.7598 - val_loss: 0.6946
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.7506 - loss: 0.7144 - val_accuracy: 0.7668 - val_loss: 0.6730
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.7642 - loss: 0.6863 - val_accuracy: 0.7752 - val_loss: 0.6560
Epoch 7/10
[1m

##Faster Optimizers

In [18]:
# Momentum Optimizer
optimizer = keras.optimizers.SGD(learning_rate=0.001, momentum=0.9)

In [19]:
# Nesterov Accelerated Gradient (NAG)
optimizer = keras.optimizers.SGD(learning_rate=0.001, momentum=0.9, nesterov=True)

In [20]:
# AdaGrad Optimizer
optimizer = keras.optimizers.Adagrad(learning_rate=0.001)

In [21]:
# RMSProp Optimizer
optimizer = keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9)

In [22]:
# Adam Optimizer (often a good default)
optimizer = keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)

In [23]:
# Adamax Optimizer
optimizer = keras.optimizers.Adamax(learning_rate=0.001, beta_1=0.9, beta_2=0.999)

In [24]:
# Nadam Optimizer
optimizer = keras.optimizers.Nadam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)

In [25]:
# Example of compiling with an optimizer
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

##Regularization

### L1 and L2 Regularization

In [26]:
kernel_regularizer=keras.regularizers.l2(0.01)
bias_regularizer=keras.regularizers.l2(0.01)
activity_regularizer=keras.regularizers.l2(0.01)

model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.Dense(300, activation="relu",
                       kernel_regularizer=keras.regularizers.l2(0.01)),
    keras.layers.Dense(100, activation="relu",
                       kernel_regularizer=keras.regularizers.l2(0.01)),
    keras.layers.Dense(10, activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="sgd", metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid))

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.6786 - loss: 6.0259 - val_accuracy: 0.8324 - val_loss: 3.5889
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.8171 - loss: 3.1733 - val_accuracy: 0.8356 - val_loss: 2.1252
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.8295 - loss: 1.9148 - val_accuracy: 0.8330 - val_loss: 1.3977
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.8342 - loss: 1.2851 - val_accuracy: 0.8468 - val_loss: 1.0032
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.8346 - loss: 0.9679 - val_accuracy: 0.8500 - val_loss: 0.8047
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.8361 - loss: 0.8032 - val_accuracy: 0.8502 - val_loss: 0.7052
Epoch 7/10


### Dropout

In [27]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.Dropout(rate=0.2), # Dropout layer after Flatten
    keras.layers.Dense(300, activation="relu"),
    keras.layers.Dropout(rate=0.2), # Dropout layer after hidden layer
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dropout(rate=0.2), # Dropout layer after hidden layer
    keras.layers.Dense(10, activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="sgd", metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid))

Epoch 1/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.5795 - loss: 1.2209 - val_accuracy: 0.8052 - val_loss: 0.5604
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 5ms/step - accuracy: 0.7725 - loss: 0.6490 - val_accuracy: 0.8278 - val_loss: 0.4875
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.7998 - loss: 0.5679 - val_accuracy: 0.8416 - val_loss: 0.4484
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.8105 - loss: 0.5286 - val_accuracy: 0.8502 - val_loss: 0.4257
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.8218 - loss: 0.4994 - val_accuracy: 0.8578 - val_loss: 0.4061
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.8210 - loss: 0.4974 - val_accuracy: 0.8602 - val_loss: 0.3917
Epoch 7/10

### Alpha Dropout

In [28]:
# Use with SELU activation and lecun_normal initializer
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.AlphaDropout(rate=0.2),
    keras.layers.Dense(300, activation="selu", kernel_initializer="lecun_normal"),
    keras.layers.AlphaDropout(rate=0.2),
    keras.layers.Dense(100, activation="selu", kernel_initializer="lecun_normal"),
    keras.layers.AlphaDropout(rate=0.2),
    keras.layers.Dense(10, activation="softmax")
])

### Max-Norm Regularization

In [29]:
# Implemented using a Keras constraint
from keras.constraints import max_norm
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.Dense(300, activation="relu", kernel_constraint=max_norm(3)),
    keras.layers.Dense(100, activation="relu", kernel_constraint=max_norm(3)),
    keras.layers.Dense(10, activation="softmax")
])