## **1. SETUP**

In [6]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [7]:
mnist1 = tf.keras.datasets.mnist

(X_train_full, y_train_full), (X_test, y_test) = mnist1.load_data()

In [8]:
X_valid, X_train = X_train_full[:5000] / 255., X_train_full[5000:] / 255.
y_valid, y_train =  y_train_full[:5000], y_train_full[5000:]


# use of Early Stopping and Check-pointing to improve model 

These are all types of callback implemented during model fitting.

Early Stopping - a form of regularization used to avoid overfitting when training a learner with an iterative method, such as gradient descent. This function of Keras callbacks is used to stop the model training in between

Model Checkpoint - This function of keras callbacks is used to save the model after every epoch. We just need to define a few of the parameters like where we want to store, what we want to monitor and etc. 

In [9]:
# Early Stopping
early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)

In [10]:
# Model Checkpoint saving
CKPT_path = "model_ckpt_01.h5"
checkpointing_cb = tf.keras.callbacks.ModelCheckpoint(CKPT_path, save_best_only=True)

Find best initializer and compare results

In [11]:
model_1 = tf.keras.models.Sequential([tf.keras.layers.Flatten(input_shape = [28,28], name = 'input_layer'),
                                     tf.keras.layers.Dense(300,activation='relu',name= 'hidden_layer_1'),
                                     tf.keras.layers.Dense(200, activation = 'relu', name = 'hidden_layer_2'),
                                     tf.keras.layers.Dense(100, activation = 'relu', name = 'hidden_layer_3'),
                                     tf.keras.layers.Dense(10, activation='softmax', name = 'output_layer')])

LOSS_FUNCTION = "sparse_categorical_crossentropy" # use => tf.losses.sparse_categorical_crossentropy
OPTIMIZER = "SGD" # or use with custom learning rate=> tf.keras.optimizers.SGD(0.02)
METRICS = ["accuracy"]

model_1.compile(loss=LOSS_FUNCTION, optimizer=OPTIMIZER, metrics=METRICS)

EPOCHS = 5
VALIDATION_SET = (X_valid, y_valid)

history = model_1.fit(X_train, y_train, epochs=EPOCHS, validation_data=VALIDATION_SET,batch_size = 32, callbacks= [early_stopping_cb, checkpointing_cb])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


#Test using glorot_normal

In [12]:
model_2 = tf.keras.models.Sequential([tf.keras.layers.Flatten(input_shape = [28,28], name = 'input_layer'),
                                     tf.keras.layers.Dense(300,activation='relu', kernel_initializer= 'glorot_normal'),
                                     tf.keras.layers.Dense(200, activation = 'relu', kernel_initializer= 'glorot_normal'),
                                     tf.keras.layers.Dense(100, activation = 'relu', kernel_initializer= 'glorot_normal'),
                                     tf.keras.layers.Dense(10, activation='softmax')])

In [13]:
model_2.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_layer (Flatten)       (None, 784)               0         
                                                                 
 dense (Dense)               (None, 300)               235500    
                                                                 
 dense_1 (Dense)             (None, 200)               60200     
                                                                 
 dense_2 (Dense)             (None, 100)               20100     
                                                                 
 dense_3 (Dense)             (None, 10)                1010      
                                                                 
Total params: 316,810
Trainable params: 316,810
Non-trainable params: 0
_________________________________________________________________


#Define parameters

In [14]:
LOSS_FUNCTION = "sparse_categorical_crossentropy" # use => tf.losses.sparse_categorical_crossentropy
OPTIMIZER = "SGD" # or use with custom learning rate=> tf.keras.optimizers.SGD(0.02)
METRICS = ["accuracy"]

model_2.compile(loss=LOSS_FUNCTION, optimizer=OPTIMIZER, metrics=METRICS)

#Model training

In [15]:
EPOCHS = 5
VALIDATION_SET = (X_valid, y_valid)
history = model_2.fit(X_train, y_train, epochs=EPOCHS, validation_data=VALIDATION_SET,batch_size = 32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Use summary() to see model architecture

In [16]:
#Using Glorot uniform
model_3 = tf.keras.models.Sequential([tf.keras.layers.Flatten(input_shape = [28,28], name = 'input_layer'),
                                     tf.keras.layers.Dense(300,activation='relu', kernel_initializer= 'glorot_uniform'),
                                     tf.keras.layers.Dense(200, activation = 'relu', kernel_initializer= 'glorot_uniform'),
                                     tf.keras.layers.Dense(100, activation = 'relu', kernel_initializer= 'glorot_uniform'),
                                     tf.keras.layers.Dense(10, activation='softmax')])

model_3.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_layer (Flatten)       (None, 784)               0         
                                                                 
 dense_4 (Dense)             (None, 300)               235500    
                                                                 
 dense_5 (Dense)             (None, 200)               60200     
                                                                 
 dense_6 (Dense)             (None, 100)               20100     
                                                                 
 dense_7 (Dense)             (None, 10)                1010      
                                                                 
Total params: 316,810
Trainable params: 316,810
Non-trainable params: 0
_________________________________________________________________


In [17]:
LOSS_FUNCTION = "sparse_categorical_crossentropy" # use => tf.losses.sparse_categorical_crossentropy
OPTIMIZER = "SGD" # or use with custom learning rate=> tf.keras.optimizers.SGD(0.02)
METRICS = ["accuracy"]

model_3.compile(loss=LOSS_FUNCTION, optimizer=OPTIMIZER, metrics=METRICS)

EPOCHS = 5
VALIDATION_SET = (X_valid, y_valid)

history = model_3.fit(X_train, y_train, epochs=EPOCHS, validation_data=VALIDATION_SET,batch_size = 32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [18]:
print("Glorot normal result is {}".format(model_2.evaluate(X_test, y_test)))
print("Glorot uniform result is {}".format(model_3.evaluate(X_test, y_test)))

Glorot normal result is [19.430076599121094, 0.9598000049591064]
Glorot uniform result is [19.432621002197266, 0.958299994468689]


In [19]:
#use he_normal as initializer and relu as activation function
model4 = tf.keras.models.Sequential([tf.keras.layers.Flatten(input_shape = [28,28], name = 'input_layer'),
                                     tf.keras.layers.Dense(300,activation='relu', kernel_initializer= 'he_normal'),
                                     tf.keras.layers.Dense(200, activation = 'relu', kernel_initializer= 'he_normal'),
                                     tf.keras.layers.Dense(100, activation = 'relu', kernel_initializer= 'he_normal'),
                                     tf.keras.layers.Dense(10, activation='softmax')])

In [20]:
LOSS_FUNCTION = "sparse_categorical_crossentropy" # use => tf.losses.sparse_categorical_crossentropy
OPTIMIZER = "SGD" # or use with custom learning rate=> tf.keras.optimizers.SGD(0.02)
METRICS = ["accuracy"]

model4.compile(loss=LOSS_FUNCTION, optimizer=OPTIMIZER, metrics=METRICS)

EPOCHS = 5
VALIDATION_SET = (X_valid, y_valid)

history = model4.fit(X_train, y_train, epochs=EPOCHS, validation_data=VALIDATION_SET,batch_size = 32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [21]:
##use he_normal as initializer and elu as activation function
model5 = tf.keras.models.Sequential([tf.keras.layers.Flatten(input_shape = [28,28], name = 'input_layer'),
                                     tf.keras.layers.Dense(300,activation='elu', kernel_initializer= 'he_normal'),
                                     tf.keras.layers.Dense(200, activation = 'elu', kernel_initializer= 'he_normal'),
                                     tf.keras.layers.Dense(100, activation = 'elu', kernel_initializer= 'he_normal'),
                                     tf.keras.layers.Dense(10, activation='softmax')])

LOSS_FUNCTION = "sparse_categorical_crossentropy" # use => tf.losses.sparse_categorical_crossentropy
OPTIMIZER = "SGD" # or use with custom learning rate=> tf.keras.optimizers.SGD(0.02)
METRICS = ["accuracy"]

model5.compile(loss=LOSS_FUNCTION, optimizer=OPTIMIZER, metrics=METRICS)

EPOCHS = 5
VALIDATION_SET = (X_valid, y_valid)

history = model5.fit(X_train, y_train, epochs=EPOCHS, validation_data=VALIDATION_SET,batch_size = 32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [22]:
print("For he normal is {}".format(model4.evaluate(X_test, y_test)))
print("For he normal with elu AF is {}".format(model5.evaluate(X_test, y_test)))

For he normal is [17.702665328979492, 0.9624999761581421]
For he normal with elu AF is [44.14720153808594, 0.8004999756813049]


# Compare different activiation functions - ReLu, LeakyReLu, Elu

In [23]:
#Implementation of relu activation function

model6 = tf.keras.models.Sequential([tf.keras.layers.Dense(30, activation="relu", input_shape=X_train.shape[1:]), tf.keras.layers.Dense(1)])
model6.compile(loss="mean_squared_error", optimizer="sgd")
history = model6.fit(X_train, y_train, epochs=5, validation_data=(X_valid, y_valid))
mse_test = model6.evaluate(X_test, y_test)
X_new = X_test[:3] # pretend these are new instances
y_pred = model6.predict(X_new)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In a similar was as shown above, diifferent activation functions can be introduced in the model. See the below code for instance:

In [24]:
leaky_relu = tf.keras.layers.LeakyReLU(alpha=0.2)
layer = tf.keras.layers.Dense(10, activation=leaky_relu, kernel_initializer="he_normal")

In [25]:
#implementation of elu and softmax
model7 = tf.keras.models.Sequential([
tf.keras.layers.Flatten(input_shape=[28, 28]),
tf.keras.layers.Dense(300, activation="elu", kernel_initializer="he_normal"),
tf.keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal"),
tf.keras.layers.Dense(10, activation="softmax")
])

# Compare *Optimisers* = ['Adadelta', 'Adagrad', 'Adam', 'RMSprop', 'SGD']

In [35]:
##use he_normal as initializer and elu as activation function
def build_model(optimizer):
  model8 = tf.keras.models.Sequential([tf.keras.layers.Flatten(input_shape = [28,28], name = 'input_layer'),
                                      tf.keras.layers.Dense(300,activation='elu', kernel_initializer= 'he_normal'),
                                      tf.keras.layers.Dense(200, activation = 'elu', kernel_initializer= 'he_normal'),
                                      tf.keras.layers.Dense(100, activation = 'elu', kernel_initializer= 'he_normal'),
                                      tf.keras.layers.Dense(10, activation='softmax')])

  LOSS_FUNCTION = "sparse_categorical_crossentropy" # use => tf.losses.sparse_categorical_crossentropy
  METRICS = ["accuracy"]

  model8.compile(loss=LOSS_FUNCTION, optimizer=OPTIMIZER, metrics=METRICS)
  return model8


In [36]:
EPOCHS = 5
VALIDATION_SET = (X_valid, y_valid)

optimizers = ['Adadelta', 'Adagrad', 'Adam', 'RMSprop', 'SGD']


model8 = build_model("Adadelta")
history = model8.fit(X_train, y_train, epochs=EPOCHS, validation_data=VALIDATION_SET,batch_size = 32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [37]:
model8 = build_model("Adagrad")
history = model8.fit(X_train, y_train, epochs=EPOCHS, validation_data=VALIDATION_SET,batch_size = 32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [38]:
model8 = build_model("Adam")
history = model8.fit(X_train, y_train, epochs=EPOCHS, validation_data=VALIDATION_SET,batch_size = 32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [39]:
model8 = build_model("RMSprop")
history = model8.fit(X_train, y_train, epochs=EPOCHS, validation_data=VALIDATION_SET,batch_size = 32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [40]:
model8 = build_model("SGD")
history = model8.fit(X_train, y_train, epochs=EPOCHS, validation_data=VALIDATION_SET,batch_size = 32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# various regularisation techniques like l1, l2, and dropout techniques

In [41]:
import os
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%load_ext tensorboard
     

In [42]:
#Performing l2()
layer = tf.keras.layers.Dense(100, activation="elu",
                          kernel_initializer="he_normal",
                          kernel_regularizer=tf.keras.regularizers.l2(0.01))
#The l2() function returns a regularizer that will be called to compute the regularization loss, at each step during training. 
#This regularization loss is then added to the final loss.
#regularization factor = 0.01

In [45]:
# import keras modules
import keras
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.python.keras.layers import Dense
from keras.utils.np_utils import to_categorical
#from tensorflow.keras import layers
#from tensorflow.keras import regularizers


from functools import partial
RegularizedDense = partial(tf.keras.layers.Dense, activation="elu", kernel_initializer="he_normal",
                            kernel_regularizer=keras.regularizers.l2(0.00001))
model_l2 = tf.keras.models.Sequential([
tf.keras.layers.Flatten(input_shape=[28, 28]),
RegularizedDense(300),
RegularizedDense(100),
RegularizedDense(100),
RegularizedDense(10, activation="softmax",
kernel_initializer="glorot_uniform")
])
    

In [46]:
model_l2.compile(loss="sparse_categorical_crossentropy",
              optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),
              metrics=["accuracy"])

model_l2.summary()

Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_1 (Flatten)         (None, 784)               0         
                                                                 
 dense_67 (Dense)            (None, 300)               235500    
                                                                 
 dense_68 (Dense)            (None, 100)               30100     
                                                                 
 dense_69 (Dense)            (None, 100)               10100     
                                                                 
 dense_70 (Dense)            (None, 10)                1010      
                                                                 
Total params: 276,710
Trainable params: 276,710
Non-trainable params: 0
_________________________________________________________________


In [49]:
history = model_l2.fit(X_train, y_train, epochs=3,
                    validation_data=(X_valid, y_valid), verbose=2)

Epoch 1/3
1719/1719 - 4s - loss: 0.4048 - accuracy: 0.8906 - val_loss: 0.3607 - val_accuracy: 0.9004 - 4s/epoch - 3ms/step
Epoch 2/3
1719/1719 - 5s - loss: 0.3687 - accuracy: 0.8994 - val_loss: 0.3329 - val_accuracy: 0.9058 - 5s/epoch - 3ms/step
Epoch 3/3
1719/1719 - 4s - loss: 0.3450 - accuracy: 0.9053 - val_loss: 0.3142 - val_accuracy: 0.9118 - 4s/epoch - 3ms/step


In [50]:
##Performing l1()
layer = tf.keras.layers.Dense(100, activation="elu",
                          kernel_initializer="he_normal",
                          kernel_regularizer=tf.keras.regularizers.l1(0.01))
#The l1() function returns a regularizer that will be called to compute the regularization loss, at each step during training. 
#This regularization loss is then added to the final loss.
#regularization factor = 0.01


In [51]:

from functools import partial
RegularizedDense = partial(tf.keras.layers.Dense, activation="elu", kernel_initializer="he_normal",
                            kernel_regularizer=keras.regularizers.l1(0.00005))
model_l1 = tf.keras.models.Sequential([
tf.keras.layers.Flatten(input_shape=[28, 28]),
RegularizedDense(300),
RegularizedDense(100),
RegularizedDense(100),
RegularizedDense(10, activation="softmax",
kernel_initializer="glorot_uniform")
])
     

In [52]:
model_l1.compile(loss="sparse_categorical_crossentropy",
              optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),
              metrics=["accuracy"])

model_l1.summary()

Model: "sequential_19"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_2 (Flatten)         (None, 784)               0         
                                                                 
 dense_73 (Dense)            (None, 300)               235500    
                                                                 
 dense_74 (Dense)            (None, 100)               30100     
                                                                 
 dense_75 (Dense)            (None, 100)               10100     
                                                                 
 dense_76 (Dense)            (None, 10)                1010      
                                                                 
Total params: 276,710
Trainable params: 276,710
Non-trainable params: 0
_________________________________________________________________


In [54]:
history = model_l1.fit(X_train, y_train, epochs=3,
                    validation_data=(X_valid, y_valid), verbose=2)

Epoch 1/3
1719/1719 - 5s - loss: 0.9996 - accuracy: 0.8996 - val_loss: 0.9683 - val_accuracy: 0.9116 - 5s/epoch - 3ms/step
Epoch 2/3
1719/1719 - 5s - loss: 0.9800 - accuracy: 0.9050 - val_loss: 0.9524 - val_accuracy: 0.9160 - 5s/epoch - 3ms/step
Epoch 3/3
1719/1719 - 6s - loss: 0.9643 - accuracy: 0.9093 - val_loss: 0.9392 - val_accuracy: 0.9208 - 6s/epoch - 4ms/step


In [57]:
#Performing Dropout
model_dropout = tf.keras.models.Sequential([
tf.keras.layers.Flatten(input_shape=[28, 28]),
tf.keras.layers.Dropout(rate=0.2),
tf.keras.layers.Dense(300, activation="elu", kernel_initializer="he_normal"),
tf.keras.layers.Dropout(rate=0.2),
tf.keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal"),
tf.keras.layers.Dropout(rate=0.2),
tf.keras.layers.Dense(10, activation="softmax")
])
    

In [58]:
model_dropout.compile(loss="sparse_categorical_crossentropy",
              optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),
              metrics=["accuracy"])

model_dropout.summary()

Model: "sequential_21"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_4 (Flatten)         (None, 784)               0         
                                                                 
 dropout_3 (Dropout)         (None, 784)               0         
                                                                 
 dense_80 (Dense)            (None, 300)               235500    
                                                                 
 dropout_4 (Dropout)         (None, 300)               0         
                                                                 
 dense_81 (Dense)            (None, 100)               30100     
                                                                 
 dropout_5 (Dropout)         (None, 100)               0         
                                                                 
 dense_82 (Dense)            (None, 10)              

In [59]:
history = model_dropout.fit(X_train, y_train, epochs=3,
                    validation_data=(X_valid, y_valid), verbose=2)

Epoch 1/3
1719/1719 - 7s - loss: 1.4259 - accuracy: 0.5462 - val_loss: 0.7326 - val_accuracy: 0.8262 - 7s/epoch - 4ms/step
Epoch 2/3
1719/1719 - 7s - loss: 0.7821 - accuracy: 0.7662 - val_loss: 0.5212 - val_accuracy: 0.8636 - 7s/epoch - 4ms/step
Epoch 3/3
1719/1719 - 6s - loss: 0.6401 - accuracy: 0.8074 - val_loss: 0.4411 - val_accuracy: 0.8828 - 6s/epoch - 4ms/step


# Test the model and evaluate to see intermediate results

In [60]:
model_l2.evaluate(X_test, y_test)
model_l1.evaluate(X_test, y_test)
model_dropout.evaluate(X_test, y_test)



[23.099672317504883, 0.8490999937057495]

# Conclusion : 

1. Initialization - 
*   Glorot initialization works best with TanH, Logistic and SoftMax
*   He initialization works best with Relu and its variants

2. Optimisers 
*   'Adadelta', 'Adagrad', 'Adam', 'RMSprop', 'SGD' are equally good. 
*    With hyperparameter tuning and more epoch training, the performance can be increased.

3. Regularization 
*   Dropout is the most effective regularization technique
*   It helped to increase accuracy and reduce loss. 





