# 2. About exploding gradients

### About this notebook

This notebook was used in the 50.039 Deep Learning course at the Singapore University of Technology and Design.

**Author:** Matthieu DE MARI (matthieu_demari@sutd.edu.sg)

**Version:** 1.0 (17/12/2022)

**Requirements:**
- Python 3 (tested on v3.9.6)
- Matplotlib (tested on v3.5.1)
- Numpy (tested on v1.22.1)

### Imports

In [1]:
# Matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm
# Numpy
import numpy as np
# Removing unecessary warnings (optional, just makes notebook outputs more readable)
import warnings
warnings.filterwarnings("ignore")

### Blast from the past - Mock dataset generation

As in previous notebook.

In [2]:
# All helper functions
min_surf = 40
max_surf = 200
def surface(min_surf, max_surf):
    return round(np.random.uniform(min_surf, max_surf), 2)
min_dist = 50
max_dist = 1000
def distance(min_dist, max_dist):
    return round(np.random.uniform(min_dist, max_dist), 2)
def price(surface, distance):
    return round((100000 + 14373*surface + (1000 - distance)*1286)*(1 + np.random.uniform(-0.1, 0.1)))/1000000
n_points = 100
def create_dataset(n_points, min_surf, max_surf, min_dist, max_dist):
    surfaces_list = np.array([surface(min_surf, max_surf) for _ in range(n_points)])
    distances_list = np.array([distance(min_dist, max_dist) for _ in range(n_points)])
    inputs = np.array([[s, d] for s, d in zip(surfaces_list, distances_list)])
    outputs = np.array([price(s, d) for s, d in zip(surfaces_list, distances_list)]).reshape(n_points, 1)
    return surfaces_list, distances_list, inputs, outputs

In [3]:
# Generate dataset
np.random.seed(47)
surfaces_list, distances_list, inputs, outputs = create_dataset(n_points, min_surf, max_surf, min_dist, max_dist)
# Check a few entries of the dataset
print(surfaces_list.shape)
print(distances_list.shape)
print(inputs.shape)
print(outputs.shape)
print(inputs[0:10, :])
print(outputs[0:10])

(100,)
(100,)
(100, 2)
(100, 1)
[[ 58.16 572.97]
 [195.92 809.8 ]
 [156.6  349.04]
 [ 96.23  86.82]
 [153.22 817.92]
 [167.94 806.25]
 [143.29 315.92]
 [106.34 482.67]
 [152.96 427.77]
 [ 79.46 955.76]]
[[1.581913]
 [3.450274]
 [2.978769]
 [2.808258]
 [2.556398]
 [3.023983]
 [3.099523]
 [2.121069]
 [3.136544]
 [1.273443]]


### ShallowNeuralNet class with intializations

As in previous notebook.

In [4]:
class ShallowNeuralNet():
    
    def __init__(self, n_x, n_h, n_y, init_type = "Normal"):
        # Network dimensions
        self.n_x = n_x
        self.n_h = n_h
        self.n_y = n_y
        # Initialize parameters
        self.init_type = init_type
        self.init_parameters()
        # Loss, initialized as infinity before first calculation is made
        self.loss = float("Inf")
        
    def init_parameters(self):
        if(self.init_type == "Normal"):
            self.init_parameters_normal()
        elif(self.init_type == "Xavier"):
            self.init_parameters_xavier()
        elif(self.init_type == "He"):
            self.init_parameters_he()
        elif(self.init_type == "LeCun"):
            self.init_parameters_lecun()
        elif(self.init_type == "Constant"):
            self.init_parameters_const()
        else:
            assert False, "Invalid initialization of parameters, check your init_type."
            
    def init_parameters_normal(self):
        # Weights and biases matrices (randomly initialized)
        self.W1 = np.random.randn(self.n_x, self.n_h)*0.1
        self.b1 = np.random.randn(1, self.n_h)*0.1
        self.W2 = np.random.randn(self.n_h, self.n_y)*0.1
        self.b2 = np.random.randn(1, self.n_y)*0.1
        
    def init_parameters_xavier(self):
        # Weights and biases matrices (Xavier initialized)
        init_val = np.sqrt(6.0/(self.n_x + self.n_y))
        self.W1 = np.random.uniform(-init_val, init_val, (self.n_x, self.n_h))
        self.b1 = np.random.uniform(-init_val, init_val, (1, self.n_h))
        self.W2 = np.random.uniform(-init_val, init_val, (self.n_h, self.n_y))
        self.b2 = np.random.uniform(-init_val, init_val, (1, self.n_y))
        
    def init_parameters_he(self):
        # Weights and biases matrices (He initialized)
        range1 = np.sqrt(2/self.n_x)
        self.W1 = np.random.randn(self.n_x, self.n_h)*range1
        self.b1 = np.zeros((1, self.n_h))
        range2 = np.sqrt(2/self.n_h)
        self.W2 = np.random.randn(self.n_h, self.n_y)*range2
        self.b2 = np.zeros((1, self.n_y))
        
    def init_parameters_lecun(self):
        # Weights and biases matrices (LeCun initialized)
        range1 = np.sqrt(1/self.n_x)
        self.W1 = np.random.randn(self.n_x, self.n_h)*range1
        self.b1 = np.zeros((1, self.n_h))
        range2 = np.sqrt(1/self.n_h)
        self.W2 = np.random.randn(self.n_h, self.n_y)*range2
        self.b2 = np.zeros((1, self.n_y))
        
    def init_parameters_const(self):
        # Weights and biases matrices (Constant initialized)
        const_val = 0.1
        self.W1 = np.ones(shape = (n_x, n_h))*const_val
        self.b1 = np.ones(shape = (1, n_h))*const_val
        self.W2 = np.ones(shape = (n_h, n_y))*const_val
        self.b2 = np.ones(shape = (1, n_y))*const_val
        
    def forward(self, inputs):
        # Wx + b operation for the first layer
        Z1 = np.matmul(inputs, self.W1)
        Z1_b = Z1 + self.b1
        # Wx + b operation for the second layer
        Z2 = np.matmul(Z1_b, self.W2)
        Z2_b = Z2 + self.b2
        return Z2_b
    
    def MSE_loss(self, inputs, outputs):
        # MSE loss function as before
        outputs_re = outputs.reshape(-1, 1)
        pred = self.forward(inputs)
        losses = (pred - outputs_re)**2
        self.loss = np.sum(losses)/outputs.shape[0]
        return self.loss
    
    def backward(self, inputs, outputs, alpha = 1e-5):
        # Get the number of samples in dataset
        m = inputs.shape[0]
        
        # Forward propagate
        Z1 = np.matmul(inputs, self.W1)
        Z1_b = Z1 + self.b1
        Z2 = np.matmul(Z1_b, self.W2)
        y_pred = Z2 + self.b2
    
        # Compute error term
        epsilon = y_pred - outputs
    
        # Compute the gradient for W2 and b2
        dL_dW2 = (2/m)*np.matmul(Z1_b.T, epsilon)
        dL_db2 = (2/m)*np.sum(epsilon, axis = 0, keepdims = True)

        # Compute the loss derivative with respect to the first layer
        dL_dZ1 = np.matmul(epsilon, self.W2.T)

        # Compute the gradient for W1 and b1
        dL_dW1 = (2/m)*np.matmul(inputs.T, dL_dZ1)
        dL_db1 = (2/m)*np.sum(dL_dZ1, axis = 0, keepdims = True)
        
        # Display (for this notebook only)
        print("- Gradients: ")
        print(alpha*dL_dW1)
        print(alpha*dL_db1)
        print(alpha*dL_dW2)
        print(alpha*dL_db2)
        print("- Parameters: ")
        print(self.W1)
        print(self.b1)
        print(self.W2)
        print(self.b2)
        
        # Update the weights and biases using gradient descent
        self.W1 -= alpha*dL_dW1
        self.b1 -= alpha*dL_db1
        self.W2 -= alpha*dL_dW2
        self.b2 -= alpha*dL_db2
        
        # Update loss
        self.MSE_loss(inputs, outputs)
    
    def train(self, inputs, outputs, N_max = 1000, alpha = 1e-5, delta = 1e-5, display = True):
        # List of losses, starts with the current loss
        self.losses_list = [self.loss]
        # Repeat iterations
        for iteration_number in range(1, N_max + 1):
            # Backpropagate
            self.backward(inputs, outputs, alpha)
            new_loss = self.loss
            # Update losses list
            self.losses_list.append(new_loss)
            # Display
            if(display and iteration_number % (N_max*0.05) == 1):
                print("Iteration {} - Loss = {}".format(iteration_number, new_loss))
            # Check for delta value and early stop criterion
            difference = abs(self.losses_list[-1] - self.losses_list[-2])
            if(difference < delta):
                if(display):
                    print("Stopping early - loss evolution was less than beta on iteration {}.".format(iteration_number))
                break
        else:
            # Else on for loop will execute if break did not trigger
            if(display):
                print("Stopping - Maximal number of iterations reached.")
    
    def show_losses_over_training(self):
        # Initialize matplotlib
        fig, axs = plt.subplots(1, 2, figsize = (15, 5))
        axs[0].plot(list(range(len(self.losses_list))), self.losses_list)
        axs[0].set_xlabel("Iteration number")
        axs[0].set_ylabel("Loss")
        axs[1].plot(list(range(len(self.losses_list))), self.losses_list)
        axs[1].set_xlabel("Iteration number")
        axs[1].set_ylabel("Loss (in logarithmic scale)")
        axs[1].set_yscale("log")
        # Display
        plt.show()

### The signs of an exploding gradient

In the previous notebook, we have tried different initialization methods and checked their final loss values after training.

While it seems that the Normal and Lecun behave nicely, we ended up with NaN values for Xavier and He initializations.

These NaN value are a typical symptom for a phenomenon called the **exploding gradient syndrom**. This typically occurs when the gradient descent rule has changes (in alpha\*dL_dW1, alpha\*dL_db1, alpha\*dL_dW2, and alpha\*dL_db2) that are far greater than the values in the matrices $ W_1 $, $ b_1 $, $ W_2 $ and $ b_2 $.

In order to observe that, we have added the following display to the **backward()** method of our model.

```
# Display (for this notebook only)
print("- Gradients: ")
print(alpha*dL_dW1)
print(alpha*dL_db1)
print(alpha*dL_dW2)
print(alpha*dL_db2)
print("- Parameters: ")
print(self.W1)
print(self.b1)
print(self.W2)
print(self.b2)
```

When running the Xavier and He model, we can observe this eventually happens in the first few iterations of training, eventually leading to values so large in our matrices $ W_1 $, $ b_1 $, $ W_2 $ and $ b_2 $ that they end up being replaced by NaNs. This in turn gives a NaN value for the loss function.

In [5]:
# Define and train neural network structure (random normal initialization)
n_x = 2
n_h = 4
n_y = 1
init_type = "Normal"
np.random.seed(37)
shallow_neural_net_normal = ShallowNeuralNet(n_x, n_h, n_y, init_type)
# Train and show final loss
shallow_neural_net_normal.train(inputs, outputs, N_max = 100, alpha = 1e-6, delta = 1e-6, display = True)
print(shallow_neural_net_normal.loss)

- Gradients: 
[[ 1.42356523e-05 -1.73577846e-06 -9.53259136e-07  1.24839272e-06]
 [ 1.32743203e-04 -1.61856153e-05 -8.88885652e-06  1.16408890e-05]]
[[ 1.27309997e-07 -1.55231348e-08 -8.52503385e-09  1.11644251e-08]]
[[ 3.86686371e-04]
 [ 2.71147622e-04]
 [ 8.05690592e-05]
 [-1.49929229e-04]]
[[2.45019007e-06]]
- Parameters: 
[[-0.00544636  0.06743081  0.0346647  -0.13003462]
 [ 0.15185119  0.09898237  0.02776809 -0.04485894]]
[[ 0.09619662 -0.08275786  0.05346571  0.12283862]]
[[ 0.05195923]
 [-0.00633548]
 [-0.00347934]
 [ 0.00455655]]
[[0.14480251]]
Iteration 1 - Loss = 6.635227700991098
- Gradients: 
[[ 1.33509060e-05 -1.71029927e-06 -9.21574694e-07  1.21839677e-06]
 [ 1.27650738e-04 -1.63525206e-05 -8.81136384e-06  1.16493403e-05]]
[[ 1.20261279e-07 -1.54059042e-08 -8.30129070e-09  1.09749821e-08]]
[[ 3.74339400e-04]
 [ 2.62301797e-04]
 [ 7.78514894e-05]
 [-1.44438980e-04]]
[[2.33188559e-06]]
- Parameters: 
[[-0.0054606   0.06743254  0.03466566 -0.13003587]
 [ 0.15171844  0.098998

In [6]:
# Define and train neural network structure (Xavier initialization)
n_x = 2
n_h = 4
n_y = 1
init_type = "Xavier"
np.random.seed(37)
shallow_neural_net_xavier = ShallowNeuralNet(n_x, n_h, n_y, init_type)
# Train and show final loss
shallow_neural_net_xavier.train(inputs, outputs, N_max = 100, alpha = 1e-6, delta = 1e-6, display = True)
print(shallow_neural_net_xavier.loss)

- Gradients: 
[[ 0.0347825  -0.28526197 -0.24439365  0.18661634]
 [ 0.17610429 -1.44428536 -1.23736846  0.94484113]]
[[ 0.00026903 -0.00220639 -0.00189029  0.0014434 ]]
[[-0.64676422]
 [-0.55320161]
 [ 1.42348809]
 [-0.81598256]]
[[-0.00168314]]
- Parameters: 
[[ 1.25722625 -0.1015457  -0.86890687  0.23163369]
 [ 0.33964944  0.52106421 -1.12164798  0.69431032]]
[[-0.61665631  0.71679298  0.82789654  0.3603433 ]]
[[-0.15983769]
 [ 1.31087797]
 [ 1.12307381]
 [-0.85756698]]
[[0.27139079]]
Iteration 1 - Loss = 5638943.191688167
- Gradients: 
[[ 0.2638808   1.01020334 -0.16280394 -0.0225359 ]
 [ 1.39767888  5.35067306 -0.8623122  -0.11936434]]
[[ 0.00207055  0.00792661 -0.00127745 -0.00017683]]
[[ 1.12929903e+00]
 [ 5.74397848e+00]
 [-2.74959803e-03]
 [-6.93203740e-01]]
[[0.00425229]]
- Parameters: 
[[ 1.22244375  0.18371627 -0.62451323  0.04501735]
 [ 0.16354515  1.96534957  0.11572048 -0.25053081]]
[[-0.61692534  0.71899936  0.82978683  0.3588999 ]]
[[ 0.48692653]
 [ 1.86407958]
 [-0.300

In [7]:
# Define and train neural network structure (He initialization)
n_x = 2
n_h = 4
n_y = 1
init_type = "He"
np.random.seed(37)
shallow_neural_net_he = ShallowNeuralNet(n_x, n_h, n_y, init_type)
# Train and show final loss
shallow_neural_net_he.train(inputs, outputs, N_max = 100, alpha = 1e-6, delta = 1e-6, display = True)
print(shallow_neural_net_he.loss)

- Gradients: 
[[-0.01783781  0.01534585 -0.00991419 -0.02277806]
 [-0.04953028  0.04261085 -0.02752874 -0.06324787]]
[[-1.18669845e-04  1.02091554e-04 -6.59562345e-05 -1.51535877e-04]]
[[-0.1091435 ]
 [-0.08975782]
 [-0.02931   ]
 [ 0.06676452]]
[[-0.00017446]]
- Parameters: 
[[-0.05446361  0.67430807  0.34664703 -1.30034617]
 [ 1.51851188  0.98982371  0.2776809  -0.44858935]]
[[0. 0. 0. 0.]]
[[ 0.68021285]
 [-0.58518647]
 [ 0.37805964]
 [ 0.8686002 ]]
[[0.]]
Iteration 1 - Loss = 52089.45912895485
- Gradients: 
[[ 0.02892969 -0.01815732  0.01492998  0.02938706]
 [ 0.20288867 -0.12734028  0.10470643  0.20609624]]
[[ 0.00025052 -0.00015724  0.00012929  0.00025448]]
[[ 0.4016924 ]
 [ 0.26761336]
 [ 0.09151608]
 [-0.14586699]]
[[0.00031737]]
- Parameters: 
[[-0.0366258   0.65896221  0.35656121 -1.27756812]
 [ 1.56804216  0.94721285  0.30520963 -0.38534149]]
[[ 1.18669845e-04 -1.02091554e-04  6.59562345e-05  1.51535877e-04]]
[[ 0.78935635]
 [-0.49542864]
 [ 0.40736963]
 [ 0.80183568]]
[[0.0

In [8]:
# Define and train neural network structure (LeCun initialization)
n_x = 2
n_h = 4
n_y = 1
init_type = "LeCun"
np.random.seed(37)
shallow_neural_net_lecun = ShallowNeuralNet(n_x, n_h, n_y, init_type)
# Train and show final loss
shallow_neural_net_lecun.train(inputs, outputs, N_max = 100, alpha = 1e-6, delta = 1e-6, display = True)
print(shallow_neural_net_lecun.loss)

- Gradients: 
[[-0.00646502  0.00556185 -0.00359324 -0.00825554]
 [-0.01811392  0.01558339 -0.01006765 -0.02313064]]
[[-4.31473063e-05  3.71195863e-05 -2.39811036e-05 -5.50971050e-05]]
[[-0.03992   ]
 [-0.0327677 ]
 [-0.01068926]
 [ 0.02430487]]
[[-8.97064872e-05]]
- Parameters: 
[[-0.03851159  0.47680781  0.24511646 -0.9194836 ]
 [ 1.07375005  0.69991105  0.19635005 -0.31720057]]
[[0. 0. 0. 0.]]
[[ 0.48098312]
 [-0.41378932]
 [ 0.26732853]
 [ 0.61419309]]
[[0.]]
Iteration 1 - Loss = 3624.008757394556
- Gradients: 
[[ 0.00147599 -0.00107964  0.00078777  0.00167147]
 [ 0.02486276 -0.01818621  0.01326982  0.02815543]]
[[ 1.95905893e-05 -1.43298011e-05  1.04559411e-05  2.21850427e-05]]
[[ 0.05202399]
 [ 0.03399833]
 [ 0.01055707]
 [-0.01661798]]
[[3.7608892e-05]]
- Parameters: 
[[-0.03204657  0.47124595  0.2487097  -0.91122806]
 [ 1.09186397  0.68432766  0.20641769 -0.29406994]]
[[ 4.31473063e-05 -3.71195863e-05  2.39811036e-05  5.50971050e-05]]
[[ 0.52090312]
 [-0.38102162]
 [ 0.27801779

[[ 0.08850213  0.35277135  0.33851216 -0.76278084]
 [ 0.97646485  0.69767704  0.15198712 -0.33958535]]
[[ 0.00063845 -0.00066882  0.00048241  0.00083291]]
[[ 0.35661453]
 [-0.33224555]
 [ 0.32905105]
 [ 0.34633557]]
[[0.00171267]]
- Gradients: 
[[-0.00074882  0.00069865 -0.00069346 -0.00072581]
 [ 0.00030734 -0.00028675  0.00028462  0.00029789]]
[[-3.92495720e-06  3.66199392e-06 -3.63478629e-06 -3.80433108e-06]]
[[ 0.00065506]
 [-0.00013799]
 [-0.00058262]
 [ 0.00130964]]
[[-1.10266353e-05]]
- Parameters: 
[[ 0.08925871  0.35206647  0.33921026 -0.76204606]
 [ 0.97615446  0.69796623  0.15170071 -0.33988679]]
[[ 0.00064242 -0.00067252  0.00048607  0.00083676]]
[[ 0.35595239]
 [-0.33210438]
 [ 0.32963694]
 [ 0.34501287]]
[[0.00172379]]
- Gradients: 
[[-0.00074117  0.0006925  -0.00068885 -0.00071698]
 [ 0.00030432 -0.00028434  0.00028284  0.00029439]]
[[-3.88553668e-06  3.63038915e-06 -3.61128578e-06 -3.75874335e-06]]
[[ 0.00064808]
 [-0.00013489]
 [-0.00057937]
 [ 0.00129676]]
[[-1.093601

In [9]:
# Comparing losses
print(shallow_neural_net_normal.loss)
print(shallow_neural_net_xavier.loss)
print(shallow_neural_net_he.loss)
print(shallow_neural_net_lecun.loss)

2.654115820074732
nan
nan
265.25171724341226


 ### Changing the learning rate to create an exploding gradient problem
    
As we said before, the exploding gradient problem typically occurs when the gradient descent rule has changes (in alpha\*dL_dW1, alpha\*dL_db1, alpha\*dL_dW2, and alpha\*dL_db2) that are far greater than the values in the matrices $ W_1 $, $ b_1 $, $ W_2 $ and $ b_2 $.

Another very easy way to force the apparition of the exploding gradient consists of using a value of a learning rate $ \alpha $ that is simply far too big. Typically, we can force an exploding gradient problem in the random initialization by doing replacing $ \alpha = 1e^{-5} $ with $ \alpha = 1e^{-2} $, as shown below!

In [10]:
# Define and train neural network structure (random normal initialization)
n_x = 2
n_h = 4
n_y = 1
init_type = "Normal"
np.random.seed(37)
shallow_neural_net_normal1 = ShallowNeuralNet(n_x, n_h, n_y, init_type)
# Train and show final loss
shallow_neural_net_normal1.train(inputs, outputs, N_max = 100, alpha = 1e-6, delta = 1e-6, display = True)
print(shallow_neural_net_normal1.loss)

- Gradients: 
[[ 1.42356523e-05 -1.73577846e-06 -9.53259136e-07  1.24839272e-06]
 [ 1.32743203e-04 -1.61856153e-05 -8.88885652e-06  1.16408890e-05]]
[[ 1.27309997e-07 -1.55231348e-08 -8.52503385e-09  1.11644251e-08]]
[[ 3.86686371e-04]
 [ 2.71147622e-04]
 [ 8.05690592e-05]
 [-1.49929229e-04]]
[[2.45019007e-06]]
- Parameters: 
[[-0.00544636  0.06743081  0.0346647  -0.13003462]
 [ 0.15185119  0.09898237  0.02776809 -0.04485894]]
[[ 0.09619662 -0.08275786  0.05346571  0.12283862]]
[[ 0.05195923]
 [-0.00633548]
 [-0.00347934]
 [ 0.00455655]]
[[0.14480251]]
Iteration 1 - Loss = 6.635227700991098
- Gradients: 
[[ 1.33509060e-05 -1.71029927e-06 -9.21574694e-07  1.21839677e-06]
 [ 1.27650738e-04 -1.63525206e-05 -8.81136384e-06  1.16493403e-05]]
[[ 1.20261279e-07 -1.54059042e-08 -8.30129070e-09  1.09749821e-08]]
[[ 3.74339400e-04]
 [ 2.62301797e-04]
 [ 7.78514894e-05]
 [-1.44438980e-04]]
[[2.33188559e-06]]
- Parameters: 
[[-0.0054606   0.06743254  0.03466566 -0.13003587]
 [ 0.15171844  0.098998

[[-5.50483753e-06  1.84399278e-06  7.48113045e-07 -1.06763740e-06]
 [ 1.61684442e-05 -5.41605347e-06 -2.19730809e-06  3.13579388e-06]]
[[-3.08140306e-08  1.03219849e-08  4.18765824e-09 -5.97623661e-09]]
[[ 5.93774415e-05]
 [ 3.04324692e-05]
 [ 6.40933006e-06]
 [-5.14114340e-07]]
[[-7.53914486e-07]]
- Parameters: 
[[-0.00548505  0.06741895  0.03466193 -0.13002977]
 [ 0.14841549  0.09972621  0.02810526 -0.04533649]]
[[ 0.09619544 -0.08275774  0.05346578  0.12283852]]
[[ 0.04087205]
 [-0.01369119]
 [-0.00555455]
 [ 0.00792694]]
[[0.14478017]]
Iteration 66 - Loss = 2.7340014488555004
- Gradients: 
[[-5.57024830e-06  1.87277205e-06  7.58978547e-07 -1.08196542e-06]
 [ 1.57524638e-05 -5.29613267e-06 -2.14636430e-06  3.05975968e-06]]
[[-3.13470495e-08  1.05391852e-08  4.27121680e-09 -6.08885310e-09]]
[[5.79516562e-05]
 [2.93556621e-05]
 [6.07689293e-06]
 [1.52666122e-07]]
[[-7.68071473e-07]]
- Parameters: 
[[-0.00547955  0.0674171   0.03466118 -0.1300287 ]
 [ 0.14839932  0.09973162  0.02810745

In [11]:
# Define and train neural network structure (random normal initialization)
n_x = 2
n_h = 4
n_y = 1
init_type = "Normal"
np.random.seed(37)
shallow_neural_net_normal2 = ShallowNeuralNet(n_x, n_h, n_y, init_type)
# Train and show final loss
shallow_neural_net_normal2.train(inputs, outputs, N_max = 100, alpha = 1e-2, delta = 1e-6, display = True)
print(shallow_neural_net_normal2.loss)

- Gradients: 
[[ 0.14235652 -0.01735778 -0.00953259  0.01248393]
 [ 1.32743203 -0.16185615 -0.08888857  0.11640889]]
[[ 1.27309997e-03 -1.55231348e-04 -8.52503385e-05  1.11644251e-04]]
[[ 3.86686371]
 [ 2.71147622]
 [ 0.80569059]
 [-1.49929229]]
[[0.0245019]]
- Parameters: 
[[-0.00544636  0.06743081  0.0346647  -0.13003462]
 [ 0.15185119  0.09898237  0.02776809 -0.04485894]]
[[ 0.09619662 -0.08275786  0.05346571  0.12283862]]
[[ 0.05195923]
 [-0.00633548]
 [-0.00347934]
 [ 0.00455655]]
[[0.14480251]]
Iteration 1 - Loss = 4354741.971946698
- Gradients: 
[[-17872.18478713 -12732.48994896  -3790.82479548   7045.27849364]
 [-96275.52328648 -68588.54399597 -20420.76249876  37952.15200341]]
[[-140.99869992 -100.45019962  -29.90688459   55.58218651]]
[[-30356.68076018]
 [  6976.86561045]
 [  3153.06278263]
 [ -4733.00388279]]
[[36.95995557]]
- Parameters: 
[[-0.14780288  0.08478859  0.04419729 -0.14251854]
 [-1.17558084  0.26083852  0.11665665 -0.16126783]]
[[ 0.09492352 -0.08260263  0.053550

In [12]:
print(shallow_neural_net_normal1.loss)
print(shallow_neural_net_normal2.loss)

2.654115820074732
nan


### Controlling the exploding gradient problem

The lessons to be learned here are threefold:
- The exploding gradient problem is serious and can lead to a network being unable to train at all. A typical telltale sign is the apparition of NaNs in losses or network parameters.
- A typical way to prevent the exploding gradient problem consists of using a learning rate $ \alpha $ that is small enough to prevent changes in the gradient descent rule from being much larger than the parameters to be changed.
- And, more importantly, it is important to control the initiation of the network parameters. In fact, if we were to use the Xavier initialization from earlier, but reduce the value of the initial parameters in the matrices $ W_1 $, $ b_1 $, $ W_2 $ and $ b_2 $ before training, then we could make the exploding gradient problem disappear! We show it below, by first running the normal Xavier initialization, and then running it again, but dividing all elements in $ W_1 $, $ b_1 $, $ W_2 $ and $ b_2 $ by - say - a factor 10. The second case has no exploding gradient issue and is able to train, although at a slower pace than Normal/LeCun models!

In [13]:
# Define and train neural network structure (Xavier initialization)
n_x = 2
n_h = 4
n_y = 1
init_type = "Xavier"
np.random.seed(37)
shallow_neural_net_xavier1 = ShallowNeuralNet(n_x, n_h, n_y, init_type)
# Train and show final loss
shallow_neural_net_xavier1.train(inputs, outputs, N_max = 100, alpha = 1e-6, delta = 1e-6, display = True)
print(shallow_neural_net_xavier1.loss)

- Gradients: 
[[ 0.0347825  -0.28526197 -0.24439365  0.18661634]
 [ 0.17610429 -1.44428536 -1.23736846  0.94484113]]
[[ 0.00026903 -0.00220639 -0.00189029  0.0014434 ]]
[[-0.64676422]
 [-0.55320161]
 [ 1.42348809]
 [-0.81598256]]
[[-0.00168314]]
- Parameters: 
[[ 1.25722625 -0.1015457  -0.86890687  0.23163369]
 [ 0.33964944  0.52106421 -1.12164798  0.69431032]]
[[-0.61665631  0.71679298  0.82789654  0.3603433 ]]
[[-0.15983769]
 [ 1.31087797]
 [ 1.12307381]
 [-0.85756698]]
[[0.27139079]]
Iteration 1 - Loss = 5638943.191688167
- Gradients: 
[[ 0.2638808   1.01020334 -0.16280394 -0.0225359 ]
 [ 1.39767888  5.35067306 -0.8623122  -0.11936434]]
[[ 0.00207055  0.00792661 -0.00127745 -0.00017683]]
[[ 1.12929903e+00]
 [ 5.74397848e+00]
 [-2.74959803e-03]
 [-6.93203740e-01]]
[[0.00425229]]
- Parameters: 
[[ 1.22244375  0.18371627 -0.62451323  0.04501735]
 [ 0.16354515  1.96534957  0.11572048 -0.25053081]]
[[-0.61692534  0.71899936  0.82978683  0.3588999 ]]
[[ 0.48692653]
 [ 1.86407958]
 [-0.300

In [14]:
# Define and train neural network structure (Xavier initialization)
n_x = 2
n_h = 4
n_y = 1
init_type = "Xavier"
np.random.seed(37)
shallow_neural_net_xavier2 = ShallowNeuralNet(n_x, n_h, n_y, init_type)
# Divide initial values by 10!
shallow_neural_net_xavier2.W1 /= 100
shallow_neural_net_xavier2.b1 /= 100
shallow_neural_net_xavier2.W2 /= 100
shallow_neural_net_xavier2.b2 /= 100
# Train and show final loss
shallow_neural_net_xavier2.train(inputs, outputs, N_max = 100, alpha = 1e-6, delta = 1e-6, display = True)
print(shallow_neural_net_xavier2.loss)

- Gradients: 
[[ 1.08643744e-06 -8.91020716e-06 -7.63367797e-06  5.82899367e-06]
 [ 4.17432348e-06 -3.42349093e-05 -2.93302128e-05  2.23962323e-05]]
[[ 8.17657760e-09 -6.70586248e-08 -5.74514080e-08  4.38692717e-08]]
[[-1.73842740e-05]
 [-1.29545695e-05]
 [ 3.51566967e-05]
 [-1.97254965e-05]]
[[-5.1155505e-06]]
- Parameters: 
[[ 0.01257226 -0.00101546 -0.00868907  0.00231634]
 [ 0.00339649  0.00521064 -0.01121648  0.0069431 ]]
[[-0.00616656  0.00716793  0.00827897  0.00360343]]
[[-0.00159838]
 [ 0.01310878]
 [ 0.01123074]
 [-0.00857567]]
[[0.00271391]]
Iteration 1 - Loss = 7.036477890046345
- Gradients: 
[[ 1.07422558e-06 -8.91572969e-06 -7.60698055e-06  5.81344543e-06]
 [ 4.12685197e-06 -3.42515552e-05 -2.92237341e-05  2.23335109e-05]]
[[ 8.08455398e-09 -6.70992195e-08 -5.72496560e-08  4.37516237e-08]]
[[-1.73650576e-05]
 [-1.30434025e-05]
 [ 3.50581006e-05]
 [-1.96533947e-05]]
[[-5.11359382e-06]]
- Parameters: 
[[ 0.01257118 -0.00100655 -0.00868144  0.00231051]
 [ 0.00339232  0.00524

In [15]:
print(shallow_neural_net_xavier1.loss)
print(shallow_neural_net_xavier2.loss)

nan
6.615066924166009


### What's next?

In the next notebook, we will investigate the vanishing gradient problem, which is the counterpart to the exploding gradient one.