In [None]:
import tensorflow as tf

In [2]:
# Auto Differentiation Example

In [3]:
# Declare a tensorflow Variable

x = tf.Variable(2.0)

In [4]:
with tf.GradientTape() as tape:
    # perform squaring operation
    y = x ** 2

    # Now that GradientTape has recorded the operation
    # we can calculate the gradient of the operation i.e. dy/dx


# Note: We are now outside the GradientTape context
# Gradient calculations and updates need to be performed
# outside the GradientTape context, or these operations will be
# recorded on the tape as well, and increased memory usage.


dy_dx = tape.gradient(y, x)

In [5]:
# Gradient value

print(dy_dx.numpy())

4.0


In [6]:
# Back Propagation Example

In [7]:
# let's first define the constants and variables

x1 = tf.constant(1.3, name="x1")
x2 = tf.constant(2.1, name="x2")
lr = tf.constant(0.1, name="learning_rate")
Y  = tf.constant(1.0, name="ground_truth")

# ---------
w1 = tf.Variable(0.7, name="x3")
w2 = tf.Variable(-0.3, name="x3")
b  = tf.Variable(1.0, name="b")

# Text formatting
bold = "\033[1m"
end = "\033[0m"

In [8]:
# Implementation for equation A1:  x1w1 + x2w2 + b and its derivative

def wx_plus_b(x1, x2, w1, w2, b):
    return x1*w1 + x2* w2 + b


# Derivative of WX + B w.r.t. its input W and B
def grad_wx_plus_b(x1, x2):
    return x1, x2, tf.constant(1.0)

In [9]:
# Implementation for equation A2: Sigmoid and its derivative

def sigmoid(x):
    return 1 / (1 + tf.math.exp(-x))


# Derivative of sigmoid w.r.t. its input.
def grad_sigmoid(x):
    return sigmoid(x) * (1.0 - sigmoid(x))

In [10]:
# Implementation for equation A3: Binary cross-entropy and its derivative

def bce_loss(y_hat, y):
    loss = -(y * tf.math.log(y_hat)) - ((1 - y) * tf.math.log(1.0 - y_hat))
    return loss


# Derivative of binary cross-entropy w.r.t. its input.
def grad_bce_loss(y_hat, y):
    return -(y / y_hat) + ((1.0 - y) / (1.0 - y_hat))

In [11]:
def forward(x1, x2, w1, w2, b, Y):
    A1 = wx_plus_b(x1, x2, w1, w2, b)
    A2 = sigmoid(A1)
    A3 = bce_loss(A2, Y)

    return_dict = {
        "A1": A1,
        "A2": A2,
        "A3": A3
    }
    return return_dict

In [12]:
def backward(x1, x2, Y, A1, A2):

    # Compute the gradients of A3 w.r.t  A2 i.e dA3/dA2
    d_bce_loss = grad_bce_loss(A2, Y)

    # Compute the gradients A2 w.r.t A1 i.e dA2/dA1
    d_sigmoid = grad_sigmoid(A1)

    # Compute the gradients of weighted sum(z) w.r.t weights and bias
    # dA1/dw1, dA1/dw2, dA1/b
    d_w1, d_w2, d_b = grad_wx_plus_b(x1, x2)

    # Using chain rule to find overall gradient of Loss w.r.t weights and bias
    w1_grad = d_bce_loss * d_sigmoid * d_w1
    w2_grad = d_bce_loss * d_sigmoid * d_w2
    b_grad  = d_bce_loss * d_sigmoid * d_b


    return_dict = {
        "dA3_dA2": d_bce_loss,
        "dA2_dA1": d_sigmoid,
        "dA1_dw1": d_w1,
        "dA1_dw2": d_w2,
        "dA1_db" : d_b,
        "dA3_dw1": w1_grad,
        "dA3_dw2": w2_grad,
        "dA3_db" : b_grad,
    }

    return return_dict

In [13]:
# Performing forward pass

forward_outputs = forward(x1, x2, w1, w2, b, Y)

print(f"{bold}Forward Pass:{end}\n")

print(f"{bold}A1:{end} {forward_outputs['A1'].numpy()}")
print(f"{bold}A2:{end} {forward_outputs['A2'].numpy()}")
print(f"{bold}A3:{end} {forward_outputs['A3'].numpy()} <---{bold} Initial Loss{end}")

[1mForward Pass:[0m

[1mA1:[0m 1.2799999713897705
[1mA2:[0m 0.7824497818946838
[1mA3:[0m 0.24532553553581238 <---[1m Initial Loss[0m


In [14]:
# Performing backward pass

A1 = forward_outputs['A1']
A2 = forward_outputs['A2']

backward_outputs = backward(x1, x2, Y, A1, A2)

print(f"{bold}Backward Pass: Step 1{end}\n")
print(f"{bold}Individual Derivatives:{end}\n")

print(f"{bold}dA3/dA2{end} = {backward_outputs['dA3_dA2']}")
print(f"{bold}dA2/dA1{end} = {backward_outputs['dA2_dA1']}")
print(f"{bold}dA1/dw1{end} = {backward_outputs['dA1_dw1']}")
print(f"{bold}dA1/dw2{end} = {backward_outputs['dA1_dw2']}")
print(f"{bold}dA1/db {end} = {backward_outputs['dA1_db']}")

print("\n-----------------\n")

print(f"{bold}Gradient of A3 w.r.t. variables:{end}\n")

print(f"{bold}dA3/dw1{end} = {backward_outputs['dA3_dw1']}")
print(f"{bold}dA3/dw2{end} = {backward_outputs['dA3_dw2']}")
print(f"{bold}dA3/db {end} = {backward_outputs['dA3_db']}")

[1mBackward Pass: Step 1[0m

[1mIndividual Derivatives:[0m

[1mdA3/dA2[0m = -1.2780373096466064
[1mdA2/dA1[0m = 0.17022211849689484
[1mdA1/dw1[0m = 1.2999999523162842
[1mdA1/dw2[0m = 2.0999999046325684
[1mdA1/db [0m = 1.0

-----------------

[1mGradient of A3 w.r.t. variables:[0m

[1mdA3/dw1[0m = -0.28281527757644653
[1mdA3/dw2[0m = -0.456855446100235
[1mdA3/db [0m = -0.21755021810531616


In [15]:
def weight_update(w1, w2, b, dw1, dw2, db, lr):

    # w1, w2 and b are objects of tf.Variable class
    # They are updated in place

    w1.assign_sub(lr * dw1) # w1 = w1 - lr * dw1
    w2.assign_sub(lr * dw2)
    b.assign_sub(lr * db)

    return w1, w2, b

In [16]:
w1_grad = backward_outputs["dA3_dw1"]
w2_grad = backward_outputs["dA3_dw2"]
b_grad  = backward_outputs["dA3_db"]

# keeping a copy of old w and b for comparison
# as w and b will be updated inplace

w1_old = tf.identity(w1, name="old_w1")
w2_old = tf.identity(w2, name="old_w2")
b_old  = tf.identity(b,  name="old_b")

# Perform Weight Update
w1_updated, w2_updated, b_updated = weight_update(w1, w2, b, w1_grad, w2_grad, b_grad, lr)

print(f"{bold}Backward Pass: Step 2{end}\n")
print(f"{bold}Parameter Updates{end}\n")

print(f"{bold}w1{end} --> {bold}Old:{end} {w1_old.numpy():<20} {bold}New:{end} {w1_updated.numpy()}")
print(f"{bold}w2{end} --> {bold}Old:{end} {w2_old.numpy():<20} {bold}New:{end} {w2_updated.numpy()}")
print(f"{bold}b{end}  --> {bold}Old:{end} {b_old.numpy():<19}  {bold}New:{end} {b_updated.numpy()}")

[1mBackward Pass: Step 2[0m

[1mParameter Updates[0m

[1mw1[0m --> [1mOld:[0m 0.699999988079071    [1mNew:[0m 0.7282814979553223
[1mw2[0m --> [1mOld:[0m -0.30000001192092896 [1mNew:[0m -0.25431448221206665
[1mb[0m  --> [1mOld:[0m 1.0                  [1mNew:[0m 1.0217549800872803


In [18]:
# Comparing the old and new loss
# New loss

new_forward_outputs = forward(x1, x2, w1_updated, w2_updated, b_updated, Y)

old_A3 = forward_outputs["A3"]
new_A3 = new_forward_outputs["A3"]

# We can also pass w1, w2 and b as the objects are being replaced in-place
# _, _, new_loss = forward(x1, x2, w1, w2, b, Y)

print(f"{bold}Checking New Loss{end}:\n")

print(f"{bold}LOSS{end} --> {bold}Old:{end} {old_A3.numpy():<20} {bold}New:{end} {new_A3.numpy()}")

[1mChecking New Loss[0m:

[1mLOSS[0m --> [1mOld:[0m 0.24532553553581238  [1mNew:[0m 0.21369412541389465


In [19]:
# Using GradientTape

In [20]:
# Redefine the constants and variables

x1 = tf.constant(1.3, name="x1")
x2 = tf.constant(2.1, name="x2")
lr = tf.constant(0.1, name="learning_rate")
Y  = tf.constant(1.0, name="ground_truth")

# ---------
w1 = tf.Variable(0.7, name="x3")
w2 = tf.Variable(-0.3, name="x3")
b  = tf.Variable(1.0, name="b")

In [21]:
with tf.GradientTape(persistent=True) as tape:
    # record operations
    A1 = w1 * x1 + w2 * x2 + b
    A2 = sigmoid(A1)
    A3 = bce_loss(A2, Y)

In [22]:
print(f"{bold}Forward Pass:{end}\n")

print(f"{bold}A1:{end} {A1.numpy()}")
print(f"{bold}A2:{end} {A2.numpy()}")
print(f"{bold}A3:{end} {A3.numpy()} <---{bold} Initial Loss{end}")

[1mForward Pass:[0m

[1mA1:[0m 1.2799999713897705
[1mA2:[0m 0.7824497818946838
[1mA3:[0m 0.24532553553581238 <---[1m Initial Loss[0m


In [23]:
print(f"{bold}Backward Pass: Step 1{end}\n")
print(f"{bold}Individual Derivatives:{end}\n")


dA3_dA2 = tape.gradient(A3, A2)

dA2_dA1 = tape.gradient(A2, A1)

dA1_dw1 = tape.gradient(A1, w1)

dA1_dw2 = tape.gradient(A1, w2)

dA1_db  = tape.gradient(A1, b)


print(f"{bold}dA3/dA2{end} = {dA3_dA2}")
print(f"{bold}dA2/dA1{end} = {dA2_dA1}")
print(f"{bold}dA1/dw1{end} = {dA1_dw1}")
print(f"{bold}dA1/dw2{end} = {dA1_dw2}")
print(f"{bold}dA1/db{end} =  {dA1_db}")

print("\n-----------------\n")


# implementing Chain rule
dA3_dw1 = dA3_dA2 * dA2_dA1 * dA1_dw1
dA3_dw2 = dA3_dA2 * dA2_dA1 * dA1_dw2
dA3_db  = dA3_dA2 * dA2_dA1 * dA1_db

print(f"{bold}Gradient of A3 wrt. variables:{end}\n")

print(f"{bold}dA3/dw1{end} = {dA3_dw1}")
print(f"{bold}dA3/dw2{end} = {dA3_dw2}")
print(f"{bold}dA3/db{end}  = {dA3_db}")

[1mBackward Pass: Step 1[0m

[1mIndividual Derivatives:[0m

[1mdA3/dA2[0m = -1.2780373096466064
[1mdA2/dA1[0m = 0.17022213339805603
[1mdA1/dw1[0m = 1.2999999523162842
[1mdA1/dw2[0m = 2.0999999046325684
[1mdA1/db[0m =  1.0

-----------------

[1mGradient of A3 wrt. variables:[0m

[1mdA3/dw1[0m = -0.2828153073787689
[1mdA3/dw2[0m = -0.4568554759025574
[1mdA3/db[0m  = -0.21755023300647736


In [24]:
# Direct implementation of gradient tape
# Redefine the constants and variables

x1 = tf.constant(1.3, name="x1")
x2 = tf.constant(2.1, name="x2")
lr = tf.constant(0.1, name="learning_rate")
Y  = tf.constant(1.0, name="ground_truth")

# ---------
w1 = tf.Variable(0.7, name="x3")
w2 = tf.Variable(-0.3, name="x3")
b  = tf.Variable(1.0, name="b")

In [25]:
def compute(x1, x2, w1, w2, b, Y):

    # Notice we have not used persistent=True

    with tf.GradientTape() as tape:
        outputs = forward(x1, x2, w1, w2, b, Y)

    # passing all variables with respect to which
    # we want to calculate the derivative of A3
    grads = tape.gradient(outputs["A3"], [w1, w2, b])

    return outputs, grads

In [26]:
forward_outputs, gradients = compute(x1, x2, w1, w2, b, Y)

print(f"{bold}Backward Pass: Step 1{end}\n")
print(f"{bold}Direct Gradient of A3 wrt. variables using GradientTape:{end}\n")

print(f"{bold}dA3/dw1{end} = {gradients[0]}")
print(f"{bold}dA3/dw2{end} = {gradients[1]}")
print(f"{bold}dA3/db{end}  = {gradients[2]}")

[1mBackward Pass: Step 1[0m

[1mDirect Gradient of A3 wrt. variables using GradientTape:[0m

[1mdA3/dw1[0m = -0.2828153073787689
[1mdA3/dw2[0m = -0.45685550570487976
[1mdA3/db[0m  = -0.21755024790763855


In [27]:
# keeping a copy of old w and b for comparison
# as w and b will be updated inplace

w1_old = tf.identity(w1, name="old_w1")
w2_old = tf.identity(w2, name="old_w2")
b_old  = tf.identity(b,  name="old_b")

# Perform Weight Update

w1_updated, w2_updated, b_updated = weight_update(w1, w2, b, gradients[0], gradients[1], gradients[2], lr)

print(f"{bold}Backward Pass: Step 2{end}\n")
print(f"{bold}Parameter Updates{end}\n")

print(f"{bold}w1{end} --> {bold}Old:{end} {w1_old.numpy():<20} {bold}New:{end} {w1_updated.numpy()}")
print(f"{bold}w2{end} --> {bold}Old:{end} {w2_old.numpy():<20} {bold}New:{end} {w2_updated.numpy()}")
print(f"{bold}b{end}  --> {bold}Old:{end} {b_old.numpy():<19}  {bold}New:{end} {b_updated.numpy()}")

[1mBackward Pass: Step 2[0m

[1mParameter Updates[0m

[1mw1[0m --> [1mOld:[0m 0.699999988079071    [1mNew:[0m 0.7282814979553223
[1mw2[0m --> [1mOld:[0m -0.30000001192092896 [1mNew:[0m -0.25431445240974426
[1mb[0m  --> [1mOld:[0m 1.0                  [1mNew:[0m 1.0217549800872803


In [28]:
# New loss computation

new_forward_outputs = forward(x1, x2, w1_updated, w2_updated, b_updated, Y)

old_A3 = forward_outputs["A3"]
new_A3 = new_forward_outputs["A3"]

# We can also pass w1, w2, b due to the objects being replaced in the memory
# _, _, new_loss = forward(x1, x2, w1, w2, b, Y)

print(f"{bold}Checking New Loss{end}:\n")

print(f"{bold}LOSS{end} --> {bold}Old:{end} {old_A3.numpy():<20} {bold}New:{end} {new_A3.numpy()}")

[1mChecking New Loss[0m:

[1mLOSS[0m --> [1mOld:[0m 0.24532553553581238  [1mNew:[0m 0.21369412541389465
