In [140]:
import random
import math

K = 3
# Define the input data
A = [[random.uniform(-1, 1) for _ in range(K)] for _ in range(K)]
B = [[random.uniform(-1, 1) for _ in range(K)] for _ in range(K)]
C = [[random.uniform(-1, 1) for _ in range(K)] for _ in range(K)]
# Input vector x
x = [random.uniform(-1, 1) for _ in range(K)]

In [141]:
# Define the sigmoid function
def sigmoid(value):
    return 1 / (1 + math.exp(-value))

def sigmoid_derivative(value):
    return sigmoid(value) * (1 - sigmoid(value))

def matrix_vector_multiplication(matrix, vector):
    if len(matrix) != len(matrix[0]) or len(matrix) != len(vector):
        raise ValueError("Matrix and vector dimensions don't match for multiplication")

    result = [0] * len(vector)
    for i in range(len(matrix)):
        for j in range(len(vector)):
            result[i] += matrix[i][j] * vector[j]

    return result

# Manual Backpropagation
def manual_backpropagation(x, A, B, C):
    K = len(x)
    
    # Initialze all other vectors to zero, of length K
    y, u, v, w = [0] * K, [0] * K, [0] * K, [0] * K

    # Forward propagation
    y = matrix_vector_multiplication(A, x)
    for i in range(K):
        u[i] = sigmoid(y[i])
    v = matrix_vector_multiplication(B, x)
    z = [u + v for u, v in zip(u, v)]
    w = matrix_vector_multiplication(C, z)

    L = sum(val ** 2 for val in w)

    print(f"Loss Manual Good: {L}")

    #Backpropagation
    #Compute gradients ∂L/∂A, ∂L/∂B, ∂L/∂C
    dL_dw = [2 * val for val in w]
    dL_dz = [dL_dw[i] for i in range(K)]
    dL_dy = [dL_dz[i] for i in range(K)]
    dL_dv = [dL_dy[i] * sigmoid_derivative(y[i]) for i in range(K)]

    dL_dC = [[dL_dw[i] * z[j] for j in range(K)] for i in range(K)]
    dL_dB = [[dL_dv[i] * x[j] for j in range(K)] for i in range(K)]
    dL_dA = [[dL_dy[i] * x[j] for j in range(K)] for i in range(K)]

    print("Gradient dL/dA:")
    for row in dL_dA:
        print(row)

    dL_dw = [2 * val for val in w]
    dL_dC = [[dL_dw[i] * z[j] for j in range(K)] for i in range(K)]

    dL_dz = [0] * K
    for i in range(K):
        dL_dz[i] = sum(dL_dw[j] * C[i][j] for j in range(K))

    dL_dv = [0] * K
    for i in range(K):
        dL_dv[i] = dL_dz[i]

    dL_dB = [[dL_dv[i] * x[j] for j in range(K)] for i in range(K)]

    dL_dy = [0] * K
    for i in range(K):
        dL_dy[i] = sum(dL_dz[j] * B[i][j] for j in range(K))
        dL_dy[i] *= y[i] * (1 - y[i])

    dL_dA = [[dL_dy[i] * x[j] for j in range(K)] for i in range(K)]
    print("Gradient dL/dA:")
    for row in dL_dA:
        print(row)


    return dL_dA, dL_dB, dL_dC

def manual_backpropagation_rounded(x, A, B, C):
    K = len(x)
    
    # Forward propagation
    y = [0] * K
    for i in range(K):
        y[i] = sum(A[i][j] * x[j] for j in range(K))
        y[i] = sigmoid(y[i])

    v = [0] * K
    for i in range(K):
        v[i] = sum(B[i][j] * x[j] for j in range(K))

    z = [u + v for u, v in zip(y, v)]

    w = [0] * K
    for i in range(K):
        w[i] = sum(C[i][j] * z[j] for j in range(K))

    for val in w:
        L = sum(val ** 2 for val in w)

    print(f"Loss Manual2: {L}")
    
    # Backpropagation ---- !!! Something wrong with this back propogation algorithm !!!
    # Compute gradients ∂L/∂A, ∂L/∂B, ∂L/∂C
    dL_dw = [round(2 * val, 4) for val in w]
    dL_dC = [[round(dL_dw[i] * z[j], 4) for j in range(K)] for i in range(K)]

    dL_dz = [0] * K
    for i in range(K):
        dL_dz[i] = sum(round(dL_dw[j] * C[i][j], 4) for j in range(K))

    dL_dv = [0] * K
    for i in range(K):
        dL_dv[i] = dL_dz[i]

    dL_dB = [[round(dL_dv[i] * x[j], 4) for j in range(K)] for i in range(K)]

    dL_dy = [0] * K
    for i in range(K):
        dL_dy[i] = sum(round(dL_dz[j] * B[i][j], 4) for j in range(K))
        dL_dy[i] *= round(y[i] * (1 - y[i]), 4)

    dL_dA = [[round(dL_dy[i] * x[j], 4) for j in range(K)] for i in range(K)]
    return dL_dA, dL_dB, dL_dC


def backwardpropagation(x, A, B, C):
    K = len(x)
    
    # Initialize gradients
    dA, dB, dC = [[0] * K for _ in range(K)], [[0] * K for _ in range(K)], [[0] * K for _ in range(K)]

    # Forward propagation
    y = matrix_vector_multiplication(A, x)
    u = [sigmoid(val) for val in y]
    v = matrix_vector_multiplication(B, x)
    z = [u_i + v_i for u_i, v_i in zip(u, v)]
    w = matrix_vector_multiplication(C, z)

    # Compute the loss L
    L = sum(val ** 2 for val in w)

    # Backpropagation
    # Gradients of L w.r.t. w
    dL_dw = [2 * val for val in w]

    # Gradients of L w.r.t. C
    for i in range(K):
        for j in range(K):
            dC[i][j] = dL_dw[i] * z[j]

    # Gradients of L w.r.t. z
    dL_dz = sum(dL_dw[i] * z[i] for i in range(K))

    # Gradients of L w.r.t. u and v
    dL_du = [dL_dz * sigmoid(y_i) * (1 - sigmoid(y_i)) for y_i in y]
    dL_dv = [dL_dz] * K  # Same gradient for all elements of v

    # Gradients of L w.r.t. A
    for i in range(K):
        for j in range(K):
            dA[i][j] = dL_du[i] * x[j]

    # Gradients of L w.r.t. B
    for i in range(K):
        for j in range(K):
            dB[i][j] = dL_dv[i] * x[j]

    return dA, dB, dC, L


# Manual Backpropagation
dL_dA_manual, dL_dB_manual, dL_dC_manual = manual_backpropagation(x, A, B, C)

dL_dA_manual_round, dL_dB_manual_round, dL_dC_manual_round = manual_backpropagation_rounded(x, A, B, C)

dL_dA_final, dL_dB_final, dL_dC_final, L = backwardpropagation(x, A, B, C)

#Print the gradients
print("--- Manual Backpropagation Gradient ∂L/∂A: ----")
for row in dL_dA_manual:
    print(row)

print("--- Manual Backpropagation Gradient Rounded ∂L/∂A: --- ")
for row in dL_dA_manual_round:
    print(row)

print("--- Automatic Differentiation with PyTorch Gradient ∂L/∂A: ---")
print(dL_dA_final)

# print("Manual Backpropagation Gradient ∂L/∂B:")
# for row in dL_dB_manual:
#     print(row)

# print("Automatic Differentiation with PyTorch Gradient ∂L/∂B:")
# print(dL_dB_torch)

# print("Manual Backpropagation Gradient ∂L/∂C:")
# for row in dL_dC_manual:
#     print(row)

# print("Automatic Differentiation with PyTorch Gradient ∂L/∂C:")
# print(dL_dC_torch)

Loss Manual Good: 2.5623731019213514
Gradient dL/dA:
[1.5652832703147892, 2.0823644836416766, 0.6996088004250455]
[1.4729555301955382, 1.9595368712053904, 0.6583425959393906]
[0.6015556063265368, 0.8002756135622635, 0.26886736995945326]
Gradient dL/dA:
[-0.0875055771261898, -0.11641247905978047, -0.03911092196841996]
[-1.2326785886564526, -1.6398859947231954, -0.550949981434444]
[0.15977480094479973, 0.2125553739557054, 0.07141190284660875]
Loss Manual2: 2.5623731019213514
--- Manual Backpropagation Gradient ∂L/∂A: ----
[-0.0875055771261898, -0.11641247905978047, -0.03911092196841996]
[-1.2326785886564526, -1.6398859947231954, -0.550949981434444]
[0.15977480094479973, 0.2125553739557054, 0.07141190284660875]
--- Manual Backpropagation Gradient Rounded ∂L/∂A: --- 
[0.2919, 0.3884, 0.1305]
[0.3558, 0.4734, 0.159]
[-0.1233, -0.164, -0.0551]
--- Automatic Differentiation with PyTorch Gradient ∂L/∂A: ---
[[0.8068272891978621, 1.0733574703833884, 0.3606142623835685], [0.7538935417432827, 1.0

# Verify the gradients using libraries - Torch, mxnet

In [142]:
import torch

A_torch = torch.tensor(A, requires_grad=True, dtype=torch.float32)
B_torch = torch.tensor(B, requires_grad=True, dtype=torch.float32)
C_torch = torch.tensor(C, requires_grad=True, dtype=torch.float32)
x_torch = torch.tensor(x, dtype=torch.float32)

# Define the sigmoid function
def sigmoid(value):
    return 1 / (1 + torch.exp(-value))

def forwardpropagation_torch(x, A, B, C):
    y = torch.matmul(A, x.unsqueeze(1)).squeeze()
    u = sigmoid(y)
    v = torch.matmul(B, x.unsqueeze(1)).squeeze()
    z = u + v
    w = torch.matmul(C, z.unsqueeze(1)).squeeze()
    L = torch.norm(w, p=2)**2
    return L

def backwardpropagation_torch(L_torch):
    L_torch.backward()
    dA = A_torch.grad
    dB = B_torch.grad
    dC = C_torch.grad
    return dA, dB, dC

L_torch = forwardpropagation_torch(x_torch, A_torch, B_torch, C_torch)
dA_torch, dB_torch, dC_torch = backwardpropagation_torch(L_torch)


print(f"Gradients (torch) dL/dA:\n{dA_torch}")
print(f"Gradients (torch) dL/dB:\n{dB_torch}")
print(f"Gradients (torch) dL/dC:\n{dC_torch}")

Gradients (torch) dL/dA:
tensor([[-0.6609, -0.8792, -0.2954],
        [-0.5241, -0.6972, -0.2342],
        [ 0.5251,  0.6986,  0.2347]])
Gradients (torch) dL/dB:
tensor([[-2.6469, -3.5212, -1.1830],
        [-2.2461, -2.9881, -1.0039],
        [ 2.1346,  2.8398,  0.9541]])
Gradients (torch) dL/dC:
tensor([[-2.5464, -1.7239, -1.2134],
        [-2.3962, -1.6222, -1.1419],
        [-0.9786, -0.6625, -0.4663]])


In [143]:
import mxnet as mx
from mxnet import autograd

x_mx = mx.nd.array(x_torch.detach().numpy())
A_mx = mx.nd.array(A_torch.detach().numpy())
B_mx = mx.nd.array(B_torch.detach().numpy())
C_mx = mx.nd.array(C_torch.detach().numpy())

def sigmoid(value):
    return 1 / (1 + mx.nd.exp(-value))

def forwardpropagation_mx(x, A, B, C):
    y = mx.nd.dot(A, x)
    u = sigmoid(y)
    v = mx.nd.dot(B, x)
    z = u + v
    w = mx.nd.dot(C, z)
    L = mx.nd.norm(w) ** 2
    return L

def backwardpropagation_mx(L_mx):
    L_mx.backward()
    dA = A_mx.grad
    dB = B_mx.grad
    dC = C_mx.grad
    return dA, dB, dC

A_mx.attach_grad()
B_mx.attach_grad()
C_mx.attach_grad()

with autograd.record():
    L_mx = forwardpropagation_mx(x_mx, A_mx, B_mx, C_mx)

dA_mx, dB_mx, dC_mx = backwardpropagation_mx(L_mx)

print(f"Gradients (mx) dL/dA:\n{dA_mx}")
print(f"Gradients (mx) dL/dB:\n{dB_mx}")
print(f"Gradients (mx) dL/dC:\n{dC_mx}")


Gradients (mx) dL/dA:

[[-0.66090703 -0.8792334  -0.29539472]
 [-0.5240515  -0.6971685  -0.23422666]
 [ 0.5251271   0.69859946  0.23470742]]
<NDArray 3x3 @cpu(0)>
Gradients (mx) dL/dB:

[[-2.6468637  -3.5212379  -1.183025  ]
 [-2.246134   -2.9881299  -1.0039175 ]
 [ 2.1346316   2.8397932   0.95408106]]
<NDArray 3x3 @cpu(0)>
Gradients (mx) dL/dC:

[[-2.5463698  -1.7238632  -1.213438  ]
 [-2.3961732  -1.6221819  -1.1418638 ]
 [-0.97859794 -0.6624996  -0.46633756]]
<NDArray 3x3 @cpu(0)>
