In [935]:
import numpy as np

def sigmoid(x):
    return 1/(1+np.exp(-x))

def sigmoid_prime(z):
    s = sigmoid(z)
    return s-s*s

def Loss(y, y_pred):
    return (y_pred - y) * y_pred * (1 - y_pred)

In [955]:
w1 = np.array([[ 0.0027,  0.1021],
               [ 0.5947, -1.0786],
               [ 0.2147,  0.8666]])

w2 = np.array([[-0.0700, -0.4209, -0.8642],
               [ 0.9634,  0.5741, -0.3815]])

b1 = np.array([0., 0., 0.])
b2 = np.array([0., 0.])

x = np.array([[2, -1]])
y = np.array([0, 1])

In [1029]:
class Layer:
    def __init__(self, size=None, w=None, b=None):
        if w is None:
            w = np.random.uniform(size=size, low=-1.0, high=1.0)
        if b is None:
            b = np.zeros(size[1])
        self.w = w
        self.b = b
        
    def __call__(self, x):
        self.z = x @ self.w.T - self.b
        self.a = sigmoid(self.z)
        return self.a
    
    def compute_gradient(self, grad):
        self.grad = (grad @ self.w) * self.a * (1 - self.a)
    
    def backward(self, grad):
        self.compute_gradient(grad) 
        self.w -= self.grad @ self.a
        self.b -= np.sum(self.grad.T, axis=-1, keepdims=True)
    
class MultiLayerPerceptron:
    def __init__(self, input_shape=2, shape=(3, 2), layers=None):
        if layers is not None:
            self.layers = layers
        else:
            self.layers = [Layer(size=(shape[0], input_shape))]
            for i in range(len(shape)-1):
                self.layers.append(Layer(size=(shape[i+1], shape[i])))
    
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
    
    def backward(self, loss):
        self.layers[-1].backward(loss)
        for i in range(len(self.layers) - 2, -1, -1):
            gradient = self.layers[i].compute_gradient(self.layers[i+1].grad)
            self.layers[i].backward(gradient)

    def summary(self):
        for i, layer in enumerate(self.layers):
            print(f"Layer: {i} | Shape: {layer.w.shape[::-1]} | Parameters: {layer.w.size + layer.b.size}")
            
mlp = MultiLayerPerceptron(layers = [Layer(w=w1, b=b1), Layer(w=w2, b=b2)])
mlp.summary()
y_pred = mlp(x[0])
loss = Loss(y, y_pred)
loss

Layer: 0 | Shape: (2, 3) | Parameters: 9
Layer: 1 | Shape: (3, 2) | Parameters: 8


array([ 0.07214337, -0.06189344])

In [1031]:
w = mlp.layers[-1].w
a = mlp.layers[-1].a

In [1033]:
# (loss @ w) * a * (1 - a)
(loss @ w) * (a * (1 - a))

ValueError: operands could not be broadcast together with shapes (3,) (2,) 

In [1002]:
mlp.layers[-1].compute_gradient(loss)
mlp.layers[-1].grad

array([[-0.00106122, -0.01305624],
       [-0.00638096, -0.00778035],
       [-0.01310151,  0.00517018]])

In [950]:
w * np.array([loss])

ValueError: operands could not be broadcast together with shapes (2,3) (1,2) 

In [999]:
mlp.layers[-2].compute_gradient(mlp.layers[-1].grad)

ValueError: operands could not be broadcast together with shapes (2,3) (2,) 

In [625]:
mlp = MultiLayerPerceptron(layers=[
    Layer(
        w = w1,
        b = b1
    ),
    Layer(
        w = w2,
        b = b2
    )
])
y_pred = mlp(x[0])
y_pred
# [0.32000038, 0.69612414]

array([0.31998417, 0.69613687])

In [293]:
mlp.backward(loss)

In [277]:
grad = mlp.layers[0].compute_gradient(mlp.output.gradient)
grad

array([-0.00158887,  0.00940998, -0.00970491])

In [295]:
for i in range(len((2, 3, 2))-1):
    print(i)

0
1


In [302]:
np.random.uniform(size=(2, 3), low=-1.0, high=1.0)

array([[ 0.88603224,  0.55697179, -0.87157397],
       [-0.74662932,  0.31392787, -0.61538366]])

In [245]:
[i for i in range(2-2, -1, -1)]

[0]

In [570]:
import torch
from torch import nn

In [789]:
d1 = nn.Linear(2, 3)
d1.weight = nn.Parameter(torch.from_numpy(w1).float())
d1.bias = nn.Parameter(torch.from_numpy(b1).float())

d2 = nn.Linear(3, 2)
d2.weight = nn.Parameter(torch.from_numpy(w2).float())
d2.bias = nn.Parameter(torch.from_numpy(b2).float())

print(d2.weight)
print(d2.bias)
print()

model = nn.Sequential(d1, nn.Sigmoid(), d2, nn.Sigmoid())

y_pred = model(torch.tensor(x).float())
print(y_pred)
print()

loss = (y_pred - torch.tensor(y).float())
print(loss)
print()

loss = loss.sum()

optim = torch.optim.SGD(model.parameters(), lr=1)
optim.zero_grad()
loss.backward(retain_graph=True)
optim.step()

print(d2.weight)

Parameter containing:
tensor([[-0.0475, -0.3984, -0.8417],
        [ 0.9859,  0.5966, -0.3590]], requires_grad=True)
Parameter containing:
tensor([-0.0054, -0.0054], requires_grad=True)

tensor([[0.3275, 0.7034]], grad_fn=<SigmoidBackward0>)

tensor([[ 0.3275, -0.2966]], grad_fn=<SubBackward0>)

tensor([[0.1048, 0.1996, 0.0864],
        [0.0993, 0.1891, 0.0819]])


In [793]:
d2.weight.grad

tensor([[0.1048, 0.1996, 0.0864],
        [0.0993, 0.1891, 0.0819]])

In [777]:
y_pred

tensor([[0.3200, 0.6961]], grad_fn=<SigmoidBackward0>)