<a href="https://colab.research.google.com/github/oleksandrkuzmychov/DL_2024_Kuzmychov/blob/main/Lab2_DL_Kuzmychov.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Backpropagation handmade

In [1]:
import numpy as np
import math
import matplotlib.pyplot as plt

## **Backpropagation realisation class**

In [45]:
class Parameter:
    def __init__(self, value: float, name: str, children=()) -> None:
        self._value = value
        self._name = name
        self._grad = 0.0
        self._backward = lambda: None
        self._prev = set(children)

    def __repr__(self) -> str:
        return f"Parameter {self._name} = {self._value}; dL/d[{self._name}] = {self._grad}"

    def __mul__(self, other: 'Parameter') -> 'Parameter':
        result = Parameter(self._value * other._value, f'{self._name} * {other._name}', (self, other))

        def _backward():
            self._grad += other._value * result._grad
            other._grad += self._value * result._grad
        result._backward = _backward
        return result


    def __add__(self, other: 'Parameter') -> 'Parameter':
        result = Parameter(self._value + other._value, f'[{self._name} + {other._name}]', (self, other))

        def _backward():
            self._grad += 1.0 * result._grad
            other._grad += 1.0 * result._grad
        result._backward = _backward
        return result


    def sigmoid(self) -> 'Parameter':
        # f(x) = 1 / (1 + exp(self._value))
        # f'(x) = f(x) * (1 - f(x))
        val = 1.0 / (1.0 + math.exp(-self._value))
        result = Parameter(val, f"σ({self._name})", (self,))

        def _backward():
            self._grad = result._grad * val * (1 - val)
        result._backward = _backward
        return result


    def backward(self):
       topolog = []
       visited = set()
       def make_topo(i):
           if i not in visited:
               visited.add(i)
               for child in i._prev:
                   make_topo(child)
               topolog.append(i)
       make_topo(self)

       self._grad = 1.0
       for node in reversed(topolog):
           node._backward()


    def relu(self) -> 'Parameter':
        result = Parameter(np.maximum(0, self._value), f'ReLU({self._name})', (self,))

        def _backward():
            cond = 1 if self._value > 0 else 0
            self._grad += cond*result._grad
        result._backward = _backward
        return result


    def gelu(self) -> 'Parameter':
        result = Parameter (self._value*0.5*(1 + np.tanh(np.sqrt(2/np.pi)*(self._value + 0.044715*self._value**3))), f'GeLU({self._name})', (self,))

        def _backward():
            self._grad += result._grad*0.5*(1 + np.tanh(np.sqrt(2/np.pi)*(self._value + 0.044715*self._value**3))) + 0.5*self._value/(np.cosh(np.sqrt(2/np.pi)*(self._value + 0.044715*self._value**3)))**2 * np.sqrt(2/np.pi)*(1 + 3*0.044715*self._value**2)
        result._backward = _backward
        return result

def sgd(weights: list, lr = 0.01):
    for param in weights:
        param._value -= param._grad*lr
    return weights

## **Test (Class work)**

In [3]:
a = Parameter(3.0, 'a')
b = Parameter(2.0, 'b')
c = Parameter(5.0, 'c')
d = Parameter(5.0, 'd')

print(a)
print(b)
print(c)
print(d)

Parameter a = 3.0; dL/d[a] = 0.0
Parameter b = 2.0; dL/d[b] = 0.0
Parameter c = 5.0; dL/d[c] = 0.0
Parameter d = 5.0; dL/d[d] = 0.0


In [4]:
u = a * b
v = u + c
L = v * d

u, v, L

(Parameter a * b = 6.0; dL/d[a * b] = 0.0,
 Parameter [a * b + c] = 11.0; dL/d[[a * b + c]] = 0.0,
 Parameter [a * b + c] * d = 55.0; dL/d[[a * b + c] * d] = 0.0)

In [21]:
L._grad = 1.0
L._backward()

v._name = 'v'
u._name = 'u'

print(L)
print(v)
print(d)

Parameter [a * b + c] * d = 55.0; dL/d[[a * b + c] * d] = 1.0
Parameter v = 11.0; dL/d[v] = 5.0
Parameter d = 5.0; dL/d[d] = 11.0


In [24]:
v._backward()

print(u)
print(c)

Parameter u = 6.0; dL/d[u] = 5.0
Parameter c = 5.0; dL/d[c] = 5.0


In [25]:
u._backward()

print(a)
print(b)

Parameter a = 3.0; dL/d[a] = 10.0
Parameter b = 2.0; dL/d[b] = 15.0


In [26]:
x = Parameter(4.0, 'x')
f = x + x

f._grad = 1.0
f._backward()

print(f)
print(x)

Parameter [x + x] = 8.0; dL/d[[x + x]] = 1.0
Parameter x = 4.0; dL/d[x] = 2.0


In [27]:
x1 = Parameter(3.0, 'x1')
x2 = Parameter(4.0, 'x2')
w1 = Parameter(1.0, 'w1')
w2 = Parameter(2.0, 'w2')

x1w1 = x1 * w1
x2w2 = x2 * w2
xw = x1w1 + x2w2

print(xw)

out = xw.sigmoid()
out._grad = 1
out._backward()

print(out)
print(xw)

Parameter [x1 * w1 + x2 * w2] = 11.0; dL/d[[x1 * w1 + x2 * w2]] = 0.0
Parameter σ([x1 * w1 + x2 * w2]) = 0.999983298578152; dL/d[σ([x1 * w1 + x2 * w2])] = 1
Parameter [x1 * w1 + x2 * w2] = 11.0; dL/d[[x1 * w1 + x2 * w2]] = 1.670114291046157e-05


In [28]:
xw._backward()

print(x1w1)
print(x2w2)

Parameter x1 * w1 = 3.0; dL/d[x1 * w1] = 1.670114291046157e-05
Parameter x2 * w2 = 8.0; dL/d[x2 * w2] = 1.670114291046157e-05


In [29]:
x1w1._backward()
x2w2._backward()

print(x1)
print(w1)
print(x2)
print(w2)

Parameter x1 = 3.0; dL/d[x1] = 1.670114291046157e-05
Parameter w1 = 1.0; dL/d[w1] = 5.010342873138471e-05
Parameter x2 = 4.0; dL/d[x2] = 3.340228582092314e-05
Parameter w2 = 2.0; dL/d[w2] = 6.680457164184628e-05


In [36]:
o = Parameter(2.0, 'o')
k = Parameter(4.0, 'k')
ko = k*o
m = Parameter(5.0, 'm')
res = ko + m
print(res)

out = res.relu()
out._grad = 1.0
out.backward()

print(out)
print(res)
print(m)
print(ko)
print(k)
print(o)

Parameter [k * o + m] = 13.0; dL/d[[k * o + m]] = 0.0
Parameter ReLU([k * o + m]) = 13.0; dL/d[ReLU([k * o + m])] = 1.0
Parameter [k * o + m] = 13.0; dL/d[[k * o + m]] = 1.0
Parameter m = 5.0; dL/d[m] = 1.0
Parameter k * o = 8.0; dL/d[k * o] = 1.0
Parameter k = 4.0; dL/d[k] = 2.0
Parameter o = 2.0; dL/d[o] = 4.0


In [38]:
o._grad = 0.0
k._grad = 0.0
ko._grad = 0.0
m._grad = 0.0

In [39]:
o = Parameter(2.0, 'o')
k = Parameter(4.0, 'k')
ko = k*o
m = Parameter(5.0, 'm')
res = ko + m
print(res)

out = res.gelu()
out._grad = 1.0
out.backward()

print(out)
print(res)
print(m)
print(ko)
print(k)
print(o)

Parameter [k * o + m] = 13.0; dL/d[[k * o + m]] = 0.0
Parameter GeLU([k * o + m]) = 13.0; dL/d[GeLU([k * o + m])] = 1.0
Parameter [k * o + m] = 13.0; dL/d[[k * o + m]] = 1.0
Parameter m = 5.0; dL/d[m] = 1.0
Parameter k * o = 8.0; dL/d[k * o] = 1.0
Parameter k = 4.0; dL/d[k] = 2.0
Parameter o = 2.0; dL/d[o] = 4.0


## **Stochastic Gradient Descent Test (Minimization of the result)**

In [55]:
x1 = Parameter(8.0, 'x1')
x2 = Parameter(5.0, 'x2')
w1 = Parameter(3.0, 'w1')
w2 = Parameter(1.0, 'w2')

x1w1 = x1*w1
x2w2 = x2*w2
xw = x1w1.relu() + x2w2.gelu()
print(xw, "\n")

n_epoch = 10
for n in range(n_epoch):
    x1w1 = x1*w1
    x2w2 = x2*w2
    xw = x1w1.relu() + x2w2.gelu()
    print(f"Epoch {n}: {xw}")
    print(f"Weights: {w1}, {w2}\n" )
    xw.backward()
    Weights = sgd([w1, w2])
    w1 = Weights[0]
    w2 = Weights[1]

Parameter [ReLU(x1 * w1) + GeLU(x2 * w2)] = 28.99999977082038; dL/d[[ReLU(x1 * w1) + GeLU(x2 * w2)]] = 0.0 

Epoch 0: Parameter [ReLU(x1 * w1) + GeLU(x2 * w2)] = 28.99999977082038; dL/d[[ReLU(x1 * w1) + GeLU(x2 * w2)]] = 0.0
Weights: Parameter w1 = 3.0; dL/d[w1] = 0.0, Parameter w2 = 1.0; dL/d[w2] = 0.0

Epoch 1: Parameter [ReLU(x1 * w1) + GeLU(x2 * w2)] = 28.10999845560614; dL/d[[ReLU(x1 * w1) + GeLU(x2 * w2)]] = 0.0
Weights: Parameter w1 = 2.92; dL/d[w1] = 8.0, Parameter w2 = 0.9499999226819006; dL/d[w2] = 5.000007731809938

Epoch 2: Parameter [ReLU(x1 * w1) + GeLU(x2 * w2)] = 26.32997728555525; dL/d[[ReLU(x1 * w1) + GeLU(x2 * w2)]] = 0.0
Weights: Parameter w1 = 2.76; dL/d[w1] = 16.0, Parameter w2 = 0.8499994855723793; dL/d[w2] = 10.000043710952136

Epoch 3: Parameter [ReLU(x1 * w1) + GeLU(x2 * w2)] = 23.65935264848688; dL/d[[ReLU(x1 * w1) + GeLU(x2 * w2)]] = 0.0
Weights: Parameter w1 = 2.5199999999999996; dL/d[w1] = 24.0, Parameter w2 = 0.6999937842877335; dL/d[w2] = 15.000570128464