<a href="https://colab.research.google.com/github/marmurr/Deep-Learning/blob/main/Deep_Learning_Lab2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Task:
* Implement automatic backpropagation (hint: topological sort, dfs)

* Implement those activations (2 of them) that you chose in lab 1 (RELu, Softplus)

* Implement gradient descent as a separate function

# Imports

In [1]:
import numpy as np
import math
import matplotlib.pyplot as plt

In [6]:
class Parameter:
    def __init__(self, value: float, name: str, _children=()) -> None:
        self._value = value
        self._name = name

        self._grad = 0.0
        self._backward = lambda: None
        self._prev = set(_children)

    def __repr__(self) -> str: #representation (parameter - gradient)
        return f"Parameter {self._name} = {self._value}; dL/d[{self._name}] = {self._grad}"

    def __mul__(self, other): #multiplication of parameters
        result = Parameter(
            self._value * other._value,
            f'{self._name} * {other._name}',
            _children=(self, other)
        )

        def _backward():
            self._grad += other._value * result._grad #dL / dself
            other._grad += self._value * result._grad # dL / dother

        result._backward = _backward

        return result

    def __add__(self, other):
        result = Parameter(
            self._value + other._value,
            f'[{self._name} + {other._name}]',
            _children=(self, other)
        )

        def _backward():
            self._grad += 1.0 * result._grad  #dL / dself
            other._grad += 1.0 * result._grad # dL / dother

        result._backward = _backward

        return result

    def sigmoid(self):
        # f(x) = 1 / (1 + exp(self._value))
        # f'(x) = f(x) * (1 - f(x))

        val = 1.0 / (1.0 + math.exp(-self._value))

        result = Parameter(
            val,
            f"σ({self._name})"
        )

        def _backward():
            self._grad = result._grad * val * (1 - val)

        result._backward = _backward

        return result

    def backward(self):
        topo_sort = []
        visited_nodes = set()

        def sort_topo(node):
            if node not in visited_nodes:
                visited_nodes.add(node)
                for child in node._prev:
                    sort_topo(child)
                topo_sort.append(node)
        sort_topo(self)

        self._grad = 1.0
        for node in reversed(topo_sort):
            node._backward()

    def softplus(self):
        val = np.log(1 + np.exp(self._value))
        result = Parameter(
            val,
            f"softplus({self._name})"
        )

        def _backward():
            self._grad += result._grad * (1.0 / (1.0 + np.exp(-self._value)))

        result._backward = _backward

        return result

    def ReLU(self):
        val = np.maximum(0, self._value)
        result = Parameter(
            val,
            f"ReLU({self._name})"
        )

        def _backward():
          self._grad += result._grad * np.where(self._value <= 0, 0, 1)

        result._backward = _backward

        return result

def gd(learning_rate: float, *parameters: Parameter) -> None:
    for parameter in parameters:
        parameter._value -= learning_rate * parameter._grad

# Backward method test

In [7]:
a = Parameter(3.0, 'a')
b = Parameter(2.0, 'b')
c = Parameter(5.0, 'c')
m = Parameter(5.0, 'd')
u = a * b
v = u + c
L = v * m
L.backward()
print(L)
print(v)
print(u)
print(m)
print(c)
print(b)
print(a)

Parameter [a * b + c] * d = 55.0; dL/d[[a * b + c] * d] = 1.0
Parameter [a * b + c] = 11.0; dL/d[[a * b + c]] = 5.0
Parameter a * b = 6.0; dL/d[a * b] = 5.0
Parameter d = 5.0; dL/d[d] = 11.0
Parameter c = 5.0; dL/d[c] = 5.0
Parameter b = 2.0; dL/d[b] = 15.0
Parameter a = 3.0; dL/d[a] = 10.0


In [8]:
x1 = Parameter(3.0, 'x1')
x2 = Parameter(4.0, 'x2')

w1 = Parameter(1.0, 'w1')
w2 = Parameter(2.0, 'w2')

x1w1 = x1 * w1
x2w2 = x2 * w2
xw = x1w1 + x2w2
out = xw.ReLU()
out.backward()
print(out._grad)
print(xw._grad)
print(x2w2._grad)
print(x1w1._grad)
print(x1._grad)
print(w1._grad)
print(x2._grad)
print(w2._grad)

1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0


In [9]:
x1 = Parameter(3.0, 'x1')
x2 = Parameter(4.0, 'x2')

w1 = Parameter(1.0, 'w1')
w2 = Parameter(2.0, 'w2')

x1w1 = x1 * w1
x2w2 = x2 * w2
xw = x1w1 + x2w2
out = xw.softplus()
out.backward()
print(out._grad)
print(xw._grad)
print(x2w2._grad)
print(x1w1._grad)
print(x1._grad)
print(w1._grad)
print(x2._grad)
print(w2._grad)

1.0
0.999983298578152
0.0
0.0
0.0
0.0
0.0
0.0


# Gradient descent test

In [41]:
W = Parameter(0.2, 'W')
b = Parameter(0.5, 'b')
x = Parameter(1, 'x')
learning_rate = 0.001
n_epochs = 10
target = 0.75
for n in range(n_epochs):
    y = (W * x).ReLU() + b
    gd(learning_rate, W, b)
    loss = (y._value - target) ** 2
    y.backward()
    print(f"loss after {n} epochs :{loss}")

loss after 0 epochs :0.0025000000000000044
loss after 1 epochs :0.0025000000000000044
loss after 2 epochs :0.0026009999999999935
loss after 3 epochs :0.0028089999999999934
loss after 4 epochs :0.0031360000000000055
loss after 5 epochs :0.0036000000000000064
loss after 6 epochs :0.004224999999999993
loss after 7 epochs :0.005040999999999993
loss after 8 epochs :0.00608400000000001
loss after 9 epochs :0.007396000000000013


In [42]:
print((W * x).ReLU() + b)

Parameter [ReLU(W * x) + b] = 0.655; dL/d[[ReLU(W * x) + b]] = 0.0
