In [30]:
import numpy as np

In [31]:
# Task 1
# Find the roots of square equation by gradient descent
# x ** 2 - 6 * x + 4 = 0


In [32]:
# возвести в квадрат
# посчитать производную
# надо начать движение от начальной точки в направлении антградиента с заданным шагом
# x = x - lr * grad(x)
# всегда ли сойдемся за приемлемое количество шагов?
# важна ли начальная точка?
# как найти второй корень?
# как вляет ЛР?

In [33]:
# Task 2
# Realize forward and backward pass for linear layer with sigmoid activation

In [34]:
def sigmoid(x):
    return 1. / (1 + np.exp(-x))

def sigmoid_backward(da, x):
    sig = sigmoid(x)
    
    return da * sig * (1 - sig)

def relu(x):
    return np.maximum(0., x)

def relu_backward(da, x):
    da = np.array(da, copy = True)
    da[x <= 0] = 0;
    return da;

In [35]:
def mse_loss(t, y):
    return (t - y) ** 2

def d_mse_loss(t, y):
    return 2 * (y - t) 


In [36]:
class LinearLayer:
    def __init__(self, n_inp, n_out, activation='sigmoid'):
        self.w = np.random.randn(n_out, n_inp) * 0.1
        self.b = np.random.randn(n_out, 1) * 0.1
        if activation == 'sigmoid':
            self.activ = sigmoid
        if activation == 'relu':
            self.activ = relu
        elif activation == 'None':
            self.activ = None
        else:
            raise Exception(f'Unknown activation "{activation}"')
        self._clear_state()

    def _clear_state(self):
        self.lin = None
        self.inp = None
        self.d_w = None
        self.d_b = None

    def forward(self, x):
        self.inp = x
        print('====')
        print(self.b)
        self.lin = np.dot(self.w, x) + self.b
        activ = self.activ(self.lin) if self.activ is not None else self.lin

        return activ

    def backward(self, grad): # grad = d L / d z    Dout 
        # grad * dz / d lin
        if self.activ == sigmoid:
            grad_lin = sigmoid_backward(grad, self.lin) 
        elif self.activ == relu:
            grad_lin = relu_backward(grad, self.lin)
        else:
            grad_lin = grad
        # grad_lin * d lin / d w 
        m = self.inp.shape[1]
        self.d_w = np.dot(grad_lin, self.inp.T) / m    # d_in dOut
        # grad_lin * d lin / d b 
        self.d_b = np.sum(grad_lin, axis=1, keepdims=True) / m

        grad = np.dot(self.w.T, grad_lin)

        return grad

# pred = model(x)
# loss = criterion(pred, target)
# grad = d loss / d pred
# model.backward(grad)

In [37]:
from typing import Tuple

class Model:
    def __init__(self, arch: Tuple[Tuple[int, int]], activation):
        self.layers = []
        for i, p in enumerate(arch):
            self.layers.append(
                LinearLayer(p[0], p[1], 
                            activation=activation if i < len(arch)-1 else 'None')
                )
        self._clear_state()
    
    def _clear_state(self):
        for l in self.layers:
            l._clear_state()

    def forward(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        
        return x

    def backward(self, grad):
        for layer in reversed(self.layers):
            grad = layer.backward(grad)

        return grad 

In [38]:
# Task 3
# Realize SGD Momentum optimizer
# velocity = momentum * velocity - lr * gradient
# w = w + velocity

In [39]:
#для одного слоя
class SGDMomentum:
    def __init__(self, model: LinearLayer, lr=0.001, momentum=0.99):
        self.lr = lr
        self.m = momentum
        self.model = model

        self.vel_w = np.zeros_like(model.w)
        self.vel_b = np.zeros_like(model.b)

    def step(self):
        self.vel_w = self.m * self.vel_w - self.lr * self.model.d_w
        self.vel_b = self.m * self.vel_b - self.lr * self.model.d_b

        self.model.w += self.vel_w
        self.model.b += self.vel_b

        




In [40]:
#для всей модели
class SGDMomentum:
    def __init__(self, model: Model, lr= 0.0001, momentum=0.0):
        self.model = model
        self.lr = lr
        self.m = momentum
        self.vel = [[np.zeros_like(layer.w), 
                     np.zeros_like(layer.b)] for layer in model.layers]

    def step(self):
        for i, layer in enumerate(self.model.layers):
            self.vel[i][0] = self.vel[i][0] * self.m - self.lr * layer.d_w 
            self.vel[i][1] = self.vel[i][1] * self.m - self.lr * layer.d_b 
            layer.w += self.vel[i][0]
            layer.b += self.vel[i][1]
    
    def zero_grad(self):
        self.model._clear_state()

In [41]:
# pred = model(x)
# loss = criterion(pred, target)
# grad = d loss / d pred
# model.backward(grad)
# optim.step()

In [42]:
x = np.random.uniform(-2, 2, 20000)
y = x**2 + np.random.randn()*0.1


In [43]:
model = Model(((1, 100), (100, 1)), activation='relu')
optim = SGDMomentum(model)
for e in range(20):
    for i, (val, t) in enumerate(zip(x, y)):
        optim.zero_grad()
        pred = model.forward(np.array([[val]]))
        loss = mse_loss(t, pred)
        grad = d_mse_loss(t, pred)
        model.backward(grad)
        optim.step()
        
    print(e, model.forward([[1]]), model.forward([[2]]), model.forward([[-1]]), model.forward([[-2]]))

[[ 0.06211994]
 [-0.12359309]
 [-0.07508479]
 [-0.08768428]
 [ 0.16933392]
 [ 0.21531807]
 [-0.04161817]
 [-0.05994589]
 [-0.07757003]
 [-0.11696517]
 [-0.05058536]
 [ 0.05584556]
 [-0.07037821]
 [-0.16961345]
 [ 0.0289412 ]
 [-0.12208779]
 [ 0.00578146]
 [-0.04020215]
 [-0.15105295]
 [-0.06817943]
 [ 0.05032674]
 [ 0.0532597 ]
 [ 0.00420154]
 [-0.028298  ]
 [-0.03601022]
 [ 0.004916  ]
 [-0.00715813]
 [-0.01525943]
 [ 0.13443368]
 [ 0.08602232]
 [ 0.13219727]
 [ 0.02413521]
 [ 0.16127646]
 [ 0.24386041]
 [ 0.19836039]
 [ 0.00410931]
 [ 0.01584528]
 [ 0.15566372]
 [ 0.04470795]
 [-0.09236429]
 [-0.19910149]
 [-0.0281225 ]
 [-0.26927175]
 [ 0.07372726]
 [ 0.00363585]
 [ 0.01191357]
 [ 0.17067493]
 [-0.07201614]
 [-0.13720632]
 [ 0.04869287]
 [-0.010361  ]
 [ 0.0442447 ]
 [-0.01181616]
 [ 0.14347441]
 [-0.01493051]
 [-0.1038864 ]
 [-0.0453732 ]
 [-0.02037094]
 [ 0.10838888]
 [-0.01972945]
 [ 0.03202716]
 [-0.0444611 ]
 [ 0.08328998]
 [ 0.05694203]
 [ 0.08953361]
 [-0.11243017]
 [-0.11328