# Блоки для тестового задания

В jupyter-notebook, чтобы было видно результаты численных проверок градиента

Что нужно реализовать: 
- Embedding (полносвязный линейный слой)
- Sigmoid +
- ReLU +
- Linear +
- Softmax +
- Tanh +
- check_gradient + (сравнение с численным градиентом)
- LSTM

In [1]:
import numpy as np
from scipy.special import logsumexp

## Численный градиент

In [2]:
def check_gradient(func, X, gradient):
    '''
    Computes numerical gradient and compares it with analytcal.
    func: callable, function of which gradient we are interested. Example call: func(X)
    X: np.array of size (n x m)
    gradient: np.array of size (n x m)
    Returns: maximum absolute diviation between numerical gradient and analytical.
    '''
    eps = 10**(-5)
    
    f_grad = np.zeros(gradient.shape)
    
    for i in range(X.shape[0]):
        for j in range(X.shape[1]):
            X[i, j] += eps            
            f_r = func(X)
            X[i, j] -= 2 * eps  
            f_l = func(X)
            X[i, j] += eps
                    
            f_grad[i, j] = (f_r - f_l) / (2 * eps)            
    
    return np.max(np.abs(gradient - f_grad))

In [3]:
# Проверим корректность работы проверки градиента
np.random.seed(159)
x = np.random.rand(10, 5)
func = lambda x: np.sum(x ** 2)
gradient = 2 * x
check_gradient(func, x, gradient)

1.574186336839034e-10

# Собираем блоки

In [4]:
class Linear:
    def __init__(self, input_size, output_size):
        '''
        Creates weights and biases for linear layer.
        Dimention of inputs is *input_size*, of output: *output_size*.
        '''
        self.W = np.random.randn(input_size, output_size)*0.01
        self.b = np.zeros(output_size)

    def forward(self, X):
        '''
        Passes objects through this layer.
        X is np.array of size (N, input_size).
        Returns output of size (N, output_size).
        '''
        self.X = X
        return X.dot(self.W)+self.b

    def backward(self, dLdy):
        '''
        1. Compute dLdw and dLdx.
        2. Store dLdw for step() call
        3. Return dLdx
        '''
        self.dLdW = self.X.T.dot(dLdy)
        self.dLdb = dLdy.sum(0)
        self.dLdx = dLdy.dot(self.W.T)
        return self.dLdx

    def step(self, learning_rate):
        '''
        1. Apply gradient dLdw to network:
        w <- w - learning_rate*dLdw
        '''
        self.W = self.W - learning_rate * self.dLdW
        self.b = self.b - learning_rate * self.dLdb

In [5]:
np.random.seed(777)

X = np.array([[0, 1], [1, 1]], dtype=float)
Y = np.array([[0], [1]])

l = Linear(2, 1)

def loss(W):
    l.W = W
    res = l.forward(X)
    return np.sum((res - Y)**2)

res = l.forward(X)
dLdy = 2*(res - Y)

l.backward(dLdy)

check_gradient(loss, l.W, l.dLdW)

9.583001059354501e-12

### Нелинейности

In [6]:
class Sigmoid:
    def __init__(self):
        pass
    
    def forward(self, X):
        '''
        Passes objects through this layer.
        X is np.array of size (N, d)
        '''
        self.sigm = 1.0 / (1 + np.exp(-X))
        
        return self.sigm
    
    def backward(self, dLdy):
        '''
        1. Compute dLdx.
        2. Return dLdx
        '''
        
        self.dLdx = dLdy * self.sigm * (1 - self.sigm)
        return self.dLdx
    
    def step(self, learning_rate):
        pass

In [7]:
np.random.seed(777)

X = np.array([[0, 1], [1, 1]], dtype=float)
Y = np.array([[0], [1]])

l = Sigmoid()

def loss(X):
    res = l.forward(X)
    
    return np.sum((res - Y)**2)

res = l.forward(X)
dLdy = 2*(res - Y)

l.backward(dLdy)

check_gradient(loss, X, l.dLdx)

4.783617946202412e-12

In [8]:
class ReLU:
    def __init__(self):
        pass
    
    def forward(self, X):
        '''
        Passes objects through this layer.
        X is np.array of size (N, d)
        '''
        self.mask = np.zeros(X.shape)
        self.mask[X > 0] = 1
        
        return self.mask * X
    
    def backward(self, dLdy):
        '''
        1. Compute dLdx.
        2. Return dLdx
        '''
        self.dLdx = dLdy * self.mask
        return self.dLdx
    
    def step(self, learning_rate):
        pass

In [9]:
np.random.seed(777)

X = np.array([[0, 1], [1, 1]], dtype=float)
Y = np.array([[0], [1]])

l = ReLU()

def loss(X):
    res = l.forward(X)
    
    return np.sum((res - Y)**2)

res = l.forward(X)
dLdy = 2*(res - Y)

l.backward(dLdy)

check_gradient(loss, X, l.dLdx)

5.000000413701855e-06

In [10]:
class LogSoftmax:
    def __init__(self):
        pass
    
    def forward(self, X):
        self.X = X
        
        max_prob = np.max(X, axis=1, keepdims=True)
        p_exp = np.exp(self.X - max_prob)
        loss = self.X - np.log(np.sum(p_exp, axis=1, keepdims=True)) - max_prob
        return loss
    
    def backward(self, dLdy):
        prob = np.exp(self.X)
        prob /= np.sum(prob, axis=1, keepdims=True)
        self.dLdx = dLdy * (1 - prob)
        return self.dLdx
        
    def step(self, learning_rate):
        pass

In [11]:
np.random.seed(777)

X = np.array([[0, 1, 0], [1, 1, 1]], dtype=float)
Y = np.array([[0], [1]])

l = LogSoftmax()

mask = np.zeros(X.shape)
mask[np.arange(X.shape[0]), Y.flatten()] = 1
prob = np.exp(X)
prob /= np.sum(prob, axis=1, keepdims=True)


def loss(X):
    res = l.forward(X)
    
    curr_loss = -np.mean(np.sum(res * mask, axis=1), axis=0)
    
    return curr_loss

res = l.forward(X)
print(res)
dLdy = (- 1 + (1 - mask) / (1 - prob)) / X.shape[0]
print(dLdy)

l.backward(dLdy)
print(l.dLdx)

check_gradient(loss, X, l.dLdx)

[[-1.55144471 -0.55144471 -1.55144471]
 [-1.09861229 -1.09861229 -1.09861229]]
[[-0.5         0.67957046  0.13447071]
 [ 0.25       -0.5         0.25      ]]
[[-0.39402922  0.28805844  0.10597078]
 [ 0.16666667 -0.33333333  0.16666667]]


1.0144218798302518e-11

In [12]:
class Tanh:
    '''
    Tanh(x) = 2 * Sigmoid(2x) - 1
    '''
    
    def __init__(self, a=1):
        self.a = a

    def forward(self, X):
        '''
        Passes objects through this layer.
        X is np.array of size (N, d)
        '''
        self.tanh = 2 / (1 + np.exp(-2 * X)) - 1
        return self.tanh
    
    def backward(self, dLdy):
        '''
        1. Compute dLdx.
        2. Return dLdx
        '''
        self.dLdx = dLdy * (1 - self.tanh ** 2)
        return self.dLdx

    def step(self, learning_rate):
        pass

In [13]:
np.random.seed(777)

X = np.array([[0, 1], [1, 1]], dtype=float)
Y = np.array([[0], [1]])

l = Tanh()

def loss(X):
    res = l.forward(X)
    
    return np.sum((res - Y)**2)

res = l.forward(X)
dLdy = 2*(res - Y)

l.backward(dLdy)

check_gradient(loss, X, l.dLdx)

2.9894808850627896e-11

### LSTM

Реализация не закончена, есть только прямой проход

In [17]:
class LSTM:
    def __init__(self, input_size, hidden_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.gate_size = 4 * self.hidden_size
        
        self.W_i = np.zeros((self.input_size, self.gate_size))
        
        self.W_h = np.zeros((self.hidden_size, self.gate_size))
        
        self.b = np.zeros(self.gate_size)
        
        
    def forward(self, inputs, hidden):
        
        h, c = hidden
        bs = h.size(0)
        
        seq_length = inputs.size(0)
        out = np.array(seq_length, bs, self.hidden_size)
        
        for i, inp in enumerate(inputs):
            S = (np.matmul(inp, self.W_i) + np.matmul(h, self.W_h) + self.b)
            S = S.view(bs, 4, self.hidden_size)
            
            i_1 = Sigmoid(S[:,0,:])
            f_1 = Sigmoid(S[:,1,:])
            g_1 = Tanh(S[:,2,:])
            o_1 = Sigmoid(S[:,3,:])
            
            c = f_1 * c + i_1 * g_1
            h = o_1 * Tanh(c)
            out[i] = h
        return out, (h, c)