In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Feedforward

In [None]:
# Linear layer implementation
class Linear(object):
    def __init__(self, in_feats, out_feats):
        super(Linear, self).__init__()
        # need to know this
        self.weight = torch.Tensor(in_feats, out_feats) 
        self.bias = torch.Tensor(1, out_feats) 
        # until here
        self.init_params()

    def init_params(self, std=0.1):
        self.weight = std * torch.randn_like(self.weight)
        self.bias = torch.rand_like(self.bias)

    def forward(self, x):
        # and this
        return torch.matmul(x, self.weight) + self.bias

# testing the linear layer
n_samples, in_feats, out_feats = 2, 3, 4
x = torch.randn((n_samples, in_feats))
layer = Linear(in_feats, out_feats)
y = layer.forward(x)
assert(y.shape == torch.Size([n_samples, out_feats]))

# linear layer the pytorch way
torch_layer = nn.Linear(in_feats, out_feats)
torch_y = torch_layer(x)

In [None]:
class ReLU(object):
    def __init__(self):
        super(ReLU, self).__init__()

    def forward(self, x):
        # also need to know this
        return torch.clamp(x, min=0)

class Sigmoid(object):
    def __init__(self):
        super(Sigmoid, self).__init__()
    
    def forward(self, x):
        # and this
        return 1.0 / (1.0 + torch.exp(-x))

# usage
y_relu = ReLU().forward(x)
y_sigmoid = Sigmoid().forward(x)

# the pytorch way
torch_relu = nn.ReLU()
torch_y_relu = torch_relu(x)
torch_sigmoid = nn.Sigmoid()
torch_y_sigmoid = torch_sigmoid(x)

In [None]:
class Net(object):
    def __init__(self, layers):
        self.layers = layers
    
    def reset_params(self, std=0.1):
        for layer in self.layers:
            if hasattr(layer, 'init_params'):
                layer.init_params(std=std)
    
    def forward(self, x):
        # need to know this
        for layer in self.layers:
            x = layer.forward(x)
        return x

class TorchNet(nn.Module):
    def __init__(self, in_feats, hidden_dim, out_feats):
        super(TorchNet, self).__init__()
        # and this
        self.layer1 = nn.Linear(in_feats, hidden_dim)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_dim, out_feats)
    
    def forward(self, x):
        # and this
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        return x

In [None]:
# how to manually set parameters to a torch layer:
# need to know that
hidden_dim = 5
torch_net = TorchNet(in_feats, hidden_dim, out_feats)
torch_net.layer1.weight = nn.Parameter(torch.Tensor(1))  # torch.Tensor is a placeholder here

In [None]:
def MSELoss(y_true, y_pred):
    # need to know this
    return torch.mean((y_pred - y_true) ** 2)
    # or return (1.0 / len(y_pred)) * torch.sum((y_true - y_pred) ** 2)

In [None]:
# toy training loop for the xor problem
x_xor = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=torch.float32)
y_xor = torch.tensor([[1, 0], [0, 1], [0, 1], [1, 0]]) # one-hot encoded
in_feats, hidden_dim, out_feats = 2, 2, 2
layers = [Linear(in_feats, hidden_dim),
          ReLU(),
          Linear(hidden_dim, out_feats)]
net = Net(layers)
acc = 0
losses = []
while acc < 1:
    net.reset_params(std=0.5)
    y_pred = net.forward(x_xor)
    losses.append(MSELoss(y_xor, y_pred))
    # calculate accuracy (reverse one-hot with argmax)
    acc = torch.sum(y_pred.argmax(1) == y_xor.argmax(1)) / len(y_pred)
print(y_pred)

tensor([[1.5119, 1.3014],
        [1.0014, 1.0033],
        [1.0416, 1.1037],
        [1.0000, 1.0000]])


# Backpropagation

**Recap**:
$$ y = xw + b $$
```
                        Linear Layer 
                  ______________________
x -------------> |  w, b, (cache the x) |--------> y
                 |        locals        |
<---downstream---|______________________|<--upstream

```
- upstream = $\frac{\partial{L}}{\partial{y}}$
- $\frac{\partial{y}}{\partial{w}}=x$ (cached)
- $\frac{\partial{y}}{\partial{w}}=1$
- locals:
  - $\frac{\partial{L}}{\partial{w}}=\frac{\partial{L}}{\partial{y}}\cdot \frac{\partial{y}}{\partial{w}}=$ upstream $\cdot x$
  - $\frac{\partial{L}}{\partial{b}}=\frac{\partial{L}}{\partial{y}}\cdot \frac{\partial{y}}{\partial{b}}=$ upstream $\cdot 1$
- downstream = $\frac{\partial{L}}{\partial{x}}=\frac{\partial{L}}{\partial{y}}\cdot \frac{\partial{y}}{\partial{x}}=$ upstream $\cdot w$ (the message that needs to be passed upwards according to the backprop algorithm)

In [None]:
class Linear(object):
    def __init__(self, in_feats, out_feats):
        super(Linear, self).__init__()
        self.weight = torch.Tensor(in_feats, out_feats)
        self.bias = torch.Tensor(out_feats)
        self.init_params()
        self.cache = None
        self.weight_grad = None
        self.bias_grad = None
    
    def init_params(self, std=0.1):
        self.weight = std * torch.randn_like(self.weight)
        self.bias = torch.rand_like(self.bias)
    
    def forward(self, x):
        y = torch.matmul(x, self.weight) + self.bias
        # know this caching
        self.cache = x
        return y 
    
    def backward(self, dupstream):
        # and these
        self.weight_grad = torch.matmul(self.cache.T, dupstream)
        self.bias_grad = torch.sum(dupstream, dim=0)
        dx = torch.matmul(dupstream, self.weight.T)
        return dx

In [None]:
class ReLU(object):
    def __init__(self):
        super(ReLU, self).__init__()
        self.cache = None
    
    def forward(self, x):
        y = torch.clamp(x, min=0)
        # know this - cache the activation not the input!
        self.cache = y
        return y
    
    def backward(self, dupstream):
        # know these
        dx = dupstream.clone() # make sure we don't modify the upstream
        dx[self.cache == 0] = 0
        return dx

class Sigmoid(object):
    def __init__(self):
        super(Sigmoid, self).__init__()
        self.cache = None
    
    def forward(self, x):
        y = 1.0 / (1.0 + torch.exp(-x))
        # know this - cache the activation, not the input!
        self.cache = y
        return y
    
    def backward(self, dupstream):
        # know this
        return dupstream * self.cache * (1 - self.cache)

In [None]:
# the torch way
x.requires_grad = True
torch_y = torch_layer(x)
torch_y.backward(dy)

In [None]:
class Net(object):
    def __init__(self, layers):
        self.layers = layers
        self.reset_params()

    def reset_params(self, std=1.):
        for layer in self.layers:
            if hasattr(layer, 'init_params'):
                layer.init_params(std=std)
    
    def forward(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def backward(self, dupstream):
        # know this
        dx = dupstream
        for layer in reversed(self.layers):
            dx = layer.backward(dx)
        return dx
    
    def optimizer_step(self, lr):
        # know this
        for layer in self.layers:
            if hasattr(layer, 'weight'):
                layer.weight -= lr * layer.weight_grad
            if hasattr(layer, 'bias'):
                layer.bias -= lr * layer.bias_grad

In [None]:
def MSELoss(y_true, y_pred):
    loss = torch.mean((y_pred - y_true) ** 2)
    # know this
    grad = 2 * (y_pred - y_true) # dL/dy_pred
    return loss, grad

In [None]:
# revisiting the xor problem and the training loop
x_xor = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=torch.float32)
y_xor = torch.tensor([[1, 0], [0, 1], [0, 1], [1, 0]]) # one-hot encoded
in_features, hidden_dim, out_features = 2, 10, 2
learning_rate = 1e-2
optim_steps = 100
layers = [Linear(in_features, hidden_dim),
          ReLU(),
          Linear(hidden_dim, out_features)]
net = Net(layers)
losses = []
accs = []
for i in range(optim_steps):
    y_pred = net.forward(x_xor)
    # know from here
    loss, grad = MSELoss(y_xor, y_pred)
    losses.append(loss)
    net.backward(grad)
    net.optimizer_step(learning_rate)
    # to here
    correct = torch.argmax(y_pred, axis=1) == torch.argmax(y_xor, axis=1)
    accs.append(torch.sum(correct)/len(y_pred))

print(y_pred)

tensor([[0.9096, 0.0114],
        [0.1047, 0.9534],
        [0.0541, 0.9737],
        [0.9306, 0.0456]])


In [None]:
def Softmax(z):
    # know this
    e = torch.exp(z)
    return e / torch.sum(e, dim=1, keepdim=True)

In [None]:
# you don't need to know the implementation of this
def CrossEntropyLoss(y_true, y_pred):
    softmax = Softmax(y_pred)
    y_true = torch.argmax(y_true, axis=1)
    n = y_true.shape[0]
    log_likelihood = -torch.log(softmax[torch.arange(n),y_true])
    loss = torch.mean(log_likelihood)
    
    grad = softmax
    softmax[torch.arange(n), y_true] -= 1
    grad /= n
    
    return loss, grad


# CNN

In [39]:
# you only need to know the shapes here and the feedforward part
class Conv2d(object):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding

        self.weight = torch.Tensor(out_channels, in_channels, kernel_size, kernel_size)
        self.bias = torch.Tensor(out_channels)

        self.init_params()

    def init_params(self, std=0.7071):  # why 1 / sqrt(2) ?
        self.weight = std * torch.randn_like(self.weight)
        self.bias = torch.rand_like(self.bias)

    def forward(self, x):
        # dims of x: (N, C, H, W)
        x_padded = torch.nn.functional.pad(x, [self.padding] * 4) 
        N, _, H, W = x.shape
        Hp = 1 + (H + 2 * self.padding - self.kernel_size) // self.stride
        Wp = 1 + (W + 2 * self.padding - self.kernel_size) // self.stride
        y = torch.empty((N, self.out_channels, Hp, Wp), dtype=x.dtype, device=x.device)
        # you need to know the part below
        for i in range(Hp):
            for j in range(Wp):
                h_offset = i * self.stride
                w_offset = j * self.stride
                window = x_padded[:, :, h_offset:h_offset+self.kernel_size, w_offset:w_offset+self.kernel_size]
                for k in range(N):
                    y[k, :, i, j] = torch.sum(window[k] * self.weight, dim=(1, 2, 3)) + self.bias
        # until here
        self.cache = x_padded
        return y
    
    def backward(self, dupstream):
        x_padded = self.cache
        dx_padded = torch.zeros_like(x_padded)
        self.weight_grad = torch.zeros_like(self.weight)
        N, _, Hp, Wp = dupstream.shape
        for i in range(Hp):
            for j in range(Wp):
                h_offset = i * self.stride
                w_offset = j * self.stride
                window = x_padded[:, :, h_offset:h_offset+self.kernel_size, w_offset:w_offset+self.kernel_size]
                dwindow = dx_padded[:, :, h_offset:h_offset+self.kernel_size, w_offset:w_offset+self.kernel_size]
                for k in range(N):
                    dwindow[k] += (self.weight * dupstream[k, :, i, j].view(-1, 1, 1, 1)).sum(dim=0)
                    self.weight_grad += window[k].view(1, self.in_channels, self.kernel_size, self.kernel_size) * dupstream[k, :, i, j].view(-1, 1, 1, 1)
        H = x_padded.shape[2] - 2 * self.padding
        W = x_padded.shape[3] - 2 * self.padding
        dx = dx_padded[:, :, self.padding:self.padding+H, self.padding:self.padding+W]
        self.bias_grad = dupstream.sum(dim=(0, 2, 3))
        return dx

In [45]:
class MaxPool2d(object):
    def __init__(self, kernel_size, stride=1, padding=0):
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
    
    def forward(self, x):
        x_padded = torch.nn.functional.pad(x, [self.padding] * 4)
        N, C, H, W = x.shape
        KS = self.kernel_size
        Hp = 1 + (H + 2 * self.padding - KS) // self.stride
        Wp = 1 + (W + 2 * self.padding - KS) // self.stride
        y = torch.empty((N*C, Hp, Wp), dtype=x.dtype, device=x.device)
        # part you should know - begin
        for i in range(Hp):
            for j in range(Wp):
                h_offset = i * self.stride
                w_offset = j * self.stride
                window = x_padded[:, :, h_offset:h_offset+self.kernel_size, w_offset:w_offset+self.kernel_size]
                window = window.reshape(N * C, -1)
                y[:, i, j] = window.max(dim=1).values
        # end
        y = y.reshape(N, C, Hp, Wp)
        self.cache = x
        return y
    
    def backward(self, dupstream):
        x = self.cache
        dx = torch.zeros_like(x)
        N, C, Hp, Wp = dupstream.shape
        for i in range(Hp):
            for j in range(Wp):
                h_offset = i * self.stride
                w_offset = j * self.stride
                window = x[:, :, h_offset:h_offset+self.kernel_size, w_offset:w_offset+self.kernel_size].reshape(N*C, -1)
                indices = window.argmax(dim=1)
                dwindow = torch.zeros_like(window)
                dwindow[torch.arange(N*C), indices] += dupstream[:, :, i, j].view(-1)
                dx[:, :, h_offset:h_offset+self.kernel_size, w_offset:w_offset+self.kernel_size] += dwindow.reshape(N, C, self.kernel_size, self.kernel_size)

        return dx

In [None]:
# convolutional layers with dimensions
n_samples, height, width, in_channels = 1, 16, 16, 3
hidden_channels, out_features = [5, 6], 2

layers = [
    Conv2d(in_channels, hidden_channels[0], kernel_size=3, padding=1),
    ReLU(),
    Conv2d(hidden_channels[0], hidden_channels[1], kernel_size=5, padding=2),
    ReLU(),
    Linear(16*16*hidden_channels[1], out_features)  # !

]

# with torch
class TorchCNN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_feats):
        super(TorchCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, hidden_channels[0], kernel_size=3, padding=1)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv2d(hidden_channels[0], hidden_channels[1], kernel_size=5, padding=2)
        self.relu2 = nn.ReLU()
        self.fc = nn.Linear(16 * 16 * hidden_channels[1], out_feats)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.conv2(x)
        x = self.relu2(x)
        x = x.flatten(1) # !
        x = self.fc(x)

# Q: how many params?
# A: 140 + 0 + 756 + 0 + 3074 = 3970

# Optimization

In [None]:
# EWMA update
s_prev = 0
s_cur = rho * s_prev + (1 - rho) * y
# EWMA with bias correction
s_cur_bc = s_cur / (1 - rho**(i + 1)) # i+1 !

# GD with momentum
v = rho * v_prev + (1 - rho) * gradient
X = X - learning_rate * v

# RMSProp
r = rho * r_prev + (1 - rho) * gradient**2
X = X - (learning_rate / np.sqrt(r + delta)) * gradient

# Adam
v = rho_v * v_prev + (1 - rho_v) * gradient
v_bc = v / (1 + rho_v**index)
r = rho_r * r_prev + (1 - rho_r) * gradient**2
r_bc = r / (1 - rho_r**index)
X = X - (learning_rate / np.sqrt(r_bc + delta)) * v_bc # do not forget the delta !

# Torch
optimizer = torch.optim.SGD(net.parameters(), lr=5e-2, momentum=0.9)
optimizer = torch.optim.RMSprop(net.parameters(), lr=1e-3)
optimizer = torch.optim.Adam(net.parameters(), lr=5e-4)

In [None]:
# typical training loop (not sure if they want us to know that)
def train(train_loader, net, optimizer, criterion):
    avg_loss = 0
    correct = 0
    total = 0
    for i, data in enumerate(train_loader):
        inputs, labels = data
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        avg_loss += loss
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    return avg_loss/len(train_loader), 100 * correct / total
        
def test(test_loader, net, criterion):
    avg_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for data in test_loader:
            inputs, labels = data
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            avg_loss += loss
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return avg_loss/len(test_loader), 100 * correct / total

writer = SummaryWriter()
epochs = 100
net = FCNet()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=5e-1)
for epoch in tqdm(range(epochs)):  # loop over the dataset multiple times
    train_loss, train_acc = train(train_loader, net, optimizer, criterion)
    test_loss, test_acc = test(test_loader, net, criterion)
    writer.add_scalars("Loss", {'Train': train_loss, 'Test':test_loss}, epoch)            
    writer.add_scalars('Accuracy', {'Train': train_acc, 'Test':test_acc} , epoch)

print('Finished Training')
writer.flush()
writer.close()

# Regularization 

In [None]:
# L2 regularization (L = L_0 + (lambda / 2)*sum(w**2))
l2 = 0
for p in net.parameters():
    l2 += torch.sum(p**2)
loss += 0.5 * wd * l2 # wd is the weight decay (L2 penalty)

# the torch way:
optimizer = torch.optim.SGD(net.parameters(), lr=5e-2, weight_decay=3e-3)

In [None]:
# early stopping
patience = 0
val_acc_best = 0
patience_cnt = 0
for epoch in range(epochs):
    train_loss, train_acc = train(train_loader, net, optimizer, criterion)
    val_loss, val_acc = test(val_loader, net, criterion)
    # this part
    if val_acc > val_acc_best:
        patience_cnt = 0
        val_acc_best = val_acc
    else:
        patience_cnt += 1
        if patience_cnt == patience:
            break

In [None]:
# dropout
self.do = nn.Dropout(p=0.4) # 40% neurons activated (the rest are set to zero)

# six fully-connected layers with residual connection and dropout layers
self.fc1 = nn.Linear(40, 500)
...
self.fc6 = nn.Linear(500, 10)

h = F.relu(self.fc1(x))
h = h + F.relu(self.fc2(h))
h = self.do1(h)
h = h + F.relu(self.fc3(h))
h = h + F.relu(self.fc4(h))
h = self.do2(h)
h = h + F.relu(self.fc5(h))

# Recurrent

In [None]:
# Elman (vanilla) RNN
class VanillaRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(VanillaRNN, self).__init__()
        self.hidden_size = hidden_size
        self.weight_xh = None
        self.weight_hh = None
        self.bias_xh = None
        self.bias_hh = None

        # need to know this
        self.weight_xh = nn.Parameter(torch.Tensor(input_size, hidden_size)) # input_size = D
        self.weight_hh = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.bias_xh = nn.Parameter(torch.Tensor(hidden_size))
        self.bias_hh = nn.Parameter(torch.Tensor(hidden_size))
        # until here

        self.reset_params()

    def reset_params(self):
        std = 1.0 / math.sqrt(self.hidden_size)
        self.weight_xh.data.uniform_(-std, std)
        self.weight_hh.data.uniform_(-std, std)
        self.bias_xh.data.uniform_(-std, std)
        self.bias_hh.data.uniform_(-std, std)

    def forward(self, x):
        # X (N, T, D) = (samples, timestep, input size)
        x = x.transpose(0, 1) # (T, N, D) after transposing
        T, N = x.shape[0], x.shape[1]
        h0 = torch.zeros(N, self.hidden_size, device=x.device)
        y = []

        # need to know this
        # ht​ = tanh(W_xh * ​xt ​+ b_xh ​+ W_hh * ​h(t−1) ​+ b_hh​)
        ht_1 = h0
        for t in range(T):
            xh = torch.addmm(self.bias_xh, x[t], self.weight_xh)
            hh = torch.addmm(self.bias_hh, ht_1, self.weight_hh)
            ht = torch.tanh(xh + hh)
            y.append(ht)
            ht_1 = ht
        # until here

        y = torch.stack(y)
        y = y.transpose(0, 1) # (T, N, H)
        # y (N, T, H) after transposing
        return y

In [None]:
# GRU
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(GRU, self).__init__()
        self.hidden_size = hidden_size
        self.weight_xh = None
        self.weight_hh = None
        self.bias_xh = None
        self.bias_hh = None

        # need to know the dimensions
        self.weight_xh = nn.Parameter(torch.Tensor(input_size, 3 * hidden_size)) # r, z, n concatenated
        self.weight_hh = nn.Parameter(torch.Tensor(hidden_size, 3 * hidden_size))
        self.bias_xh = nn.Parameter(torch.Tensor(3 * hidden_size))
        self.bias_hh = nn.Parameter(torch.Tensor(3 * hidden_size))

        self.reset_params()

    def reset_params(self):
        std = 1.0 / math.sqrt(self.hidden_size)
        self.weight_xh.data.uniform_(-std, std)
        self.weight_hh.data.uniform_(-std, std)
        self.bias_xh.data.uniform_(-std, std)
        self.bias_hh.data.uniform_(-std, std)

    def forward(self, x):
        x = x.transpose(0, 1)
        T, N, H = x.shape[0], x.shape[1], self.hidden_size
        h0 = torch.zeros(N, H, device=x.device)
        y = []

        # part that you should know below
        # rt​ = σ(W_xr * ​xt​ + b_xr​ + W_hr * ​h(t−1) ​+ b_hr​)
        # zt​ = σ(W_xz * ​xt​ + b_xz​ + W_hz * ​h(t−1) ​+ b_hz​)
        # nt​ = tanh(W_xn * ​xt ​+ b_xn ​+ rt​(W_hn * ​h(t−1) ​+ b_hn​))
        # ht​ = (1 − zt​) ⊙ nt ​+ zt​ ⊙ h(t−1)​
        ht_1 = h0
        for t in range(T):
            xh = torch.addmm(self.bias_xh, x[t], self.weight_xh) 
            hh = torch.addmm(self.bias_hh, ht_1, self.weight_hh)
            rt = torch.sigmoid(xh[:, 0:H] + hh[:, 0:H])
            zt = torch.sigmoid(xh[:, H:2*H] + hh[:, H:2*H])
            nt = torch.tanh(xh[:, 2*H:3*H] + rt*hh[:, 2*H:3*H])
            ht = (1 - zt) * nt + zt * ht_1
            y.append(ht)
            ht_1 = ht
        # until here

        y = torch.stack(y)
        y = y.transpose(0, 1)
        return y

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.weight_xh = None
        self.weight_hh = None
        self.bias_xh = None
        self.bias_hh = None

        # need to know this
        self.weight_xh = nn.Parameter(torch.Tensor(input_size, 4*hidden_size))
        self.weight_hh = nn.Parameter(torch.Tensor(hidden_size, 4*hidden_size))
        self.bias_xh = nn.Parameter(torch.Tensor(4*hidden_size))
        self.bias_hh = nn.Parameter(torch.Tensor(4*hidden_size))

        self.reset_params()

    def reset_params(self):
        std = 1.0 / math.sqrt(self.hidden_size)
        self.weight_xh.data.uniform_(-std, std)
        self.weight_hh.data.uniform_(-std, std)
        self.bias_xh.data.uniform_(-std, std)
        self.bias_hh.data.uniform_(-std, std)

    def forward(self, x):
        x = x.transpose(0, 1)  # (N, T, D) -> (T, N, D)
        T, N, H = x.shape[0], x.shape[1], self.hidden_size
        h0 = torch.zeros(N, H, device=x.device)
        c0 = torch.zeros(N, H, device=x.device)
        y = []

        # it ​= σ(W_xi * ​xt ​+ b_xi ​+ W_hi * ​h(t−1) ​+ b_hi​)
        # ft​ = σ(W_xf * ​xt ​+ b_xf ​+ W_hf * ​h(t−1) ​+ b_hf​)
        # gt ​= tanh(W_xg * ​xt ​+ b_xg ​+ W_hg * ​h(t−1) ​+ b_hg​)
        # ot ​= σ(W_xo * ​xt ​+ b_xo​ + W_ho ​h(t−1) ​+ b_ho​)
        # ct ​= ft​ ⊙ c(t−1) ​+ it ​⊙ gt​
        # ht ​= ot​ ⊙ tanh(ct​)​
        # ht_1 = h0
        ct_1 = c0
        ht_1 = h0
        for t in range(T):
            xh = torch.addmm(self.bias_xh, x[t], self.weight_xh) 
            hh = torch.addmm(self.bias_hh, ht_1, self.weight_hh)
            it = torch.sigmoid(xh[:, 0:H] + hh[:, 0:H])
            ft = torch.sigmoid(xh[:, H:2*H] + hh[:, H:2*H])
            gt = torch.tanh(xh[:, 2*H:3*H] + hh[:, 2*H:3*H])
            ot = torch.sigmoid(xh[:, 3*H:4*H] + hh[:, 3*H:4*H])
            ct = ft * ct_1 + it * gt
            ht = ot * torch.tanh(ct) # ct! and don't forget the tanh
            y.append(ht)
            ct_1 = ct
            ht_1 = ht

        y = torch.stack(y)
        y = y.transpose(0, 1) # (T, N, H) -> (N, T, H)
        return y

# Attention

In [None]:
# must know this
class BasicSelfAttention(nn.Module):
    def forward(self, x):
        # dimensions of x: (b, t, k) (batch size, sequence length, embedding dimension)
        w_prime = torch.bmm(x, x.transpose(1, 2))
        w = F.softmax(w_prime, dim=2)  # sum over the sequences not over vector!
        y = torch.bmm(w, x)
        return y

In [None]:
class SelfAttention(nn.Module):
    def __init__(self, k):
        super(SelfAttention, self).__init__()
        self.tokeys    = nn.Linear(k, k, bias=False)
        self.toqueries = nn.Linear(k, k, bias=False)
        self.tovalues  = nn.Linear(k, k, bias=False)

    def forward(self, x):
        b, t, k = x.size()
        # you must know this
        queries = self.toqueries(x)
        keys = self.tokeys(x)
        values = self.tovalues(x)
        w_prime = torch.bmm(queries, keys.transpose(1, 2)) # !
        w_prime = w_prime / (k**0.5)
        w = F.softmax(w_prime, dim=2)
        y = torch.bmm(w, values) # !
        # until here
        return y

In [None]:
# wide multi-head attention layer
class MultiHeadAttention(nn.Module):
    def __init__(self, k, heads=8):
        super(MultiHeadAttention, self).__init__()
        self.heads = heads
        self.tokeys    = nn.Linear(k, k * heads, bias=False)
        self.toqueries = nn.Linear(k, k * heads, bias=False)
        self.tovalues  = nn.Linear(k, k * heads, bias=False)
        self.unifyheads = nn.Linear(k * heads, k)
        
    def forward(self, x):
        b, t, k = x.size()
        h = self.heads
        # must know all this
        queries = self.toqueries(x).view(b, t, h, k)
        keys = self.tokeys(x).view(b, t, h, k)
        values = self.tovalues(x).view(b, t, h, k)
        queries = queries.transpose(1, 2).reshape(b * h, t, k)  # fold heads into the batch dim
        keys = keys.transpose(1, 2).reshape(b * h, t, k)
        values = values.transpose(1, 2).reshape(b * h, t, k)
        w_prime = torch.bmm(queries, keys.transpose(1, 2)) / (k ** 0.5) # compute attention weights
        w = F.softmax(w_prime, dim=2)
        y = torch.bmm(w, values).view(b, h, t, k) # apply self-attention to the values
        y = y.transpose(1, 2).reshape(b, t, h * k) # swap h, t back, unify heads
        y = self.unifyheads(y)
        return y

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, k, heads):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(k, heads=heads)
        self.norm1 = nn.LayerNorm(k)
        self.ff = nn.Sequential(
            nn.Linear(k, 4 * k),
            nn.ReLU(),
            nn.Linear(4 * k, k)
        )
        self.norm2 = nn.LayerNorm(k)
    def forward(self, x):
        # need to know this
        att = self.att(x)
        res1 = x + att
        norm1 = self.norm1(res1)
        ff = self.ff(norm1)
        res2 = ff + norm1
        y = self.norm2(res2)
        return y

# Autoencoders

In [None]:
class Encoder(nn.Module):
    def __init__(self, latent_dims, s_img, hdim):
        super(Encoder, self).__init__()
        # know these
        self.linear1 = nn.Linear(s_img * s_img, hdim[0])
        self.linear2 = nn.Linear(hdim[0], hdim[1])
        self.linear3 = nn.Linear(hdim[1], latent_dims)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        # and these
        x = torch.flatten(x, start_dim=1)
        x = self.relu(self.linear1(x))
        x = self.relu(self.linear2(x))
        x = self.linear3(x) # no activation!
        return x

class Decoder(nn.Module):
    def __init__(self, latent_dims, s_img, hdim):
        super(Decoder, self).__init__()
        self.linear1 = nn.Linear(latent_dims, hdim[1])
        self.linear2 = nn.Linear(hdim[1], hdim[0])
        self.linear3 = nn.Linear(hdim[0], s_img*s_img)
        self.relu    = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, z):
        # and these
        z = self.relu(self.linear1(z))
        z = self.relu(self.linear2(z))
        z = self.sigmoid(self.linear3(z))
        z = z.reshape((-1, 1, s_img, s_img))
        return z

class Autoencoder(nn.Module):
    def __init__(self, latent_dims, s_img, hdim = [100, 50]):
        super(Autoencoder, self).__init__()
        self.encoder = Encoder(latent_dims, s_img, hdim)
        self.decoder = Decoder(latent_dims, s_img, hdim)

    def forward(self, x):
        # and these
        z = self.encoder(x)
        y = self.decoder(z)
        return y

# when you train this, use MSE loss, not CrossEntropy

In [None]:
class VarEncoder(nn.Module):
    def __init__(self, latent_dims, s_img, hdim):
        super(VarEncoder, self).__init__()
        self.linear1_1 = nn.Linear(s_img*s_img, hdim[0])
        self.linear2_1 = nn.Linear(hdim[0], hdim[1])
        self.linear3_1 = nn.Linear(hdim[1], latent_dims)
        
        self.linear1_2 = nn.Linear(s_img*s_img, hdim[0])
        self.linear2_2 = nn.Linear(hdim[0], hdim[1])
        self.linear3_2 = nn.Linear(hdim[1], latent_dims)
        self.relu    = nn.ReLU()

        self.N = torch.distributions.Normal(0, 1)
        self.N.loc = self.N.loc.to(try_gpu()) # hack to get sampling on the GPU
        self.N.scale = self.N.scale.to(try_gpu())
        self.kl = 0

    # need to know this
    def kull_leib(self, mu, sig): # the KL loss is added to the loss during training
        return (sig**2 + mu**2 - torch.log(sig) - 1/2).sum()

    # and this
    def reparameterize(self, mu, sig):
        return mu + sig * self.N.sample(mu.shape)

    def forward(self, x):
        # and all this
        x = torch.flatten(x, start_dim=1)

        sig = self.relu(self.linear1_1(x))
        sig = self.relu(self.linear2_1(sig))
        sig = self.linear3_1(sig)

        sig = torch.exp(sig)  # <-- sigma needs exp too for some reason?

        mu = self.relu(self.linear1_2(x))
        mu = self.relu(self.linear2_2(mu))
        mu = self.linear3_2(mu)
        # until here
        z = self.reparameterize(mu, sig)
        self.kl = self.kull_leib(mu, sig)
        return z

class VarAutoencoder(nn.Module):
    def __init__(self, latent_dims, s_img, hdim = [100, 50]):
        super(VarAutoencoder, self).__init__()
        self.encoder = VarEncoder(latent_dims, s_img, hdim)
        self.decoder = Decoder(latent_dims, s_img, hdim)

    def forward(self, x):
        z = self.encoder(x)
        y = self.decoder(z)
        return y

# Other notes

## Normalization

Suppose we have:
```
X = torch.Tensor([
    [ [[2, 3]], [[5, 7]], [[11, 13]], [[17, 19]] ],
    [ [[0, 1]], [[1, 2]], [[3, 5]], [[8, 13]] ],
    [ [[1, 2]], [[3, 4]], [[5, 6]], [[7, 8]] ]
])
```

The shape of X is (3, 4, 1, 2) (essentially Batch, Channel, Height, Width)

Normalization: $x_i := \frac{x_i - m_i}{\sqrt{s^2 + e}}$, where e is a small value to prevent division by zero)

Batch Normalization: (per channel)
$m_i = mean(2, 3, 0, 1, 1, 2)$
$s_i^2 = var(2, 3, 0, 1, 1, 2)$

Layer Normalization: (per batch) (independent of batch size, and of other features in a batch)
$m_i = mean(2, 3, 5, 7, 11, 13, 17, 19)$
$s_i^2 = var(2, 3, 5, 7, 11, 13, 17, 19)$

Instance Normalization: (per feature/height-width) (like Batch-Norm w/ batch size 1 - not really used)
$m_i = mean(2, 3)$
$s_i^2 = var(2, 3)$

Group Normalization: (like Layer Norm but with groups)
$m_i = mean(2, 3, 5, 7)$
$s_i^2 = var(2, 3, 5, 7)$

## How to avoid vanishing/exploding gradients

- Use ReLu-like activation functions (ReLU, Leaky-ReLu, Randomized ReLu etc)
- Use Batch Normalization (BN) (or GN, especially when using sigmoid or tanh)
- Add residual connections ("highways" for gradients to flow backwards unchanged)
- Try a smaller learning rate
- Use proper weight initialization
- You can try gradient clipping