In [1]:
import sys
sys.path.append('../../pyutils')

import torch
from torch import nn
import torch.nn.functional as F
import numpy as np

import metrics

# Matrix multiplication

## Forward pass

$$Z = X Y, \space X \in \mathbb{R}^{m*n}, Y \in \mathbb{R}^{n*p}, X \in \mathbb{R}^{m*p}$$

## Backward pass

$$\frac{\partial E}{\partial X} = \frac{\partial E}{\partial Z} Y^T$$
$$\frac{\partial E}{\partial Y} = X^T \frac{\partial E}{\partial Z}$$

In [13]:
np.random.seed(12)

def matmul(x, y):
    return x @ y

def matmul_dx(x, y, dout):
    return dout @ y.T

def matmul_dy(x, y, dout):
    return x.T @ dout

x = np.random.randn(4, 6).astype(np.float32)
y = np.random.randn(6, 9).astype(np.float32)
z = matmul(x, y)
loss = np.sum(z**2)
dz = 2*z
dx = matmul_dx(x, y, dz)
dy = matmul_dy(x, y, dz)


tx = torch.from_numpy(x).requires_grad_(True)
ty = torch.from_numpy(y).requires_grad_(True)
tz = tx @ ty
tloss = torch.sum(tz**2)
tloss.backward()
tdx = tx.grad
tdy = ty.grad

print(metrics.tdist(z, tz.data.numpy()))
print(metrics.tdist(dx, tdx.data.numpy()))
print(metrics.tdist(dy, tdy.data.numpy()))

0.0
0.0
0.0


## Add bias

This is an operation that takes a 3D tensor $X$ and vector $y$ and sum both of them on the second axis of $X$.  
This is a general operation that can implement many types of add bias by reshaping the inputs correctly

## Forward pass

$$Z = \text{add_bias}(X, y) \space X, Z \in \mathbb{R}^{n*m*p}, y \in \mathbb{R}^m$$

$$Z_{ijk} = X_{ijk} + b_{j}$$

## Backward pass

$$\frac{\partial E}{\partial X} = \frac{\partial E}{\partial Z}$$
$$\frac{\partial E}{\partial y_j} = \sum_{i=1}^n \sum_{k=1}^p \frac{\partial E}{\partial Z_{ijk}}$$

In [38]:
np.random.seed(12)

def add_bias(x, y):
    return x + y.reshape(1, -1, 1)

def add_bias_dx(x, y, dout):
    return dout

def add_bias_dy(x, y, dout):
    return np.sum(dout, axis=(0,2))

x = np.random.randn(4, 6, 3).astype(np.float32)
y = np.random.randn(6).astype(np.float32)
z = add_bias(x, y)
loss = np.sum(z**2)
dz = 2*z
dx = add_bias_dx(x, y, dz)
dy = add_bias_dy(x, y, dz)


tx = torch.from_numpy(x).requires_grad_(True)
ty = torch.from_numpy(y).requires_grad_(True)
tz = tx + ty.view(1, -1, 1)
tloss = torch.sum(tz**2)
tloss.backward()
tdx = tx.grad
tdy = ty.grad

print(metrics.tdist(z, tz.data.numpy()))
print(metrics.tdist(dx, tdx.data.numpy()))
print(metrics.tdist(dy, tdy.data.numpy()))

0.0
0.0
5.415829e-06


# Linear transformation (FC layer)

Perform a linear transformation with bias, with $p$ input units and $k$ output units.

## Forward pass

$$Y = XW^T + b, \space X \in \mathbb{R}^{N*p}, W \in \mathbb{R}^{k*p}, b \in \mathbb{R}^k, Y \in \mathbb{R}^{N*k}$$


## Backward pass

$$\frac{\partial E}{\partial X} = \frac{\partial E}{\partial Z} W$$
$$\frac{\partial E}{\partial W} = X^T \frac{\partial E}{\partial Z}$$
$$\frac{\partial E}{\partial b_j} = \sum_{i=1}^N \frac{\partial E}{\partial Y_{ij}}$$

In [59]:
np.random.seed(12)

def linear(x, w, b):
    return x @ w.T + b

def linear_dx(x, w, b, dout):
    return dout @ w

def linear_dw(x, w, b, dout):
    return dout.T @ x

def linear_db(x, w, b, dout):
    return np.sum(dout, axis=0)

x = np.random.randn(18, 14).astype(np.float32)
w = np.random.randn(5, 14).astype(np.float32)
b = np.random.randn(5).astype(np.float32)
y = linear(x, w, b)
loss = np.sum(y**2)
dy = 2*y
dx = linear_dx(x, w, b, dy)
dw = linear_dw(x, w, b, dy)
db = linear_db(x, w, b, dy)

tx = torch.from_numpy(x).requires_grad_(True)
tw = torch.from_numpy(w).requires_grad_(True)
tb = torch.from_numpy(b).requires_grad_(True)
ty = F.linear(tx, tw, tb)
tloss = torch.sum(ty**2)
tloss.backward()
tdx = tx.grad
tdw = tw.grad
tdb = tb.grad

print(metrics.tdist(y, ty.data.numpy()))
print(metrics.tdist(dx, tdx.data.numpy()))
print(metrics.tdist(dw, tdw.data.numpy()))
print(metrics.tdist(db, tdb.data.numpy()))

0.0
0.0
0.0
0.0


# Sum

## Forward pass

$$y = \sum_{i=1}^n X_i, \space X \in \mathbb{R}^n, y \in \mathbb{R}$$

## Backward pass

$$\frac{\partial E}{\partial X} = \frac{\partial E}{\partial y} * \mathbb{1}_n$$

In [66]:
np.random.seed(12)

def vsum(x):
    return np.sum(x)

def vsum_dx(x, dout):
    return dout * np.ones(x.shape)

x = np.random.randn(4, 6, 3).astype(np.float32)
y = vsum(x)
loss = np.sum(y**2)
dy = 2*y
dx = vsum_dx(x, dy)

tx = torch.from_numpy(x).requires_grad_(True)
ty = torch.sum(tx)
tloss = torch.sum(ty**2)
tloss.backward()
tdx = tx.grad

print(metrics.tdist(y, ty.data.numpy()))
print(metrics.tdist(dx, tdx.data.numpy()))

9.536743e-07
1.6184389828183307e-05
