In [1]:
import sys
sys.path.append('../../pyutils')

import numpy as np
import torch

import metrics
import utils

# Convolutional Layer

## Convolution Operation

$X$: input tensor of size $(N, C_X, H_X, W_X)$  
$K$: kernel tensor of size $(C_Y, C_X, H_K, W_K)$  
$Y$: output tensor of size $(N, C_Y, H_Y, W_Y)$

$S_H$: Stride on the y-axis.  
$S_W$: Stride on the x-axis.

$$Y = \text{Conv2D}(X, K, S_H, S_W)$$

$$H_Y = \lfloor \frac{H_X - H_K}{S_H} \rfloor + 1$$  
$$W_Y = \lfloor \frac{W_X - W_K}{S_W} \rfloor + 1$$

$$Y_{n,f,y,x} = \sum_{c=0}^{C_X} \sum_{i=0}^{H_K} \sum_{j=0}^{W_K} X_{n,c,y*S_H+i,x*S_W+j} K_{f,c,i,j} $$

In [2]:
def compute_val(X, K, n, f, y, x):
    res = 0
    for c in range(K.shape[1]):
        for i in range(K.shape[2]):
            for j in range(K.shape[3]):
                res += X[n, c, y+i, x+j] * K[f, c, i, j]
    return res

def conv2d(X, K, sh, sw):
    h_y = int((X.shape[2] - K.shape[2]) / sh + 1)
    w_y = int((X.shape[3] - K.shape[3]) / sw + 1)
    
    Y = np.empty((X.shape[0], K.shape[0], h_y, w_y))
    
    for n in range(X.shape[0]):
        for f in range(K.shape[0]):
            for y in range(h_y):
                for x in range(w_y):
                    Y[n, f, y, x] = compute_val(X, K, n, f, y*sh, x*sw)
                    
    return Y

In [3]:
#Test stride = 1

X = np.random.randn(2, 3, 17, 23)
K = np.random.randn(4, 3, 5, 8)
Y = conv2d(X, K, 1, 1)
tX = torch.Tensor(X)
tK = torch.Tensor(K)
tY = torch.nn.functional.conv2d(tX, tK, stride=(1, 1))

print(Y.shape)
print(tY.shape)
print(metrics.tdist(tY.data.numpy(), Y))

print(Y.ravel()[:10])
print(tY.data.numpy().ravel()[:10])

(2, 4, 13, 16)
torch.Size([2, 4, 13, 16])
8.76177517283893e-05
[12.27027611 -1.85225329 -5.01949694 -8.9430606   8.00193804 21.22102025
 -9.40190511 11.04296152 -0.60304749  5.9764414 ]
[12.270277  -1.8522543 -5.019496  -8.943059   8.001938  21.221014
 -9.401903  11.04296   -0.6030485  5.9764423]


In [4]:
#Test strided complete

X = np.random.randn(2, 3, 17, 24)
K = np.random.randn(4, 3, 5, 8)
Y = conv2d(X, K, 3, 4)
tX = torch.Tensor(X)
tK = torch.Tensor(K)
tY = torch.nn.functional.conv2d(tX, tK, stride=(3, 4))

print(Y.shape)
print(tY.shape)
print(metrics.tdist(tY.data.numpy(), Y))

print(Y.ravel()[:10])
print(tY.data.numpy().ravel()[:10])

(2, 4, 5, 5)
torch.Size([2, 4, 5, 5])
3.068909616939018e-05
[-36.17271351  16.92498969   7.10963938  12.88219436  -8.32642641
  12.03880214  15.00547853  -7.8010264   25.09452249   0.91864666]
[-36.17271     16.924992     7.109639    12.882193    -8.326427
  12.0388      15.005477    -7.8010254   25.094528     0.91864663]


In [5]:
#Test strided partial

X = np.random.randn(2, 3, 19, 26)
K = np.random.randn(4, 3, 5, 8)
Y = conv2d(X, K, 3, 4)
tX = torch.Tensor(X)
tK = torch.Tensor(K)
tY = torch.nn.functional.conv2d(tX, tK, stride=(3, 4))

print(Y.shape)
print(tY.shape)
print(metrics.tdist(tY.data.numpy(), Y))

print(Y.ravel()[:10])
print(tY.data.numpy().ravel()[:10])

(2, 4, 5, 5)
torch.Size([2, 4, 5, 5])
2.944308609853652e-05
[ 5.87410606 -2.94072337 -5.19944722  3.12111597  6.34410141 -7.72689006
 -0.64374334  4.17813088 16.82332271 -6.28946122]
[ 5.8741064 -2.9407241 -5.1994486  3.121117   6.3440995 -7.7268906
 -0.6437428  4.17813   16.823324  -6.28946  ]


## Padded Convolution

To make the output of the convolution to be of a specific size, we usually pad the input in both x and y axis of 0 values, for both sides.

$X$: input tensor of size $(N, C_X, H_X, W_X)$  
$K$: kernel tensor of size $(C_Y, C_X, H_K, W_K)$  
$Y$: output tensor of size $(N, C_Y, H_Y, W_Y)$

$S_H$: Stride on the y-axis.  
$S_W$: Stride on the x-axis.  
$P_H$: Padding on the x-axis.  
$P_W$: Padding on the x-axis.

$$Y = \text{Conv2D}(X, K, S_H, S_W, P_H, P_W)$$

$$H_Y = \lfloor \frac{H_X - H_K + 2P_H}{S_H} \rfloor + 1$$
$$W_Y = \lfloor \frac{W_X - W_K + 2P_W}{S_W} \rfloor + 1$$

In [6]:
def pad_tensor(X, ptop, pbot, pleft, pright):
    Y = np.zeros((X.shape[0], X.shape[1], X.shape[2] + ptop + pbot,
                  X.shape[3] + pleft + pright))
    Y[:, :, ptop:ptop+X.shape[2], pleft:pleft+X.shape[3]] = X
    return Y

def conv2d_padded(X, K, sh, sw, ph, pw):
    return conv2d(pad_tensor(X, ph, ph, pw, pw), K, sh, sw)

In [7]:
X = np.random.randn(2, 3, 19, 26)
K = np.random.randn(4, 3, 5, 8)
Y = conv2d_padded(X, K, 3, 4, 0, 0)
tX = torch.Tensor(X)
tK = torch.Tensor(K)
tY = torch.nn.functional.conv2d(tX, tK, stride=(3, 4))

print(Y.shape)
print(tY.shape)
print(metrics.tdist(tY.data.numpy(), Y))

print(Y.ravel()[:10])
print(tY.data.numpy().ravel()[:10])

(2, 4, 5, 5)
torch.Size([2, 4, 5, 5])
3.297999823558242e-05
[  2.75597412   1.76427656  -1.32040894 -14.56122892  -3.84040831
 -31.07545898  -4.92394088  -5.23159529  11.75772608  22.49719166]
[  2.755973    1.7642767  -1.3204103 -14.561233   -3.8404062 -31.075459
  -4.923938   -5.2315955  11.757724   22.497198 ]


In [8]:
X = np.random.randn(2, 3, 19, 26)
K = np.random.randn(4, 3, 5, 8)
Y = conv2d_padded(X, K, 3, 4, 7, 10)
tX = torch.Tensor(X)
tK = torch.Tensor(K)
tY = torch.nn.functional.conv2d(tX, tK, stride=(3, 4), padding=(7, 10))

print(Y.shape)
print(tY.shape)
print(metrics.tdist(tY.data.numpy(), Y))

(2, 4, 10, 10)
torch.Size([2, 4, 10, 10])
3.447534606364203e-05


## Backprogation

$$Y = \text{tr}(X, d1, d2, d3, d4)$$

Operation to transpose a 4D tensor, such that first dimension becomes $d1$, second becomes $d2$, and so on.  

In the convolution, the last 2 dimensions represent the x and y-axises for both the kernel, input, and output tensors, we never need to change them.  
The only needed transpose is to swap the 2 axes.  
Let's define $X^T = \text{tr}(X, 1, 0, 2, 3)$.

$$Y = \text{stride0}(X, h, w)$$

Extend the 2 last axes of the tensor $X$ by adding $h$ rows of $0$ between the original rows, and $w$ columns of $0$ between the original columns.

In [9]:
def stride0(X, h, w):
    
    Y = np.zeros((X.shape[0], X.shape[1],
                 1 + (h + 1) * (X.shape[2] - 1), 
                 1 + (w + 1) * (X.shape[3] - 1)))
    
    for i1 in range(X.shape[0]):
        for i2 in range(X.shape[1]):
            for i3 in range(X.shape[2]):
                for i4 in range(X.shape[3]):
                    Y[i1, i2, i3 * (h+1), i4 * (w+1)] = X[i1, i2, i3, i4]
                
    return Y

$$Y = \text{rot180}(X)$$

Make a 180 degree rotations on the 2 last axes of the tensor $X$.

In [10]:
def rot180(X):
    
    Y = np.empty(X.shape)
    for i1 in range(X.shape[0]):
        for i2 in range(X.shape[1]):
            for i3 in range(X.shape[2]):
                for i4 in range(X.shape[3]):
                    Y[i1, i2, i3, i4] = X[i1, i2, 
                                        X.shape[2] - i3 - 1,
                                        X.shape[3] - i4 - 1]
                    
    return Y

$$\frac{\partial E}{\partial K} = \text{conv2d}(\text{pad}(X^T, P_H, P_W), \text{stride0}(\nabla_Y E^T, S_H-1, S_W-1))^T $$

In [11]:
def conv2d_padded_dk(X, dY, sh, sw, ph, pw):
    Xtr = np.transpose(pad_tensor(X, ph, ph, pw, pw), (1, 0, 2, 3))
    f_dY = np.transpose(dY, (1,0,2,3))
    f_dY = stride0(f_dY, sh - 1, sw - 1)
    o_dK = conv2d(Xtr, f_dY, 1, 1)
    return np.transpose(o_dK, (1,0,2,3))

$$\frac{\partial E}{\partial X} = \text{conv2d}(\text{pad}(\text{stride0}(\nabla_Y E, S_H - 1, S_W - 1), H_K - 1, W_K - 1), \text{rot180}(K^T))_{:,:,P_H:H_X+P_H,P_W:W_X+P_W}$$

In [12]:
def conv2d_padded_dx(K, dY, sh, sw, ph, pw):
    pdY = pad_tensor(stride0(dY, sh - 1, sw - 1), 
                     K.shape[2] - 1, K.shape[2] - 1,
                     K.shape[3] - 1, K.shape[3] - 1)
    K180 = np.transpose(rot180(K), (1, 0, 2, 3))
    dX_full = conv2d(pdY, K180, 1, 1)
    dX = dX_full[:,:,ph:dX_full.shape[2]-ph, pw:dX_full.shape[3]-pw]
    return dX

In [13]:
X = np.random.randn(2, 3, 17, 24).astype(np.float32)
K = np.random.randn(4, 3, 5, 8).astype(np.float32)
Y = conv2d_padded(X, K, 3, 4, 6, 8)
Yf = np.reshape(Y, (-1,))
e = Yf @ Yf


tX = torch.tensor(X, requires_grad=True)
tK = torch.tensor(K, requires_grad=True)
tY = torch.nn.functional.conv2d(tX, tK, stride=(3, 4), padding=(6, 8))
tY = tY.view(-1)
te = torch.dot(tY, tY)
te.backward()

dK = conv2d_padded_dk(X, 2 * Y, 3, 4, 6, 8)
dX = conv2d_padded_dx(K, 2 * Y, 3, 4, 6, 8)

dK_sol = tK.grad.data.numpy()
dX_sol = tX.grad.data.numpy()

print(metrics.tdist(dK_sol, dK))
print(dK_sol.ravel()[:10])
print(dK.ravel()[:10])

print(metrics.tdist(dX_sol, dX))
print(dX_sol.ravel()[:10])
print(dX.ravel()[:10])

0.0008505403649264535
[   4.2319527 -294.97983    -62.27544    -36.04886    -44.115356
  135.40556     99.77838     74.368034   333.33154   -204.06589  ]
[   4.23194802 -294.97981392  -62.27546192  -36.04880602  -44.11533931
  135.40561936   99.77840562   74.36800223  333.33151138 -204.06588999]
0.0007010260194984067
[-13.444337  -18.262794  125.35315   -45.83321   -22.022026   33.528214
 -25.425465   10.127198   64.602066    2.9770412]
[-13.44432102 -18.26277261 125.35316172 -45.83321508 -22.02201179
  33.52820121 -25.42547419  10.12720413  64.60206918   2.97704919]


## Convolution transpose  (deconvolution)

Also called fractionally-strided convolution, it's an operation to upsample 2D data.  
It's operation is the same as to compute the gradient of the convoluton relative to $X$

$X$: input tensor of size $(N, C_X, H_X, W_X)$  
$K$: kernel tensor of size $(C_Y, C_X, H_K, W_K)$  
$Y$: output tensor of size $(N, C_Y, H_Y, W_Y)$

$S_H$: Stride on the y-axis.  
$S_W$: Stride on the x-axis.  
$P_H$: Padding on the x-axis (corresponding to pading for conv input).  
$P_W$: Padding on the x-axis (corresponding to padding for conv input).

$$Y = \text{Conv2D_transpose}(X, K, S_H, S_W, P_H, P_W)$$

$$H_Y = (H_X - 1) S_H - 2 * P_H + H_K$$
$$W_Y = (W_X - 1) S_W - 2 * P_W + W_K$$

In [14]:
HX = (Y.shape[2] - 1) * 3 - 2 * 6 + K.shape[2]
WX = (Y.shape[3] - 1) * 4 - 2 * 8 + K.shape[3]

print(HX, WX)
print(X.shape)

17 24
(2, 3, 17, 24)


Create a conv transpose wth arguments $S_H$, $S_W$, $P_H$ and $P_W$ gives exactly the gradient operation of the conv2d with the same arguments.  
The same way, the gradient with respect to the input of the convolution transpose is the actual convolution.

In [15]:
X2 = np.random.randn(*Y.shape)
K2 = np.random.randn(*K.shape)
Y2 = conv2d_padded_dx(K2, X2, 3, 4, 6, 8)

Y2f = Y2.reshape(-1)
e = Y2f @ Y2f
dX2 = conv2d_padded(Y2 * 2, K2, 3, 4, 6, 8)
dK2 = conv2d_padded_dk(Y2 * 2, X2, 3, 4, 6, 8)

tX2 = torch.tensor(X2, requires_grad=True)
tK2 = torch.tensor(K2, requires_grad=True)
tY2 = torch.nn.functional.conv_transpose2d(tX2, tK2, stride=(3, 4), 
                                           padding=(6, 8))
tY2f = tY2.view(-1)
te = torch.dot(tY2f, tY2f)
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(tY2.data.numpy(), Y2))
print(metrics.tdist(tX2.grad.data.numpy(), dX2))
print(metrics.tdist(tK2.grad.data.numpy(), dK2))

35486.62502238485
35486.62502238486
3.111882376880343e-14
7.669622831608581e-13
1.0476004925131232e-12


## Convolution AddBias

It's possible to add a vector $b$ of size $C_Y$ to the output of the convolution.  
The operation is broadcasted to every position in the input.  

$$Y = X + b, \space X, Y \in \mathbb{R}^{N, C, H, W}, \space b \in \mathbb{R}^C$$ 
$$Y_{n,c,i,j} = X_{n,c,i,j} + b_c$$

$$\frac{\partial E}{\partial X} = \frac{\partial E}{\partial Y}$$
$$\frac{\partial E}{\partial b_c} = \sum_{n=0}^N \sum_{i=0}^H \sum_{J=0}^W   \frac{\partial E}{\partial Y_{ncij}}$$

In [16]:
def conv2d_bias_add(X, b):
    return np.transpose(np.transpose(X, (0, 3, 2, 1)) + b, (0, 3, 2, 1))

X = np.random.randn(4, 6, 8, 9)
b = np.random.randn(6)

print(conv2d_bias_add(X, b).shape)

(4, 6, 8, 9)


In [17]:
X = np.random.randn(2, 3, 17, 24).astype(np.float32)
K = np.random.randn(4, 3, 5, 8).astype(np.float32)
b = np.random.randn(4).astype(np.float32)
Y = conv2d_padded(X, K, 3, 4, 6, 8)
Y = conv2d_bias_add(Y, b)
Yf = np.reshape(Y, (-1,))
e = Yf @ Yf


tX = torch.tensor(X, requires_grad=True)
tK = torch.tensor(K, requires_grad=True)
tb = torch.tensor(b, requires_grad=True)
tY = torch.nn.functional.conv2d(tX, tK, bias=tb,
                                stride=(3, 4), padding=(6, 8))
tYf = tY.view(-1)
te = torch.dot(tYf, tYf)
te.backward()

dY = 2 * Y
db = np.sum(dY, axis=(0, 2, 3))
dK = conv2d_padded_dk(X, dY, 3, 4, 6, 8)
dX = conv2d_padded_dx(K, dY, 3, 4, 6, 8)

dK_sol = tK.grad.data.numpy()
dX_sol = tX.grad.data.numpy()

print(e)
print(te.data.numpy())
print(metrics.tdist(Y, tY.data.numpy()))
print(metrics.tdist(dK_sol, dK))
print(dK_sol.ravel()[:10])
print(dK.ravel()[:10])
print(metrics.tdist(dX_sol, dX))
print(dX_sol.ravel()[:10])
print(dX.ravel()[:10])
print(db)
print(tb.grad.data.numpy())
print(metrics.tdist(tb.grad.data.numpy(), db))

31180.443713126795
31180.445
3.9100816682472546e-05
0.0008785277254920673
[ 129.81822   -22.769493  108.71103   139.54231    51.764565 -167.12312
    2.09487    62.73004   285.674    -295.93918 ]
[ 129.81824269  -22.76947692  108.71101132  139.54227146   51.76462611
 -167.12318224    2.09485557   62.73000509  285.67403302 -295.93914631]
0.0008570048285231473
[ -15.412226   53.548717   28.9291     92.12505   -20.475239  -42.663155
 -104.22132    12.306933   20.25196   -14.48221 ]
[ -15.41222737   53.54872246   28.92910638   92.1250647   -20.47525047
  -42.66316142 -104.22131218   12.30692726   20.25195988  -14.48219793]
[-183.75158732 -159.68335086 -311.66476433  101.68080914]
[-183.7516  -159.6833  -311.6647   101.68082]
7.877045010856777e-05


The same operation can be applied after a convolution transpose

In [18]:
X2 = np.random.randn(*Y.shape)
K2 = np.random.randn(*K.shape)
b2 = np.random.randn(3)
Y2 = conv2d_padded_dx(K2, X2, 3, 4, 6, 8)
Y2 = conv2d_bias_add(Y2, b2)
Y2f = Y2.reshape(-1)
e = Y2f @ Y2f
dY2 = Y2 * 2
dX2 = conv2d_padded(dY2, K2, 3, 4, 6, 8)
dK2 = conv2d_padded_dk(dY2, X2, 3, 4, 6, 8)
db2 = np.sum(dY2, axis=(0, 2, 3))

tX2 = torch.tensor(X2, requires_grad=True)
tK2 = torch.tensor(K2, requires_grad=True)
tb2 = torch.tensor(b2, requires_grad=True)
tY2 = torch.nn.functional.conv_transpose2d(tX2, tK2, bias=tb2, 
                                           stride=(3, 4), 
                                           padding=(6, 8))
tY2f = tY2.view(-1)
te = torch.dot(tY2f, tY2f)
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(tY2.data.numpy(), Y2))
print(metrics.tdist(tX2.grad.data.numpy(), dX2))
print(metrics.tdist(tK2.grad.data.numpy(), dK2))
print(metrics.tdist(tb2.grad.data.numpy(), db2))

39335.24326852916
39335.24326852916
3.1822902218206984e-14
7.888666479021034e-13
9.046082080092663e-13
2.2737367544323206e-13


## Convolution by matrix multiplication

$X$: input tensor of size $(N, C_X, H_X, W_X)$  
$K$: kernel tensor of size $(C_Y, C_X, H_K, W_K)$  
$Y$: output tensor of size $(N, C_Y, H_Y, W_Y)$

$S_H$: Stride on the y-axis.    
$S_W$: Stride on the x-axis.  

Can be transformed into matrix multiplication: $M_K$ $M_X$.

$M_K$ is just $K$ reshaped to size $(C_Y, C_X * H_K * W_K)$. Each row contains the ordered values for the dot product for each feature map of the kernel.

$M_X$ is of size $(C_X * H_K * W_K, N * H_Y * W_Y)$. Each column contains the ordered values for the dot produt with the kernel, for each image and position in the output.  
This matrix contains a lot of duplicates and can be quite big.  

Finally the result is reshaped to a 4D-tensor of size $(C_Y, N, H_Y, W_Y)$, and then transposed to get the same order as $Y$

In [19]:
def x2mat(X, K, sh, sw):
    hy = int((X.shape[2] - K.shape[2]) / sh + 1)
    wy = int((X.shape[3] - K.shape[3]) / sw + 1)
    
    mx = np.empty((X.shape[0], hy, wy,
                   X.shape[1] * K.shape[2] * K.shape[3]))
    
    for n in range(X.shape[0]):
        for i in range(hy):
            for j in range(wy):
                v = X[n, :, i*sh:i*sh+K.shape[2], j*sw:j*sw+K.shape[3]]
                mx[n, i, j] = v.reshape(-1)
     
    return mx.reshape(-1, mx.shape[3])

def conv2d_matmul(X, K, sh, sw):
    
    hy = int((X.shape[2] - K.shape[2]) / sh + 1)
    wy = int((X.shape[3] - K.shape[3]) / sw + 1)
    
    mX = x2mat(X, K, sh, sw)
    mK = K.reshape(K.shape[0], -1)
    mY = mK @ mX.T
    Y = mY.reshape(K.shape[0], X.shape[0], hy, wy)
    Y = np.transpose(Y, (1, 0, 2, 3))
    return Y

X = np.random.randn(2, 3, 17, 24)
K = np.random.randn(4, 3, 5, 8)
Y = conv2d(X, K, 3, 4)
Y2 = conv2d_matmul(X, K, 3, 4)

print(metrics.tdist(Y, Y2))

3.8507597872912736e-14


# Pooling layer

## Max Pooling

$X$: input tensor of size $(N, C, H_X, W_X)$  
$Y$: output tensor of size $(N, C, H_Y, W_Y)$

$K_H$: Kernel height.  
$K_W$ Kernel width.  
$S_H$: Stride on the y-axis.    
$S_W$: Stride on the x-axis.

$$H_Y = \lfloor \frac{H_X - K_H}{S_H} \rfloor + 1$$  
$$W_Y = \lfloor \frac{W_X - K_W}{S_W} \rfloor + 1$$

$$Y_{n,c,i,j} = \max(X_{n,c,i*S_H:i*S_H+K_H,j*S_W:j*S_W+K_W})$$
The kernel returns the maximum value in the applied region

In [20]:
def maxpool(X, kh, kw, sh, sw):
    
    hy = int((X.shape[2] - kh) / sh) + 1
    wy = int((X.shape[3] - kw) / sw) + 1
    
    Y = np.empty((X.shape[0], X.shape[1], hy, wy))
    
    for n in range(X.shape[0]):
        for c in range(X.shape[1]):
            for i in range(hy):
                for j in range(wy):
                    Y[n, c, i, j] = np.max(X[n, c, i*sh:i*sh+kh,
                                             j*sw:j*sw+kw])
    return Y

In [21]:
X = np.random.randn(2, 3, 18, 17).astype(np.float32)
Y = maxpool(X, 2, 3, 4, 5)

tX = torch.tensor(X, requires_grad=True)
tY = torch.nn.functional.max_pool2d(tX, (2, 3), (4, 5))

print(tY.shape)
print(Y.shape)
print(metrics.tdist(Y, tY.data.numpy()))

torch.Size([2, 3, 5, 3])
(2, 3, 5, 3)
0.0


$$\frac{\partial E}{\partial X_I} = \sum_{Y_J \in \text{Succ}(X_I)} \frac{\partial E}{\partial Y_J} \mathbb{1}_{X_I = Y_J}$$

In [22]:
def maxpool_dk(X, Y, dout, kh, kw, sh, sw):
    hy = int((X.shape[2] - kh) / sh) + 1
    wy = int((X.shape[3] - kw) / sw) + 1
    
    dX = np.zeros(X.shape)
    
    for n in range(Y.shape[0]):
        for c in range(Y.shape[1]):
            for i in range(Y.shape[2]):
                for j in range(Y.shape[3]):
                    m = Y[n, c, i, j]
                    for xi in range(i*sh,i*sh+kh):
                        for xj in range(j*sw,j*sw+kw):
                            if X[n, c, xi, xj] == m:
                                dX[n, c, xi, xj] += dout[n, c, i, j]
                                
    return dX

In [23]:
X = np.random.randn(1, 1, 6, 6)
Y = maxpool(X, 2, 2, 2, 2)
dX = maxpool_dk(X, Y, 2*Y, 2, 2, 2, 2)

tX = torch.tensor(X, requires_grad=True)
tY = torch.nn.functional.max_pool2d(tX, (2, 2), (2, 2))
tYf = tY.view(-1)
te = torch.dot(tYf, tYf)
te.backward()

dX_sol = tX.grad.data.numpy()

print(metrics.tdist(Y, tY.data.numpy()))
print(metrics.tdist(dX, dX_sol))

0.0
0.0


In [24]:
X = np.random.randn(1, 1, 5, 5)
Y = maxpool(X, 2, 2, 1, 1)
dX = maxpool_dk(X, Y, 2*Y, 2, 2, 1, 1)

tX = torch.tensor(X, requires_grad=True)
tY = torch.nn.functional.max_pool2d(tX, (2, 2), (1, 1))
tYf = tY.view(-1)
te = torch.dot(tYf, tYf)
te.backward()

dX_sol = tX.grad.data.numpy()

print(metrics.tdist(Y, tY.data.numpy()))
print(metrics.tdist(dX, dX_sol))

0.0
0.0


In [25]:
X = np.random.randn(2, 3, 9, 7)
Y = maxpool(X, 3, 2, 2, 1)
dX = maxpool_dk(X, Y, 2*Y, 3, 2, 2, 1)

tX = torch.tensor(X, requires_grad=True)
tY = torch.nn.functional.max_pool2d(tX, (3, 2), (2, 1))
tYf = tY.view(-1)
te = torch.dot(tYf, tYf)
te.backward()

dX_sol = tX.grad.data.numpy()

print(metrics.tdist(Y, tY.data.numpy()))
print(metrics.tdist(dX, dX_sol))

0.0
0.0


# Local Response Normalization

$$b^i_{x,y} = a^i_{x,y} \left( k + \alpha \sum_{j=\max(0,i-n/2)}^{\min(N-1,i+n/2)} (a^j_{x,y})^2 \right) ^{-\beta}$$

This operation takes a 4D tensor as input, it is applied right after a 2D Convolution, and helps for generalization.  
It implements a form of lateral inhibition, and creates a form a competion with the biggest activity among neurons of different kernels.  
$k$, $n$, $\alpha$ and $\beta$ are hyperparameters.

In [26]:
def local_response_norm(n, alpha, beta, k, X):
    
    Y = np.empty(X.shape)
    for e in range(X.shape[0]):
        for i in range(X.shape[1]):
            for x in range(X.shape[2]):
                for y in range(X.shape[3]):
                    val = X[e, i, x, y]
                    ss = 0
                    for j in range(max(0, i-n//2), 
                                   min(X.shape[1], i + n//2)):
                        ss += X[e, j, x, y]**2
                        
                    den = (k + alpha * ss) ** beta
                    Y[e, i, x, y] = val / den
    
    return Y

def local_response_norm_dX(n, alpha, beta, k, X, dY):
    
    dX = np.empty(dY.shape)
    #TODO
    return dX
    
    

X = np.random.randn(2, 14, 9, 7)
Y = local_response_norm(6, 0.0001, 0.75, 1, X)

tX = torch.tensor(X, requires_grad=True)
lrn = torch.nn.LocalResponseNorm(6, 0.0001 * 6, 0.75, 1)
tY = lrn(tX)
tYf = tY.view(-1)
te = torch.dot(tYf, tYf)
te.backward()
dX_sol = tX.grad.data.numpy()

print(metrics.tdist(Y, tY.data.numpy()))

5.087681048627601e-16
