In [139]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import tensorflow as tf

In [140]:
# original image dimensions
# B * C * H * W
batch = 6
orig_img = np.arange(125*batch).reshape((batch, 5,5,5))
batch, C, H, W = orig_img.shape

In [141]:
# original kernels
# n_f * C * F1 *F2
F1 = 1  # Height of kernels
F2 = 2  # Widht of kernels
n_f = 5   # Number of kernels
orig_kernels = np.random.randn(n_f, C, F1, F2)
n_f, C, F1, F2 = orig_kernels.shape

In [142]:
# https://stackoverflow.com/a/40840048
# Refer above link for awesome answer on im2col

def im2col(image,kernel_shape,strides=(1,1)):
    A = image
    B = kernel_shape
    skip = strides

    # Parameters 
    batch, D,M,N = A.shape
    col_extent = N - B[1] + 1
    row_extent = M - B[0] + 1

    # Get batch block indices
    batch_idx = np.arange(batch)[:, None, None] * D * M * N

    # Get Starting block indices
    start_idx = np.arange(B[0])[None, :,None]*N + np.arange(B[1])

    # Generate Depth indeces
    didx=M*N*np.arange(D)
    start_idx=(didx[None, :, None]+start_idx.ravel()).reshape((-1,B[0],B[1]))

    # Get offsetted indices across the height and width of input array
    offset_idx = np.arange(row_extent)[None, :, None]*N + np.arange(col_extent)

    # Get all actual indices & index into input array for final output
    act_idx = (batch_idx + 
        start_idx.ravel()[None, :, None] + 
        offset_idx[:,::skip[0],::skip[1]].ravel())

    out = np.take (A, act_idx)
    
    return out

In [143]:
def conv_2D(input, kernel, stride=(1,1), padding=(0,0)):
    """
    Performs 2D convolution operation on image with tensorflow style padding.
    
    input: input image of shape batch * channels * height * width
    kernel: kernel of shape no_of_kernels * channels * height * width
    stride: Stride for height and width
    padding: Either the string 'same','valid'(case sensitive) or list of list/tuples
            list in the form of [[pad_top,pad_bottom],[pad_left,pad_right]]
    
    returns convoluted feature map
    """
    input = np.float32(input)
    S1 ,S2 = stride
    N_K, C, K1, K2 = kernel.shape
    B, C, H, W = input.shape

    if padding == "same": 
        if (H % S1 == 0):
            pad_along_height = np.max([K1 - S1, 0])
        else:
            pad_along_height = np.max([K1 - (H % S1), 0])
        
        if(W % S2 == 0):
            pad_along_width = np.max([K2 - S2, 0])
        else:
            pad_along_width = np.max([K2 - (W % S2), 0])
        
        pad_top = int(np.floor(pad_along_height / 2))
        pad_bottom = int(pad_along_height - pad_top)
        pad_left = int(np.floor(pad_along_width / 2))
        pad_right = int(pad_along_width - pad_left)
            
    elif padding == "valid":
        pad_top, pad_bottom = (0,0)
        pad_left, pad_right = (0,0)
        
    else:
        pad_top, pad_bottom = padding[0]
        pad_left, pad_right = padding[1]

    input = np.pad(input, [(0,0),(0,0),(pad_top,pad_bottom),(pad_left,pad_right)])    
    B, C, H, W = input.shape  
    
    # Output feature map height and width
    H_ = np.int(np.floor((H - K1) / S1) + 1)
    W_ = np.int(np.floor((W - K2) / S2) + 1)
    
    # im2col
    input = im2col(input, (K1,K2), stride)
    
    input = np.hstack((input))
    
    kernel = kernel.reshape(N_K,-1)
    
    input = np.matmul(kernel, input)
    
    input = np.split(np.array(input), B, axis=1)
    
    input = np.array(input).reshape(B,N_K,H_,W_)
    
    return input

In [144]:
out = conv_2D(orig_img, orig_kernels, stride=(2,1), padding='valid')
print(out.shape)
print(out)

(6, 5, 3, 4)
[[[[   84.92851672    81.70750048    78.48648425    75.26546801]
   [   52.71835436    49.49733812    46.27632189    43.05530565]
   [   20.508192      17.28717577    14.06615953    10.8451433 ]]

  [[  197.93237723   198.38250837   198.83263951   199.28277066]
   [  202.43368865   202.88381979   203.33395093   203.78408207]
   [  206.93500006   207.38513121   207.83526235   208.28539349]]

  [[ -327.87244368  -334.03572717  -340.19901066  -346.36229415]
   [ -389.50527858  -395.66856207  -401.83184556  -407.99512905]
   [ -451.13811348  -457.30139697  -463.46468046  -469.62796395]]

  [[ -496.16974039  -499.97438094  -503.77902149  -507.58366204]
   [ -534.21614591  -538.02078646  -541.82542701  -545.63006757]
   [ -572.26255143  -576.06719198  -579.87183254  -583.67647309]]

  [[  444.66445357   448.46409522   452.26373686   456.06337851]
   [  482.66087001   486.46051166   490.2601533    494.05979495]
   [  520.65728645   524.4569281    528.25656974   532.05621139]]]




In [145]:
# Comparing with pytorch conv2d 
torch_out = F.conv2d(torch.Tensor(orig_img), torch.Tensor(orig_kernels),stride=(2,1))
print(torch_out.shape)
print(torch_out)

torch.Size([6, 5, 3, 4])
tensor([[[[   84.9285,    81.7075,    78.4865,    75.2655],
          [   52.7183,    49.4973,    46.2763,    43.0553],
          [   20.5082,    17.2872,    14.0662,    10.8451]],

         [[  197.9324,   198.3825,   198.8327,   199.2828],
          [  202.4337,   202.8838,   203.3340,   203.7841],
          [  206.9350,   207.3851,   207.8353,   208.2854]],

         [[ -327.8725,  -334.0357,  -340.1990,  -346.3623],
          [ -389.5053,  -395.6686,  -401.8318,  -407.9951],
          [ -451.1381,  -457.3014,  -463.4647,  -469.6279]],

         [[ -496.1697,  -499.9744,  -503.7790,  -507.5837],
          [ -534.2161,  -538.0208,  -541.8254,  -545.6301],
          [ -572.2626,  -576.0672,  -579.8718,  -583.6765]],

         [[  444.6645,   448.4641,   452.2638,   456.0634],
          [  482.6609,   486.4605,   490.2602,   494.0598],
          [  520.6573,   524.4569,   528.2566,   532.0562]]],


        [[[ -317.6985,  -320.9195,  -324.1405,  -327.3615],
   

In [146]:
# converting from NCHW --> NHWC for testing with tensorflow
orig_img_ = np.transpose(orig_img, [0,2,3,1]).astype(np.float64)
orig_kernels_ = np.transpose(orig_kernels, [2,3,1,0])
type(orig_img_[0,1,1,1])

numpy.float64

In [147]:
tf_out = tf.nn.conv2d(tf.convert_to_tensor(orig_img_),tf.convert_to_tensor(orig_kernels_),(2,1),padding='VALID')
print(tf_out.shape)
print(tf_out)

(6, 3, 4, 5)
tf.Tensor(
[[[[   84.92851672   197.93237723  -327.87244368  -496.16974039
      444.66445357]
   [   81.70750048   198.38250837  -334.03572717  -499.97438094
      448.46409522]
   [   78.48648425   198.83263951  -340.19901066  -503.77902149
      452.26373686]
   [   75.26546801   199.28277066  -346.36229415  -507.58366204
      456.06337851]]

  [[   52.71835436   202.43368865  -389.50527858  -534.21614591
      482.66087001]
   [   49.49733812   202.88381979  -395.66856207  -538.02078646
      486.46051166]
   [   46.27632189   203.33395093  -401.83184556  -541.82542701
      490.2601533 ]
   [   43.05530565   203.78408207  -407.99512905  -545.63006757
      494.05979495]]

  [[   20.508192     206.93500006  -451.13811348  -572.26255143
      520.65728645]
   [   17.28717577   207.38513121  -457.30139697  -576.06719198
      524.4569281 ]
   [   14.06615953   207.83526235  -463.46468046  -579.87183254
      528.25656974]
   [   10.8451433    208.28539349  -469.62796395

In [148]:
def max_pool_2d(input, kernel_size=(2,2), stride=None, padding='valid'):
    """
    Performs max pooling on input feature map.
    input: input of shape B * C * H * W
    kernel_size = height and widht of the kernel
    stride = tuple of two integers, denoting strides along height and width of the image
            if stride is none, then stride is same as kernel size.
    padding = 'valid' or 'same' padding is possible
    """
    if not stride:
        # Stride same as kernel size
        stride = kernel_size
    
    input = np.float32(input)
    S1 ,S2 = stride
    K1, K2 = kernel_size
    B, C, H, W = input.shape
        
    if padding == 'valid':
        # No padding
        pad_top, pad_bottom = (0,0)
        pad_left, pad_right = (0,0)
    elif padding == 'same':
        # this make sure input height and widht is divisible by the stride
        if (H % S1 == 0):
            pad_along_height = np.max(K1 - S1, 0)
        else:
            pad_along_height = np.max(K1 - (H % S1), 0)
        
        if(W % S2 == 0):
            pad_along_width = np.max(K2 - S2, 0)
        else:
            pad_along_width = np.max(K2 - (W % S2), 0)
        
        pad_top = int(np.floor(pad_along_height / 2))
        pad_bottom = int(pad_along_height - pad_top)
        pad_left = int(np.floor(pad_along_width / 2))
        pad_right = int(pad_along_width - pad_left)
    else:
        print(r'only same and valid padding is possible')
        return
    
    input = np.pad(input, [(0,0),(0,0),(pad_top,pad_bottom),(pad_left,pad_right)], mode='constant', constant_values=np.NINF)    
    B, C, H, W = input.shape  
    
    # Output feature map height and width
    H_ = np.int(np.floor((H - K1) / S1) + 1)
    W_ = np.int(np.floor((W - K2) / S2) + 1)
    
    # im2col
    input = im2col(input, kernel_size, stride)
    
    input = input.reshape(B,C,K1*K2,H_*W_)
    
    input = np.max(input,2).reshape(B,C,H_,W_)
    
    return input    

In [149]:
max_pool_2d(orig_img, (2,2), padding="same") # outputs in NCHW format

array([[[[  6.,   8.,   9.],
         [ 16.,  18.,  19.],
         [ 21.,  23.,  24.]],

        [[ 31.,  33.,  34.],
         [ 41.,  43.,  44.],
         [ 46.,  48.,  49.]],

        [[ 56.,  58.,  59.],
         [ 66.,  68.,  69.],
         [ 71.,  73.,  74.]],

        [[ 81.,  83.,  84.],
         [ 91.,  93.,  94.],
         [ 96.,  98.,  99.]],

        [[106., 108., 109.],
         [116., 118., 119.],
         [121., 123., 124.]]],


       [[[131., 133., 134.],
         [141., 143., 144.],
         [146., 148., 149.]],

        [[156., 158., 159.],
         [166., 168., 169.],
         [171., 173., 174.]],

        [[181., 183., 184.],
         [191., 193., 194.],
         [196., 198., 199.]],

        [[206., 208., 209.],
         [216., 218., 219.],
         [221., 223., 224.]],

        [[231., 233., 234.],
         [241., 243., 244.],
         [246., 248., 249.]]],


       [[[256., 258., 259.],
         [266., 268., 269.],
         [271., 273., 274.]],

        [[281., 2

In [150]:
max_pool_layer = tf.keras.layers.MaxPool2D((2,2),padding='same', dtype='float64')
max_pool_layer(tf.convert_to_tensor(orig_img_)) # Outputs in NHWC format, therefore output might look different than mine.

<tf.Tensor: id=100, shape=(6, 3, 3, 5), dtype=float64, numpy=
array([[[[  6.,  31.,  56.,  81., 106.],
         [  8.,  33.,  58.,  83., 108.],
         [  9.,  34.,  59.,  84., 109.]],

        [[ 16.,  41.,  66.,  91., 116.],
         [ 18.,  43.,  68.,  93., 118.],
         [ 19.,  44.,  69.,  94., 119.]],

        [[ 21.,  46.,  71.,  96., 121.],
         [ 23.,  48.,  73.,  98., 123.],
         [ 24.,  49.,  74.,  99., 124.]]],


       [[[131., 156., 181., 206., 231.],
         [133., 158., 183., 208., 233.],
         [134., 159., 184., 209., 234.]],

        [[141., 166., 191., 216., 241.],
         [143., 168., 193., 218., 243.],
         [144., 169., 194., 219., 244.]],

        [[146., 171., 196., 221., 246.],
         [148., 173., 198., 223., 248.],
         [149., 174., 199., 224., 249.]]],


       [[[256., 281., 306., 331., 356.],
         [258., 283., 308., 333., 358.],
         [259., 284., 309., 334., 359.]],

        [[266., 291., 316., 341., 366.],
         [268., 29