In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import tensorflow as tf

In [None]:
# original image dimensions
# B * C * H * W
batch = 6
orig_img = np.arange(125*batch).reshape((batch, 5,5,5))
batch, C, H, W = orig_img.shape

In [None]:
# original kernels
# n_f * C * F1 *F2
F1 = 1  # Height of kernels
F2 = 2  # Widht of kernels
n_f = 5   # Number of kernels
orig_kernels = np.random.randn(n_f, C, F1, F2)
n_f, C, F1, F2 = orig_kernels.shape

In [None]:
# https://stackoverflow.com/a/40840048
# Refer above link for awesome answer on im2col

def im2col(image,kernel_shape,strides=(1,1)):
    A = image
    B = kernel_shape
    skip = strides

    # Parameters 
    batch, D,M,N = A.shape
    col_extent = N - B[1] + 1
    row_extent = M - B[0] + 1

    # Get batch block indices
    batch_idx = np.arange(batch)[:, None, None] * D * M * N

    # Get Starting block indices
    start_idx = np.arange(B[0])[None, :,None]*N + np.arange(B[1])

    # Generate Depth indeces
    didx=M*N*np.arange(D)
    start_idx=(didx[None, :, None]+start_idx.ravel()).reshape((-1,B[0],B[1]))

    # Get offsetted indices across the height and width of input array
    offset_idx = np.arange(row_extent)[None, :, None]*N + np.arange(col_extent)

    # Get all actual indices & index into input array for final output
    act_idx = (batch_idx + 
        start_idx.ravel()[None, :, None] + 
        offset_idx[:,::skip[0],::skip[1]].ravel())

    out = np.take (A, act_idx)
    
    return out

In [None]:
def conv_2D(input, kernel, stride=(1,1), padding=(0,0)):
    """
    Performs 2D convolution operation on image with tensorflow style padding.
    
    input: input image of shape batch * channels * height * width
    kernel: kernel of shape no_of_kernels * channels * height * width
    stride: Stride for height and width
    padding: Either the string 'same','valid'(case sensitive) or list of list/tuples
            list in the form of [[pad_top,pad_bottom],[pad_left,pad_right]]
    
    returns convoluted feature map
    """
    input = np.float32(input)
    S1 ,S2 = stride
    N_K, C, K1, K2 = kernel.shape
    B, C, H, W = input.shape

    if padding == "same": 
        if (H % S1 == 0):
            pad_along_height = np.max([K1 - S1, 0])
        else:
            pad_along_height = np.max([K1 - (H % S1), 0])
        
        if(W % S2 == 0):
            pad_along_width = np.max([K2 - S2, 0])
        else:
            pad_along_width = np.max([K2 - (W % S2), 0])
        
        pad_top = int(np.floor(pad_along_height / 2))
        pad_bottom = int(pad_along_height - pad_top)
        pad_left = int(np.floor(pad_along_width / 2))
        pad_right = int(pad_along_width - pad_left)
            
    elif padding == "valid":
        pad_top, pad_bottom = (0,0)
        pad_left, pad_right = (0,0)
        
    else:
        pad_top, pad_bottom = padding[0]
        pad_left, pad_right = padding[1]

    input = np.pad(input, [(0,0),(0,0),(pad_top,pad_bottom),(pad_left,pad_right)])    
    B, C, H, W = input.shape  
    
    # Output feature map height and width
    H_ = np.int(np.floor((H - K1) / S1) + 1)
    W_ = np.int(np.floor((W - K2) / S2) + 1)
    
    # im2col
    input = im2col(input, (K1,K2), stride)
    
    input = np.hstack((input))
    
    kernel = kernel.reshape(N_K,-1)
    
    input = np.matmul(kernel, input)
    
    input = np.split(np.array(input), B, axis=1)
    
    input = np.array(input).reshape(B,N_K,H_,W_)
    
    return input

In [None]:
out = conv_2D(orig_img, orig_kernels, stride=(2,1), padding='valid')
print(out.shape)
print(out)

In [None]:
# Comparing with pytorch conv2d 
torch_out = F.conv2d(torch.Tensor(orig_img), torch.Tensor(orig_kernels),stride=(2,1))
print(torch_out.shape)
print(torch_out)

In [None]:
# converting from NCHW --> NHWC for testing with tensorflow
orig_img_ = np.transpose(orig_img, [0,2,3,1]).astype(np.float64)
orig_kernels_ = np.transpose(orig_kernels, [2,3,1,0])
type(orig_img_[0,1,1,1])

In [None]:
tf_out = tf.nn.conv2d(tf.convert_to_tensor(orig_img_),tf.convert_to_tensor(orig_kernels_),(2,1),padding='VALID')
print(tf_out.shape)
print(tf_out)

In [None]:
def max_pool_2d(input, kernel_size=(2,2), stride=None, padding='valid'):
    """
    Performs max pooling on input feature map.
    input: input of shape B * C * H * W
    kernel_size = height and widht of the kernel
    stride = tuple of two integers, denoting strides along height and width of the image
            if stride is none, then stride is same as kernel size.
    padding = 'valid' or 'same' padding is possible
    """
    if not stride:
        # Stride same as kernel size
        stride = kernel_size
    
    input = np.float32(input)
    S1 ,S2 = stride
    K1, K2 = kernel_size
    B, C, H, W = input.shape
        
    if padding == 'valid':
        # No padding
        pad_top, pad_bottom = (0,0)
        pad_left, pad_right = (0,0)
    elif padding == 'same':
        # this make sure input height and widht is divisible by the stride
        if (H % S1 == 0):
            pad_along_height = np.max(K1 - S1, 0)
        else:
            pad_along_height = np.max(K1 - (H % S1), 0)
        
        if(W % S2 == 0):
            pad_along_width = np.max(K2 - S2, 0)
        else:
            pad_along_width = np.max(K2 - (W % S2), 0)
        
        pad_top = int(np.floor(pad_along_height / 2))
        pad_bottom = int(pad_along_height - pad_top)
        pad_left = int(np.floor(pad_along_width / 2))
        pad_right = int(pad_along_width - pad_left)
    else:
        print(r'only same and valid padding is possible')
        return
    
    input = np.pad(input, [(0,0),(0,0),(pad_top,pad_bottom),(pad_left,pad_right)], mode='constant', constant_values=np.NINF)    
    B, C, H, W = input.shape  
    
    # Output feature map height and width
    H_ = np.int(np.floor((H - K1) / S1) + 1)
    W_ = np.int(np.floor((W - K2) / S2) + 1)
    
    # im2col
    input = im2col(input, kernel_size, stride)
    
    input = input.reshape(B,C,K1*K2,H_*W_)
    
    print(np.argmax(input,2).reshape(B,C,H_,W_))
    input = np.max(input,2).reshape(B,C,H_,W_)
    
    return input    

In [None]:
max_pool_2d(orig_img, (2,2), padding="same") # outputs in NCHW format

In [None]:
max_pool_layer = tf.keras.layers.MaxPool2D((2,2),padding='same', dtype='float64')
max_pool_layer(tf.convert_to_tensor(orig_img_)) # Outputs in NHWC format, therefore output might look different than mine.

In [68]:
def avg_pool_2d(input, kernel_size=(2,2), stride=None, padding='valid'):
    """
    Performs average pooling on input feature map.
    input: input of shape B * C * H * W
    kernel_size = height and widht of the kernel
    stride = tuple of two integers, denoting strides along height and width of the image
            if stride is none, then stride is same as kernel size.
    padding = 'valid' or 'same' padding is possible
    """
    if not stride:
        # Stride same as kernel size
        stride = kernel_size
    
    input = np.float32(input)
    S1 ,S2 = stride
    K1, K2 = kernel_size
    B, C, H, W = input.shape
        
    if padding == 'valid':
        # No padding
        pad_top, pad_bottom = (0,0)
        pad_left, pad_right = (0,0)
    elif padding == 'same':
        # this make sure input height and width is divisible by the stride
        if (H % S1 == 0):
            pad_along_height = np.max(K1 - S1, 0)
        else:
            pad_along_height = np.max(K1 - (H % S1), 0)
        
        if(W % S2 == 0):
            pad_along_width = np.max(K2 - S2, 0)
        else:
            pad_along_width = np.max(K2 - (W % S2), 0)
        
        pad_top = int(np.floor(pad_along_height / 2))
        pad_bottom = int(pad_along_height - pad_top)
        pad_left = int(np.floor(pad_along_width / 2))
        pad_right = int(pad_along_width - pad_left)
    else:
        print(r'only same and valid padding is possible')
        return
    
    # Padding with nan's because we don't want to include them while calculating average
    input = np.pad(input, [(0,0),(0,0),(pad_top,pad_bottom),(pad_left,pad_right)], mode='constant', constant_values=np.nan)    
    B, C, H, W = input.shape  
    
    # Output feature map height and width
    H_ = np.int(np.floor((H - K1) / S1) + 1)
    W_ = np.int(np.floor((W - K2) / S2) + 1)
    
    # im2col
    input = im2col(input, kernel_size, stride)
    
    input = input.reshape(B,C,K1*K2,H_*W_)
    
    #
    input = np.nanmean(input,2).reshape(B,C,H_,W_)
    
    return input    

In [None]:
avg_pool_2d(orig_img, (2,2), padding='same')

In [None]:
avg_pool_layer = tf.keras.layers.AveragePooling2D((2,2),padding='same', dtype='float64',)
avg_pool_layer(tf.convert_to_tensor(orig_img_)) # Outputs in NHWC format