# Implementing Convolution and Maxpool operations from scratch using Numpy library

In [1]:
import numpy as np

In [2]:


def convolution_operation_2D_Image(input_image, kernel, stride, pad):
    """
    Performs a 2D convolution operation on a given input_image with a given kernel.

    Args:
    input_image (numpy array): a 2D array representing the input image
    kernel (numpy array): a 2D array representing the weights used for the convolution
    stride (int): the stride used for the convolution operation
    pad (int): the amount of zero padding to be added to the input image

    Returns:
    final_output (numpy array): a 2D array representing the result of the convolution operation
    """
    
    # Get the height and width of the input image and kernel
    input_height, input_width = input_image.shape
    kernel_height, kernel_width = kernel.shape
    
    # Add zero padding to the input image based on the given pad value
    padded_image = np.pad(input_image, pad, 'constant', constant_values=(0, 0))
    
    # Calculate the output height and width based on the input size, kernel size, stride, and pad
    output_height = int((input_height - kernel_height + 2 * pad) / stride) + 1
    output_width = int((input_width - kernel_width + 2 * pad) / stride) + 1
    
    # Create an empty array for the final output
    final_output = np.zeros((output_height, output_width))
    
    # Loop through each element of the final output array
    for h in range(output_height):
        h_start = h * stride
        h_end = h_start + kernel_height
        
        for w in range(output_width):
            w_start = w * stride
            w_end = w_start + kernel_width
            
            # Get the image patch corresponding to the current output element
            image_patch = padded_image[h_start:h_end, w_start:w_end]
            
            # Perform a convolution step on the image patch and the kernel
            #element wise multiplication of two similar sized matrix and taking element wise sum of resultant matrix
            final_output[h, w] = np.sum(np.multiply(image_patch, kernel))
    
    # Return the final output array
    return final_output



In [3]:
input_image=np.array([[3,3,2,1,0],
                      [0,0,1,3,1],
                      [3,1,2,2,3],
                      [2,0,0,2,2],
                      [2,0,0,0,1]])
kernel = np.array([[0,1,2],
                   [2,2,0],
                   [0,1,2]])

result=convolution_operation_2D_Image(input_image,kernel,1,0)
print(result)

[[12. 12. 17.]
 [10. 17. 19.]
 [ 9.  6. 14.]]


In [4]:
import numpy as np

def convolution_operation_3D_Image(input_image, kernel, stride, pad):
    """
    Performs a 3D convolution operation on a given input_image with a given kernel.

    Args:
    input_image (numpy array): a 3D array representing the input image 
    kernel (numpy array): a 3D array representing the weights used for the convolution
    stride (int): the stride used for the convolution operation
    pad (int): the amount of zero padding to be added to the input image

    Returns:
    final_output (numpy array): a 3D array representing the result of the convolution operation
    """

    # Get the height, width, and number of channels of the input image and kernel
    input_height, input_width, input_channels = input_image.shape
    kernel_height, kernel_width, kernel_channels = kernel.shape
    
    # Add zero padding to the input image based on the given pad value
    #however there is some modification needs to be done, if input=9x9x3 ;direct padding will give 11x11x5
    #the correct shape should be 11x11x3
    padded_image = np.pad(input_image,(((pad, pad), (pad, pad),(0,0))), mode='constant', constant_values=(0, 0))
    
    # Calculate the output height and width based on the input size, kernel size, stride, and pad
    output_height = int((input_height - kernel_height + 2 * pad) / stride) + 1
    output_width = int((input_width - kernel_width + 2 * pad) / stride) + 1
    
    # Create an empty array for the final output, with the same number of channels as the input image
    final_output = np.zeros((output_height, output_width, input_channels))
    
    # Loop through each element of the final output array
    for h in range(output_height):
        h_start = h * stride
        h_end = h_start + kernel_height
        
        for w in range(output_width):
            w_start = w * stride
            w_end = w_start + kernel_width
            
            for c in range(kernel_channels):
                # Get the image patch corresponding to the current output element and channel
                image_patch = padded_image[h_start:h_end, w_start:w_end, c:c+1]
                
                # Perform a convolution step on the image patch and the kernel for the current channel
                #element wise multiplication of two similar sized matrix and taking element wise sum of resultant matrix
                final_output[h, w, c] = np.sum(np.multiply(image_patch, kernel[:,:,c:c+1]))
    
    # Return the final output array
    return final_output


In [5]:
import numpy as np

def convolution_operation_batch_3D_images(input_images_batch, kernel, stride, pad):
    """
    Perform 2D convolution operation on a batch of 3D input images with a given kernel.

    Args:
    input_image_batch (np.array): A 4D numpy array of shape (batch_size, input_height, input_width, input_channels),
                                  where batch_size is the number of input images, input_height is the height of each input
                                  image, input_width is the width of each input image, and input_channels is the number
                                  of channels in each input image.
    kernel (np.array): A 3D numpy array of shape (kernel_height, kernel_width, kernel_channels), where kernel_height is the
                       height of the kernel, kernel_width is the width of the kernel, and kernel_channels is the number of
                       channels in the kernel.
    stride (int): The stride of the convolution operation.
    pad (int): The number of pixels to pad the input image with.

    Returns:
    final_output (np.array): A 4D numpy array of shape (batch_size, output_height, output_width, input_channels), where
                             output_height is the height of each output image and output_width is the width of each output
                             image.
    """

    # Get the shape of the input image and kernel
    batch_size, input_height, input_width, input_channels = input_images_batch.shape
    num_filters, kernel_height, kernel_width, num_channels = kernel.shape

    # Compute the output image dimensions
    output_height = int((input_height - kernel_height + 2 * pad) / stride) + 1
    output_width = int((input_width - kernel_width + 2 * pad) / stride) + 1

    # Pad the input image with zeros
    batch_padded_image = np.pad(input_images_batch, ((0, 0), (pad, pad), (pad, pad), (0, 0)), 
                                mode='constant', constant_values=(0, 0))

    # Create an empty numpy array to hold the output images
    final_output = np.zeros((batch_size, output_height, output_width, num_filters))

    # Loop through each input image in the batch
    for index in range(batch_size):
        current_padded_image = batch_padded_image[index]
        # Loop through each pixel in the output image
        for h in range(output_height):
            h_start = h * stride
            h_end = h_start + kernel_height
            for w in range(output_width):
                w_start = w * stride
                w_end = w_start + kernel_width
                # Loop through each channel in the kernel
                for c in range(num_filters):
                    # Extract the image patch and apply the convolution operation
                    image_patch = current_padded_image[h_start:h_end,w_start:w_end, :]
                     #element wise multiplication of two similar sized matrix and taking element wise sum of resultant matrix
                    final_output[index, h, w, c] = np.sum(np.multiply(image_patch, kernel[c:c+1,:,:,:]))
                    
    return final_output


In [6]:
import numpy as np

def max_pool_batch_3D_images(input_images_batch,filter_size,stride):
     
    batch_size, input_height, input_width, input_channels = input_images_batch.shape
    

    # Compute the output image dimensions
    output_height = int(1 + (input_height - filter_size) / stride)
    output_width = int(1 + (input_width - filter_size) / stride)
    
    # Initialize output matrix A
    final_output = np.zeros((batch_size, output_height, output_width, input_channels))              
    # Loop through each input image in the batch
    for index in range(batch_size):
        # Loop through each pixel in the output image
        for h in range(output_height):
            h_start = h * stride
            h_end = h_start + filter_size
            for w in range(output_width):
                w_start = w * stride
                w_end = w_start + filter_size
                # Loop through each channel in the kernel
                for c in range(input_channels):
                    
                    #Extract the image patch and apply pooling operation
                    #we are also apply np.mean() to get average pooling
                    image_patch = final_output[index,h_start:h_end,w_start:w_end,c]
                    final_output[index, h, w, c] = np.max(image_patch)
                    
    return final_output

In [8]:
import tensorflow as tf
from tensorflow.keras.layers import Conv2D
import numpy as np


np.random.seed(1)
# Generate random data
## 4 random RGB images of size 9x9x3
input_image_batch = np.random.randn(4, 9, 9, 3).astype(np.float32)
kernel = np.random.randn(8, 5,5 ,3).astype(np.float32)

# Apply convolution_operation_batch_3D_images

output_custom =convolution_operation_batch_3D_images(input_image_batch,kernel,stride=1, pad=2)
print('Output shape of custom convolution')
print(output_custom.shape)


# Apply TensorFlow's Conv2D layer
init=tf.constant_initializer(kernel.transpose(1,2,3,0))

conv_layer = Conv2D(filters=8, kernel_size=5, strides=1, padding='same', use_bias=False,kernel_initializer=init)

output_tensorflow = conv_layer(tf.constant(input_image_batch))
output_tensorflow = output_tensorflow.numpy()
print('Output shape of tensorflow convolution')
print(output_tensorflow.shape)
# Compare outputs
print('*'*50)
assert np.allclose(np.round(output_tensorflow,2), np.round(output_custom,2), rtol=1e-5, atol=1e-8)
print("Outputs of both methods are the same")
print('*'*50)

Output shape of custom convolution
(4, 9, 9, 8)
Output shape of tensorflow convolution
(4, 9, 9, 8)
**************************************************
Outputs of both methods are the same
**************************************************


In [None]:
import torch
