# Implementing pooling layer

* Reduces the spatial size of the representation
* Reduce the dependence on the dimensionality of original image
* Alleviates excessive sensitivity of conv layer to the location of the feature in the original image

In [1]:
import torch
import torch.nn as nn

In [2]:
# leaving out padding and stride of this basic implentation
def pool2d(X, pool_size, mode='max'):
  p_h, p_w = pool_size
  Y = torch.zeros((X.shape[0] - p_h + 1, X.shape[1] - p_w + 1))
  for i in range(Y.shape[0]):
    for j in range(Y.shape[1]):
      if mode == 'max':
        Y[i, j] = X[i: i + p_h, j: j + p_w].max()
      elif mode == 'avg':
        Y[i, j] = X[i: i + p_h, j: j + p_w].mean()
  return Y

******** When processing multi-channel input data, the pooling layer pools each input channel separately, rather than
adding the inputs of each channel by channel as in a convolutional layer. This means that the number of output
channels for the pooling layer is the same as the number of input channels.
