## 8. CONVOLUTIONAL NEURAL NETWORKS

#### 8.2.1 The Cross-Correlation Operator 

In [1]:
from mxnet import autograd, nd 
from mxnet.gluon import nn

# Save to the d2l package. 
def corr2d(X, K):
    """Compute 2D cross-correlation.""" 
    h, w = K.shape 
    Y = nd.zeros((X.shape[0] - h + 1, X.shape[1] - w + 1)) 
    
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            Y[i, j] = (X[i: i + h, j: j + w] * K).sum()
            
    return Y


In [2]:
X = nd.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]]) 
K = nd.array([[0, 1], [2, 3]]) 
corr2d(X, K)


[[19. 25.]
 [37. 43.]]
<NDArray 2x2 @cpu(0)>

#### 8.2.2 Convolutional Layers

In [3]:
class Conv2D(nn.Block):
    def __init__(self, kernel_size, **kwargs):
        super(Conv2D, self).__init__(**kwargs) 
        self.weight = self.params.get('weight', shape=kernel_size) 
        self.bias = self.params.get('bias', shape=(1,))
        
    def forward(self, x):
        return corr2d(x, self.weight.data()) + self.bias.data()


#### 8.2.3 Object Edge Detection in Images 

In [4]:
X = nd.ones((6, 8)) 
X[:, 2:6] = 0 
X


[[1. 1. 0. 0. 0. 0. 1. 1.]
 [1. 1. 0. 0. 0. 0. 1. 1.]
 [1. 1. 0. 0. 0. 0. 1. 1.]
 [1. 1. 0. 0. 0. 0. 1. 1.]
 [1. 1. 0. 0. 0. 0. 1. 1.]
 [1. 1. 0. 0. 0. 0. 1. 1.]]
<NDArray 6x8 @cpu(0)>

In [5]:
K = nd.array([[1, -1]])
Y = corr2d(X, K) 
Y


[[ 0.  1.  0.  0.  0. -1.  0.]
 [ 0.  1.  0.  0.  0. -1.  0.]
 [ 0.  1.  0.  0.  0. -1.  0.]
 [ 0.  1.  0.  0.  0. -1.  0.]
 [ 0.  1.  0.  0.  0. -1.  0.]
 [ 0.  1.  0.  0.  0. -1.  0.]]
<NDArray 6x7 @cpu(0)>

In [6]:
corr2d(X.T, K)


[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
<NDArray 8x5 @cpu(0)>

#### 8.2.4 Learning a Kernel

In [13]:
from mxnet.gluon import nn
# Construct a convolutional layer with 1 output channel 
# (channels will be introduced in the following section) 
# and a kernel array shape of (1, 2) 
conv2d = nn.Conv2D(1, kernel_size=(1, 2)) 
conv2d.initialize()

# The two-dimensional convolutional layer uses four-dimensional input and 
# output in the format of (example, channel, height, width), where the batch 
# size (number of examples in the batch) and the number of channels are both 1 
X = X.reshape((1, 1, 6, 8)) 
Y = Y.reshape((1, 1, 6, 7))

for i in range(10):
    with autograd.record():
        Y_hat = conv2d(X) 
        l = (Y_hat - Y) ** 2
    l.backward() 
    
    # For the sake of simplicity, we ignore the bias here 
    conv2d.weight.data()[:] -= 3e-2 * conv2d.weight.grad() 
    
    if (i + 1) % 2 == 0:
        print('batch %d, loss %.3f' % (i + 1, l.sum().asscalar()))


batch 2, loss 5.154
batch 4, loss 0.872
batch 6, loss 0.149
batch 8, loss 0.026
batch 10, loss 0.005


In [8]:
conv2d.weight.data().reshape((1, 2))


[[ 0.9895    -0.9873705]]
<NDArray 1x2 @cpu(0)>

In [9]:
conv2d

Conv2D(1 -> 1, kernel_size=(1, 2), stride=(1, 1))

In [10]:
conv2d.weight

Parameter conv0_weight (shape=(1, 1, 1, 2), dtype=<class 'numpy.float32'>)

### 8.3 Padding and Stride

#### 8.3.1 Padding 

In [1]:
from mxnet import nd 
from mxnet.gluon import nn

# For convenience, we define a function to calculate the convolutional layer. 
# This function initializes the convolutional layer weights and performs 
# corresponding dimensionality elevations and reductions on the input and 
# output 
def comp_conv2d(conv2d, X):
    conv2d.initialize() 
    # (1,1) indicates that the batch size and the number of channels 
    # (described in later chapters) are both 1 
    X = X.reshape((1, 1) + X.shape) 
    Y = conv2d(X) 
    # Exclude the first two dimensions that do not interest us: batch and 
    # channel 
    return Y.reshape(Y.shape[2:])

# Note that here 1 row or column is padded on either side, so a total of 2 
# rows or columns are added 
conv2d = nn.Conv2D(1, kernel_size=3, padding=1) 
X = nd.random.uniform(shape=(8, 8)) 
comp_conv2d(conv2d, X).shape

(8, 8)

In [2]:
# Here, we use a convolution kernel with a height of 5 and a width of 3. The 
# padding numbers on both sides of the height and width are 2 and 1, 
# respectively 
conv2d = nn.Conv2D(1, kernel_size=(5, 3), padding=(2, 1)) 
comp_conv2d(conv2d, X).shape

(8, 8)

#### 8.3.2 Stride 

In [18]:
conv2d = nn.Conv2D(1, kernel_size=3, padding=1, strides=2) 
comp_conv2d(conv2d, X).shape

(4, 4)

In [19]:
conv2d = nn.Conv2D(1, kernel_size=(3, 5), padding=(0, 1), strides=(3, 4))
comp_conv2d(conv2d, X).shape

(2, 2)

### 8.4 Multiple Input and Output Channels

#### 8.4.1 Multiple Input Channels 

In [9]:
import d2l 
from mxnet import nd

def corr2d_multi_in(X, K):
    # First, traverse along the 0th dimension (channel dimension) of X and K. 
    # Then, add them together by using * to turn the result list into a 
    # positional argument of the add_n function 
    return nd.add_n(*[d2l.corr2d(x, k) for x, k in zip(X, K)])


In [10]:
X = nd.array([[[0, 1, 2], [3, 4, 5], [6, 7, 8]], [[1, 2, 3], [4, 5, 6], [7, 8, 9]]])

K = nd.array([[[0, 1], [2, 3]], [[1, 2], [3, 4]]])

corr2d_multi_in(X, K)


[[ 56.  72.]
 [104. 120.]]
<NDArray 2x2 @cpu(0)>

#### 8.4.2 Multiple Output Channels 

In [11]:
def corr2d_multi_in_out(X, K):
    # Traverse along the 0th dimension of K, and each time, perform 
    # cross-correlation operations with input X. All of the results are merged 
    # together using the stack function 
    return nd.stack(*[corr2d_multi_in(X, k) for k in K])

In [12]:
K = nd.stack(K, K + 1, K + 2) 
K.shape

(3, 2, 2, 2)

In [13]:
corr2d_multi_in_out(X, K)


[[[ 56.  72.]
  [104. 120.]]

 [[ 76. 100.]
  [148. 172.]]

 [[ 96. 128.]
  [192. 224.]]]
<NDArray 3x2x2 @cpu(0)>

#### 8.4.3 1 x 1 Convolutional Layer 

In [14]:
from mxnet import nd

def corr2d_multi_in_out_1x1(X, K):
    c_i, h, w = X.shape 
    c_o = K.shape[0] 
    X = X.reshape((c_i, h * w)) 
    K = K.reshape((c_o, c_i)) 
    Y = nd.dot(K, X) # Matrix multiplication in the fully connected layer 
    return Y.reshape((c_o, h, w))


In [19]:
X = nd.random.uniform(shape=(3, 3, 3)) 
print(X)
K = nd.random.uniform(shape=(2, 3, 1, 1))
print(K)
Y1 = corr2d_multi_in_out_1x1(X, K) 
print(Y1)
Y2 = corr2d_multi_in_out(X, K)
print(Y2)
(Y1 - Y2).norm().asscalar() < 1e-6



[[[0.00469548 0.14644176 0.6778165 ]
  [0.5696184  0.27000797 0.70373726]
  [0.735194   0.28847644 0.96218854]]

 [[0.43328807 0.24875315 0.7561067 ]
  [0.57615733 0.3960983  0.5920419 ]
  [0.8960384  0.5722519  0.6389211 ]]

 [[0.22308163 0.8915544  0.952749  ]
  [0.68005556 0.44712538 0.44919774]
  [0.84640867 0.97857094 0.6994793 ]]]
<NDArray 3x3x3 @cpu(0)>

[[[[0.11620191]]

  [[0.29743695]]

  [[0.7670237 ]]]


 [[[0.81379783]]

  [[0.41182014]]

  [[0.39650574]]]]
<NDArray 2x3x1x1 @cpu(0)>

[[[0.3005304  0.7748486  1.0344387 ]
  [0.75917995 0.49214545 0.60241604]
  [1.0011613  0.95431745 0.8383641 ]]

 [[0.2707111  0.575122   1.240756  ]
  [0.97047335 0.5601409  0.9946241 ]
  [1.3029119  0.8584354  1.323495  ]]]
<NDArray 2x3x3 @cpu(0)>

[[[0.3005304  0.7748486  1.0344387 ]
  [0.75917995 0.49214545 0.60241604]
  [1.0011615  0.9543175  0.83836406]]

 [[0.27071106 0.575122   1.240756  ]
  [0.9704734  0.56014097 0.99462414]
  [1.3029119  0.8584354  1.323495  ]]]
<NDArray 2x3x3 @cpu(

True

### 8.5 Pooling


#### 8.5.1 Maximum Pooling and Average Pooling 

In [51]:
from mxnet import nd 
from mxnet.gluon import nn

def pool2d(X, pool_size, mode='max'):
    p_h, p_w = pool_size 
    Y = nd.zeros((X.shape[0] - p_h + 1, X.shape[1] - p_w + 1)) 
    
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            if mode == 'max':
                Y[i, j] = X[i: i + p_h, j: j + p_w].max()
            elif mode == 'avg':
                Y[i, j] = X[i: i + p_h, j: j + p_w].mean()
    return Y

In [52]:
X = nd.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]]) 

pool2d(X, (2, 2))


[[4. 5.]
 [7. 8.]]
<NDArray 2x2 @cpu(0)>

In [53]:
pool2d(X, (2, 2), 'avg')


[[2. 3.]
 [5. 6.]]
<NDArray 2x2 @cpu(0)>

#### 8.5.2 Padding and Stride 

In [54]:
X = nd.arange(16).reshape((1, 1, 4, 4)) 
print(X)


[[[[ 0.  1.  2.  3.]
   [ 4.  5.  6.  7.]
   [ 8.  9. 10. 11.]
   [12. 13. 14. 15.]]]]
<NDArray 1x1x4x4 @cpu(0)>


In [55]:
pool2d = nn.MaxPool2D(3)
print(pool2d)

MaxPool2D(size=(3, 3), stride=(3, 3), padding=(0, 0), ceil_mode=False, global_pool=False, pool_type=max, layout=NCHW)


In [56]:
pool2d(X)


[[[[10.]]]]
<NDArray 1x1x1x1 @cpu(0)>

In [57]:
pool2d = nn.MaxPool2D(3, padding=1, strides=2) 
pool2d(X)


[[[[ 5.  7.]
   [13. 15.]]]]
<NDArray 1x1x2x2 @cpu(0)>

In [58]:
pool2d = nn.MaxPool2D((2, 3), padding=(1, 2), strides=(2, 3))
pool2d(X)


[[[[ 0.  3.]
   [ 8. 11.]
   [12. 15.]]]]
<NDArray 1x1x3x2 @cpu(0)>

#### 8.5.3 Multiple Channels 

In [59]:
X = nd.concat(X, X + 1, dim=1)
X



[[[[ 0.  1.  2.  3.]
   [ 4.  5.  6.  7.]
   [ 8.  9. 10. 11.]
   [12. 13. 14. 15.]]

  [[ 1.  2.  3.  4.]
   [ 5.  6.  7.  8.]
   [ 9. 10. 11. 12.]
   [13. 14. 15. 16.]]]]
<NDArray 1x2x4x4 @cpu(0)>

In [60]:
pool2d = nn.MaxPool2D(3, padding=1, strides=2) 
pool2d(X)


[[[[ 5.  7.]
   [13. 15.]]

  [[ 6.  8.]
   [14. 16.]]]]
<NDArray 1x2x2x2 @cpu(0)>

### 8.6 Convolutional Neural Networks (LeNet)

#### 8.6.1 LeNet 

In [1]:
import d2l 
from mxnet import autograd, gluon, init, nd 
from mxnet.gluon import nn

net = nn.Sequential() 
net.add(
    nn.Conv2D(channels=6, kernel_size=5, padding=2, activation='sigmoid'),
    nn.AvgPool2D(pool_size=2, strides=2), 
    nn.Conv2D(channels=16, kernel_size=5, activation='sigmoid'), 
    nn.AvgPool2D(pool_size=2, strides=2), 
    # Dense will transform the input of the shape (batch size, channel, 
    # height, width) into the input of the shape (batch size, 
    # channel * height * width) automatically by default 
    nn.Dense(120, activation='sigmoid'), 
    nn.Dense(84, activation='sigmoid'), 
    nn.Dense(10))


In [2]:
X = nd.random.uniform(shape=(1,1,28,28))
net.initialize()
for layer in net:
    X = layer(X)
    print(layer.name, 'output shape:\t', X.shape)

conv0 output shape:	 (1, 6, 28, 28)
pool0 output shape:	 (1, 6, 14, 14)
conv1 output shape:	 (1, 16, 10, 10)
pool1 output shape:	 (1, 16, 5, 5)
dense0 output shape:	 (1, 120)
dense1 output shape:	 (1, 84)
dense2 output shape:	 (1, 10)


In [None]:
import d2l

batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)

In [None]:
# Save to the d2l package 
def evaluate_accuracy_gpu(net, data_iter, ctx=None):
    if not ctx: # Query the first device the first parameter is on.
        ctx = list(net.collect_params().values())[0].list_ctx()[0] 
    metric = d2l.Accumulator(2) # num_corrected_examples, num_examples
    
    for X, y in data_iter:
        X, y = X.as_in_context(ctx), y.as_in_context(ctx) 
        metric.add(d2l.accuracy(net(X), y), y.size)
    
    return metric[0].metric[1]

In [None]:

def train_ch5(net, train_iter, test_iter, num_epochs, lr):
    net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier()) 
    loss = gluon.loss.SoftmaxCrossEntropyLoss() 
    trainer = gluon.Trainer(net.collect_params(),'sgd', {'learning_rate': lr})
    
    animator = d2l.Animator(xlabel='epoch', xlim=[0,num_epochs],legend=['train loss','train acc','test acc'])
    
    timer = d2l.Timer() 
    for epoch in range(num_epochs):
        metric = d2l.Accumulator(3) # train_loss, train_acc, num_examples 
        for i, (X, y) in enumerate(train_iter):
            timer.start() # Here is the only difference compared to train_epoch_ch3 
            X, y = X.as_in_context(ctx), 
            y.as_in_context(ctx) 
            
            with autograd.record():
                y_hat = net(X) 
                l = loss(y_hat, y)
            l.backward() 
            trainer.step(X.shape[0]) 
            metric.add(l.sum().asscalar(), d2l.accuracy(y_hat, y), X.shape[0]) 
            timer.stop() 
            train_loss, train_acc = metric[0]/metric[2], metric[1]/metric[2]
            
            if (i+1) % 50 == 0:
                animator.add(epoch + i/len(train_iter), (train_loss, train_acc, None))
        test_acc = evaluate_accuracy_gpu(net, test_iter) 
        animator.add(epoch+1, (None, None, test_acc))
        
    print('loss %.3f, train acc %.3f, test acc %.3f' % (train_loss, train_acc, test_acc))
    print('%.1f exampes/sec on %s'%(metric[2]*num_epochs/timer.sum(), ctx))

            


# 9. MODERN CONVOLUTIONAL NETWORKS

### 9.1 Deep Convolutional Neural Networks (AlexNet)

#### 9.1.1 Learning Feature Representation 