(dl/04-training)=
# Activations and Gradients

![Status](https://img.shields.io/static/v1.svg?label=Status&message=Finished&color=brightgreen)
[![Source](https://img.shields.io/static/v1.svg?label=GitHub&message=Source&color=181717&logo=GitHub)](https://github.com/particle1331/ok-transformer/blob/master/docs/nb/dl/04-training.ipynb)
[![Stars](https://img.shields.io/github/stars/particle1331/ok-transformer?style=social)](https://github.com/particle1331/ok-transformer)

---

**Readings:**  {cite}`timviera`

## Introduction

In this notebook.

In [1]:
import torch.nn as nn
import torchsummary

def celeba_model(): 
    model = nn.Sequential()

    model.add_module('conv1', nn.Conv2d(3, 64, kernel_size=5, padding=3))
    model.add_module('norm1', nn.BatchNorm2d(64))
    model.add_module('relu1', nn.ReLU())
    model.add_module('pool1', nn.MaxPool2d(kernel_size=2))

    model.add_module('conv2', nn.Conv2d(64, 128, kernel_size=3, padding=1))
    model.add_module('norm2', nn.BatchNorm2d(128))
    model.add_module('relu2', nn.ReLU())
    model.add_module('pool2', nn.MaxPool2d(kernel_size=2))

    model.add_module('conv3', nn.Conv2d(128, 128, kernel_size=3, padding=1))
    model.add_module('norm3', nn.BatchNorm2d(128))
    model.add_module('relu3', nn.ReLU())
    model.add_module('pool3', nn.MaxPool2d(kernel_size=2))
    
    model.add_module('conv4', nn.Conv2d(128, 256, kernel_size=3, padding=1))
    model.add_module('norm4', nn.BatchNorm2d(256))
    model.add_module('relu4', nn.ReLU())
    model.add_module('pool4', nn.AvgPool2d(kernel_size=8))
    
    model.add_module('flatten', nn.Flatten())
    model.add_module('fc', nn.Linear(256, 2))

    return model


torchsummary.summary(celeba_model(), input_size=(3, 64, 64))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 66, 66]           4,864
       BatchNorm2d-2           [-1, 64, 66, 66]             128
              ReLU-3           [-1, 64, 66, 66]               0
         MaxPool2d-4           [-1, 64, 33, 33]               0
            Conv2d-5          [-1, 128, 33, 33]          73,856
       BatchNorm2d-6          [-1, 128, 33, 33]             256
              ReLU-7          [-1, 128, 33, 33]               0
         MaxPool2d-8          [-1, 128, 16, 16]               0
            Conv2d-9          [-1, 128, 16, 16]         147,584
      BatchNorm2d-10          [-1, 128, 16, 16]             256
             ReLU-11          [-1, 128, 16, 16]               0
        MaxPool2d-12            [-1, 128, 8, 8]               0
           Conv2d-13            [-1, 256, 8, 8]         295,168
      BatchNorm2d-14            [-1, 25

---

In [13]:
model = celeba_model()
list(model.modules())[1:-3]

[Conv2d(3, 64, kernel_size=(5, 5), stride=(1, 1), padding=(3, 3)),
 BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
 ReLU(),
 MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
 Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
 BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
 ReLU(),
 MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
 Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
 BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
 ReLU(),
 MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
 Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
 BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
 ReLU()]

In [12]:
list(model.modules())

[Sequential(
   (conv1): Conv2d(3, 64, kernel_size=(5, 5), stride=(1, 1), padding=(3, 3))
   (norm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (relu1): ReLU()
   (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
   (conv2): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
   (norm2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (relu2): ReLU()
   (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
   (conv3): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
   (norm3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (relu3): ReLU()
   (pool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
   (conv4): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
   (norm4): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_ru

■