# PyTorch parameter
Reference
- [Constructing parameter groups in pytorch](https://stackoverflow.com/questions/69774137/constructing-parameter-groups-in-pytorch)
- [PyTorch: What's the difference between state_dict and parameters()?](https://stackoverflow.com/questions/54746829/pytorch-whats-the-difference-between-state-dict-and-parameters)
- [What pytorch means by buffers?](https://discuss.pytorch.org/t/what-pytorch-means-by-buffers/120266)

In [1]:
import torch
import torch.nn as nn
from torch.optim import SGD
import torch.nn.functional as F
from copy import deepcopy

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(16, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        
        return x

model = Net()
opt = SGD(model.parameters(), lr=0.01)
print(model)

Net(
  (fc1): Linear(in_features=16, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
)


## Get parameters
- model.named_parameters() returns a generator
- each call to the generator produced a tuple `(paramter_name, torch.nn.parameter.Parameter)`

In [2]:
print(type(model.named_parameters()))
next(model.named_parameters())

<class 'generator'>


('fc1.weight',
 Parameter containing:
 tensor([[-0.0052,  0.1259,  0.1436,  ...,  0.1291, -0.2167,  0.1890],
         [-0.1982,  0.2269,  0.1691,  ...,  0.0108,  0.2383, -0.0038],
         [ 0.0908, -0.0608,  0.0193,  ...,  0.1351,  0.2323,  0.1127],
         ...,
         [ 0.2033, -0.1474, -0.1043,  ..., -0.0095,  0.0411, -0.2085],
         [-0.1480,  0.1185,  0.2237,  ...,  0.1508,  0.0838, -0.0118],
         [-0.2084,  0.1216, -0.1767,  ..., -0.2042,  0.1350,  0.1368]],
        requires_grad=True))

## Fix parameters
- To fix a `torch.nn.parameter.Parameter` (say `model.fc1.weight`), set `.requires_grad = False`
- Calling `.backward()` will no accumulate gradient on `model.fc1.weight`. Therefore, `optimizer.step` will no change `model.fc1.weight`

In [21]:
fc1_weight_copy = deepcopy(model.fc1.weight.detach())
fc2_weight_copy = deepcopy(model.fc2.weight.detach())

model.fc1.weight.requires_grad = False
model.fc1.bias.requires_grad = False

opt.zero_grad() 

x = torch.normal(0, 1, [16])
y = model(x)

loss = F.mse_loss(y , torch.tensor([5.0]))
loss.backward()

In [22]:
loss

tensor(23.1273, grad_fn=<MseLossBackward0>)

In [23]:
print(model.fc1.weight.grad)
print(model.fc2.weight.grad)

None
tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.2686],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])


In [24]:
opt.step()

In [30]:
print((model.fc1.weight - fc1_weight_copy).sum())
print((model.fc2.weight - fc2_weight_copy).sum())

tensor(0.)
tensor(0.1701, grad_fn=<SumBackward0>)


## Parameters vs buffer vs state_dict
- parameters: tensors that require gradients
- buffer: tensors that do not require gradients (e.g., mean and std in batchnorm layers)
- state_dict: contains elements from both parameters and buffer 

In [47]:
import torch
import torch.nn as nn
from torch.optim import SGD
import torch.nn.functional as F
from copy import deepcopy

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(16, 64)
        self.bn1 = nn.BatchNorm1d(64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        
        return x

model = Net()
opt = SGD(model.parameters(), lr=0.01)
print(model)

Net(
  (fc1): Linear(in_features=16, out_features=64, bias=True)
  (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
)


In [57]:
list(model.named_buffers())

[('bn1.running_mean',
  tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])),
 ('bn1.running_var',
  tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])),
 ('bn1.num_batches_tracked', tensor(0))]

In [70]:
print(model.state_dict()['bn1.weight'])
print(model.state_dict()['fc1.weight'])

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
tensor([[-1.3200e-01, -5.9882e-02, -9.9831e-02,  ..., -1.5544e-01,
         -1.9045e-01,  5.3080e-02],
        [-5.7073e-02, -9.8052e-02, -1.7600e-01,  ..., -1.1753e-02,
          1.1653e-01, -2.3358e-01],
        [ 7.1093e-02,  2.1064e-01, -1.9982e-01,  ..., -1.4521e-01,
         -1.4367e-01, -5.0642e-02],
        ...,
        [ 2.4479e-01,  1.0007e-01, -8.1331e-05,  ..., -1.4318e-01,
         -2.2091e-01,  6.5968e-02],
        [ 9.6643e-02,  1.4006e-01, -2.4466e-02,  ...,  1.0410e-01,
          1.3157e-01,  2.1909e-01],
        [ 2.3638e-01,  1.4358e-01, -1.5742e-01,  ...,  2.1863e-01,
          2.7791e-02, -1.6912e-01]])


## Parameter Group
- For a complex network, we can declare a parameter group and define update rules to the parameter group.
- Procedure: define a class attribute, say `self.params` as a `nn.ModuleDict` and group parameters using `nn.ModuleLists`
- Can fix parameter group or set different learning rate
    - For SGD, the input params could iterable of parameters to optimize or dicts defining parameter groups
    - In the example below, `[fc1, fc2]` have different learning rate and momentum than `fc3`

In [35]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(16, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 1)
        
        self.params = nn.ModuleDict({
            'base': nn.ModuleList([self.fc1, self.fc2]),
            'regressor': nn.ModuleList([self.fc3])})

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        
        return x

model = Net()

In [40]:
opt = SGD(
    [
        {'params': model.params.base.parameters()},
        {'params': model.params.regressor.parameters(), 'lr': 1e-3, 'momentum': 0}
    ], 
    lr=1e-2, 
    momentum=0.9
)

In [41]:
opt

SGD (
Parameter Group 0
    dampening: 0
    lr: 0.01
    maximize: False
    momentum: 0.9
    nesterov: False
    weight_decay: 0

Parameter Group 1
    dampening: 0
    lr: 0.001
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)