In [13]:
import torch
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.first_layer = nn.Linear(100, 50)
        self.second_layer = nn.Linear(50, 1)

    def forward(self, x):
        x = nn.functional.relu(self.first_layer(x))
        x = self.second_layer(x)
        return x


mlp = MLP()

In [4]:
from torch.utils.data import TensorDataset

x = torch.randn(1000, 100).to(device)
y = (torch.rand(1000) > 0.5).int().float().to(device)

In [5]:
from torch.utils.data import TensorDataset, DataLoader

dataset = TensorDataset(x, y)
dataloader = DataLoader(dataset, batch_size=100, shuffle=True)

In [6]:
for i in dataloader:
    print(i[0])
    print(i[1])

tensor([[ 0.4217, -1.6034,  0.8309,  ..., -0.0828,  0.4816, -0.1056],
        [ 0.8294, -0.0793, -0.4452,  ...,  0.1234, -0.3097,  0.4904],
        [ 1.4083, -1.7829,  1.1165,  ...,  0.8527,  0.0048,  1.2228],
        ...,
        [-1.1427,  0.9442,  0.2447,  ..., -0.5201, -1.3575, -0.1999],
        [ 0.6190,  1.1396,  0.2771,  ..., -0.5554, -1.0536, -1.2033],
        [ 1.6340,  0.4454,  1.3743,  ...,  0.3292, -0.7257,  1.1535]],
       device='cuda:0')
tensor([1., 1., 0., 1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 0., 0., 1., 1., 0.,
        0., 1., 0., 1., 0., 1., 1., 0., 1., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
        1., 1., 1., 1., 0., 0., 0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1., 0.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 0., 1., 1., 1., 1.,
        1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1.,
        1., 0., 1., 0., 1., 1., 0., 1., 1., 1.], device='cuda:0')
tensor([[ 0.1825,  0.2435,  1.7229,  ..., -0.1683, -0.7597,  0.2902],
      

# Set-up learning rate

In [15]:
import torch.optim as optim

optimizer = optim.Adam([
    {"params": mlp.first_layer.parameters(), "lr": 1e-2},
    {"params": mlp.second_layer.parameters(), "lr": 1e-3},
], lr=2e-2)
criterion = nn.BCEWithLogitsLoss()
mlp.to(device)
criterion.to(device)

BCEWithLogitsLoss()

In [16]:
n_epoch = 5
for epoch in range(n_epoch):
    for batch in dataloader:
        optimizer.zero_grad()

        x, y = batch

        predictions = mlp(x).squeeze()

        loss = criterion(predictions, y)

        loss.backward()

        optimizer.step()

In [17]:
for param_group in optimizer.param_groups:
    print(param_group["lr"])

0.01
0.001


In [18]:
torch.save(optimizer.state_dict(), "optimizer.pt")

In [19]:
optimizer2 = optim.Adam([
    {"params": mlp.first_layer.parameters(), "lr": 1e-2},
    {"params": mlp.second_layer.parameters(), "lr": 1e-3},
], lr=2e-2)
optimizer2.load_state_dict(torch.load("optimizer.pt"))

In [20]:
optimizer2.__dict__

{'defaults': {'lr': 0.02,
  'betas': (0.9, 0.999),
  'eps': 1e-08,
  'weight_decay': 0,
  'amsgrad': False,
  'maximize': False},
 '_zero_grad_profile_name': 'Optimizer.zero_grad#Adam.zero_grad',
 'state': defaultdict(dict,
             {Parameter containing:
              tensor([[ 0.1257,  0.0408, -0.0288,  ...,  0.0996,  0.0468,  0.0352],
                      [ 0.0171, -0.0554, -0.1298,  ..., -0.1226, -0.0520,  0.0684],
                      [ 0.0326,  0.0748,  0.1145,  ..., -0.1658, -0.1641,  0.0314],
                      ...,
                      [ 0.1509,  0.0560,  0.0803,  ..., -0.0011,  0.1267, -0.0259],
                      [ 0.0034, -0.0171,  0.0084,  ...,  0.0181, -0.1051, -0.0074],
                      [-0.0355,  0.0833,  0.0555,  ..., -0.1282, -0.0403, -0.1128]],
                     device='cuda:0', requires_grad=True): {'step': 50,
               'exp_avg': tensor([[-4.8652e-04, -1.4053e-03, -1.5291e-03,  ..., -4.9121e-04,
                         4.4010e-04, -6.219