In [1]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm


# Batch Normalization

In [2]:
class Model_BN(nn.Module):
    def __init__(self):
        super(Model_BN, self).__init__()
        self.flat = nn.Flatten()
        self.bn1 = nn.BatchNorm1d(784)
        self.linear1 = nn.Linear(784, 300)
        self.relu = nn.ReLU()
        self.bn2 = nn.BatchNorm1d(300)
        self.linear2 = nn.Linear(300, 100)
        self.bn3 = nn.BatchNorm1d(100)
        self.linear3 = nn.Linear(100, 10)
        self.smax = nn.Softmax(dim=1)
            
    def forward(self, X):
        out = self.flat(X)
        out = self.bn1(out)
        out = self.linear1(out)
        out = self.relu(out)
        out = self.bn2(out)
        out = self.linear2(out)
        out = self.relu(out)
        out = self.bn3(out)
        out = self.linear3(out)
        out = self.smax(out)
        return out

In [3]:
model1 = Model_BN()

from torchsummary import summary
summary(model1, (1,28,28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Flatten-1                  [-1, 784]               0
       BatchNorm1d-2                  [-1, 784]           1,568
            Linear-3                  [-1, 300]         235,500
              ReLU-4                  [-1, 300]               0
       BatchNorm1d-5                  [-1, 300]             600
            Linear-6                  [-1, 100]          30,100
              ReLU-7                  [-1, 100]               0
       BatchNorm1d-8                  [-1, 100]             200
            Linear-9                   [-1, 10]           1,010
          Softmax-10                   [-1, 10]               0
Total params: 268,978
Trainable params: 268,978
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.02
Params size (MB): 1.03
Estimated T

Note that the Number of trainable parameters is the same as in Keras. Only, there are additional non-trainable parameters (the running averages of mean and variance in BN layer) which don't appear in PyTorch (not stored?)

For models with BN, it doesn't really make sense to have bias with linear layers (since its effect is cancelled by the BN layer)

In [4]:
class Model_BN_nb(nn.Module):
    def __init__(self):
        super(Model_BN_nb, self).__init__()
        self.flat = nn.Flatten()
        self.bn1 = nn.BatchNorm1d(784)
        self.linear1 = nn.Linear(784, 300, bias=False)
        self.relu = nn.ReLU()
        self.bn2 = nn.BatchNorm1d(300)
        self.linear2 = nn.Linear(300, 100, bias=False)
        self.bn3 = nn.BatchNorm1d(100)
        self.linear3 = nn.Linear(100, 10)
        self.smax = nn.Softmax(dim=1)
            
    def forward(self, X):
        out = self.flat(X)
        out = self.bn1(out)
        out = self.linear1(out)
        out = self.relu(out)
        out = self.bn2(out)
        out = self.linear2(out)
        out = self.relu(out)
        out = self.bn3(out)
        out = self.linear3(out)
        out = self.smax(out)
        return out

In [5]:
model2 = Model_BN_nb()

from torchsummary import summary
summary(model2, (1,28,28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Flatten-1                  [-1, 784]               0
       BatchNorm1d-2                  [-1, 784]           1,568
            Linear-3                  [-1, 300]         235,200
              ReLU-4                  [-1, 300]               0
       BatchNorm1d-5                  [-1, 300]             600
            Linear-6                  [-1, 100]          30,000
              ReLU-7                  [-1, 100]               0
       BatchNorm1d-8                  [-1, 100]             200
            Linear-9                   [-1, 10]           1,010
          Softmax-10                   [-1, 10]               0
Total params: 268,578
Trainable params: 268,578
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.02
Params size (MB): 1.02
Estimated T

# Gradient Clipping

This needs to be done inside the PyTorchTrainer

### By Value

```
# backward pass and optimize
                loss.backward()
                nn.utils.clip_grad_value_(self.model.parameters(), clipvalue=1.0)
                self.optim.step()
```

### By Norm

```
# backward pass and optimize
                loss.backward()
                nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0, norm_type=2)
                self.optim.step()
```

# Reusing a PyTorch Model

In [6]:
train_dataset = torchvision.datasets.FashionMNIST(
                    root='.',
                    train=True,
                    transform=transforms.ToTensor(), # This will automatically normalize data!
                    download=True
                    )

test_dataset = torchvision.datasets.FashionMNIST(
                    root='.',
                    train=False,
                    transform=transforms.ToTensor(), # This will automatically normalize data!
                    download=True
                    )

In [7]:
def split_dataset(X, y):
    y_5_or_6 = (y == 5) | (y == 6) # sandals or shirts
    y_A = y[~y_5_or_6]
    y_A[y_A > 6] -= 2 # class indices 7, 8, 9 should be moved to 5, 6, 7
    y_B = torch.from_numpy((y[y_5_or_6] == 6).numpy().astype(np.float32)) # binary classification task: is it a shirt (class 6)?
    return ((X[~y_5_or_6], y_A),
            (X[y_5_or_6], y_B))

(X_train_A, y_train_A), (X_train_B, y_train_B) = split_dataset(train_dataset.data, train_dataset.targets)
(X_test_A, y_test_A), (X_test_B, y_test_B) = split_dataset(test_dataset.data, test_dataset.targets)
X_train_B = X_train_B[:200]
y_train_B = y_train_B[:200]

In [8]:
X_train_A = X_train_A.type(torch.FloatTensor)
X_train_B = X_train_B.type(torch.FloatTensor)
X_test_A = X_test_A.type(torch.FloatTensor)
X_test_B = X_test_B.type(torch.FloatTensor)

y_train_A = y_train_A.type(torch.LongTensor)
y_train_B = y_train_B.type(torch.LongTensor)
y_test_A = y_test_A.type(torch.LongTensor)
y_test_B = y_test_B.type(torch.LongTensor)

In [9]:
train_a_dataset = torch.utils.data.TensorDataset(X_train_A, y_train_A)
test_a_dataset = torch.utils.data.TensorDataset(X_test_A, y_test_A)

train_b_dataset = torch.utils.data.TensorDataset(X_train_B, y_train_B)
test_b_dataset = torch.utils.data.TensorDataset(X_test_B, y_test_B)

In [10]:
batch_size = 32 # The default in Keras
train_loader = torch.utils.data.DataLoader(
                        dataset=train_dataset,
                        batch_size=batch_size,
                        shuffle=False
                        )


test_loader = torch.utils.data.DataLoader(
                        dataset=test_dataset,
                        batch_size=batch_size,
                        shuffle=False # Not necessary!
                        )

In [11]:
batch_size = 32 # The default in Keras
train_a_loader = torch.utils.data.DataLoader(
                        dataset=train_a_dataset,
                        batch_size=batch_size,
                        shuffle=False
                        )


test_a_loader = torch.utils.data.DataLoader(
                        dataset=test_a_dataset,
                        batch_size=batch_size,
                        shuffle=False # Not necessary!
                        )

train_b_loader = torch.utils.data.DataLoader(
                        dataset=train_b_dataset,
                        batch_size=batch_size,
                        shuffle=False
                        )


test_b_loader = torch.utils.data.DataLoader(
                        dataset=test_b_dataset,
                        batch_size=batch_size,
                        shuffle=False # Not necessary!
                        )

In [12]:
class Model_TL_A(nn.Module):
    def __init__(self):
        super(Model_TL_A, self).__init__()
        self.flatten = nn.Flatten()
        current_dim = 784
        layers = []
        for n_hidden in (300, 100, 50, 50, 50):
            layers.append(nn.Linear(current_dim, n_hidden))
            layers.append(nn.SELU())
            current_dim = n_hidden
        self.layers = nn.Sequential(*layers)
        self.linear1 = nn.Linear(current_dim, 8)
            
    def forward(self, X):
        out = self.flatten(X)
        for layer in self.layers:
            out = layer(out)
        out = self.linear1(out)
        return out

In [13]:
model_tl_a = Model_TL_A()

from torchsummary import summary
summary(model_tl_a, (1,28,28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Flatten-1                  [-1, 784]               0
            Linear-2                  [-1, 300]         235,500
              SELU-3                  [-1, 300]               0
            Linear-4                  [-1, 100]          30,100
              SELU-5                  [-1, 100]               0
            Linear-6                   [-1, 50]           5,050
              SELU-7                   [-1, 50]               0
            Linear-8                   [-1, 50]           2,550
              SELU-9                   [-1, 50]               0
           Linear-10                   [-1, 50]           2,550
             SELU-11                   [-1, 50]               0
           Linear-12                    [-1, 8]             408
Total params: 276,158
Trainable params: 276,158
Non-trainable params: 0
-------------------------------

In [14]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model_tl_a.parameters(), lr=0.001)

In [15]:
class PyTorchTrainer(object):
    def __init__(self,
                model,
                optim,
                loss_fn,
                device="cpu"):
        self.model = model
        self.optim = optim
        self.loss_fn = loss_fn
        self.device = device
        
    def fit(self,
            train_loader,
            test_loader,
            n_epochs: int=100,
            eval_inter: int=1):
        history = {}
        history['epoch'] = []
        history['training_acc'] = []
        history['test_acc'] = []
        history['training_loss'] = []
        history['test_loss'] = []
        # Training Loop

        history['p_test'] = []
        p_test = np.array([]) # for listing all predictions - last epoch only

        for epoch in range(n_epochs):
            history['epoch'].append(epoch+1)
            
            self.model.train()
            
            train_loss = []
            n_correct = 0.
            n_total = 0.
            for inputs, targets in tqdm(train_loader, leave=False):
                # Move data to GPU
                inputs, targets = inputs.to(self.device), targets.to(self.device)

                # zero the gradient
                self.optim.zero_grad()

                # forward pass
                outputs = self.model(inputs)
                loss = self.loss_fn(outputs, targets)

                # get prediction
                _, predictions = torch.max(outputs, 1)

                # update counts
                n_correct += (predictions == targets).sum().item()
                n_total += targets.shape[0]

                # backward pass and optimize
                loss.backward()
                self.optim.step()

                train_loss.append(loss.item())

            train_loss = np.mean(train_loss)
            history['training_acc'].append(n_correct / n_total * 100) 
            # save losses
            history['training_loss'].append(train_loss) 
    
           
            self.model.eval()
            with torch.no_grad():
                test_loss = []
                n_correct = 0.
                n_total = 0.
                for inputs, targets in test_loader:
                    # Move data to GPU
                    inputs, targets = inputs.to(self.device), targets.to(self.device)

                    # forward pass
                    outputs_test = self.model(inputs)
                    loss_test = self.loss_fn(outputs_test, targets)

                    # get prediction
                    _, predictions = torch.max(outputs_test, 1)

                    # update list of predictions  - done for last epoch only!
                    if epoch == (n_epochs - 1):
                        p_test = np.concatenate((p_test, predictions.cpu().numpy()))

                    # update counts
                    n_correct += (predictions == targets).sum().item()
                    n_total += targets.shape[0]

                    test_loss.append(loss_test.item())
            history['p_test'] = p_test
            test_loss = np.mean(test_loss)
            history['test_acc'].append(n_correct / n_total * 100) 
            # save losses
            history['test_loss'].append(test_loss) 
            
            if (epoch +1) % eval_inter == 0:
                print(f"Epoch: {epoch+1}/{n_epochs}, Train Accuracy: {history['training_acc'][-1]:.2f}%, Test Accuracy: {history['test_acc'][-1]:.2f}%, Train Loss: {history['training_loss'][-1]:.4f}, Test Loss: {history['test_loss'][-1]:.4f}", end='')
        return history

In [16]:
# If GPU...
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model_tl_a.to(device)

cpu


Model_TL_A(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (layers): Sequential(
    (0): Linear(in_features=784, out_features=300, bias=True)
    (1): SELU()
    (2): Linear(in_features=300, out_features=100, bias=True)
    (3): SELU()
    (4): Linear(in_features=100, out_features=50, bias=True)
    (5): SELU()
    (6): Linear(in_features=50, out_features=50, bias=True)
    (7): SELU()
    (8): Linear(in_features=50, out_features=50, bias=True)
    (9): SELU()
  )
  (linear1): Linear(in_features=50, out_features=8, bias=True)
)

In [17]:
trainer = PyTorchTrainer(model_tl_a, optimizer, loss_fn,device)
history={}
history = trainer.fit(train_a_loader,test_a_loader,20,1)

  0%|          | 0/1500 [00:00<?, ?it/s]

Epoch: 1/20, Train Accuracy: 86.07%, Test Accuracy: 87.79%, Train Loss: 0.4087, Test Loss: 0.3434

  0%|          | 0/1500 [00:00<?, ?it/s]

Epoch: 2/20, Train Accuracy: 90.15%, Test Accuracy: 89.04%, Train Loss: 0.2848, Test Loss: 0.3051

  0%|          | 0/1500 [00:00<?, ?it/s]

Epoch: 3/20, Train Accuracy: 91.19%, Test Accuracy: 89.89%, Train Loss: 0.2561, Test Loss: 0.2862

  0%|          | 0/1500 [00:00<?, ?it/s]

Epoch: 4/20, Train Accuracy: 91.76%, Test Accuracy: 90.11%, Train Loss: 0.2378, Test Loss: 0.2770

  0%|          | 0/1500 [00:00<?, ?it/s]

Epoch: 5/20, Train Accuracy: 92.27%, Test Accuracy: 90.44%, Train Loss: 0.2240, Test Loss: 0.2661

  0%|          | 0/1500 [00:00<?, ?it/s]

Epoch: 6/20, Train Accuracy: 92.54%, Test Accuracy: 90.71%, Train Loss: 0.2127, Test Loss: 0.2575

  0%|          | 0/1500 [00:00<?, ?it/s]

Epoch: 7/20, Train Accuracy: 92.83%, Test Accuracy: 90.95%, Train Loss: 0.2030, Test Loss: 0.2506

  0%|          | 0/1500 [00:00<?, ?it/s]

Epoch: 8/20, Train Accuracy: 93.14%, Test Accuracy: 91.21%, Train Loss: 0.1940, Test Loss: 0.2450

  0%|          | 0/1500 [00:00<?, ?it/s]

Epoch: 9/20, Train Accuracy: 93.45%, Test Accuracy: 91.38%, Train Loss: 0.1858, Test Loss: 0.2393

  0%|          | 0/1500 [00:00<?, ?it/s]

Epoch: 10/20, Train Accuracy: 93.68%, Test Accuracy: 91.70%, Train Loss: 0.1783, Test Loss: 0.2330

  0%|          | 0/1500 [00:00<?, ?it/s]

Epoch: 11/20, Train Accuracy: 93.91%, Test Accuracy: 91.70%, Train Loss: 0.1712, Test Loss: 0.2324

  0%|          | 0/1500 [00:00<?, ?it/s]

Epoch: 12/20, Train Accuracy: 94.16%, Test Accuracy: 91.72%, Train Loss: 0.1648, Test Loss: 0.2311

  0%|          | 0/1500 [00:00<?, ?it/s]

Epoch: 13/20, Train Accuracy: 94.38%, Test Accuracy: 91.84%, Train Loss: 0.1586, Test Loss: 0.2284

  0%|          | 0/1500 [00:00<?, ?it/s]

Epoch: 14/20, Train Accuracy: 94.63%, Test Accuracy: 91.80%, Train Loss: 0.1528, Test Loss: 0.2278

  0%|          | 0/1500 [00:00<?, ?it/s]

Epoch: 15/20, Train Accuracy: 94.80%, Test Accuracy: 92.16%, Train Loss: 0.1474, Test Loss: 0.2248

  0%|          | 0/1500 [00:00<?, ?it/s]

Epoch: 16/20, Train Accuracy: 94.99%, Test Accuracy: 92.15%, Train Loss: 0.1421, Test Loss: 0.2264

  0%|          | 0/1500 [00:00<?, ?it/s]

Epoch: 17/20, Train Accuracy: 95.20%, Test Accuracy: 92.06%, Train Loss: 0.1371, Test Loss: 0.2280

  0%|          | 0/1500 [00:00<?, ?it/s]

Epoch: 18/20, Train Accuracy: 95.39%, Test Accuracy: 92.07%, Train Loss: 0.1325, Test Loss: 0.2279

  0%|          | 0/1500 [00:00<?, ?it/s]

Epoch: 19/20, Train Accuracy: 95.51%, Test Accuracy: 92.00%, Train Loss: 0.1278, Test Loss: 0.2293

  0%|          | 0/1500 [00:00<?, ?it/s]

Epoch: 20/20, Train Accuracy: 95.67%, Test Accuracy: 92.11%, Train Loss: 0.1232, Test Loss: 0.2300

In [18]:
torch.save(model_tl_a.state_dict(), 'model_tl_a.pth')

In [17]:
class Model_TL_B(nn.Module):
    def __init__(self):
        super(Model_TL_B, self).__init__()
        self.flatten = nn.Flatten()
        current_dim = 784
        layers = []
        for n_hidden in (300, 100, 50, 50, 50):
            layers.append(nn.Linear(current_dim, n_hidden))
            layers.append(nn.SELU())
            current_dim = n_hidden
        self.layers = nn.Sequential(*layers)
        self.linear1 = nn.Linear(current_dim, 2)
            
    def forward(self, X):
        out = self.flatten(X)
        for layer in self.layers:
            out = layer(out)
        out = self.linear1(out)
        return out

In [18]:
model_tl_b = Model_TL_B()

from torchsummary import summary
summary(model_tl_b, (1,28,28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Flatten-1                  [-1, 784]               0
            Linear-2                  [-1, 300]         235,500
              SELU-3                  [-1, 300]               0
            Linear-4                  [-1, 100]          30,100
              SELU-5                  [-1, 100]               0
            Linear-6                   [-1, 50]           5,050
              SELU-7                   [-1, 50]               0
            Linear-8                   [-1, 50]           2,550
              SELU-9                   [-1, 50]               0
           Linear-10                   [-1, 50]           2,550
             SELU-11                   [-1, 50]               0
           Linear-12                    [-1, 2]             102
Total params: 275,852
Trainable params: 275,852
Non-trainable params: 0
-------------------------------

In [19]:
# If GPU...
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model_tl_b.to(device)

cpu


Model_TL_B(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (layers): Sequential(
    (0): Linear(in_features=784, out_features=300, bias=True)
    (1): SELU()
    (2): Linear(in_features=300, out_features=100, bias=True)
    (3): SELU()
    (4): Linear(in_features=100, out_features=50, bias=True)
    (5): SELU()
    (6): Linear(in_features=50, out_features=50, bias=True)
    (7): SELU()
    (8): Linear(in_features=50, out_features=50, bias=True)
    (9): SELU()
  )
  (linear1): Linear(in_features=50, out_features=2, bias=True)
)

In [20]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model_tl_b.parameters(), lr=0.001)

In [21]:
trainer = PyTorchTrainer(model_tl_b, optimizer, loss_fn,device)
history={}
history = trainer.fit(train_b_loader,test_b_loader,20,1)

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 1/20, Train Accuracy: 72.50%, Test Accuracy: 81.05%, Train Loss: 0.5325, Test Loss: 0.4075

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 2/20, Train Accuracy: 88.50%, Test Accuracy: 91.75%, Train Loss: 0.3144, Test Loss: 0.2796

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 3/20, Train Accuracy: 94.50%, Test Accuracy: 95.70%, Train Loss: 0.2177, Test Loss: 0.2130

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 4/20, Train Accuracy: 96.50%, Test Accuracy: 97.10%, Train Loss: 0.1654, Test Loss: 0.1742

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 5/20, Train Accuracy: 98.50%, Test Accuracy: 97.55%, Train Loss: 0.1337, Test Loss: 0.1497

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 6/20, Train Accuracy: 98.50%, Test Accuracy: 97.85%, Train Loss: 0.1121, Test Loss: 0.1324

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 7/20, Train Accuracy: 99.00%, Test Accuracy: 97.95%, Train Loss: 0.0961, Test Loss: 0.1184

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 8/20, Train Accuracy: 99.00%, Test Accuracy: 98.25%, Train Loss: 0.0839, Test Loss: 0.1079

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 9/20, Train Accuracy: 99.00%, Test Accuracy: 98.30%, Train Loss: 0.0742, Test Loss: 0.0994

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 10/20, Train Accuracy: 99.50%, Test Accuracy: 98.35%, Train Loss: 0.0663, Test Loss: 0.0922

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 11/20, Train Accuracy: 99.50%, Test Accuracy: 98.35%, Train Loss: 0.0598, Test Loss: 0.0861

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 12/20, Train Accuracy: 100.00%, Test Accuracy: 98.50%, Train Loss: 0.0542, Test Loss: 0.0809

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 13/20, Train Accuracy: 100.00%, Test Accuracy: 98.65%, Train Loss: 0.0494, Test Loss: 0.0763

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 14/20, Train Accuracy: 100.00%, Test Accuracy: 98.65%, Train Loss: 0.0452, Test Loss: 0.0724

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 15/20, Train Accuracy: 100.00%, Test Accuracy: 98.70%, Train Loss: 0.0417, Test Loss: 0.0689

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 16/20, Train Accuracy: 100.00%, Test Accuracy: 98.70%, Train Loss: 0.0386, Test Loss: 0.0659

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 17/20, Train Accuracy: 100.00%, Test Accuracy: 98.70%, Train Loss: 0.0358, Test Loss: 0.0632

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 18/20, Train Accuracy: 100.00%, Test Accuracy: 98.70%, Train Loss: 0.0334, Test Loss: 0.0607

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 19/20, Train Accuracy: 100.00%, Test Accuracy: 98.70%, Train Loss: 0.0312, Test Loss: 0.0585

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 20/20, Train Accuracy: 100.00%, Test Accuracy: 98.70%, Train Loss: 0.0292, Test Loss: 0.0565

In [24]:
class Model_TL_B_on_A(nn.Module):
    def __init__(self, Model_TL_A):
        super(Model_TL_B_on_A, self).__init__()
        model_tl_a = Model_TL_A()
        self.model = nn.Sequential(*list(model_tl_a.children())[:-1])
        self.linear1 = nn.Linear(50, 2)
            
    def forward(self, X):
        out = self.model(X)
        out = self.linear1(out)
        return out

In [25]:
model_tl_b_on_a = Model_TL_B_on_A(Model_TL_A)
from torchsummary import summary
summary(model_tl_b_on_a, (1,28,28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Flatten-1                  [-1, 784]               0
            Linear-2                  [-1, 300]         235,500
              SELU-3                  [-1, 300]               0
            Linear-4                  [-1, 100]          30,100
              SELU-5                  [-1, 100]               0
            Linear-6                   [-1, 50]           5,050
              SELU-7                   [-1, 50]               0
            Linear-8                   [-1, 50]           2,550
              SELU-9                   [-1, 50]               0
           Linear-10                   [-1, 50]           2,550
             SELU-11                   [-1, 50]               0
           Linear-12                    [-1, 2]             102
Total params: 275,852
Trainable params: 275,852
Non-trainable params: 0
-------------------------------

In [26]:
for parameter in model_tl_b_on_a.parameters():
    parameter.requires_grad = False

model_tl_b_on_a.linear1.weight.requires_grad = True
model_tl_b_on_a.linear1.bias.requires_grad = True

In [27]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model_tl_b_on_a.parameters(), lr=0.001)

In [28]:
trainer = PyTorchTrainer(model_tl_b_on_a, optimizer, loss_fn,device)
history={}
history = trainer.fit(train_b_loader,test_b_loader,4,1)

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 1/4, Train Accuracy: 53.00%, Test Accuracy: 51.75%, Train Loss: 0.7895, Test Loss: 0.8038

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 2/4, Train Accuracy: 55.50%, Test Accuracy: 52.75%, Train Loss: 0.7394, Test Loss: 0.7582

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 3/4, Train Accuracy: 58.50%, Test Accuracy: 53.80%, Train Loss: 0.7012, Test Loss: 0.7236

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 4/4, Train Accuracy: 60.00%, Test Accuracy: 55.35%, Train Loss: 0.6720, Test Loss: 0.6969

In [29]:
for parameter in model_tl_b_on_a.parameters():
    parameter.requires_grad = True

In [30]:
history={}
history = trainer.fit(train_b_loader,test_b_loader,16,1)

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 1/16, Train Accuracy: 74.50%, Test Accuracy: 85.40%, Train Loss: 0.5003, Test Loss: 0.3831

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 2/16, Train Accuracy: 93.50%, Test Accuracy: 93.10%, Train Loss: 0.2907, Test Loss: 0.2743

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 3/16, Train Accuracy: 97.50%, Test Accuracy: 95.65%, Train Loss: 0.2048, Test Loss: 0.2142

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 4/16, Train Accuracy: 97.50%, Test Accuracy: 96.50%, Train Loss: 0.1567, Test Loss: 0.1790

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 5/16, Train Accuracy: 99.00%, Test Accuracy: 97.15%, Train Loss: 0.1271, Test Loss: 0.1544

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 6/16, Train Accuracy: 99.00%, Test Accuracy: 97.55%, Train Loss: 0.1056, Test Loss: 0.1363

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 7/16, Train Accuracy: 99.00%, Test Accuracy: 97.80%, Train Loss: 0.0895, Test Loss: 0.1225

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 8/16, Train Accuracy: 99.00%, Test Accuracy: 98.20%, Train Loss: 0.0769, Test Loss: 0.1102

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 9/16, Train Accuracy: 99.00%, Test Accuracy: 98.35%, Train Loss: 0.0669, Test Loss: 0.1006

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 10/16, Train Accuracy: 100.00%, Test Accuracy: 98.30%, Train Loss: 0.0593, Test Loss: 0.0928

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 11/16, Train Accuracy: 100.00%, Test Accuracy: 98.35%, Train Loss: 0.0531, Test Loss: 0.0866

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 12/16, Train Accuracy: 100.00%, Test Accuracy: 98.35%, Train Loss: 0.0480, Test Loss: 0.0812

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 13/16, Train Accuracy: 100.00%, Test Accuracy: 98.45%, Train Loss: 0.0436, Test Loss: 0.0765

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 14/16, Train Accuracy: 100.00%, Test Accuracy: 98.55%, Train Loss: 0.0400, Test Loss: 0.0725

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 15/16, Train Accuracy: 100.00%, Test Accuracy: 98.65%, Train Loss: 0.0368, Test Loss: 0.0690

  0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 16/16, Train Accuracy: 100.00%, Test Accuracy: 98.65%, Train Loss: 0.0340, Test Loss: 0.0659