# Transfer learning - Learning Rate


Readings:
- https://arxiv.org/abs/1506.01186 (Cyclical Learning Rates for Training Neural Networks)
- Jeremy Jordan post: https://www.jeremyjordan.me/nn-learning-rate/
- Learning rate annealing: https://cs231n.github.io/neural-networks-3/#annealing-the-learning-rate
- Experimental results show that 3e-4 is a good starting point for an Adam optimizer

## Fine Tuning

In [1]:
# Setting seeds to try and ensure we have the same results - this is not guaranteed across PyTorch releases.
import torch
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [2]:
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std)
    ])

trainset = datasets.CIFAR10(root='data/CIFAR10',train=True, download=True,transform=transform)
testset = datasets.CIFAR10(root='data/CIFAR10',train=False, transform=transform)

trainloader = DataLoader(trainset, batch_size=64, shuffle=True)
testloader = DataLoader(testset, batch_size=64, shuffle=False)

Files already downloaded and verified


In [6]:
for images, labels in trainloader:
  print(images.size(), labels.size())
  break

torch.Size([64, 3, 224, 224]) torch.Size([64])


In [7]:
model = models.vgg16(pretrained=True)
model.classifier

Sequential(
  (0): Linear(in_features=25088, out_features=4096, bias=True)
  (1): ReLU(inplace=True)
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=4096, out_features=4096, bias=True)
  (4): ReLU(inplace=True)
  (5): Dropout(p=0.5, inplace=False)
  (6): Linear(in_features=4096, out_features=1000, bias=True)
)

In [8]:
for param in model.parameters():
  param.requires_grad = False

In [9]:
for i in range(0,7):
  model.classifier[i].requires_grad = True

In [10]:
model.classifier[6] = nn.Sequential(
                      nn.Linear(4096,512),
                      nn.ReLU(),
                      nn.Dropout(0.5),
                      nn.Linear(512,10),
                      nn.LogSoftmax(dim=1)
                      )


In [11]:
model

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [12]:
criterion = nn.NLLLoss()

In [13]:
from torch.optim import Adam

model = model.to(device)



## Training from the Fully Connected Network onwards

### Re-training the model

In [14]:
from torch.optim import Adam

lr = 3e-4
optimizer = Adam([
    { 'params': model.classifier[0].parameters(), 'lr': lr},
    { 'params': model.classifier[3].parameters(), 'lr': lr},
    { 'params': model.classifier[6].parameters(), 'lr': lr}
    ], lr=lr)

In [14]:
model = model.to(device)
#optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()))

num_epochs = 1
batch_loss = 0
cum_epoch_loss = 0

for e in range(num_epochs):
  cum_epoch_loss = 0
  
  for batch, (images, labels) in enumerate(trainloader,1):
    images = images.to(device)
    labels = labels.to(device)

    optimizer.zero_grad()
    logps = model(images)
    loss = criterion(logps, labels)
    loss.backward()
    optimizer.step()
    
    batch_loss += loss.item()
    print(f'Epoch({e}/{num_epochs} : Batch number({batch}/{len(trainloader)}) : Batch loss : {loss.item()}')
    
  print(f'Training loss : {batch_loss/len(trainloader)}')  
    

Epoch(0/1 : Batch number(1/782) : Batch loss : 2.3528125286102295
Epoch(0/1 : Batch number(2/782) : Batch loss : 2.281886577606201
Epoch(0/1 : Batch number(3/782) : Batch loss : 2.0992794036865234
Epoch(0/1 : Batch number(4/782) : Batch loss : 2.1647257804870605
Epoch(0/1 : Batch number(5/782) : Batch loss : 2.119939088821411
Epoch(0/1 : Batch number(6/782) : Batch loss : 1.9770513772964478
Epoch(0/1 : Batch number(7/782) : Batch loss : 1.9793083667755127
Epoch(0/1 : Batch number(8/782) : Batch loss : 1.9261523485183716
Epoch(0/1 : Batch number(9/782) : Batch loss : 1.7236722707748413
Epoch(0/1 : Batch number(10/782) : Batch loss : 1.6771641969680786
Epoch(0/1 : Batch number(11/782) : Batch loss : 1.5943920612335205
Epoch(0/1 : Batch number(12/782) : Batch loss : 1.6339595317840576
Epoch(0/1 : Batch number(13/782) : Batch loss : 1.5308122634887695
Epoch(0/1 : Batch number(14/782) : Batch loss : 1.6044483184814453
Epoch(0/1 : Batch number(15/782) : Batch loss : 1.3871948719024658
Epoch(

### The accuracy of the model

In [15]:
model.to('cpu')

model.eval()
with torch.no_grad():
    num_correct = 0
    total = 0

    #set_trace()
    for batch, (images, labels) in enumerate(testloader,1):
        
        logps = model(images)
        output = torch.exp(logps)
        
        pred = torch.argmax(output, 1)
        total += labels.size(0)
        num_correct += (pred == labels).sum().item()
        print(f'Batch ({batch}/{len(testloader)})')
        
        if batch == 5:
          break

    print(f'Accuracy of the model on {total} test images: {num_correct * 100 / total}% ')

Batch (1/157)
Batch (2/157)
Batch (3/157)
Batch (4/157)
Batch (5/157)
Accuracy of the model on 320 test images: 79.0625% 


## Un-freezing & training on the LAST CNN block onwards

### Re-training the model

In [0]:
for i in range(24,31):
  model.features[i].requires_grad = True


In [0]:
from torch.optim import Adam

lr = 3e-4
optimizer = Adam([
    { 'params': model.features[24].parameters(), 'lr': lr},
    { 'params': model.features[26].parameters(), 'lr': lr},
    { 'params': model.features[28].parameters(), 'lr': lr},
    { 'params': model.classifier[0].parameters(), 'lr': lr},
    { 'params': model.classifier[3].parameters(), 'lr': lr},
    { 'params': model.classifier[6].parameters(), 'lr': lr}
    ], lr=lr)

In [18]:
model = model.to(device)
#optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()))

num_epochs = 3
batch_loss = 0
cum_epoch_loss = 0

for e in range(num_epochs):
  cum_epoch_loss = 0
  
  for batch, (images, labels) in enumerate(trainloader,1):
    images = images.to(device)
    labels = labels.to(device)

    optimizer.zero_grad()
    logps = model(images)
    loss = criterion(logps, labels)
    loss.backward()
    optimizer.step()
    
    batch_loss += loss.item()
    print(f'Epoch({e}/{num_epochs} : Batch number({batch}/{len(trainloader)}) : Batch loss : {loss.item()}')
    
  print(f'Training loss : {batch_loss/len(trainloader)}')  
    

Epoch(0/3 : Batch number(1/782) : Batch loss : 0.37384113669395447
Epoch(0/3 : Batch number(2/782) : Batch loss : 0.5146580934524536
Epoch(0/3 : Batch number(3/782) : Batch loss : 0.5885090827941895
Epoch(0/3 : Batch number(4/782) : Batch loss : 0.31350016593933105
Epoch(0/3 : Batch number(5/782) : Batch loss : 0.4741794764995575
Epoch(0/3 : Batch number(6/782) : Batch loss : 0.33164510130882263
Epoch(0/3 : Batch number(7/782) : Batch loss : 0.5082323551177979
Epoch(0/3 : Batch number(8/782) : Batch loss : 0.5119118094444275
Epoch(0/3 : Batch number(9/782) : Batch loss : 0.5857939720153809
Epoch(0/3 : Batch number(10/782) : Batch loss : 0.4682360887527466
Epoch(0/3 : Batch number(11/782) : Batch loss : 0.3581196963787079
Epoch(0/3 : Batch number(12/782) : Batch loss : 0.3989426791667938
Epoch(0/3 : Batch number(13/782) : Batch loss : 0.5390603542327881
Epoch(0/3 : Batch number(14/782) : Batch loss : 0.3741486966609955
Epoch(0/3 : Batch number(15/782) : Batch loss : 0.6649900078773499
E

KeyboardInterrupt: ignored

### The accuracy of the model

In [19]:
model.to('cpu')

model.eval()
with torch.no_grad():
    num_correct = 0
    total = 0

    #set_trace()
    for batch, (images, labels) in enumerate(testloader,1):
        
        logps = model(images)
        output = torch.exp(logps)
        
        pred = torch.argmax(output, 1)
        total += labels.size(0)
        num_correct += (pred == labels).sum().item()
        print(f'Batch ({batch}/{len(testloader)})')
        
        if batch == 5:
          break

    print(f'Accuracy of the model on {total} test images: {num_correct * 100 / total}% ')

Batch (1/157)
Batch (2/157)
Batch (3/157)
Batch (4/157)
Batch (5/157)
Accuracy of the model on 320 test images: 82.8125% 


## Un-freezing & training on the LAST TWO CNN block onwards

### Re-training the model

In [0]:
for i in range(17,24):
  model.features[i].requires_grad = True

In [0]:
from torch.optim import Adam

lr = 3e-4
optimizer = Adam([
    { 'params': model.features[17].parameters(), 'lr': lr},
    { 'params': model.features[19].parameters(), 'lr': lr},
    { 'params': model.features[21].parameters(), 'lr': lr},
    { 'params': model.features[24].parameters(), 'lr': lr},
    { 'params': model.features[26].parameters(), 'lr': lr},
    { 'params': model.features[28].parameters(), 'lr': lr},
    { 'params': model.classifier[0].parameters(), 'lr': lr},
    { 'params': model.classifier[3].parameters(), 'lr': lr},
    { 'params': model.classifier[6].parameters(), 'lr': lr}
    ], lr=lr)

In [22]:
model = model.to(device)
#optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()))

num_epochs = 1
batch_loss = 0
cum_epoch_loss = 0

for e in range(num_epochs):
  cum_epoch_loss = 0
  
  for batch, (images, labels) in enumerate(trainloader,1):
    images = images.to(device)
    labels = labels.to(device)

    optimizer.zero_grad()
    logps = model(images)
    loss = criterion(logps, labels)
    loss.backward()
    optimizer.step()
    
    batch_loss += loss.item()
    print(f'Epoch({e}/{num_epochs} : Batch number({batch}/{len(trainloader)}) : Batch loss : {loss.item()}')
    
  print(f'Training loss : {batch_loss/len(trainloader)}')  
    

Epoch(0/1 : Batch number(1/782) : Batch loss : 0.3778439164161682
Epoch(0/1 : Batch number(2/782) : Batch loss : 0.23955854773521423
Epoch(0/1 : Batch number(3/782) : Batch loss : 0.5916640758514404
Epoch(0/1 : Batch number(4/782) : Batch loss : 0.3050242066383362
Epoch(0/1 : Batch number(5/782) : Batch loss : 0.5073587894439697
Epoch(0/1 : Batch number(6/782) : Batch loss : 0.28958654403686523
Epoch(0/1 : Batch number(7/782) : Batch loss : 0.4297773838043213
Epoch(0/1 : Batch number(8/782) : Batch loss : 0.24711520969867706
Epoch(0/1 : Batch number(9/782) : Batch loss : 0.3713686764240265
Epoch(0/1 : Batch number(10/782) : Batch loss : 0.3815925121307373
Epoch(0/1 : Batch number(11/782) : Batch loss : 0.38709938526153564
Epoch(0/1 : Batch number(12/782) : Batch loss : 0.4454638659954071
Epoch(0/1 : Batch number(13/782) : Batch loss : 0.357410728931427
Epoch(0/1 : Batch number(14/782) : Batch loss : 0.43848666548728943
Epoch(0/1 : Batch number(15/782) : Batch loss : 0.31920498609542847

### The accuracy of the model

In [23]:
model.to('cpu')

model.eval()
with torch.no_grad():
    num_correct = 0
    total = 0

    #set_trace()
    for batch, (images, labels) in enumerate(testloader,1):
        
        logps = model(images)
        output = torch.exp(logps)
        
        pred = torch.argmax(output, 1)
        total += labels.size(0)
        num_correct += (pred == labels).sum().item()
        print(f'Batch ({batch}/{len(testloader)})')
        
        if batch == 5:
          break

    print(f'Accuracy of the model on {total} test images: {num_correct * 100 / total}% ')

Batch (1/157)
Batch (2/157)
Batch (3/157)
Batch (4/157)
Batch (5/157)
Accuracy of the model on 320 test images: 84.0625% 
