* changed by nov05 on 2024-11-28  
* Udacity AWS MLE Nanodegree (ND189)  
  Course 4, 3.7 Exercise: Training a Convolutional Neural Network  
* `conda activate drlnd_py310` with cuda enabled   
* Training:  
  * freeze all the layers of resnet18, check [the W&B logs](https://wandb.ai/nov05/udacity-awsmle-resnet18-cifar10/runs/28gkg0jn)    
    Testing Accuracy: 80.86, Testing Loss: 0.5680697995185852   
  * freeze most of the layers from the bottom of resnet18, check [the W&B logs](https://wandb.ai/nov05/udacity-awsmle-resnet18-cifar10/runs/ouiinmmp)  

In [None]:
## use the repo root folder as working directory
## training data is in data\
%cd ..
%pwd

d:\github\udacity-CD0387-deep-learning-topics-within-computer-vision-nlp-project-starter


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


'd:\\github\\udacity-CD0387-deep-learning-topics-within-computer-vision-nlp-project-starter'

## Solution: Training a CNN

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision.datasets import CIFAR10
from torchvision import transforms
from torchvision.models import resnet18, ResNet18_Weights
from torch.optim.lr_scheduler import CosineAnnealingLR
import time
from tqdm import tqdm
import wandb

## log training process with W&B if uncommented
# os.environ['WANDB_MODE'] = 'disabled'

In [None]:
class Config:
    DEVICE = torch.device('cpu')
    def __init__(self):
        self.wandb = True
        self.epochs = 30
        self.batch_size = 256
        self.opt_lr = 1e-4
        self.opt_weight_decay = 1e-4
        self.unfreeze_top_layers = True

config = Config()
config.DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"👉 Running on device type: {config.DEVICE}")

In [None]:
def test(model, test_loader, criterion):
    print("Testing model on whole testing dataset...")
    model.eval()
    running_loss=0
    running_corrects=0
    
    for inputs, labels in test_loader:
        inputs=inputs.to(config.DEVICE)
        labels=labels.to(config.DEVICE)
        outputs=model(inputs)
        loss=criterion(outputs, labels)
        _, preds = torch.max(outputs, 1)
        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds==labels.data).item()

    total_loss = running_loss / len(test_loader.dataset)
    total_acc = running_corrects/ len(test_loader.dataset)
    print(f"🟢 Testing Accuracy: {100*total_acc}, Testing Loss: {total_loss}")
    

def train(model, train_loader, validation_loader, criterion, optimizer):

    best_loss = 1e6
    image_dataset = {'train':train_loader, 'eval':validation_loader}
    epoch_loss_counter = 0
    total_steps = 0
    
    for epoch in tqdm(range(config.epochs)):
        for phase in ['train', 'eval']:
            print(f"👉 Epoch {epoch}, Phase \"{phase}\"")
            if phase=='train':
                model.train()
            else:
                model.eval()
            running_loss = 0.0
            running_corrects = 0
            running_samples=0

            for _, (inputs, labels) in enumerate(image_dataset[phase]):
                total_steps += 1
                inputs=inputs.to(config.DEVICE)
                labels=labels.to(config.DEVICE)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                wandb.log({"loss": loss}, step=total_steps)

                if phase=='train':
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                _, preds = torch.max(outputs, 1)
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds==labels.data).item()
                running_samples += len(inputs)
                if (running_samples%2000)==0:
                    accuracy = running_corrects / running_samples
                    wandb.log({"accuracy (%)": accuracy}, step=total_steps)
                    print(f"Step {total_steps}, Images [{running_samples}/{len(image_dataset[phase].dataset)} "
                          f"({100.0 * (running_samples / len(image_dataset[phase].dataset)):.0f}%)] "
                          f"Loss: {loss.item():.2f} Accuracy: {running_corrects}/{running_samples} "
                          ## for measuring time for testing, remove for students and in the formatting
                          f"({accuracy*100.:.2f}%) Time: {time.asctime()}")

                ## NOTE: Comment lines below to train and test on whole dataset
                # if running_samples>(0.2*len(image_dataset[phase].dataset)):
                #     break

            epoch_loss = running_loss / running_samples
            wandb.log({f"epoch_loss_{phase}": epoch_loss}, step=total_steps)
            epoch_accuracy = running_corrects / running_samples
            wandb.log({f"epoch_accuracy_{phase} (%)": epoch_accuracy}, step=total_steps)
            
            if phase=='train':
                scheduler.step()
            else:  ## eval
                if epoch_loss<best_loss:
                    best_loss=epoch_loss
                    epoch_loss_counter = 0  ## reset 
                else:
                    epoch_loss_counter += 1
        ## early stop if epoch_loss stops decreasing
        if epoch_loss_counter==2:
            break
    return model

def create_model():
    # model = resnet18(pretrained=True)  ## future warning: obsolete
    model = resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
    for param in model.parameters():
        param.requires_grad = False  ## freeze resnet18  
    if config.unfreeze_top_layers==True:
        # Unfreeze the last residual block (layer4)
        for param in model.layer4.parameters():
            param.requires_grad = True
    model.fc = nn.Sequential(
        nn.Linear(model.fc.in_features, 10)
    )
    return model

training_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
testing_transform = transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])


########################################################################
## create model
########################################################################
model=create_model()
model=model.to(config.DEVICE)
criterion = nn.CrossEntropyLoss()  ## CIFAR10 has 10 classes
if config.unfreeze_top_layers==True:
    # Use an optimizer that only updates unfrozen layers
    optimizer = torch.optim.Adam(
        filter(lambda p: p.requires_grad, model.parameters()), 
        lr=config.opt_lr, weight_decay=config.opt_weight_decay)
else:
    optimizer = optim.AdamW(model.fc.parameters(), lr=config.opt_lr, weight_decay=config.opt_weight_decay)
    # optimizer = torch.optim.SGD(model.fc.parameters(), lr=0.001, momentum=0.9, weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config.epochs)

trainset = CIFAR10(root='./data', train=True,
    download=True, transform=training_transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=config.batch_size,
    shuffle=True)
testset = CIFAR10(root='./data', train=False,
    download=True, transform=testing_transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=config.batch_size,
    shuffle=False)

wandb.init(
    # set the wandb project where this run will be logged
    project="udacity-awsmle-resnet18-cifar10",
    config=config
)

########################################################################
## training
########################################################################
train(model, trainloader, testloader, criterion, optimizer)

## freezed all layers of resnet18: 
##     Epoch 0, Phase train
##     Images [2000/50000 (4%)] Loss: 1.16 Accuracy: 846/2000 (42.30%) Time: Thu Nov 28 16:48:06 2024
##     Epoch 20, Phase eval
##     Step 39, Images [10000/10000 (100%)] Loss: 0.52 Accuracy: 8073/10000 (80.73%) Time: Thu Nov 28 20:46:01 2024
##     20 epochs, 101m 35.8s
## unfreeze layer4 of resnet18:
##     👉 Epoch 5, Phase "eval"
##     Step 1416, Images [10000/10000 (100%)] Loss: 0.03 Accuracy: 9155/10000 (91.55%) Time: Thu Nov 28 21:42:58 2024 
##     32m    

👉 Running on device type: cuda:0
Files already downloaded and verified
Files already downloaded and verified


0,1
accuracy (%),▁▂▄▆▆▅██▅
epoch_accuracy_eval (%),▁█▇
epoch_accuracy_train (%),▁▆█
epoch_loss_eval,█▁█
epoch_loss_train,█▃▁
loss,▇█▇▇▆▄▄▃▄▃▃▃▂▂▂▂▂▃▂▂▄▃▁▁▁▁▁▁▂▁▁▁▂▂▂▃▁▁▁▁

0,1
accuracy (%),0.9069
epoch_accuracy_eval (%),0.9069
epoch_accuracy_train (%),0.96734
epoch_loss_eval,0.301
epoch_loss_train,0.10426
loss,0.04952


  0%|          | 0/30 [00:00<?, ?it/s]

👉 Epoch 0, Phase "train"
👉 Epoch 0, Phase "eval"


  3%|▎         | 1/30 [05:09<2:29:40, 309.68s/it]

👉 Epoch 1, Phase "train"
👉 Epoch 1, Phase "eval"


  7%|▋         | 2/30 [10:51<2:33:14, 328.37s/it]

👉 Epoch 2, Phase "train"
👉 Epoch 2, Phase "eval"


 10%|█         | 3/30 [16:25<2:28:56, 330.99s/it]

👉 Epoch 3, Phase "train"
👉 Epoch 3, Phase "eval"


 13%|█▎        | 4/30 [21:43<2:21:10, 325.79s/it]

👉 Epoch 4, Phase "train"
👉 Epoch 4, Phase "eval"


 17%|█▋        | 5/30 [27:16<2:16:49, 328.37s/it]

👉 Epoch 5, Phase "train"
👉 Epoch 5, Phase "eval"


 17%|█▋        | 5/30 [32:23<2:41:57, 388.72s/it]


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [None]:
test(model, testloader, criterion)
## Freeze all Resnet18 layers, Testing Accuracy: 80.86, Testing Loss: 0.5680697995185852
## Unfreeze layer4 of Resnet18, Testing Accuracy: 91.55, Testing Loss: 0.27510506587028505
## 24.5s

Testing model on whole testing dataset...
🟢 Testing Accuracy: 91.55, Testing Loss: 0.27510506587028505


```python
ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (layer2): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (downsample): Sequential(
        (0): Conv2d(64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): BasicBlock(
      (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (layer3): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (downsample): Sequential(
        (0): Conv2d(128, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): BasicBlock(
      (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (layer4): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (downsample): Sequential(
        (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): BasicBlock(
      (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(1, 1))
  (fc): Sequential(
    (0): Linear(in_features=512, out_features=10, bias=True)
  )
)
```