In [412]:
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms
from torch import Tensor
from torch.utils.data import DataLoader
from torchsummary import summary
from torchvision import datasets
from tqdm import tqdm

In [413]:
def get_device() -> torch.device:
    if torch.cuda.is_available():
        device = torch.device("cuda")
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
    else:
        device = torch.device("cpu")
    x = torch.ones(1, device=device)

    return device

In [414]:
ROOT = Path().resolve()
print(f"Working directory: {ROOT}")

Working directory: /Users/luca/Documents/GitRepos/dlo/tutorials


In [415]:
device = get_device()
print(f"device: {device}")

device: mps


In [None]:
BATCH_SIZE = 64

dataset_transform = transforms.Compose(
    [
        # Resize to 28x28
        transforms.Resize((28, 28)),
        # Normalize to [0, 1]
        transforms.ToTensor(),
        # Flatten 2D image to 1D vector
        transforms.Lambda(lambda x: x.view(-1)),
    ]
)

# Initialize datasets
dataset_train = datasets.MNIST(
    root="images" / ROOT,
    train=True,
    download=True,
    transform=dataset_transform,
)
dataset_test = datasets.MNIST(
    root="images" / ROOT,
    train=False,
    download=True,
    transform=dataset_transform,
)

# Initialize dataloaders
dataloader_train = DataLoader(
    dataset=dataset_train,
    batch_size=BATCH_SIZE,
    shuffle=True,
)
dataloader_test = DataLoader(
    dataset=dataset_test,
    batch_size=BATCH_SIZE,
    shuffle=False,
)

print(f"dataset_train: {dataset_train}")
print(f"dataset_test: {dataset_test}")

print(f"dataloader_train: {dataloader_train}")
print(f"dataloader_test: {dataloader_test}")

print(f"len(dataset_train): {len(dataset_train)}")
print(f"len(dataset_test): {len(dataset_test)}")

print(f"classes: {dataset_train.classes}")

dataset_train: Dataset MNIST
    Number of datapoints: 60000
    Root location: /Users/luca/Documents/GitRepos/dlo/tutorials
    Split: Train
    StandardTransform
Transform: Compose(
               Resize(size=(28, 28), interpolation=bilinear, max_size=None, antialias=True)
               ToTensor()
               Lambda()
           )
dataset_test: Dataset MNIST
    Number of datapoints: 10000
    Root location: /Users/luca/Documents/GitRepos/dlo/tutorials
    Split: Test
    StandardTransform
Transform: Compose(
               Resize(size=(28, 28), interpolation=bilinear, max_size=None, antialias=True)
               ToTensor()
               Lambda()
           )
dataloader_train: <torch.utils.data.dataloader.DataLoader object at 0x17cc4fb50>
dataloader_test: <torch.utils.data.dataloader.DataLoader object at 0x17cc4fd50>
len(dataset_train): 60000
len(dataset_test): 10000
classes: ['0 - zero', '1 - one', '2 - two', '3 - three', '4 - four', '5 - five', '6 - six', '7 - seven', '8 - ei

In [417]:
# Custom model
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()

        self.fc1 = nn.Linear(1 * 28 * 28, 128)  # C x H x W
        self.fc2 = nn.Linear(self.fc1.out_features, 64)
        self.fc3 = nn.Linear(self.fc2.out_features, len(dataset_train.classes))

    def forward(self, x: Tensor) -> Tensor:
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)

        return x


model = Model().to(device)
print(f"model: {model}")

model: Model(
  (fc1): Linear(in_features=784, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=10, bias=True)
)


In [418]:
# Summary
summary(model.to("cpu"), (model.fc1.in_features,))
model = model.to(device)
print(next(model.parameters()).device)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                  [-1, 128]         100,480
            Linear-2                   [-1, 64]           8,256
            Linear-3                   [-1, 10]             650
Total params: 109,386
Trainable params: 109,386
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.42
Estimated Total Size (MB): 0.42
----------------------------------------------------------------
mps:0


In [419]:
# Optimizer
optimizer = optim.SGD(
    model.parameters(),
    lr=0.01,
    # momentum=0.9,
    # nesterov=True,
)
optimizer = optim.Adagrad(
    model.parameters(),
    lr=0.01,
)
optimizer = optim.RMSprop(
    model.parameters(),
    lr=0.01,
)
optimizer = optim.AdamW(
    model.parameters(),
    lr=0.01,
)
print(f"optimizer: {optimizer}")

for param_group in optimizer.param_groups:
    print(param_group)

print(f"Available optimizers: {torch.optim.__all__}")

optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.01
    maximize: False
    weight_decay: 0.01
)
{'params': [Parameter containing:
tensor([[-0.0209,  0.0355, -0.0357,  ...,  0.0305,  0.0037,  0.0237],
        [ 0.0254, -0.0068, -0.0073,  ...,  0.0167, -0.0259, -0.0133],
        [-0.0105, -0.0237, -0.0241,  ...,  0.0346,  0.0077, -0.0248],
        ...,
        [-0.0174, -0.0125, -0.0073,  ...,  0.0302,  0.0335,  0.0163],
        [-0.0221,  0.0013,  0.0259,  ...,  0.0345, -0.0268, -0.0013],
        [-0.0228, -0.0354,  0.0195,  ..., -0.0144, -0.0228, -0.0277]],
       device='mps:0', requires_grad=True), Parameter containing:
tensor([-0.0259, -0.0275, -0.0119,  0.0087, -0.0148,  0.0341,  0.0120,  0.0151,
         0.0188,  0.0048, -0.0315,  0.0068, -0.0018,  0.0087, -0.0282,  0.0262,
        -0.0211,  0.0355,  0.0186, -0.0198, -0.0285, -0.0044,  0.0023, 

In [420]:
# LR scheduler
lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer=optimizer,
)
print(f"lr_scheduler: {lr_scheduler}")

print(f"Available schedulers: {torch.optim.lr_scheduler.__all__}")

lr_scheduler: <torch.optim.lr_scheduler.ReduceLROnPlateau object at 0x16a1956e0>
Available schedulers: ['LambdaLR', 'MultiplicativeLR', 'StepLR', 'MultiStepLR', 'ConstantLR', 'LinearLR', 'ExponentialLR', 'SequentialLR', 'CosineAnnealingLR', 'ChainedScheduler', 'ReduceLROnPlateau', 'CyclicLR', 'CosineAnnealingWarmRestarts', 'OneCycleLR', 'PolynomialLR', 'LRScheduler']


In [421]:
# Loss function
loss_fn = nn.CrossEntropyLoss()
print(f"loss_fn: {loss_fn}")

print(f"Available loss functions: {torch.nn.modules.loss.__all__}")

loss_fn: CrossEntropyLoss()
Available loss functions: ['L1Loss', 'NLLLoss', 'NLLLoss2d', 'PoissonNLLLoss', 'GaussianNLLLoss', 'KLDivLoss', 'MSELoss', 'BCELoss', 'BCEWithLogitsLoss', 'HingeEmbeddingLoss', 'MultiLabelMarginLoss', 'SmoothL1Loss', 'HuberLoss', 'SoftMarginLoss', 'CrossEntropyLoss', 'MultiLabelSoftMarginLoss', 'CosineEmbeddingLoss', 'MarginRankingLoss', 'MultiMarginLoss', 'TripletMarginLoss', 'TripletMarginWithDistanceLoss', 'CTCLoss']


In [None]:
EPOCHS = 10

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    model.train()

    running_train_loss: float = 0.0
    correct: int = 0
    total: int = 0
    for images, labels in tqdm(dataloader_train, desc="Training", leave=False):
        images: Tensor = images.to(device)
        labels: Tensor = labels.to(device).long()

        optimizer.zero_grad()
        outputs: Tensor = model(images)  # shape: [BATCH_SIZE, 10]

        loss: Tensor = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        running_train_loss += loss.item()

        # Calculate accuracy
        _, predicted = torch.max(outputs, dim=1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

    avg_train_loss = running_train_loss / len(dataloader_train)
    accuracy = correct / total * 100

    lr_scheduler.step(avg_train_loss)

    print(f"Train Loss: {avg_train_loss:.4f} | Accuracy: {accuracy:.2f}%")

    # Manually adjust learning rate
    if epoch % 2 == 0:
        for param_group in optimizer.param_groups:
            param_group["lr"] *= 0.9
            print(f"New learning rate: {param_group['lr']:.6f}")


Epoch 1/10


                                                            

Train Loss: 0.2483 | Accuracy: 92.53%
New learning rate: 0.009000

Epoch 2/10


                                                            

Train Loss: 0.1309 | Accuracy: 96.10%

Epoch 3/10


                                                            

Train Loss: 0.1171 | Accuracy: 96.66%
New learning rate: 0.008100

Epoch 4/10


                                                            

Train Loss: 0.0924 | Accuracy: 97.22%

Epoch 5/10


                                                            

Train Loss: 0.0902 | Accuracy: 97.27%
New learning rate: 0.007290

Epoch 6/10


                                                            

Train Loss: 0.0725 | Accuracy: 97.86%

Epoch 7/10


                                                            

Train Loss: 0.0669 | Accuracy: 98.01%
New learning rate: 0.006561

Epoch 8/10


                                                            

Train Loss: 0.0583 | Accuracy: 98.24%

Epoch 9/10


                                                            

Train Loss: 0.0605 | Accuracy: 98.19%
New learning rate: 0.005905

Epoch 10/10


                                                            

Train Loss: 0.0427 | Accuracy: 98.72%


