In [None]:
# Import necessary libraries and packages
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms, models
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from typing import Type
import numpy as np
from torch.utils.data import Subset

In [None]:
# Global parameters
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
num_epochs = 10
batch_size = 64

# 1. Train the base model

In [None]:
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion * planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out

class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512 * block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.maxpool(out)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)

        out = F.adaptive_avg_pool2d(out, (1, 1))
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

def ResNet18():
    return ResNet(BasicBlock, [2, 2, 2, 2])

In [None]:
# Load and normalize the SVHN dataset
transform_svhn = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

trainset_svhn = torchvision.datasets.SVHN(root='./data', split='train', download=True, transform=transform_svhn)
trainloader_svhn = DataLoader(trainset_svhn, batch_size=batch_size, shuffle=True, num_workers=2)

testset_svhn = torchvision.datasets.SVHN(root='./data', split='test', download=True, transform=transform_svhn)
testloader_svhn = DataLoader(testset_svhn, batch_size=batch_size, shuffle=False, num_workers=2)



# Load and normalize the MNIST dataset
transform_mnist = transforms.Compose([
    transforms.Resize((32, 32)),  # Resize to be compatible with SVHN
    transforms.Grayscale(num_output_channels=3),  # Convert 1 channel grayscale to 3 channel grayscale
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

testset_mnist = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform_mnist)
testloader_mnist = DataLoader(testset_mnist, batch_size=batch_size, shuffle=False, num_workers=2)

In [None]:
def train(model, trainloader, criterion, optimizer, device, epochs):
    model.train()
    loss_values = []  # List to store loss values

    for epoch in range(epochs):
        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        # Print the average loss of the entire epoch
        avg_loss = running_loss / len(trainloader)
        print(f'[{epoch + 1}] loss: {avg_loss:.3f}')
        loss_values.append(avg_loss)

    # Plot the training loss
    plt.figure(figsize=(10, 5))
    plt.plot(loss_values, label='Training Loss')
    plt.title('Training Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()



def evaluate(model, testloader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    return accuracy

In [None]:
# Setup and repeat Train
net = ResNet18()
net = net.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9, weight_decay=5e-4)

train(net, trainloader_svhn, criterion, optimizer, device, num_epochs)

In [None]:
# Test
svhn_accuracy = evaluate(net, testloader_svhn, device)
print(f'Accuracy of the network on the SVHN test images: {svhn_accuracy:.2f}%')

mnist_accuracy = evaluate(net, testloader_mnist, device)
print(f'Accuracy of the network on the MNIST test images: {mnist_accuracy:.2f}%')

# 2. Enhance the generalization

## 2.1 Model architecture modification

In [None]:
class BasicBlockNoBN(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlockNoBN, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=True)  # bias is True now
        # self.bn1 = nn.BatchNorm2d(planes)  # Removed batch normalization
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=True)  # bias is True now
        # self.bn2 = nn.BatchNorm2d(planes)  # Removed batch normalization

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=True)  # bias is True now
                # Batch normalization is removed from shortcut as well
            )

    def forward(self, x):
        out = F.relu(self.conv1(x))
        out = self.conv2(out)
        out += self.shortcut(x)
        out = F.relu(out)
        return out

class ResNetNoBN(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNetNoBN, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=True)  # bias is True now
        # self.bn1 = nn.BatchNorm2d(64)  # Removed batch normalization
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512 * block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.conv1(x))  # Removed the batch normalization
        out = self.maxpool(out)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)

        out = F.adaptive_avg_pool2d(out, (1, 1))
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

def ResNet18NoBN():
    return ResNetNoBN(BasicBlockNoBN, [2, 2, 2, 2])

In [None]:
# Setup and repeat Train
net = ResNet18NoBN()
net = net.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9, weight_decay=5e-4)

train(net, trainloader_svhn, criterion, optimizer, device, num_epochs)

In [None]:
# Test
svhn_accuracy = evaluate(net, testloader_svhn, device)
print(f'Accuracy of the network on the SVHN test images: {svhn_accuracy:.2f}%')

mnist_accuracy = evaluate(net, testloader_mnist, device)
print(f'Accuracy of the network on the MNIST test images: {mnist_accuracy:.2f}%')

## 2.2. Loss function

You can find different implementation of Label Smoothing in PyTorch in this link: https://stackoverflow.com/questions/55681502/label-smoothing-in-pytorch

In this code I test the accuracy of my implementation with built-in function of Pytorch.

In [None]:
class LabelSmoothingCrossEntropyLoss(nn.Module):
    def __init__(self, smoothing=0.1):
        super(LabelSmoothingCrossEntropyLoss, self).__init__()
        self.smoothing = smoothing

    def forward(self, input, target):
        log_probs = F.log_softmax(input, dim=-1)

        # Compute the smoothed labels
        num_classes = input.size(-1)
        with torch.no_grad():
            true_dist = torch.zeros_like(log_probs)
            true_dist.fill_(self.smoothing / (num_classes - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), 1.0 - self.smoothing)

        # Return the average loss
        return torch.mean(torch.sum(-true_dist * log_probs, dim=-1))

In [None]:
# Setup and repeat Train
net = ResNet18()
net = net.to(device)

# criterion = LabelSmoothingCrossEntropyLoss(smoothing=0.25)

criterion = torch.nn.CrossEntropyLoss(weight=None, size_average=None, 
                          ignore_index=- 100, reduce=None, 
                          reduction='mean', label_smoothing=0.25)

optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9, weight_decay=5e-4)

train(net, trainloader_svhn, criterion, optimizer, device, num_epochs)

In [None]:
# Test
svhn_accuracy = evaluate(net, testloader_svhn, device)
print(f'Accuracy of the network on the SVHN test images: {svhn_accuracy:.2f}%')

mnist_accuracy = evaluate(net, testloader_mnist, device)
print(f'Accuracy of the network on the MNIST test images: {mnist_accuracy:.2f}%')

## 2.3 Data agumentaiton

In [None]:
# Define the augmentation pipeline
augmentation_transforms = transforms.Compose([
#     transforms.RandomRotation(degrees=10),  # Rotate +/- 10 degrees
#     transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),  # 10% translation
    transforms.RandomResizedCrop(size=(32, 32), scale=(0.8, 1.2)),  # Scaling between 80% and 120%
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),  # Randomly change brightness, contrast, and saturation
    transforms.RandomPerspective(distortion_scale=0.5),  # perspective transformations
    
    # Top gener: Color + Rotate + Size
    # Top acc: Perspective + Color + Size
    # Intersection: Color + Size
    # All
    
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Apply the augmentation pipeline only to the training data
trainloader_svhn = torch.utils.data.DataLoader(
    datasets.SVHN(root='./data', split='train', download=True, transform=augmentation_transforms),
    batch_size=batch_size, shuffle=True)

In [None]:
# Setup and repeat Train
net = ResNet18()
net = net.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9, weight_decay=5e-4)

train(net, trainloader_svhn, criterion, optimizer, device, num_epochs)

In [None]:
# Test
svhn_accuracy = evaluate(net, testloader_svhn, device)
print(f'Accuracy of the network on the SVHN test images: {svhn_accuracy:.2f}%')

mnist_accuracy = evaluate(net, testloader_mnist, device)
print(f'Accuracy of the network on the MNIST test images: {mnist_accuracy:.2f}%')

## 2.4 Transfer learning

Need to run earlier dataloaders to remove the effect of recent data agumentation

In [None]:
# Setup and repeat Train
net = models.resnet18(pretrained=True)
# Change class numbers to 10
net.fc = nn.Linear(net.fc.in_features, 10)
net = net.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9, weight_decay=5e-4)

train(net, trainloader_svhn, criterion, optimizer, device, num_epochs)


In [None]:
# Test
svhn_accuracy = evaluate(net, testloader_svhn, device)
print(f'Accuracy of the network on the SVHN test images: {svhn_accuracy:.2f}%')

mnist_accuracy = evaluate(net, testloader_mnist, device)
print(f'Accuracy of the network on the MNIST test images: {mnist_accuracy:.2f}%')

## 2.5 Optimizer

In [None]:
# Setup and repeat Train
net = ResNet18()
net = net.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001, weight_decay=5e-4)

train(net, trainloader_svhn, criterion, optimizer, device, num_epochs)


In [None]:
# Test
svhn_accuracy = evaluate(net, testloader_svhn, device)
print(f'Accuracy of the network on the SVHN test images: {svhn_accuracy:.2f}%')

mnist_accuracy = evaluate(net, testloader_mnist, device)
print(f'Accuracy of the network on the MNIST test images: {mnist_accuracy:.2f}%')

# 3. Rverese training

## 3.1. Unsupervised
Choose the best setting from the previous step and retrain the model on MNIST instead of SVHN.

We need new data agumentation because the MNIST dataset contains grayscale images with a single channel, but the normalization was set up for three-channel (RGB) image.

In [None]:
mnist_augmentation_transforms = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.Grayscale(num_output_channels=3),
    
    transforms.RandomResizedCrop(size=(32, 32), scale=(0.8, 1.2)),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.RandomPerspective(distortion_scale=0.5),

    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

In [None]:
# Reload the MNIST dataset
trainset_mnist = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform_mnist)

# Apply the augmentation pipeline only to the training data
trainloader_mnist = torch.utils.data.DataLoader(
    datasets.MNIST(root='./data', train=True, download=True, transform=mnist_augmentation_transforms), batch_size=64, shuffle=True) # Agumentation from section 2.3

testset_mnist = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform_mnist)
testloader_mnist = DataLoader(testset_mnist, batch_size=batch_size, shuffle=False, num_workers=2)


# Reload the SVHN dataset
testset_svhn = torchvision.datasets.SVHN(root='./data', split='test', download=True, transform=transform_svhn)
testloader_svhn = DataLoader(testset_svhn, batch_size=batch_size, shuffle=False, num_workers=2)

In [None]:
# Setup and repeat Train
net = models.resnet18(pretrained=True) #Pre-trained from section 2.4
net.fc = nn.Linear(net.fc.in_features, 10)
net = net.to(device)
criterion = LabelSmoothingCrossEntropyLoss(smoothing=0.25) # With label smoothing from section 2.2
optimizer = optim.Adam(net.parameters(), lr=0.001, weight_decay=5e-4) # Adam from 2.5 section

train(net, trainloader_mnist, criterion, optimizer, device, num_epochs)

In [None]:
# Test
svhn_accuracy = evaluate(net, testloader_svhn, device)
print(f'Accuracy of the network on the SVHN test images: {svhn_accuracy:.2f}%')

mnist_accuracy = evaluate(net, testloader_mnist, device)
print(f'Accuracy of the network on the MNIST test images: {mnist_accuracy:.2f}%')

## 3.2. Supervised

### Fine-tune

In [None]:
# Freeze convolutional layers
for param in net.parameters():
    param.requires_grad = False

# Unfreeze the classifier layers
for param in net.fc.parameters():
    param.requires_grad = True

# Optimizer for the classifier
optimizer = optim.Adam(net.fc.parameters(), lr=0.001, weight_decay=5e-4)

In [None]:
# After training on MNIST, fine-tune the classifier using a subset of SVHN test set
num_samples = 500
indices = np.random.choice(len(testset_svhn), num_samples, replace=False)
subset_svhn = Subset(testset_svhn, indices)

# Create a DataLoader for the SVHN subset
subset_loader_svhn = DataLoader(subset_svhn, batch_size=batch_size, shuffle=True, num_workers=2)

# Continue training (fine-tuning) on the SVHN subset
train(net, subset_loader_svhn, criterion, optimizer, device, num_epochs)

In [None]:
# Test
svhn_accuracy = evaluate(net, testloader_svhn, device)
print(f'Accuracy of the network on the SVHN test images: {svhn_accuracy:.2f}%')

mnist_accuracy = evaluate(net, testloader_mnist, device)
print(f'Accuracy of the network on the MNIST test images: {mnist_accuracy:.2f}%')