In [None]:
"""
Notes from the paper:

The Lenet paper summrizes previous work done on character recognition, including SGD, Convolutions and Neural Networks.

Goal:
Character Recognition, by building a character classifier

Dataset Used:
MNIST

Method Used:
Build a Convolution based Feature Extractor, followed by a Fully Connected Neural Network Classifier

Architecture:
Input (32, 32)
-> Convolution (5x5, 6 filters) (6, 28, 28)
-> Sub Sampling (6, 14, 14)
-> Sigmoid
-> Convolution (5x5, 16 filters) (16, 10, 10)
-> Sub Sampling (16, 5, 5)
-> Sigmoid
-> Convolution (5x5, 120 filters) (120, 1, 1)
-> Sigmoid
-> Fully Connected (120)
-> Sigmoid
-> Fully Connected (84)
-> Sigmoid
-> RBF (10)

Training Parameters / Hyperparamters:
- Important to note detail is the the dataset is 28 x 28. Padding is added to the image to better extract stroke-endpoints on the edges on the images
- Image is norrmalized to have zero mean and equal variance.
- Sumsampling means, in a 2x2 pixel area, all values are arred, multiplied by a weight and added to a bias. This IS NOT THE SAME AS MAX POOLING.
- Stride for subsampling is 2, so that the output is half the size of the input and the area of sub-sampling is non overlaping
- S2 and C3 have some weird associations which I will ignore probably
- The last layer is a layer of RBF units instead of neurons. The Paper explains, "In probabilistic terms, the RBF output can be interpreted as the unnormalized negative loglikelihood of a Gaussian distribution in the space of configurations of layer F6"
- Loss function is MSE, but they modify it and make it scary. We will just just MSE loss

- Ran three Experiments
- 1. Images were centered into a 28 x 28 image and then padded to 32 x 32. This was called the "Regular" dataset
- 2. Images were deslanted and cropped into a 20 x 20 image. This was called the "Deslanted" dataset
- 3. Images were centered into a 16 x 16 image. The Author forgot to name this dataset like it was his middle child.

I will only be using the Regular Dataset.

- Trained for 20 epochs
- 60k training images, 10k test images
- Learning Rate was 0.0005 for the first 2 epochs, and 0.0002 for the next 3, 0.0001 fir the next 3, 0.00005 for the next 4 and 0.00001 thereafter.
- Author obeserver no over-fitting? Is he Jesus? The Author says this is because the learning rates are too high? LMFAO
- 

Metrics Defined:
Error Rate
- Number of misclassified test samples / Total number of test samples

Results:

"""

In [64]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Input (32, 32)
# -> Convolution (5x5, 6 filters) (6, 28, 28)
# -> Sub Sampling (6, 14, 14)
# -> Sigmoid
# -> Convolution (5x5, 16 filters) (16, 10, 10)
# -> Sub Sampling (16, 5, 5)
# -> Sigmoid
# -> Convolution (5x5, 120 filters) (120, 1, 1)
# -> Sigmoid
# -> Fully Connected (120
# )
# -> Sigmoid
# -> Fully Connected (84)
# -> Sigmoid
# -> RBF (10)

# Since the Sub Sampling as mentioned by Yunn LeCun is not the same as Average Pooling, I will implement it as a trainable layer
class TrainableAvgPool2d(nn.Module):
    def __init__(self, kernel_size, stride=None):
        super(TrainableAvgPool2d, self).__init__()
        self.avg_pool = nn.AvgPool2d(kernel_size, stride=stride)
        # Learnable weight and bias
        self.weight = nn.Parameter(torch.ones(1))
        self.bias = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        x = self.avg_pool(x)
        return self.weight * x + self.bias

# Well guess what, RBFs are also extinct. So I gotta implement my own
class RBFLayer(nn.Module):
    def __init__(self, input_dim, output_dim, gamma=1.0):
        super(RBFLayer, self).__init__()
        # Learnable RBF centers with shape [output_dim, input_dim]
        self.centers = nn.Parameter(torch.randn(output_dim, input_dim))
        self.gamma = gamma

    def forward(self, x):
        # Compute squared Euclidean distance between input and RBF centers
        # x shape: [batch_size, input_dim], centers shape: [output_dim, input_dim]
        dists = torch.cdist(x.unsqueeze(1), self.centers.unsqueeze(0)) ** 2
        # Apply Gaussian function to get RBF output
        rbf_out = torch.exp(-self.gamma * dists.squeeze(1))
        return rbf_out

class Lenet5(nn.Module):
    def __init__(self):
        super(Lenet5, self).__init__()
        self.c1 = nn.Conv2d(1, 6, 5, stride=1, padding=2,) # 1, 28, 28 -> 6, 28, 28
        self.s2 = TrainableAvgPool2d(2, stride=2) # 6, 14, 14
        self.c3 = nn.Conv2d(6, 16, 5, stride=1, padding=0)
        self.s4 = TrainableAvgPool2d(2, stride=2)
        self.c5 = nn.Conv2d(16, 120, 5, stride=1, padding=0)
        self.f6 = nn.Linear(120, 84)
        self.rbf = RBFLayer(84, 10)
        
        
    def forward(self, x):
        x = F.sigmoid(self.s2(self.c1(x)))
        x = self.s4(self.c3(x))
        x = self.c5(x)
        x = x.view(-1, 120)
        x = self.f6(x)
        x = self.rbf(x)
        return x

In [65]:
# Let's test the model class

model = Lenet5()
image = torch.randn(5, 1, 28, 28)

output = model(image)
output.shape

torch.Size([5, 10])

In [66]:
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor, Lambda, Compose
from torch.nn.functional import one_hot
from torch.utils.data import DataLoader, Dataset

mnist_train = MNIST(root='./data', download=True, train=True)
mnist_test = MNIST(root='./data', download=True, train=False)

class MNISTDataset(Dataset):
    def __init__(self, mnist_dataset, transform=None):
        self.mnist_dataset = mnist_dataset
        self.transform = transform

    def __len__(self):
        return len(self.mnist_dataset)

    def __getitem__(self, idx):
        image, label = self.mnist_dataset[idx]
        if self.transform:
            image = self.transform(image)   
        return image, one_hot(torch.tensor(label), num_classes=10).float()


# normalize the image so that black corresponds to 0.1 and white corresponds to 1.175
# These numbers are taken directly from the paper and Author claims these give us a mean of 0 and std of 1
def custom_normalize(img: torch.Tensor, target_min = -0.1, target_max = 1.175) -> torch.Tensor:
    # Assuming img is a torch tensor with pixel values in [0, 255]
    img = img.float()  # Ensure the tensor is float for proper scaling
    img = (img - 0) * (target_max - target_min) / (255 - 0) + target_min
    return img
    

transforms = Compose([
    ToTensor(),
    Lambda(custom_normalize)
])

mnist_train_dataset = MNISTDataset(mnist_train, transforms)
mnist_test_dataset = MNISTDataset(mnist_test, transforms)

batch_size = 64
train_loader = DataLoader(mnist_train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(mnist_test_dataset, batch_size=batch_size, shuffle=False)

In [67]:
x, y = mnist_train_dataset[0]
print(x.shape, y.shape)

torch.Size([1, 28, 28]) torch.Size([10])


In [68]:
from torch.optim.lr_scheduler import LambdaLR

# Custom LR schedule as per the paper
# We set an initial Learning RAte and the scheduler will adjust it based on the epoch
def lr_lambda(epoch):
    if epoch < 2:
        return 1.0  # 0.0005 (initial LR)
    elif epoch < 5:
        return 0.4  # 0.0002
    elif epoch < 8:
        return 0.2  # 0.0001
    elif epoch < 12:
        return 0.1  # 0.00005
    else:
        return 0.02  # 0.00001

In [69]:
from torch.optim import SGD

# Hyperparameters
model = Lenet5().cuda()
epochs = 20
optimizer = SGD(model.parameters(), lr=0.0005)
scheduler = LambdaLR(optimizer, lr_lambda)
loss = nn.MSELoss()

In [None]:
def accuracy(y_pred, y_true):
    # Convert y_true from one-hot to class indices
    y_true_classes = y_true.argmax(dim=1)
    y_pred_classes = y_pred.argmax(dim=1)
    return (y_pred_classes == y_true_classes).float().mean()

for epoch in range(epochs):
    model.train()
    train_loss_val = 0.0
    train_acc_val = 0.0

    # Training loop
    for images, labels in train_loader:
        images, labels = images.cuda(), labels.cuda()
        optimizer.zero_grad()
        outputs = model(images)
        train_loss = loss(outputs, labels)
        train_loss.backward()
        optimizer.step()
        
        train_loss_val += train_loss.item()
        train_acc_val += accuracy(outputs, labels).item()

    train_loss_val /= len(train_loader)
    train_acc_val /= len(train_loader)

    # Evaluation loop
    model.eval()
    test_loss_val = 0.0
    test_acc_val = 0.0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.cuda(), labels.cuda()
            outputs = model(images)
            test_loss_val += loss(outputs, labels).item()
            test_acc_val += accuracy(outputs, labels).item()

    test_loss_val /= len(test_loader)
    test_acc_val /= len(test_loader)

    print(f"Epoch {epoch + 1}: Test Loss = {test_loss_val:.6f}, Train Loss = {train_loss_val:.6f}")
    print(f"Epoch {epoch + 1}: Test Accuracy = {test_acc_val:.4f}, Train Accuracy = {train_acc_val:.4f}")

    scheduler.step()


Output:
Epoch 1: Test Loss = 0.100000, Train Loss = 0.100000 <br>
Epoch 1: Test Accuracy = 0.0978, Train Accuracy = 0.0987 <br>
Epoch 2: Test Loss = 0.100000, Train Loss = 0.100000 <br>
Epoch 2: Test Accuracy = 0.0978, Train Accuracy = 0.0987 <br>
Epoch 3: Test Loss = 0.100000, Train Loss = 0.100000 <br>
Epoch 3: Test Accuracy = 0.0978, Train Accuracy = 0.0987 <br>
Epoch 4: Test Loss = 0.100000, Train Loss = 0.100000 <br>
Epoch 4: Test Accuracy = 0.0978, Train Accuracy = 0.0987 <br>
Epoch 5: Test Loss = 0.100000, Train Loss = 0.100000 <br>
Epoch 5: Test Accuracy = 0.0978, Train Accuracy = 0.0987 <br>
Epoch 6: Test Loss = 0.100000, Train Loss = 0.100000 <br>
Epoch 6: Test Accuracy = 0.0978, Train Accuracy = 0.0987 <br>
Epoch 7: Test Loss = 0.100000, Train Loss = 0.100000 <br>
Epoch 7: Test Accuracy = 0.0978, Train Accuracy = 0.0987 <br>
Epoch 8: Test Loss = 0.100000, Train Loss = 0.100000 <br>
Epoch 8: Test Accuracy = 0.0978, Train Accuracy = 0.0987 <br>


# Improvemnts

In [None]:
# The Paper replication clearly failed. The main reason being that pytorch modules assume modern standards are bsing used. And also because of vanishing gradients

### Improvemnt 1: Increase the learning rate and remove the decay

In [75]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor, Lambda, Compose
from torch.nn.functional import one_hot
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import LambdaLR
from torch.optim import SGD

class TrainableAvgPool2d(nn.Module):
    def __init__(self, kernel_size, stride=None):
        super(TrainableAvgPool2d, self).__init__()
        self.avg_pool = nn.AvgPool2d(kernel_size, stride=stride)
        self.weight = nn.Parameter(torch.ones(1))
        self.bias = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        x = self.avg_pool(x)
        return self.weight * x + self.bias

class RBFLayer(nn.Module):
    def __init__(self, input_dim, output_dim, gamma=1.0):
        super(RBFLayer, self).__init__()
        self.centers = nn.Parameter(torch.randn(output_dim, input_dim))
        self.gamma = gamma

    def forward(self, x):
        dists = torch.cdist(x.unsqueeze(1), self.centers.unsqueeze(0)) ** 2
        rbf_out = torch.exp(-self.gamma * dists.squeeze(1))
        return rbf_out

class Lenet5(nn.Module):
    def __init__(self):
        super(Lenet5, self).__init__()
        self.c1 = nn.Conv2d(1, 6, 5, stride=1, padding=2,) # 1, 28, 28 -> 6, 28, 28
        self.s2 = TrainableAvgPool2d(2, stride=2) # 6, 14, 14
        self.c3 = nn.Conv2d(6, 16, 5, stride=1, padding=0)
        self.s4 = TrainableAvgPool2d(2, stride=2)
        self.c5 = nn.Conv2d(16, 120, 5, stride=1, padding=0)
        self.f6 = nn.Linear(120, 84)
        self.rbf = RBFLayer(84, 10)
        
        
    def forward(self, x):
        x = F.sigmoid(self.s2(self.c1(x)))
        x = self.s4(self.c3(x))
        x = self.c5(x)
        x = x.view(-1, 120)
        x = self.f6(x)
        x = self.rbf(x)
        return x
    

class MNISTDataset(Dataset):
    def __init__(self, mnist_dataset, transform=None):
        self.mnist_dataset = mnist_dataset
        self.transform = transform

    def __len__(self):
        return len(self.mnist_dataset)

    def __getitem__(self, idx):
        image, label = self.mnist_dataset[idx]
        if self.transform:
            image = self.transform(image)   
        return image, one_hot(torch.tensor(label), num_classes=10).float()


def custom_normalize(img: torch.Tensor, target_min = -0.1, target_max = 1.175) -> torch.Tensor:
    img = img.float()  # Ensure the tensor is float for proper scaling
    img = (img - 0) * (target_max - target_min) / (255 - 0) + target_min
    return img
    
batch_size = 64

transforms = Compose([
    ToTensor(),
    Lambda(custom_normalize)
])

mnist_train = MNIST(root='./data', download=True, train=True)
mnist_test = MNIST(root='./data', download=True, train=False)


mnist_train_dataset = MNISTDataset(mnist_train, transforms)
mnist_test_dataset = MNISTDataset(mnist_test, transforms)

train_loader = DataLoader(mnist_train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(mnist_test_dataset, batch_size=batch_size, shuffle=False)


# Hyperparameters
model = Lenet5().cuda()
epochs = 5
optimizer = SGD(model.parameters(), lr=0.01)
loss = nn.MSELoss()

def accuracy(y_pred, y_true):
    # Convert y_true from one-hot to class indices
    y_true_classes = y_true.argmax(dim=1)
    y_pred_classes = y_pred.argmax(dim=1)
    return (y_pred_classes == y_true_classes).float().mean()

for epoch in range(epochs):
    model.train()
    train_loss_val = 0.0
    train_acc_val = 0.0

    # Training loop
    for images, labels in train_loader:
        images, labels = images.cuda(), labels.cuda()
        optimizer.zero_grad()
        outputs = model(images)
        train_loss = loss(outputs, labels)
        train_loss.backward()
        optimizer.step()
        
        train_loss_val += train_loss.item()
        train_acc_val += accuracy(outputs, labels).item()

    train_loss_val /= len(train_loader)
    train_acc_val /= len(train_loader)

    # Evaluation loop
    model.eval()
    test_loss_val = 0.0
    test_acc_val = 0.0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.cuda(), labels.cuda()
            outputs = model(images)
            test_loss_val += loss(outputs, labels).item()
            test_acc_val += accuracy(outputs, labels).item()

    test_loss_val /= len(test_loader)
    test_acc_val /= len(test_loader)

    print(f"Epoch {epoch + 1}: Test Loss = {test_loss_val:.6f}, Train Loss = {train_loss_val:.6f}")
    print(f"Epoch {epoch + 1}: Test Accuracy = {test_acc_val:.4f}, Train Accuracy = {train_acc_val:.4f}")

Epoch 1: Test Loss = 0.100000, Train Loss = 0.100000
Epoch 1: Test Accuracy = 0.1136, Train Accuracy = 0.1124
Epoch 2: Test Loss = 0.100000, Train Loss = 0.100000
Epoch 2: Test Accuracy = 0.1136, Train Accuracy = 0.1124
Epoch 3: Test Loss = 0.100000, Train Loss = 0.100000
Epoch 3: Test Accuracy = 0.1136, Train Accuracy = 0.1124
Epoch 4: Test Loss = 0.100000, Train Loss = 0.100000
Epoch 4: Test Accuracy = 0.1136, Train Accuracy = 0.1123
Epoch 5: Test Loss = 0.100000, Train Loss = 0.100000
Epoch 5: Test Accuracy = 0.1136, Train Accuracy = 0.1124


### Improvement #2 : Update the model architecture <br>
- Replace Sigmod with ReLU activation function
- Replace the RBF layer with a fully connected layer
- Add activation on the last layer
- Replace the trainable Average pool with a maxpool

In [77]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor, Lambda, Compose
from torch.nn.functional import one_hot
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import LambdaLR
from torch.optim import SGD

class Lenet5(nn.Module):
    def __init__(self):
        super(Lenet5, self).__init__()
        self.c1 = nn.Conv2d(1, 6, 5, stride=1, padding=2,) # 1, 28, 28 -> 6, 28, 28
        self.s2 = nn.MaxPool2d(2, stride=2) # 6, 14, 14
        self.c3 = nn.Conv2d(6, 16, 5, stride=1, padding=0)
        self.s4 = nn.MaxPool2d(2, stride=2)
        self.c5 = nn.Conv2d(16, 120, 5, stride=1, padding=0)
        self.f6 = nn.Linear(120, 84)
        self.f7 = nn.Linear(84, 10)
        
        
    def forward(self, x):
        x = F.relu(self.s2(self.c1(x)))
        x = F.relu(self.s4(self.c3(x)))
        x = F.relu(self.c5(x))
        x = x.view(-1, 120)
        x = F.relu(self.f6(x))
        x = self.f7(x)
        return F.sigmoid(x)
    

class MNISTDataset(Dataset):
    def __init__(self, mnist_dataset, transform=None):
        self.mnist_dataset = mnist_dataset
        self.transform = transform

    def __len__(self):
        return len(self.mnist_dataset)

    def __getitem__(self, idx):
        image, label = self.mnist_dataset[idx]
        if self.transform:
            image = self.transform(image)   
        return image, one_hot(torch.tensor(label), num_classes=10).float()


def custom_normalize(img: torch.Tensor, target_min = -0.1, target_max = 1.175) -> torch.Tensor:
    img = img.float()  # Ensure the tensor is float for proper scaling
    img = (img - 0) * (target_max - target_min) / (255 - 0) + target_min
    return img
    
batch_size = 64

transforms = Compose([
    ToTensor(),
    Lambda(custom_normalize)
])

mnist_train = MNIST(root='./data', download=True, train=True)
mnist_test = MNIST(root='./data', download=True, train=False)


mnist_train_dataset = MNISTDataset(mnist_train, transforms)
mnist_test_dataset = MNISTDataset(mnist_test, transforms)

train_loader = DataLoader(mnist_train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(mnist_test_dataset, batch_size=batch_size, shuffle=False)


# Hyperparameters
model = Lenet5().cuda()
epochs = 5
optimizer = SGD(model.parameters(), lr=0.01)
loss = nn.MSELoss()

def accuracy(y_pred, y_true):
    # Convert y_true from one-hot to class indices
    y_true_classes = y_true.argmax(dim=1)
    y_pred_classes = y_pred.argmax(dim=1)
    return (y_pred_classes == y_true_classes).float().mean()

for epoch in range(epochs):
    model.train()
    train_loss_val = 0.0
    train_acc_val = 0.0

    # Training loop
    for images, labels in train_loader:
        images, labels = images.cuda(), labels.cuda()
        optimizer.zero_grad()
        outputs = model(images)
        train_loss = loss(outputs, labels)
        train_loss.backward()
        optimizer.step()
        
        train_loss_val += train_loss.item()
        train_acc_val += accuracy(outputs, labels).item()

    train_loss_val /= len(train_loader)
    train_acc_val /= len(train_loader)

    # Evaluation loop
    model.eval()
    test_loss_val = 0.0
    test_acc_val = 0.0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.cuda(), labels.cuda()
            outputs = model(images)
            test_loss_val += loss(outputs, labels).item()
            test_acc_val += accuracy(outputs, labels).item()

    test_loss_val /= len(test_loader)
    test_acc_val /= len(test_loader)

    print(f"Epoch {epoch + 1}: Test Loss = {test_loss_val:.6f}, Train Loss = {train_loss_val:.6f}")
    print(f"Epoch {epoch + 1}: Test Accuracy = {test_acc_val:.4f}, Train Accuracy = {train_acc_val:.4f}")

Epoch 1: Test Loss = 0.205365, Train Loss = 0.231430
Epoch 1: Test Accuracy = 0.1011, Train Accuracy = 0.1022
Epoch 2: Test Loss = 0.090314, Train Loss = 0.141966
Epoch 2: Test Accuracy = 0.0983, Train Accuracy = 0.0993
Epoch 3: Test Loss = 0.089986, Train Loss = 0.090044
Epoch 3: Test Accuracy = 0.1136, Train Accuracy = 0.1015
Epoch 4: Test Loss = 0.089972, Train Loss = 0.089980
Epoch 4: Test Accuracy = 0.1136, Train Accuracy = 0.1123
Epoch 5: Test Loss = 0.089969, Train Loss = 0.089974
Epoch 5: Test Accuracy = 0.1136, Train Accuracy = 0.1124


### Improvement #3: Change the Loss to CrossEntropyLoss <br>
To do that, Remove the activation layer on the last layer of the model

In [82]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor, Lambda, Compose
from torch.nn.functional import one_hot
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import LambdaLR
from torch.optim import SGD

class Lenet5(nn.Module):
    def __init__(self):
        super(Lenet5, self).__init__()
        self.c1 = nn.Conv2d(1, 6, 5, stride=1, padding=2,) # 1, 28, 28 -> 6, 28, 28
        self.s2 = nn.MaxPool2d(2, stride=2) # 6, 14, 14
        self.c3 = nn.Conv2d(6, 16, 5, stride=1, padding=0)
        self.s4 = nn.MaxPool2d(2, stride=2)
        self.c5 = nn.Conv2d(16, 120, 5, stride=1, padding=0)
        self.f6 = nn.Linear(120, 84)
        self.f7 = nn.Linear(84, 10)
        
        
    def forward(self, x):
        x = F.relu(self.s2(self.c1(x)))
        x = F.relu(self.s4(self.c3(x)))
        
        x = F.relu(self.c5(x))
        x = x.view(-1, 120)
        x = F.relu(self.f6(x))
        x = self.f7(x)
        return x
    

class MNISTDataset(Dataset):
    def __init__(self, mnist_dataset, transform=None):
        self.mnist_dataset = mnist_dataset
        self.transform = transform

    def __len__(self):
        return len(self.mnist_dataset)

    def __getitem__(self, idx):
        image, label = self.mnist_dataset[idx]
        if self.transform:
            image = self.transform(image)   
        return image, label


def custom_normalize(img: torch.Tensor, target_min = -0.1, target_max = 1.175) -> torch.Tensor:
    img = img.float()  # Ensure the tensor is float for proper scaling
    img = (img - 0) * (target_max - target_min) / (255 - 0) + target_min
    return img
    
batch_size = 64

transforms = Compose([
    ToTensor(),
    Lambda(custom_normalize)
])

mnist_train = MNIST(root='./data', download=True, train=True)
mnist_test = MNIST(root='./data', download=True, train=False)


mnist_train_dataset = MNISTDataset(mnist_train, transforms)
mnist_test_dataset = MNISTDataset(mnist_test, transforms)

train_loader = DataLoader(mnist_train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(mnist_test_dataset, batch_size=batch_size, shuffle=False)


# Hyperparameters
model = Lenet5().cuda()
epochs = 5
optimizer = SGD(model.parameters(), lr=0.01)
loss = nn.CrossEntropyLoss()

def accuracy(y_pred, y_true):
    y_pred_classes = y_pred.argmax(dim=1)
    return (y_pred_classes == y_true).float().mean()

for epoch in range(epochs):
    model.train()
    train_loss_val = 0.0
    train_acc_val = 0.0

    # Training loop
    for images, labels in train_loader:
        images, labels = images.cuda(), labels.cuda()
        optimizer.zero_grad()
        outputs = model(images)
        train_loss = loss(outputs, labels)
        train_loss.backward()
        optimizer.step()
        
        train_loss_val += train_loss.item()
        train_acc_val += accuracy(outputs, labels).item()

    train_loss_val /= len(train_loader)
    train_acc_val /= len(train_loader)

    # Evaluation loop
    model.eval()
    test_loss_val = 0.0
    test_acc_val = 0.0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.cuda(), labels.cuda()
            outputs = model(images)
            test_loss_val += loss(outputs, labels).item()
            test_acc_val += accuracy(outputs, labels).item()

    test_loss_val /= len(test_loader)
    test_acc_val /= len(test_loader)

    print(f"Epoch {epoch + 1}: Test Loss = {test_loss_val:.6f}, Train Loss = {train_loss_val:.6f}")
    print(f"Epoch {epoch + 1}: Test Accuracy = {test_acc_val:.4f}, Train Accuracy = {train_acc_val:.4f}")

Epoch 1: Test Loss = 2.301208, Train Loss = 2.302107
Epoch 1: Test Accuracy = 0.1136, Train Accuracy = 0.1124
Epoch 2: Test Loss = 2.301031, Train Loss = 2.301321
Epoch 2: Test Accuracy = 0.1136, Train Accuracy = 0.1124
Epoch 3: Test Loss = 2.301044, Train Loss = 2.301251
Epoch 3: Test Accuracy = 0.1136, Train Accuracy = 0.1124
Epoch 4: Test Loss = 2.301025, Train Loss = 2.301240
Epoch 4: Test Accuracy = 0.1136, Train Accuracy = 0.1124
Epoch 5: Test Loss = 2.301021, Train Loss = 2.301231
Epoch 5: Test Accuracy = 0.1136, Train Accuracy = 0.1124


### Improvement #4: Change the normalization to 0-1 instead of the whacky number.<br>
Not normalizing to mean 0 and std 1, because all rely layers zero all negative outputs anyways.

In [83]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor, Lambda, Compose
from torch.nn.functional import one_hot
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import LambdaLR
from torch.optim import SGD

class Lenet5(nn.Module):
    def __init__(self):
        super(Lenet5, self).__init__()
        self.c1 = nn.Conv2d(1, 6, 5, stride=1, padding=2,) # 1, 28, 28 -> 6, 28, 28
        self.s2 = nn.MaxPool2d(2, stride=2) # 6, 14, 14
        self.c3 = nn.Conv2d(6, 16, 5, stride=1, padding=0)
        self.s4 = nn.MaxPool2d(2, stride=2)
        self.c5 = nn.Conv2d(16, 120, 5, stride=1, padding=0)
        self.f6 = nn.Linear(120, 84)
        self.f7 = nn.Linear(84, 10)
        
        
    def forward(self, x):
        x = F.relu(self.s2(self.c1(x)))
        x = F.relu(self.s4(self.c3(x)))
        
        x = F.relu(self.c5(x))
        x = x.view(-1, 120)
        x = F.relu(self.f6(x))
        x = self.f7(x)
        return x
    

class MNISTDataset(Dataset):
    def __init__(self, mnist_dataset, transform=None):
        self.mnist_dataset = mnist_dataset
        self.transform = transform

    def __len__(self):
        return len(self.mnist_dataset)

    def __getitem__(self, idx):
        image, label = self.mnist_dataset[idx]
        if self.transform:
            image = self.transform(image)   
        return image, label

batch_size = 64

transforms = Compose([
    ToTensor()
])

mnist_train = MNIST(root='./data', download=True, train=True)
mnist_test = MNIST(root='./data', download=True, train=False)


mnist_train_dataset = MNISTDataset(mnist_train, transforms)
mnist_test_dataset = MNISTDataset(mnist_test, transforms)

train_loader = DataLoader(mnist_train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(mnist_test_dataset, batch_size=batch_size, shuffle=False)


# Hyperparameters
model = Lenet5().cuda()
epochs = 5
optimizer = SGD(model.parameters(), lr=0.01)
loss = nn.CrossEntropyLoss()

def accuracy(y_pred, y_true):
    y_pred_classes = y_pred.argmax(dim=1)
    return (y_pred_classes == y_true).float().mean()

for epoch in range(epochs):
    model.train()
    train_loss_val = 0.0
    train_acc_val = 0.0

    # Training loop
    for images, labels in train_loader:
        images, labels = images.cuda(), labels.cuda()
        optimizer.zero_grad()
        outputs = model(images)
        train_loss = loss(outputs, labels)
        train_loss.backward()
        optimizer.step()
        
        train_loss_val += train_loss.item()
        train_acc_val += accuracy(outputs, labels).item()

    train_loss_val /= len(train_loader)
    train_acc_val /= len(train_loader)

    # Evaluation loop
    model.eval()
    test_loss_val = 0.0
    test_acc_val = 0.0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.cuda(), labels.cuda()
            outputs = model(images)
            test_loss_val += loss(outputs, labels).item()
            test_acc_val += accuracy(outputs, labels).item()

    test_loss_val /= len(test_loader)
    test_acc_val /= len(test_loader)

    print(f"Epoch {epoch + 1}: Test Loss = {test_loss_val:.6f}, Train Loss = {train_loss_val:.6f}")
    print(f"Epoch {epoch + 1}: Test Accuracy = {test_acc_val:.4f}, Train Accuracy = {train_acc_val:.4f}")

Epoch 1: Test Loss = 2.179829, Train Loss = 2.282292
Epoch 1: Test Accuracy = 0.4022, Train Accuracy = 0.1871
Epoch 2: Test Loss = 0.322339, Train Loss = 0.796592
Epoch 2: Test Accuracy = 0.9026, Train Accuracy = 0.7783
Epoch 3: Test Loss = 0.185626, Train Loss = 0.273590
Epoch 3: Test Accuracy = 0.9455, Train Accuracy = 0.9151
Epoch 4: Test Loss = 0.139431, Train Loss = 0.184453
Epoch 4: Test Accuracy = 0.9570, Train Accuracy = 0.9439
Epoch 5: Test Loss = 0.115000, Train Loss = 0.142690
Epoch 5: Test Accuracy = 0.9645, Train Accuracy = 0.9564


# Improvement #5: Use Adam optimizer

In [84]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor, Lambda, Compose
from torch.nn.functional import one_hot
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import LambdaLR
from torch.optim import SGD, Adam

class Lenet5(nn.Module):
    def __init__(self):
        super(Lenet5, self).__init__()
        self.c1 = nn.Conv2d(1, 6, 5, stride=1, padding=2,) # 1, 28, 28 -> 6, 28, 28
        self.s2 = nn.MaxPool2d(2, stride=2) # 6, 14, 14
        self.c3 = nn.Conv2d(6, 16, 5, stride=1, padding=0)
        self.s4 = nn.MaxPool2d(2, stride=2)
        self.c5 = nn.Conv2d(16, 120, 5, stride=1, padding=0)
        self.f6 = nn.Linear(120, 84)
        self.f7 = nn.Linear(84, 10)
        
        
    def forward(self, x):
        x = F.relu(self.s2(self.c1(x)))
        x = F.relu(self.s4(self.c3(x)))
        
        x = F.relu(self.c5(x))
        x = x.view(-1, 120)
        x = F.relu(self.f6(x))
        x = self.f7(x)
        return x
    

class MNISTDataset(Dataset):
    def __init__(self, mnist_dataset, transform=None):
        self.mnist_dataset = mnist_dataset
        self.transform = transform

    def __len__(self):
        return len(self.mnist_dataset)

    def __getitem__(self, idx):
        image, label = self.mnist_dataset[idx]
        if self.transform:
            image = self.transform(image)   
        return image, label

batch_size = 64

transforms = Compose([
    ToTensor()
])

mnist_train = MNIST(root='./data', download=True, train=True)
mnist_test = MNIST(root='./data', download=True, train=False)


mnist_train_dataset = MNISTDataset(mnist_train, transforms)
mnist_test_dataset = MNISTDataset(mnist_test, transforms)

train_loader = DataLoader(mnist_train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(mnist_test_dataset, batch_size=batch_size, shuffle=False)


# Hyperparameters
model = Lenet5().cuda()
epochs = 5
optimizer = Adam(model.parameters(), lr=0.0001)
loss = nn.CrossEntropyLoss()

def accuracy(y_pred, y_true):
    y_pred_classes = y_pred.argmax(dim=1)
    return (y_pred_classes == y_true).float().mean()

for epoch in range(epochs):
    model.train()
    train_loss_val = 0.0
    train_acc_val = 0.0

    # Training loop
    for images, labels in train_loader:
        images, labels = images.cuda(), labels.cuda()
        optimizer.zero_grad()
        outputs = model(images)
        train_loss = loss(outputs, labels)
        train_loss.backward()
        optimizer.step()
        
        train_loss_val += train_loss.item()
        train_acc_val += accuracy(outputs, labels).item()

    train_loss_val /= len(train_loader)
    train_acc_val /= len(train_loader)

    # Evaluation loop
    model.eval()
    test_loss_val = 0.0
    test_acc_val = 0.0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.cuda(), labels.cuda()
            outputs = model(images)
            test_loss_val += loss(outputs, labels).item()
            test_acc_val += accuracy(outputs, labels).item()

    test_loss_val /= len(test_loader)
    test_acc_val /= len(test_loader)

    print(f"Epoch {epoch + 1}: Test Loss = {test_loss_val:.6f}, Train Loss = {train_loss_val:.6f}")
    print(f"Epoch {epoch + 1}: Test Accuracy = {test_acc_val:.4f}, Train Accuracy = {train_acc_val:.4f}")

Epoch 1: Test Loss = 0.364424, Train Loss = 0.868267
Epoch 1: Test Accuracy = 0.8932, Train Accuracy = 0.7449
Epoch 2: Test Loss = 0.234410, Train Loss = 0.304749
Epoch 2: Test Accuracy = 0.9304, Train Accuracy = 0.9092
Epoch 3: Test Loss = 0.176403, Train Loss = 0.217226
Epoch 3: Test Accuracy = 0.9460, Train Accuracy = 0.9347
Epoch 4: Test Loss = 0.141677, Train Loss = 0.170702
Epoch 4: Test Accuracy = 0.9583, Train Accuracy = 0.9480
Epoch 5: Test Loss = 0.117975, Train Loss = 0.141245
Epoch 5: Test Accuracy = 0.9625, Train Accuracy = 0.9575


### Experiment, If we keep everything else the same and only change the normalization in the first code, Let's see what happens

In [86]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor, Lambda, Compose
from torch.nn.functional import one_hot
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import LambdaLR
from torch.optim import SGD

class TrainableAvgPool2d(nn.Module):
    def __init__(self, kernel_size, stride=None):
        super(TrainableAvgPool2d, self).__init__()
        self.avg_pool = nn.AvgPool2d(kernel_size, stride=stride)
        # Learnable weight and bias
        self.weight = nn.Parameter(torch.ones(1))
        self.bias = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        x = self.avg_pool(x)
        return self.weight * x + self.bias

# Well guess what, RBFs are also extinct. So I gotta implement my own
class RBFLayer(nn.Module):
    def __init__(self, input_dim, output_dim, gamma=1.0):
        super(RBFLayer, self).__init__()
        # Learnable RBF centers with shape [output_dim, input_dim]
        self.centers = nn.Parameter(torch.randn(output_dim, input_dim))
        self.gamma = gamma

    def forward(self, x):
        # Compute squared Euclidean distance between input and RBF centers
        # x shape: [batch_size, input_dim], centers shape: [output_dim, input_dim]
        dists = torch.cdist(x.unsqueeze(1), self.centers.unsqueeze(0)) ** 2
        # Apply Gaussian function to get RBF output
        rbf_out = torch.exp(-self.gamma * dists.squeeze(1))
        return rbf_out

class Lenet5(nn.Module):
    def __init__(self):
        super(Lenet5, self).__init__()
        self.c1 = nn.Conv2d(1, 6, 5, stride=1, padding=2,) # 1, 28, 28 -> 6, 28, 28
        self.s2 = TrainableAvgPool2d(2, stride=2) # 6, 14, 14
        self.c3 = nn.Conv2d(6, 16, 5, stride=1, padding=0)
        self.s4 = TrainableAvgPool2d(2, stride=2)
        self.c5 = nn.Conv2d(16, 120, 5, stride=1, padding=0)
        self.f6 = nn.Linear(120, 84)
        self.rbf = RBFLayer(84, 10)
        
        
    def forward(self, x):
        x = F.sigmoid(self.s2(self.c1(x)))
        x = self.s4(self.c3(x))
        x = self.c5(x)
        x = x.view(-1, 120)
        x = self.f6(x)
        x = self.rbf(x)
        return x
    

class MNISTDataset(Dataset):
    def __init__(self, mnist_dataset, transform=None):
        self.mnist_dataset = mnist_dataset
        self.transform = transform

    def __len__(self):
        return len(self.mnist_dataset)

    def __getitem__(self, idx):
        image, label = self.mnist_dataset[idx]
        if self.transform:
            image = self.transform(image)   
        return image, one_hot(torch.tensor(label), num_classes=10).float()

def lr_lambda(epoch):
    if epoch < 2:
        return 1.0  # 0.0005 (initial LR)
    elif epoch < 5:
        return 0.4  # 0.0002
    elif epoch < 8:
        return 0.2  # 0.0001
    elif epoch < 12:
        return 0.1  # 0.00005
    else:
        return 0.02  # 0.00001

    
batch_size = 64

transforms = Compose([
    ToTensor()
])

mnist_train = MNIST(root='./data', download=True, train=True)
mnist_test = MNIST(root='./data', download=True, train=False)


mnist_train_dataset = MNISTDataset(mnist_train, transforms)
mnist_test_dataset = MNISTDataset(mnist_test, transforms)

train_loader = DataLoader(mnist_train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(mnist_test_dataset, batch_size=batch_size, shuffle=False)


# Hyperparameters
model = Lenet5().cuda()
epochs = 5
optimizer = SGD(model.parameters(), lr=0.0005)
scheduler = LambdaLR(optimizer, lr_lambda)
loss = nn.MSELoss()

def accuracy(y_pred, y_true):
    # Convert y_true from one-hot to class indices
    y_true_classes = y_true.argmax(dim=1)
    y_pred_classes = y_pred.argmax(dim=1)
    return (y_pred_classes == y_true_classes).float().mean()

for epoch in range(epochs):
    model.train()
    train_loss_val = 0.0
    train_acc_val = 0.0

    # Training loop
    for images, labels in train_loader:
        images, labels = images.cuda(), labels.cuda()
        optimizer.zero_grad()
        outputs = model(images)
        train_loss = loss(outputs, labels)
        train_loss.backward()
        optimizer.step()
        
        train_loss_val += train_loss.item()
        train_acc_val += accuracy(outputs, labels).item()

    train_loss_val /= len(train_loader)
    train_acc_val /= len(train_loader)

    # Evaluation loop
    model.eval()
    test_loss_val = 0.0
    test_acc_val = 0.0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.cuda(), labels.cuda()
            outputs = model(images)
            test_loss_val += loss(outputs, labels).item()
            test_acc_val += accuracy(outputs, labels).item()

    test_loss_val /= len(test_loader)
    test_acc_val /= len(test_loader)

    print(f"Epoch {epoch + 1}: Test Loss = {test_loss_val:.6f}, Train Loss = {train_loss_val:.6f}")
    print(f"Epoch {epoch + 1}: Test Accuracy = {test_acc_val:.4f}, Train Accuracy = {train_acc_val:.4f}")
    scheduler.step()

Epoch 1: Test Loss = 0.100000, Train Loss = 0.100000
Epoch 1: Test Accuracy = 0.1026, Train Accuracy = 0.1044
Epoch 2: Test Loss = 0.100000, Train Loss = 0.100000
Epoch 2: Test Accuracy = 0.1026, Train Accuracy = 0.1044
Epoch 3: Test Loss = 0.100000, Train Loss = 0.100000
Epoch 3: Test Accuracy = 0.1026, Train Accuracy = 0.1044
Epoch 4: Test Loss = 0.100000, Train Loss = 0.100000
Epoch 4: Test Accuracy = 0.1026, Train Accuracy = 0.1044
Epoch 5: Test Loss = 0.100000, Train Loss = 0.100000
Epoch 5: Test Accuracy = 0.1026, Train Accuracy = 0.1044


##### Maybe try increasing learning rate?

In [88]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor, Lambda, Compose
from torch.nn.functional import one_hot
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import LambdaLR
from torch.optim import SGD

class TrainableAvgPool2d(nn.Module):
    def __init__(self, kernel_size, stride=None):
        super(TrainableAvgPool2d, self).__init__()
        self.avg_pool = nn.AvgPool2d(kernel_size, stride=stride)
        # Learnable weight and bias
        self.weight = nn.Parameter(torch.ones(1))
        self.bias = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        x = self.avg_pool(x)
        return self.weight * x + self.bias

# Well guess what, RBFs are also extinct. So I gotta implement my own
class RBFLayer(nn.Module):
    def __init__(self, input_dim, output_dim, gamma=1.0):
        super(RBFLayer, self).__init__()
        # Learnable RBF centers with shape [output_dim, input_dim]
        self.centers = nn.Parameter(torch.randn(output_dim, input_dim))
        self.gamma = gamma

    def forward(self, x):
        # Compute squared Euclidean distance between input and RBF centers
        # x shape: [batch_size, input_dim], centers shape: [output_dim, input_dim]
        dists = torch.cdist(x.unsqueeze(1), self.centers.unsqueeze(0)) ** 2
        # Apply Gaussian function to get RBF output
        rbf_out = torch.exp(-self.gamma * dists.squeeze(1))
        return rbf_out

class Lenet5(nn.Module):
    def __init__(self):
        super(Lenet5, self).__init__()
        self.c1 = nn.Conv2d(1, 6, 5, stride=1, padding=2,) # 1, 28, 28 -> 6, 28, 28
        self.s2 = TrainableAvgPool2d(2, stride=2) # 6, 14, 14
        self.c3 = nn.Conv2d(6, 16, 5, stride=1, padding=0)
        self.s4 = TrainableAvgPool2d(2, stride=2)
        self.c5 = nn.Conv2d(16, 120, 5, stride=1, padding=0)
        self.f6 = nn.Linear(120, 84)
        self.rbf = RBFLayer(84, 10)
        
        
    def forward(self, x):
        x = F.sigmoid(self.s2(self.c1(x)))
        x = self.s4(self.c3(x))
        x = self.c5(x)
        x = x.view(-1, 120)
        x = self.f6(x)
        x = self.rbf(x)
        return x
    

class MNISTDataset(Dataset):
    def __init__(self, mnist_dataset, transform=None):
        self.mnist_dataset = mnist_dataset
        self.transform = transform

    def __len__(self):
        return len(self.mnist_dataset)

    def __getitem__(self, idx):
        image, label = self.mnist_dataset[idx]
        if self.transform:
            image = self.transform(image)   
        return image, one_hot(torch.tensor(label), num_classes=10).float()

def lr_lambda(epoch):
    if epoch < 2:
        return 1.0  # 0.0005 (initial LR)
    elif epoch < 5:
        return 0.4  # 0.0002
    elif epoch < 8:
        return 0.2  # 0.0001
    elif epoch < 12:
        return 0.1  # 0.00005
    else:
        return 0.02  # 0.00001

    
batch_size = 64

transforms = Compose([
    ToTensor()
])

mnist_train = MNIST(root='./data', download=True, train=True)
mnist_test = MNIST(root='./data', download=True, train=False)


mnist_train_dataset = MNISTDataset(mnist_train, transforms)
mnist_test_dataset = MNISTDataset(mnist_test, transforms)

train_loader = DataLoader(mnist_train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(mnist_test_dataset, batch_size=batch_size, shuffle=False)


# Hyperparameters
model = Lenet5().cuda()
epochs = 5
optimizer = SGD(model.parameters(), lr=0.05)
scheduler = LambdaLR(optimizer, lr_lambda)
loss = nn.MSELoss()

def accuracy(y_pred, y_true):
    # Convert y_true from one-hot to class indices
    y_true_classes = y_true.argmax(dim=1)
    y_pred_classes = y_pred.argmax(dim=1)
    return (y_pred_classes == y_true_classes).float().mean()

for epoch in range(epochs):
    model.train()
    train_loss_val = 0.0
    train_acc_val = 0.0

    # Training loop
    for images, labels in train_loader:
        images, labels = images.cuda(), labels.cuda()
        optimizer.zero_grad()
        outputs = model(images)
        train_loss = loss(outputs, labels)
        train_loss.backward()
        optimizer.step()
        
        train_loss_val += train_loss.item()
        train_acc_val += accuracy(outputs, labels).item()

    train_loss_val /= len(train_loader)
    train_acc_val /= len(train_loader)

    # Evaluation loop
    model.eval()
    test_loss_val = 0.0
    test_acc_val = 0.0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.cuda(), labels.cuda()
            outputs = model(images)
            test_loss_val += loss(outputs, labels).item()
            test_acc_val += accuracy(outputs, labels).item()

    test_loss_val /= len(test_loader)
    test_acc_val /= len(test_loader)

    print(f"Epoch {epoch + 1}: Test Loss = {test_loss_val:.6f}, Train Loss = {train_loss_val:.6f}")
    print(f"Epoch {epoch + 1}: Test Accuracy = {test_acc_val:.4f}, Train Accuracy = {train_acc_val:.4f}")
    scheduler.step()

Epoch 1: Test Loss = 0.100000, Train Loss = 0.100000
Epoch 1: Test Accuracy = 0.1136, Train Accuracy = 0.1124
Epoch 2: Test Loss = 0.100000, Train Loss = 0.100000
Epoch 2: Test Accuracy = 0.1136, Train Accuracy = 0.1124
Epoch 3: Test Loss = 0.100000, Train Loss = 0.100000
Epoch 3: Test Accuracy = 0.1136, Train Accuracy = 0.1124
Epoch 4: Test Loss = 0.100000, Train Loss = 0.100000
Epoch 4: Test Accuracy = 0.1136, Train Accuracy = 0.1123
Epoch 5: Test Loss = 0.100000, Train Loss = 0.100000
Epoch 5: Test Accuracy = 0.1136, Train Accuracy = 0.1123


##### How about replace MSE Loss with Cross Entropy Loss?

In [89]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor, Lambda, Compose
from torch.nn.functional import one_hot
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import LambdaLR
from torch.optim import SGD

class TrainableAvgPool2d(nn.Module):
    def __init__(self, kernel_size, stride=None):
        super(TrainableAvgPool2d, self).__init__()
        self.avg_pool = nn.AvgPool2d(kernel_size, stride=stride)
        # Learnable weight and bias
        self.weight = nn.Parameter(torch.ones(1))
        self.bias = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        x = self.avg_pool(x)
        return self.weight * x + self.bias

# Well guess what, RBFs are also extinct. So I gotta implement my own
class RBFLayer(nn.Module):
    def __init__(self, input_dim, output_dim, gamma=1.0):
        super(RBFLayer, self).__init__()
        # Learnable RBF centers with shape [output_dim, input_dim]
        self.centers = nn.Parameter(torch.randn(output_dim, input_dim))
        self.gamma = gamma

    def forward(self, x):
        # Compute squared Euclidean distance between input and RBF centers
        # x shape: [batch_size, input_dim], centers shape: [output_dim, input_dim]
        dists = torch.cdist(x.unsqueeze(1), self.centers.unsqueeze(0)) ** 2
        # Apply Gaussian function to get RBF output
        rbf_out = torch.exp(-self.gamma * dists.squeeze(1))
        return rbf_out

class Lenet5(nn.Module):
    def __init__(self):
        super(Lenet5, self).__init__()
        self.c1 = nn.Conv2d(1, 6, 5, stride=1, padding=2,) # 1, 28, 28 -> 6, 28, 28
        self.s2 = TrainableAvgPool2d(2, stride=2) # 6, 14, 14
        self.c3 = nn.Conv2d(6, 16, 5, stride=1, padding=0)
        self.s4 = TrainableAvgPool2d(2, stride=2)
        self.c5 = nn.Conv2d(16, 120, 5, stride=1, padding=0)
        self.f6 = nn.Linear(120, 84)
        self.rbf = RBFLayer(84, 10)
        
        
    def forward(self, x):
        x = F.sigmoid(self.s2(self.c1(x)))
        x = self.s4(self.c3(x))
        x = self.c5(x)
        x = x.view(-1, 120)
        x = self.f6(x)
        x = self.rbf(x)
        return x
    

class MNISTDataset(Dataset):
    def __init__(self, mnist_dataset, transform=None):
        self.mnist_dataset = mnist_dataset
        self.transform = transform

    def __len__(self):
        return len(self.mnist_dataset)

    def __getitem__(self, idx):
        image, label = self.mnist_dataset[idx]
        if self.transform:
            image = self.transform(image)   
        return image, label

def lr_lambda(epoch):
    if epoch < 2:
        return 1.0  # 0.0005 (initial LR)
    elif epoch < 5:
        return 0.4  # 0.0002
    elif epoch < 8:
        return 0.2  # 0.0001
    elif epoch < 12:
        return 0.1  # 0.00005
    else:
        return 0.02  # 0.00001

    
batch_size = 64

transforms = Compose([
    ToTensor()
])

mnist_train = MNIST(root='./data', download=True, train=True)
mnist_test = MNIST(root='./data', download=True, train=False)


mnist_train_dataset = MNISTDataset(mnist_train, transforms)
mnist_test_dataset = MNISTDataset(mnist_test, transforms)

train_loader = DataLoader(mnist_train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(mnist_test_dataset, batch_size=batch_size, shuffle=False)


# Hyperparameters
model = Lenet5().cuda()
epochs = 5
optimizer = SGD(model.parameters(), lr=0.05)
scheduler = LambdaLR(optimizer, lr_lambda)
loss = nn.CrossEntropyLoss()

def accuracy(y_pred, y_true):
    y_pred_classes = y_pred.argmax(dim=1)
    return (y_pred_classes == y_true).float().mean()

for epoch in range(epochs):
    model.train()
    train_loss_val = 0.0
    train_acc_val = 0.0

    # Training loop
    for images, labels in train_loader:
        images, labels = images.cuda(), labels.cuda()
        optimizer.zero_grad()
        outputs = model(images)
        train_loss = loss(outputs, labels)
        train_loss.backward()
        optimizer.step()
        
        train_loss_val += train_loss.item()
        train_acc_val += accuracy(outputs, labels).item()

    train_loss_val /= len(train_loader)
    train_acc_val /= len(train_loader)

    # Evaluation loop
    model.eval()
    test_loss_val = 0.0
    test_acc_val = 0.0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.cuda(), labels.cuda()
            outputs = model(images)
            test_loss_val += loss(outputs, labels).item()
            test_acc_val += accuracy(outputs, labels).item()

    test_loss_val /= len(test_loader)
    test_acc_val /= len(test_loader)

    print(f"Epoch {epoch + 1}: Test Loss = {test_loss_val:.6f}, Train Loss = {train_loss_val:.6f}")
    print(f"Epoch {epoch + 1}: Test Accuracy = {test_acc_val:.4f}, Train Accuracy = {train_acc_val:.4f}")
    scheduler.step()

Epoch 1: Test Loss = 2.302585, Train Loss = 2.302585
Epoch 1: Test Accuracy = 0.0894, Train Accuracy = 0.0904
Epoch 2: Test Loss = 2.302585, Train Loss = 2.302585
Epoch 2: Test Accuracy = 0.0894, Train Accuracy = 0.0903
Epoch 3: Test Loss = 2.302585, Train Loss = 2.302585
Epoch 3: Test Accuracy = 0.0894, Train Accuracy = 0.0904
Epoch 4: Test Loss = 2.302585, Train Loss = 2.302585
Epoch 4: Test Accuracy = 0.0894, Train Accuracy = 0.0904
Epoch 5: Test Loss = 2.302585, Train Loss = 2.302585
Epoch 5: Test Accuracy = 0.0894, Train Accuracy = 0.0903


##### Remioving the RBF Layer

In [90]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor, Lambda, Compose
from torch.nn.functional import one_hot
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import LambdaLR
from torch.optim import SGD

class TrainableAvgPool2d(nn.Module):
    def __init__(self, kernel_size, stride=None):
        super(TrainableAvgPool2d, self).__init__()
        self.avg_pool = nn.AvgPool2d(kernel_size, stride=stride)
        # Learnable weight and bias
        self.weight = nn.Parameter(torch.ones(1))
        self.bias = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        x = self.avg_pool(x)
        return self.weight * x + self.bias


class Lenet5(nn.Module):
    def __init__(self):
        super(Lenet5, self).__init__()
        self.c1 = nn.Conv2d(1, 6, 5, stride=1, padding=2,) # 1, 28, 28 -> 6, 28, 28
        self.s2 = TrainableAvgPool2d(2, stride=2) # 6, 14, 14
        self.c3 = nn.Conv2d(6, 16, 5, stride=1, padding=0)
        self.s4 = TrainableAvgPool2d(2, stride=2)
        self.c5 = nn.Conv2d(16, 120, 5, stride=1, padding=0)
        self.f6 = nn.Linear(120, 84)
        self.fc7 = nn.Linear(84, 10)
        
        
    def forward(self, x):
        x = F.sigmoid(self.s2(self.c1(x)))
        x = self.s4(self.c3(x))
        x = self.c5(x)
        x = x.view(-1, 120)
        x = self.f6(x)
        x = self.fc7(x)
        return x
    

class MNISTDataset(Dataset):
    def __init__(self, mnist_dataset, transform=None):
        self.mnist_dataset = mnist_dataset
        self.transform = transform

    def __len__(self):
        return len(self.mnist_dataset)

    def __getitem__(self, idx):
        image, label = self.mnist_dataset[idx]
        if self.transform:
            image = self.transform(image)   
        return image, label

def lr_lambda(epoch):
    if epoch < 2:
        return 1.0  # 0.0005 (initial LR)
    elif epoch < 5:
        return 0.4  # 0.0002
    elif epoch < 8:
        return 0.2  # 0.0001
    elif epoch < 12:
        return 0.1  # 0.00005
    else:
        return 0.02  # 0.00001

    
batch_size = 64

transforms = Compose([
    ToTensor()
])

mnist_train = MNIST(root='./data', download=True, train=True)
mnist_test = MNIST(root='./data', download=True, train=False)


mnist_train_dataset = MNISTDataset(mnist_train, transforms)
mnist_test_dataset = MNISTDataset(mnist_test, transforms)

train_loader = DataLoader(mnist_train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(mnist_test_dataset, batch_size=batch_size, shuffle=False)


# Hyperparameters
model = Lenet5().cuda()
epochs = 5
optimizer = SGD(model.parameters(), lr=0.05)
scheduler = LambdaLR(optimizer, lr_lambda)
loss = nn.CrossEntropyLoss()

def accuracy(y_pred, y_true):
    y_pred_classes = y_pred.argmax(dim=1)
    return (y_pred_classes == y_true).float().mean()

for epoch in range(epochs):
    model.train()
    train_loss_val = 0.0
    train_acc_val = 0.0

    # Training loop
    for images, labels in train_loader:
        images, labels = images.cuda(), labels.cuda()
        optimizer.zero_grad()
        outputs = model(images)
        train_loss = loss(outputs, labels)
        train_loss.backward()
        optimizer.step()
        
        train_loss_val += train_loss.item()
        train_acc_val += accuracy(outputs, labels).item()

    train_loss_val /= len(train_loader)
    train_acc_val /= len(train_loader)

    # Evaluation loop
    model.eval()
    test_loss_val = 0.0
    test_acc_val = 0.0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.cuda(), labels.cuda()
            outputs = model(images)
            test_loss_val += loss(outputs, labels).item()
            test_acc_val += accuracy(outputs, labels).item()

    test_loss_val /= len(test_loader)
    test_acc_val /= len(test_loader)

    print(f"Epoch {epoch + 1}: Test Loss = {test_loss_val:.6f}, Train Loss = {train_loss_val:.6f}")
    print(f"Epoch {epoch + 1}: Test Accuracy = {test_acc_val:.4f}, Train Accuracy = {train_acc_val:.4f}")
    scheduler.step()

Epoch 1: Test Loss = 0.351698, Train Loss = 1.139313
Epoch 1: Test Accuracy = 0.8866, Train Accuracy = 0.6106
Epoch 2: Test Loss = 0.306960, Train Loss = 0.329711
Epoch 2: Test Accuracy = 0.9095, Train Accuracy = 0.9013
Epoch 3: Test Loss = 0.220299, Train Loss = 0.253536
Epoch 3: Test Accuracy = 0.9373, Train Accuracy = 0.9260
Epoch 4: Test Loss = 0.169878, Train Loss = 0.205451
Epoch 4: Test Accuracy = 0.9510, Train Accuracy = 0.9385
Epoch 5: Test Loss = 0.134443, Train Loss = 0.164429
Epoch 5: Test Accuracy = 0.9597, Train Accuracy = 0.9517


##### Yup. So the problem is the RBF Layer