In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class AlexNet(nn.Module):
    def __init__(self, num_classes=10):
        super(AlexNet, self).__init__()
        self.features = nn.Sequential(
            # 1 here is the input channel size of previous layer ....
            # original has 96 kernels with stride = 4 and size 11
            nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=2),
            nn.ReLU(inplace=True),

            # Max pooling layer with a 2x2 kernel size and a default stride of 2
            # origal has size 3x3 and stride 2
            nn.MaxPool2d(kernel_size=2),

            # original has 256 kernels with stride = 1 and size 5
            # 64 here is the input channel size of previous layer ....
            nn.Conv2d(64, 192, kernel_size=3, padding=2),
            nn.ReLU(inplace=True),

            # origal has size 3x3 and stride 2
            nn.MaxPool2d(kernel_size=2),

            # original has 384 kernels with stride = 1 and size 3
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),

            # original has 384 kernels with stride = 1 and size 3
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),

            # original has 256 kernels with stride = 1 and size 3
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),

            # original has size 3x3 and stride 2
            nn.MaxPool2d(kernel_size=2),
        )
        # This layer will adaptively resize the input tensor to the specified size (6x6),
        # performing average pooling to produce a fixed-size output regardless of the input size.
        self.avgpool = nn.AdaptiveAvgPool2d((6, 6))

        # Now move to the fully connected layer
        # just the same as feed-forward neural network
        # original alexnet has 2 hidden layer, each has 4096 neurons, so we keep the same here
        self.classifier = nn.Sequential(
            # Dropout layer to randomly zero some of the elements of the input tensor with a probability of 0.5.
            # Dropout is a regularization technique used to prevent overfitting by randomly dropping units during training.
            nn.Dropout(),

            #Fully connected (linear) layer with 256*6*6 input features (output size from the last convolutional layer)
            # and 4096 output features.
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes),
            # no soft max activation function for the final output layer ??
            # somehow the result (see below) is still really good
            # maybe i just used too strong of a model for a simple task
        )

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        # flatten before feeding in the fully connected layer
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        probabilities = F.softmax(x, dim=1)  # Apply softmax
        return probabilities


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AlexNet().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
def train_model(num_epochs):
    model.train()
    for epoch in range(num_epochs):
        for i, (images, labels) in enumerate(train_loader):
            images, labels = images.to(device), labels.to(device)

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if (i+1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

num_epochs = 10
train_model(num_epochs)


In [None]:
def evaluate_model():
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        print(f'Accuracy of the model on the 10000 test images: {100 * correct / total}%')

evaluate_model()
