In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [3]:
# Select proper device to do the training!

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS is available")
else:
    device = torch.device("cpu")
    print("CUDA is not available")

MPS is available


# 1. Data Preparation

In [4]:
import torchvision
from torchvision import transforms
from torch.utils.data import DataLoader

In [5]:
basic_transforms = transforms.Compose([
    transforms.ToTensor(),  # Convert PIL image to tensor
    transforms.Lambda(lambda x: x / 255.0)  # Normalize pixel values to [0, 1]
])

# It is not necessary to flatten the image. (Wow, because we are going to use CNN ไงง)
# Now it accepts 28x28x1 (grayscale image)
batch_size = 64

train_dataset = torchvision.datasets.MNIST(
    root='./data',
    train=True,
    transform=basic_transforms,
    download=True
)

# Split the train_dataset into train and validation sets
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(
    train_dataset, [train_size, val_size]
)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

test_dataset = torchvision.datasets.MNIST(
    root='./data',
    train=False,
    transform=basic_transforms,
    download=True
)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100.0%


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100.0%


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100.0%


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100.0%

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw






# 2. Convolutional Neural Network
## 2.1 LeNet

In [7]:
# This is the original LeNet-5 architecture.
class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 5, padding=2) # (number of input channels, number of out channels = # kernels, kernal_size = 5x5, SAME convolution)
        self.conv2 = nn.Conv2d(6, 16, 5, padding=0) # (, , , VALID convolution)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
    
    def forward(self, x): # x = 28*28*1
        x = self.conv1(x) # 28*28*6
        x = F.avg_pool2d(x, 2) # 14*14*6
        x = self.conv2(x) # (14-5+1)*(14-5+1)*16 = 10*10*16
        x = F.avg_pool2d(x, 2) # 5*5*16
        x = self.flatten(x)
        x = self.fc1(x) # 120
        x = F.relu(x)
        x = self.fc2(x) # 84
        x = F.relu(x)
        x = self.fc3(x) # 10
        return x

### Training Loop with Early Stopping

In [8]:
def train_model(model, train_loader, val_loader, loss_fn,
                optimizer, epochs, device, scheduler=None,
                early_stop=True, patience=5, min_delta=0.001,
                model_name='best_model.pth'):
    train_losses, val_losses = [], []
    best_val_loss = float('inf')
    patience_counter = 0
    best_model_state = None
    
    for epoch in range(epochs):
        model.train()
        train_loss, train_steps = 0.0, 0
        
        for x_batch, y_batch in train_loader:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            
            # Forward pass
            y_pred = model(x_batch)
            
            # Compute loss
            loss = loss_fn(y_pred, y_batch)
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            
            # Update weights
            optimizer.step()
            
            if scheduler is not None:
                scheduler.step()
            
            # Accumulate loss
            train_loss += loss.item()
            train_steps += 1
        
        train_loss_avg = train_loss / train_steps
        train_losses.append(train_loss_avg)
        
        # Evaluate on validation set
        model.eval()
        val_loss, val_steps = 0.0, 0
        
        with torch.no_grad():
            for x_batch, y_batch in val_loader:
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = model(x_batch)
                loss = loss_fn(y_pred, y_batch)
                val_loss += loss.item()
                val_steps += 1
        
        val_loss_avg = val_loss / val_steps
        val_losses.append(val_loss_avg)
        
        # Early stopping logic
        if early_stop:
            if val_loss_avg < best_val_loss - min_delta:
                best_val_loss = val_loss_avg
                patience_counter = 0
                # Save the best model state
                best_model_state = model.state_dict().copy()
                print(f"Epoch {epoch + 1:3d}/{epochs},",
                      f"Loss: {train_loss_avg:.4f},",
                      f"Validation Loss: {val_loss_avg:.4f} (Best)")
            else:
                patience_counter += 1
                print(f"Epoch {epoch + 1:3d}/{epochs},",
                      f"Loss: {train_loss_avg:.4f},",
                      f"Validation Loss: {val_loss_avg:.4f}",
                      f"(Patience: {patience_counter}/{patience})")
                
                # Check for early stopping
                if patience_counter >= patience:
                    print(f"\nEarly stopping triggered at epoch {epoch + 1}")
                    print(f"Best validation loss: {best_val_loss:.4f}")
                    break
        else:
            print(f"Epoch {epoch + 1:3d}/{epochs},",
                  f"Loss: {train_loss_avg:.4f},",
                  f"Validation Loss: {val_loss_avg:.4f}")
    
    # Restore the best model state
    if best_model_state is not None:
        # Save the best model state to a file
        torch.save(best_model_state, model_name)
        model.load_state_dict(best_model_state)
        print("Restored best model state")
    
    return train_losses, val_losses

In [9]:
def evaluate_model(model, test_loader, loss_fn, device):
    model.eval()
    test_loss, test_steps = 0.0, 0
    correct_predictions = 0
    
    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            y_pred = model(x_batch)
            loss = loss_fn(y_pred, y_batch)
            test_loss += loss.item()
            test_steps += 1
            
            preds = y_pred.argmax(dim=1)
            correct_predictions += (preds == y_batch).sum().item()
    
    test_loss_avg = test_loss / test_steps
    test_accuracy = correct_predictions / len(test_loader.dataset)
    print(f"Loss: {test_loss_avg:.4f}, Accuracy: {test_accuracy:.4f}")

### Training LeNet

In [10]:
lenet = LeNet().to(device)
epochs = 100
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(lenet.parameters(), lr=0.001, weight_decay=0.0001)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)

train_losses, val_losses = train_model(
    lenet, train_loader, val_loader,
    loss_fn, optimizer, epochs, device, 
    scheduler=scheduler,
    early_stop=True, 
    model_name='best_lenet.pth'
)

print("Evaluating on training set...")
evaluate_model(lenet, train_loader, loss_fn, device)
print("Evaluating on validation set...")
evaluate_model(lenet, val_loader, loss_fn, device)
print("Evaluating on test set...")
evaluate_model(lenet, test_loader, loss_fn, device)

Epoch   1/100, Loss: 2.1861, Validation Loss: 1.3758 (Best)
Epoch   2/100, Loss: 0.6724, Validation Loss: 0.5001 (Best)
Epoch   3/100, Loss: 0.4319, Validation Loss: 0.3764 (Best)
Epoch   4/100, Loss: 0.3347, Validation Loss: 0.3102 (Best)
Epoch   5/100, Loss: 0.2715, Validation Loss: 0.2549 (Best)
Epoch   6/100, Loss: 0.2302, Validation Loss: 0.2230 (Best)
Epoch   7/100, Loss: 0.2036, Validation Loss: 0.2046 (Best)
Epoch   8/100, Loss: 0.1858, Validation Loss: 0.1953 (Best)
Epoch   9/100, Loss: 0.1712, Validation Loss: 0.1739 (Best)
Epoch  10/100, Loss: 0.1610, Validation Loss: 0.1646 (Best)
Epoch  11/100, Loss: 0.1511, Validation Loss: 0.1603 (Best)
Epoch  12/100, Loss: 0.1429, Validation Loss: 0.1674 (Patience: 1/5)
Epoch  13/100, Loss: 0.1362, Validation Loss: 0.1476 (Best)
Epoch  14/100, Loss: 0.1319, Validation Loss: 0.1374 (Best)
Epoch  15/100, Loss: 0.1246, Validation Loss: 0.1433 (Patience: 1/5)
Epoch  16/100, Loss: 0.1195, Validation Loss: 0.1434 (Patience: 2/5)
Epoch  17/100

## 2.2 Custom CNN

In [11]:
class CustomCNN(nn.Module):
    def __init__(self, dropout_rate=0.25):
        super(CustomCNN, self).__init__()
        
        # Define the Convolutional Blocks
        self.features = nn.Sequential(
            # Conv Block 1: 28x28x1 -> 26x26x32 (from 28-3+1 = 26) -> 13x13x32
            nn.Conv2d(1, 32, kernel_size=3, padding=0),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(0.25),
            
            # Conv Block 2: 13x13x32 -> 11x11x64 -> 5x5x64
            nn.Conv2d(32, 64, kernel_size=3, padding=0),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(0.25),
            
            # Conv Block 3: 5x5x64 -> 3x3x128
            nn.Conv2d(64, 128, kernel_size=3, padding=0),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.Dropout(0.25)
            # Note: Final output size after all conv/pool layers is 3x3x128
        )
        
        # Define the Fully-Connected (Dense) Layers
        self.classifier = nn.Sequential(
            # Input size: 3*3*128 = 1152 features
            nn.Flatten(),
            nn.Dropout(dropout_rate),
            nn.Linear(3 * 3 * 128, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, 10)  # Output layer for 10 classes (digits 0-9)
        )
    
    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

### Training Custom CNN

In [12]:
custom_cnn = CustomCNN().to(device)
epochs = 100
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(custom_cnn.parameters(), lr=1e-4, weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)

train_losses, val_losses = train_model(
    custom_cnn, train_loader, val_loader,
    loss_fn, optimizer, epochs, device, 
    scheduler=scheduler,
    early_stop=True, 
    model_name='best_custom_cnn.pth'
)

print("Evaluating on training set...")
evaluate_model(custom_cnn, train_loader, loss_fn, device)
print("Evaluating on validation set...")
evaluate_model(custom_cnn, val_loader, loss_fn, device)
print("Evaluating on test set...")
evaluate_model(custom_cnn, test_loader, loss_fn, device)

Epoch   1/100, Loss: 0.9967, Validation Loss: 0.3954 (Best)
Epoch   2/100, Loss: 0.3575, Validation Loss: 0.1906 (Best)
Epoch   3/100, Loss: 0.2232, Validation Loss: 0.1316 (Best)
Epoch   4/100, Loss: 0.1661, Validation Loss: 0.1714 (Patience: 1/5)
Epoch   5/100, Loss: 0.1325, Validation Loss: 0.0751 (Best)
Epoch   6/100, Loss: 0.1158, Validation Loss: 0.0622 (Best)
Epoch   7/100, Loss: 0.1006, Validation Loss: 0.0812 (Patience: 1/5)
Epoch   8/100, Loss: 0.0914, Validation Loss: 0.0705 (Patience: 2/5)
Epoch   9/100, Loss: 0.0840, Validation Loss: 0.0466 (Best)
Epoch  10/100, Loss: 0.0783, Validation Loss: 0.0441 (Best)
Epoch  11/100, Loss: 0.0730, Validation Loss: 0.0417 (Best)
Epoch  12/100, Loss: 0.0683, Validation Loss: 0.0439 (Patience: 1/5)
Epoch  13/100, Loss: 0.0663, Validation Loss: 0.0388 (Best)
Epoch  14/100, Loss: 0.0620, Validation Loss: 0.0364 (Best)
Epoch  15/100, Loss: 0.0593, Validation Loss: 0.0405 (Patience: 1/5)
Epoch  16/100, Loss: 0.0554, Validation Loss: 0.0366 (P

# 3. Transfer Learning

In [17]:
from torchvision import models
from torch.utils.data import TensorDataset, random_split
import matplotlib.pyplot as plt

# Cat-Dog Classification
X = torch.load('catdog_X.pt', weights_only=True)
y = torch.load('catdog_y.pt', weights_only=True)
D = TensorDataset(X, y)

catdog_train_set, catdog_val_set, catdog_test_set = random_split(
    D, [0.7, 0.15, 0.15]
)

catdog_train_loader = DataLoader(catdog_train_set, batch_size=64, shuffle=True)
catdog_val_loader = DataLoader(catdog_val_set, batch_size=64, shuffle=False)
catdog_test_loader = DataLoader(catdog_test_set, batch_size=64, shuffle=False)

# Show some images
img_idx = [0, 1, 2, 3, 4, 12500, 12501, 12502, 12503, 12504]
plt.figure(figsize=(10, 5))
for i in range(10):
    plt.subplot(2, 5, i+1)
    plt.imshow(X[img_idx[i]].permute(1, 2, 0))
    plt.title(f"Label: {y[img_idx[i]]}")
    plt.axis('off')

FileNotFoundError: [Errno 2] No such file or directory: 'catdog_X.pt'

### Transfer Learning with ResNet18 (Frozen Layers)

In [None]:
model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
model.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
model.fc = nn.Linear(model.fc.in_features, 2)

# Freeze all parameters except the last layer
for param in model.parameters():
    param.requires_grad = False

for param in model.fc.parameters():
    param.requires_grad = True

model.to(device)

epochs = 100
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)

train_losses, val_losses = train_model(
    model, catdog_train_loader, catdog_val_loader,
    loss_fn, optimizer, epochs, device, 
    scheduler=scheduler,
    early_stop=True, 
    model_name='best_resnet18_catdog.pth'
)

print("Evaluating on training set...")
evaluate_model(model, catdog_train_loader, loss_fn, device)
print("Evaluating on validation set...")
evaluate_model(model, catdog_val_loader, loss_fn, device)
print("Evaluating on test set...")
evaluate_model(model, catdog_test_loader, loss_fn, device)

### Fine-tuning (Unfrozen Layers)

In [None]:
# Unfreeze all parameters
for param in model.parameters():
    param.requires_grad = True

optimizer = optim.Adam([
    {'params': (p for n, p in model.named_parameters() if "fc" not in n),
     'lr': 1e-5},
    {'params': (p for n, p in model.named_parameters() if "fc" in n),
     'lr': 1e-4}
], weight_decay=0.001)

scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)

train_losses, val_losses = train_model(
    model, catdog_train_loader, catdog_val_loader,
    loss_fn, optimizer, epochs, device, 
    scheduler=scheduler,
    early_stop=True, 
    model_name='best_resnet18_catdog.pth'
)

print("Evaluating on training set...")
evaluate_model(model, catdog_train_loader, loss_fn, device)
print("Evaluating on validation set...")
evaluate_model(model, catdog_val_loader, loss_fn, device)
print("Evaluating on test set...")
evaluate_model(model, catdog_test_loader, loss_fn, device)