<a href="https://colab.research.google.com/github/myllanes/Introduction-to-Deep-Learning/blob/main/HW6_2_Swin.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Michael Yllanes
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from transformers import SwinForImageClassification, AutoImageProcessor
from tqdm import tqdm
import time
from datetime import timedelta

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters
num_epochs = 2
batch_size = 32
learning_rate = 2e-5
image_size = 224
num_classes = 100
hidden_dim = 768  # hidden dimension

# Data preprocessing
processor = AutoImageProcessor.from_pretrained("microsoft/swin-small-patch4-window7-224") # Change with microsoft/swin-tiny-patch4-window7-224 or microsoft/swin-small-patch4-window7-224
transform = transforms.Compose([
    transforms.Resize((image_size, image_size)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=processor.image_mean, std=processor.image_std)
])

# Load CIFAR-100
train_dataset = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

# Classifier
class classifierH(nn.Module):
    def __init__(self, hidden_dim, num_classes):
        super().__init__()

        self.conv = nn.Conv2d(hidden_dim, hidden_dim//2, kernel_size=1)
        self.bn = nn.BatchNorm2d(hidden_dim//2)
        self.relu = nn.ReLU()
        self.pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(hidden_dim//2, num_classes)
        # Zero-padding
        self.pad = nn.ZeroPad2d((0, 0, 0, max(0, 100 - num_classes)))

    def forward(self, x):
        x = x.unsqueeze(-1).unsqueeze(-1)
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)
        x = self.pool(x).flatten(1)
        x = self.fc(x)
        return self.pad(x)

# Load model with custom head
model = SwinForImageClassification.from_pretrained(
    "microsoft/swin-tiny-patch4-window7-224",
    num_labels=num_classes,
    ignore_mismatched_sizes=True
).to(device)

# Replace classifier
model.classifier = classifierH(hidden_dim, num_classes).to(device)

# Freeze backbone, train only head for fine tune
for param in model.parameters():
    param.requires_grad = False
for param in model.classifier.parameters():
    param.requires_grad = True

print("\nTrainable Parameters:")
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"\t{name}")

# Optimizer and loss
optimizer = torch.optim.AdamW(model.classifier.parameters(), lr=learning_rate, weight_decay=0.01)
criterion = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)

def train_epoch(epoch):
    model.train()
    epoch_start = time.time()
    total_loss = 0
    correct = 0
    total = 0

    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}')
    for images, labels in progress_bar:
        images, labels = images.to(device), labels.to(device)

        # Forward pass
        outputs = model(images).logits
        loss = criterion(outputs, labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Metrics
        total_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

        progress_bar.set_postfix({
            'loss': total_loss/(progress_bar.n+1),
            'acc': f'{100.*correct/total:.1f}%'
        })

    epoch_time = time.time() - epoch_start
    epoch_acc = 100 * correct / total
    avg_loss = total_loss / len(train_loader)

    scheduler.step()

    return epoch_time, epoch_acc, avg_loss

def evaluate():
    model.eval()
    start_time = time.time()
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in tqdm(test_loader, desc='Testing'):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images).logits
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    test_time = time.time() - start_time
    accuracy = 100 * correct / total
    return test_time, accuracy

if __name__ == '__main__':
    print("=== Training with Conv-Padded Classifier Head ===")
    print(f"Using device: {device}\n")

    # Training loop
    train_times = []
    train_accs = []

    for epoch in range(num_epochs):
        epoch_time, epoch_acc, epoch_loss = train_epoch(epoch)
        train_times.append(epoch_time)
        train_accs.append(epoch_acc)

        print(f"\nEpoch {epoch+1} Summary:")
        print(f"Time: {timedelta(seconds=epoch_time)}")
        print(f"Train Accuracy: {epoch_acc:.2f}%")
        print(f"Avg Loss: {epoch_loss:.4f}\n")

    # Final evaluation
    test_time, test_acc = evaluate()

    # Results summary
    print("\n=== Final Results ===")
    print(f"Total Training Time: {timedelta(seconds=sum(train_times))}")
    print(f"Average Epoch Time: {timedelta(seconds=sum(train_times)/num_epochs)}")
    print("\nTraining Accuracies by Epoch:")
    for i, acc in enumerate(train_accs):
        print(f"Epoch {i+1}: {acc:.2f}% (Time: {timedelta(seconds=train_times[i])})")


Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([100]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([100, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Trainable Parameters:
	classifier.conv.weight
	classifier.conv.bias
	classifier.bn.weight
	classifier.bn.bias
	classifier.fc.weight
	classifier.fc.bias
=== Training with Conv-Padded Classifier Head ===
Using device: cuda



Epoch 1/2: 100%|██████████| 1563/1563 [02:13<00:00, 11.69it/s, loss=3.6, acc=36.0%]



Epoch 1 Summary:
Time: 0:02:13.685298
Train Accuracy: 35.97%
Avg Loss: 3.5953



Epoch 2/2: 100%|██████████| 1563/1563 [02:13<00:00, 11.71it/s, loss=2.48, acc=59.4%]



Epoch 2 Summary:
Time: 0:02:13.461502
Train Accuracy: 59.39%
Avg Loss: 2.4824



Testing: 100%|██████████| 313/313 [00:26<00:00, 11.82it/s]


=== Final Results ===
Total Training Time: 0:04:27.146800
Average Epoch Time: 0:02:13.573400

Training Accuracies by Epoch:
Epoch 1: 35.97% (Time: 0:02:13.685298)
Epoch 2: 59.39% (Time: 0:02:13.461502)



