In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset


import numpy as np
import time
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, random_split
import requests

#including runtime measurements, accuracy metrics, and model size calculations (Homework 6)
#For training the models with different layers and heads
from itertools import product

import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchinfo import summary
import math
import time
from collections import OrderedDict

#Importing the Swin Transformer model from Hugging Face Transformers library for Problem 3
import transformers 
from transformers import SwinForImageClassification, SwinConfig, AutoImageProcessor
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

#Check the GPU name and number
'''
devNumber = torch.cuda.current_device()
devName = torch.cuda.get_device_name(devNumber)

print(f"Current device number is: {devNumber}")
print(f"GPU name is: {devName}")'''

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


'\ndevNumber = torch.cuda.current_device()\ndevName = torch.cuda.get_device_name(devNumber)\n\nprint(f"Current device number is: {devNumber}")\nprint(f"GPU name is: {devName}")'

In [18]:
'''Problem 1
'''
# Hyperparameters
batch_size = 64
num_classes = 100
learning_rate = 0.001
num_epochs = 20

# Data loading and preprocessing
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761))  # CIFAR-100 stats
])

train_dataset = torchvision.datasets.CIFAR100(
    root='./data', train=True, transform=transform, download=True)
test_dataset = torchvision.datasets.CIFAR100(
    root='./data', train=False, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Vision Transformer (ViT) implementation
class PatchEmbedding(nn.Module):
    def __init__(self, img_size=32, patch_size=4, in_channels=3, embed_dim=256):
        super().__init__()
        self.img_size = img_size
        self.patch_size = patch_size
        self.n_patches = (img_size // patch_size) ** 2
        
        self.proj = nn.Conv2d(
            in_channels=in_channels,
            out_channels=embed_dim,
            kernel_size=patch_size,
            stride=patch_size
        )
        
        # Learnable position embeddings
        self.pos_embed = nn.Parameter(torch.zeros(1, self.n_patches + 1, embed_dim))
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        
        nn.init.trunc_normal_(self.pos_embed, std=0.02)
        nn.init.trunc_normal_(self.cls_token, std=0.02)
    
    def forward(self, x):
        B, C, H, W = x.shape
        assert H == W == self.img_size, f"Input image size ({H}*{W}) doesn't match model ({self.img_size}*{self.img_size})"
        
        # Create patches
        x = self.proj(x)  # (B, embed_dim, n_patches_h, n_patches_w)
        x = x.flatten(2)  # (B, embed_dim, n_patches)
        x = x.transpose(1, 2)  # (B, n_patches, embed_dim)
        
        # Add class token
        cls_token = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_token, x), dim=1)
        
        # Add position embeddings
        x = x + self.pos_embed
        
        return x

class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        
        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
        
        self.qkv = nn.Linear(embed_dim, embed_dim * 3)
        self.proj = nn.Linear(embed_dim, embed_dim)
        self.scale = self.head_dim ** -0.5
    
    def forward(self, x):
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]
        
        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        
        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        
        return x

class MLP(nn.Module):
    def __init__(self, in_features, hidden_features=None, out_features=None, drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features * 4
        
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_ratio=4., drop=0.):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.attn = MultiHeadAttention(embed_dim, num_heads)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.mlp = MLP(in_features=embed_dim, hidden_features=int(embed_dim * mlp_ratio))
    
    def forward(self, x):
        x = x + self.attn(self.norm1(x))
        x = x + self.mlp(self.norm2(x))
        return x

class VisionTransformer(nn.Module):
    def __init__(self, img_size=32, patch_size=4, in_channels=3, num_classes=100,
                 embed_dim=256, depth=4, num_heads=4, mlp_ratio=4.):
        super().__init__()
        self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, embed_dim)
        self.blocks = nn.Sequential(*[
            TransformerBlock(embed_dim, num_heads, mlp_ratio)
            for _ in range(depth)
        ])
        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes)
        
        self.apply(self._init_weights)
    
    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.trunc_normal_(m.weight, std=0.02)
            if m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)
    
    def forward(self, x):
        x = self.patch_embed(x)
        x = self.blocks(x)
        x = self.norm(x)
        x = x[:, 0]  # Class token
        x = self.head(x)
        return x

# Training function
def train_model(model, criterion, optimizer, num_epochs=20):
    model.train()
    total_step = len(train_loader)
    train_times = []
    
    for epoch in range(num_epochs):
        start_time = time.time()
        for i, (images, labels) in enumerate(train_loader):
            images = images.to(device)
            labels = labels.to(device)
            
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if (i+1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{total_step}], Loss: {loss.item():.4f}')
        
        epoch_time = time.time() - start_time
        train_times.append(epoch_time)
        print(f'Epoch {epoch+1} completed in {epoch_time:.2f} seconds')
    
    return train_times

# Evaluation function
def evaluate_model(model):
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
        accuracy = 100 * correct / total
        print(f'Test Accuracy: {accuracy:.2f}%')
        return accuracy

# Model configurations to test
configs = [
    {'name': 'ViT-Tiny', 'patch_size': 4, 'embed_dim': 256, 'depth': 4, 'num_heads': 2, 'mlp_ratio': 2},
    {'name': 'ViT-Small', 'patch_size': 8, 'embed_dim': 256, 'depth': 8, 'num_heads': 2, 'mlp_ratio': 2},
    {'name': 'ViT-Medium', 'patch_size': 4, 'embed_dim': 512, 'depth': 4, 'num_heads': 4, 'mlp_ratio': 4},
    {'name': 'ViT-Large', 'patch_size': 8, 'embed_dim': 512, 'depth': 8, 'num_heads': 4, 'mlp_ratio': 4},
]
results = []

for config in configs:
    print(f"\nTraining {config['name']} configuration...")
    print(f"Patch size: {config['patch_size']}, Embed dim: {config['embed_dim']}, "
          f"Depth: {config['depth']}, Heads: {config['num_heads']}, MLP ratio: {config['mlp_ratio']}")
    
    # Create model
    model = VisionTransformer(
        img_size=32,
        patch_size=config['patch_size'],
        embed_dim=config['embed_dim'],
        depth=config['depth'],
        num_heads=config['num_heads'],
        mlp_ratio=config['mlp_ratio'],
        num_classes=num_classes
    ).to(device)
    
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    # Print model summary
    print("\nModel Summary:")
    summary(model, input_size=(batch_size, 3, 32, 32))
    
    # Train and evaluate
    train_times = train_model(model, criterion, optimizer, num_epochs)
    accuracy = evaluate_model(model)
    
    # Calculate parameters and FLOPs
    total_params = sum(p.numel() for p in model.parameters())
    flops = sum(p.numel() for p in model.parameters() if p.requires_grad) * 2 * 32 * 32  # Approximate
    
    results.append({
        'name': config['name'],
        'patch_size': config['patch_size'],
        'embed_dim': config['embed_dim'],
        'depth': config['depth'],
        'num_heads': config['num_heads'],
        'mlp_ratio': config['mlp_ratio'],
        'params': total_params,
        'flops': flops,
        'avg_train_time': sum(train_times)/len(train_times),
        'accuracy': accuracy
    })

# ResNet-18 baseline
print("\nTraining ResNet-18 baseline...")
resnet = torchvision.models.resnet18(num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(resnet.parameters(), lr=learning_rate)

print("\nResNet-18 Summary:")
summary(resnet, input_size=(batch_size, 3, 32, 32))

resnet_train_times = train_model(resnet, criterion, optimizer, 10)  # Train for 10 epochs
resnet_accuracy = evaluate_model(resnet)

resnet_params = sum(p.numel() for p in resnet.parameters())
resnet_flops = sum(p.numel() for p in resnet.parameters() if p.requires_grad) * 2 * 32 * 32  # Approximate

results.append({
    'name': 'ResNet-18',
    'patch_size': 'N/A',
    'embed_dim': 'N/A',
    'depth': 18,
    'num_heads': 'N/A',
    'mlp_ratio': 'N/A',
    'params': resnet_params,
    'flops': resnet_flops,
    'avg_train_time': sum(resnet_train_times)/len(resnet_train_times),
    'accuracy': resnet_accuracy
})

# Print results table
print("\nResults Summary:")
print("="*120)
print(f"{'Model':<15}{'Patch':<8}{'Embed':<8}{'Depth':<8}{'Heads':<8}{'MLP':<8}{'Params':<15}{'FLOPs':<15}{'Time/Epoch':<15}{'Accuracy':<10}")
print("-"*120)
for r in results:
    print(f"{r['name']:<15}{r['patch_size']:<8}{r['embed_dim']:<8}{r['depth']:<8}{r['num_heads']:<8}"
          f"{r['mlp_ratio']:<8}{r['params']/1e6:.2f}M{'':<5}{r['flops']/1e9:.2f}G{'':<5}"
          f"{r['avg_train_time']:.2f}s{'':<7}{r['accuracy']:.2f}%")
print("="*120)

100.0%



Training ViT-Tiny configuration...
Patch size: 4, Embed dim: 256, Depth: 4, Heads: 2, MLP ratio: 2

Model Summary:
Epoch [1/20], Step [100/782], Loss: 4.1893
Epoch [1/20], Step [200/782], Loss: 3.8480
Epoch [1/20], Step [300/782], Loss: 4.0729
Epoch [1/20], Step [400/782], Loss: 3.8686
Epoch [1/20], Step [500/782], Loss: 3.8387
Epoch [1/20], Step [600/782], Loss: 3.9619
Epoch [1/20], Step [700/782], Loss: 3.6748
Epoch 1 completed in 14.78 seconds
Epoch [2/20], Step [100/782], Loss: 3.7500
Epoch [2/20], Step [200/782], Loss: 3.6387
Epoch [2/20], Step [300/782], Loss: 3.5436
Epoch [2/20], Step [400/782], Loss: 3.8163
Epoch [2/20], Step [500/782], Loss: 3.4327
Epoch [2/20], Step [600/782], Loss: 3.4373
Epoch [2/20], Step [700/782], Loss: 3.6836
Epoch 2 completed in 14.90 seconds
Epoch [3/20], Step [100/782], Loss: 3.8262
Epoch [3/20], Step [200/782], Loss: 3.2702
Epoch [3/20], Step [300/782], Loss: 3.3842
Epoch [3/20], Step [400/782], Loss: 3.6353
Epoch [3/20], Step [500/782], Loss: 3.30

In [None]:
'''Problem 2 Fine-tuning pretrained Swin Transformer models from Hugging Face Transformers library on CIFAR-100 on
Tiny (microsoft/swin-tiny-patch4-window7-224) and Small (microsoft/swin-small-patch4-window7-224) variants - on CIFAR-100
'''
# Hyperparameters
num_epochs = 5
batch_size = 32
learning_rate = 2e-5
image_size = 224  # Swin expects 224x224 input

# Models to compare
model_variants = {
    'tiny': 'microsoft/swin-tiny-patch4-window7-224',
    'small': 'microsoft/swin-small-patch4-window7-224',
    'scratch': None
}

results = {}

# Data preparation
processor = AutoImageProcessor.from_pretrained(model_variants['tiny'])

transform = transforms.Compose([
    transforms.Resize((image_size, image_size)),
    transforms.ToTensor(),
    transforms.Normalize(mean=processor.image_mean, std=processor.image_std)
])

# CIFAR-100 dataset
train_dataset = torchvision.datasets.CIFAR100(root='./data', train=True,
                                           download=True, transform=transform)
test_dataset = torchvision.datasets.CIFAR100(root='./data', train=False,
                                          download=True, transform=transform)

for model_name, model_path in model_variants.items():
    print(f"\n=== Processing {model_name} model ===")
    
    # Scratch model
    if model_name == 'scratch':
        config = SwinConfig(
            image_size=image_size,
            patch_size=4,
            num_channels=3,
            embed_dim=96,
            depths=[2, 2, 6, 2],
            num_heads=[3, 6, 12, 24],
            window_size=7,
            num_labels=100
        )
        model = SwinForImageClassification(config).to(device)
    else:
        model = SwinForImageClassification.from_pretrained(
            model_path,
            num_labels=100,  # CIFAR-100 has 100 classes
            ignore_mismatched_sizes=True
        ).to(device)
        
        # Freeze backbone parameters for pretrained models
        for param in model.swin.parameters():
            param.requires_grad = False

    # Only train classifier head for pretrained models, all params for scratch
    trainable_params = []
    if model_name == 'scratch':
        trainable_params = model.parameters()
    else:
        trainable_params = model.classifier.parameters()
        for param in model.classifier.parameters():
            param.requires_grad = True

 
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(trainable_params, lr=learning_rate)

    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

 
    model.train()
    epoch_times = []
    
    for epoch in range(num_epochs):
        start_time = time.time()
        progress_bar = tqdm(train_loader, desc=f'Epoch [{epoch+1}/{num_epochs}]')
        
        for images, labels in progress_bar:
            images = images.to(device)
            labels = labels.to(device)
            
            outputs = model(images).logits
            loss = criterion(outputs, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            progress_bar.set_postfix({'loss': loss.item()})
        
        epoch_time = time.time() - start_time
        epoch_times.append(epoch_time)
        print(f"Epoch {epoch+1} time: {epoch_time:.2f}s")

    # Testing
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for images, labels in tqdm(test_loader, desc='Testing'):
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images).logits
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    accuracy = 100 * correct / total
    avg_epoch_time = sum(epoch_times) / len(epoch_times)
    
    results[model_name] = {
        'accuracy': accuracy,
        'avg_epoch_time': avg_epoch_time
    }
    
    print(f"{model_name} Test Accuracy: {accuracy:.2f}%")
    print(f"{model_name} Avg Epoch Time: {avg_epoch_time:.2f}s")

# Print results table
print("\n=== Results ===")
print(f"{'Model':<10} | {'Accuracy (%)':<12} | {'Avg Epoch Time (s)':<18}")
print("-" * 40)    
for model_name, metrics in results.items():
    print(f"{model_name:<10} | {metrics['accuracy']:<12.2f} | {metrics['avg_epoch_time']:<18.2f}")


=== Processing tiny model ===


Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-tiny-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([100]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([100, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch [1/5]: 100%|██████████| 1563/1563 [01:57<00:00, 13.31it/s, loss=3.37]


Epoch 1 time: 117.42s


Epoch [2/5]: 100%|██████████| 1563/1563 [01:53<00:00, 13.81it/s, loss=2.56]


Epoch 2 time: 113.22s


Epoch [3/5]: 100%|██████████| 1563/1563 [01:52<00:00, 13.84it/s, loss=2.13]


Epoch 3 time: 112.93s


Epoch [4/5]: 100%|██████████| 1563/1563 [01:52<00:00, 13.90it/s, loss=1.73]


Epoch 4 time: 112.42s


Epoch [5/5]: 100%|██████████| 1563/1563 [01:54<00:00, 13.62it/s, loss=1.46]


Epoch 5 time: 114.79s


Testing: 100%|██████████| 313/313 [00:23<00:00, 13.53it/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tiny Test Accuracy: 66.60%
tiny Avg Epoch Time: 114.15s

=== Processing small model ===


Some weights of SwinForImageClassification were not initialized from the model checkpoint at microsoft/swin-small-patch4-window7-224 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([100, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([100]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch [1/5]: 100%|██████████| 1563/1563 [02:42<00:00,  9.63it/s, loss=3.31]


Epoch 1 time: 162.35s


Epoch [2/5]: 100%|██████████| 1563/1563 [02:41<00:00,  9.69it/s, loss=2.73]


Epoch 2 time: 161.34s


Epoch [3/5]: 100%|██████████| 1563/1563 [02:36<00:00,  9.97it/s, loss=1.98]


Epoch 3 time: 156.80s


Epoch [4/5]: 100%|██████████| 1563/1563 [02:34<00:00, 10.11it/s, loss=1.7] 


Epoch 4 time: 154.65s


Epoch [5/5]: 100%|██████████| 1563/1563 [02:33<00:00, 10.21it/s, loss=1.35] 


Epoch 5 time: 153.11s


Testing: 100%|██████████| 313/313 [00:30<00:00, 10.35it/s]


small Test Accuracy: 70.36%
small Avg Epoch Time: 157.65s

=== Processing scratch model ===


Epoch [1/5]: 100%|██████████| 1563/1563 [04:04<00:00,  6.38it/s, loss=3.33]


Epoch 1 time: 244.95s


Epoch [2/5]: 100%|██████████| 1563/1563 [04:04<00:00,  6.38it/s, loss=3.13]


Epoch 2 time: 244.99s


Epoch [3/5]: 100%|██████████| 1563/1563 [04:05<00:00,  6.35it/s, loss=2.25]


Epoch 3 time: 246.00s


Epoch [4/5]: 100%|██████████| 1563/1563 [04:01<00:00,  6.48it/s, loss=2.54]


Epoch 4 time: 241.24s


Epoch [5/5]: 100%|██████████| 1563/1563 [03:57<00:00,  6.58it/s, loss=2.07]


Epoch 5 time: 237.52s


Testing: 100%|██████████| 313/313 [00:20<00:00, 14.96it/s]

scratch Test Accuracy: 37.23%
scratch Avg Epoch Time: 242.94s

=== Results ===
Model      | Accuracy (%) | Avg Epoch Time (s)
----------------------------------------
tiny       | 66.60        | 114.15            
small      | 70.36        | 157.65            
scratch    | 37.23        | 242.94            



