In [1]:
import numpy as np
import torch
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

import gc
import os
import time

# Device configuration for Apple chips
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print("Device:", device)

Device: mps


In [2]:
def data_loader(
    dataset_name,
    data_dir,
    batch_size,
    random_seed=42,
    valid_size=0.1,
    shuffle=True,
    test=False,
):
    # Dataset-specific normalization values
    if dataset_name == "CIFAR10":
        normalize = transforms.Normalize(
            mean=[0.4914, 0.4822, 0.4465],
            std=[0.2023, 0.1994, 0.2010],
        )
        resize_dim = (224, 224) # Original CIFAR10 size is 32x32
    elif dataset_name == "CIFAR100":
        normalize = transforms.Normalize(
            mean=[0.5071, 0.4867, 0.4408],
            std=[0.2675, 0.2565, 0.2761],
        )
        resize_dim = (224, 224) # Original CIFAR100 size is 32x32
    elif dataset_name == "TinyImageNet":
        normalize = transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225],
        )
        resize_dim = (64, 64)  # Original Tiny ImageNet size is 64x64
    elif dataset_name == "StanfordDogs":
        normalize = transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225],
        )
        resize_dim = (
            224,
            224,
        )  # Images will be resized to 224x224 for ResNet-like models
    else:
        raise ValueError(f"Dataset {dataset_name} not supported.")

    # Define transformations: Resize and normalize
    transform = transforms.Compose(
        [
            transforms.Resize(resize_dim),  # Resize depending on the dataset
            transforms.ToTensor(),
            normalize,
        ]
    )

    # Choose the dataset
    if dataset_name in ["CIFAR10", "CIFAR100"]:
        dataset_cls = datasets.__dict__[dataset_name]
    elif dataset_name == "TinyImageNet":
        dataset_cls = datasets.ImageFolder  # Tiny ImageNet is structured with folders
    elif dataset_name == "StanfordDogs":
        dataset_cls = (
            datasets.ImageFolder
        )  # Stanford Dogs is also structured with folders
    else:
        raise ValueError(f"Dataset {dataset_name} not supported.")

    # Handle test mode
    if test:
        if dataset_name in ["CIFAR10", "CIFAR100"]:
            dataset = dataset_cls(
                root=data_dir,
                train=False,
                download=True,
                transform=transform,
            )
        else:
            dataset = dataset_cls(
                root=os.path.join(
                    data_dir, "test" if dataset_name == "TinyImageNet" else "val"
                ),
                transform=transform,
            )

        test_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
        return test_loader

    # Load train dataset
    if dataset_name in ["CIFAR10", "CIFAR100"]:
        train_dataset = dataset_cls(
            root=data_dir,
            train=True,
            download=True,
            transform=transform,
        )

        valid_dataset = dataset_cls(
            root=data_dir,
            train=True,
            download=True,
            transform=transform,
        )
    else:
        train_dataset = dataset_cls(
            root=os.path.join(data_dir, "train"),
            transform=transform,
        )

        valid_dataset = dataset_cls(
            root=os.path.join(data_dir, "train"),
            transform=transform,
        )

    # Split the train dataset into train and validation
    num_train = len(train_dataset)
    indices = list(range(num_train))
    split = int(np.floor(valid_size * num_train))

    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)

    train_idx, valid_idx = indices[split:], indices[:split]

    # Create samplers for train and validation
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    # Create data loaders
    train_loader = DataLoader(
        train_dataset, batch_size=batch_size, sampler=train_sampler
    )

    valid_loader = DataLoader(
        valid_dataset, batch_size=batch_size, sampler=valid_sampler
    )

    return (train_loader, valid_loader)

In [3]:
from torch.utils.tensorboard import SummaryWriter
import time

# Initialize TensorBoard writer
log_dir = "./runs/resnet_experiment_" + time.strftime("%Y%m%d-%H%M%S")
writer = SummaryWriter(log_dir=log_dir)


# Hook function to capture the activations
def hook_fn(module, input, output):
    writer.add_histogram(f"{module.__class__.__name__}_activations", output)

In [4]:
class ResidualBlock_A(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock_A, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels,
                out_channels,
                kernel_size=3,
                stride=stride,
                padding=1,
                bias=False,
            ),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(
                out_channels,
                out_channels,
                kernel_size=3,
                stride=1,
                padding=1,
                bias=False,
            ),
            nn.BatchNorm2d(out_channels),
        )

        # automatically create downsample layer if needed
        self.downsample = None
        if stride != 1 or in_channels != out_channels:
            self.downsample = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride),
                nn.BatchNorm2d(out_channels),
            )

        self.relu = nn.ReLU()

        # Register hooks on conv1 and conv2 to visualize intermediate activations
        # self.conv1[0].register_forward_hook(hook_fn)
        # self.conv2[0].register_forward_hook(hook_fn)

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.conv2(out)
        if self.downsample:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out

In [5]:
class ResidualBlock_B(nn.Module):
    def __init__(self, in_channels, out_channels, bottleneck_rate=4, stride=1):
        super(ResidualBlock_B, self).__init__()

        # Calculate the number of mid_channels using the bottleneck rate
        mid_channels = out_channels // bottleneck_rate

        # 1x1 convolution (to reduce dimensionality)
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels, mid_channels, kernel_size=1, stride=stride, bias=False
            ),
            nn.BatchNorm2d(mid_channels),
            nn.ReLU(),
        )

        # 3x3 convolution (spatial convolution)
        self.conv2 = nn.Sequential(
            nn.Conv2d(
                mid_channels,
                mid_channels,
                kernel_size=3,
                stride=1,
                padding=1,
                bias=False,
            ),
            nn.BatchNorm2d(mid_channels),
            nn.ReLU(),
        )

        # 1x1 convolution (to restore dimensionality)
        self.conv3 = nn.Sequential(
            nn.Conv2d(mid_channels, out_channels, kernel_size=1, stride=1, bias=False),
            nn.BatchNorm2d(out_channels),
        )

        # If the input and output sizes don't match, we need a downsample layer
        self.downsample = None
        if stride != 1 or in_channels != out_channels:
            self.downsample = nn.Sequential(
                nn.Conv2d(
                    in_channels, out_channels, kernel_size=1, stride=stride, bias=False
                ),
                nn.BatchNorm2d(out_channels),
            )

        self.relu = nn.ReLU()

        # Register hooks on conv1 and conv2 to visualize intermediate activations
        # self.conv1[0].register_forward_hook(hook_fn)
        # self.conv2[0].register_forward_hook(hook_fn)
        # self.conv3[0].register_forward_hook(hook_fn)

    def forward(self, x):
        residual = x

        # Forward pass through the three convolutional layers
        out = self.conv1(x)
        out = self.conv2(out)
        out = self.conv3(out)

        # Apply downsample if needed
        if self.downsample is not None:
            residual = self.downsample(x)

        # Add the residual (skip connection)
        out += residual
        out = self.relu(out)

        return out

In [6]:
class ResNet34(nn.Module):
    def __init__(self, residual_block, num_classes=10):
        super(ResNet34, self).__init__()

        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        )
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer0 = self._make_layer(residual_block, 64, 64, 3)
        self.layer1 = self._make_layer(residual_block, 64, 128, 4, init_stride=2)
        self.layer2 = self._make_layer(residual_block, 128, 256, 6, init_stride=2)
        self.layer3 = self._make_layer(residual_block, 256, 512, 3, init_stride=2)
        self.avgpool = nn.AvgPool2d(7, stride=1)
        self.fc = nn.Linear(512, num_classes)

    def _make_layer(
        self, residual_block, in_channels, out_channels, num_blocks, init_stride=1
    ):
        layers = [residual_block(in_channels, out_channels, init_stride)]
        for i in range(1, num_blocks):
            layers.append(residual_block(out_channels, out_channels))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.maxpool(x)
        x = self.layer0(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x

In [7]:
class ResNet50(nn.Module):
    def __init__(self, residual_block, num_classes=10):
        super(ResNet50, self).__init__()

        # Initial Convolutional Layer (same as ResNet-34)
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        )
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        # ResNet-50 Specific Layer Configurations
        self.layer1 = self._make_layer(residual_block, 64, 256, 3)  # 3 blocks
        self.layer2 = self._make_layer(
            residual_block, 256, 512, 4, stride=2
        )  # 4 blocks
        self.layer3 = self._make_layer(
            residual_block, 512, 1024, 6, stride=2
        )  # 6 blocks
        self.layer4 = self._make_layer(
            residual_block, 1024, 2048, 3, stride=2
        )  # 3 blocks

        # AdaptiveAvgPool2d for any input size
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(2048, num_classes)

    def _make_layer(
        self, residual_block, in_channels, out_channels, num_blocks, stride=1
    ):
        layers = [residual_block(in_channels, out_channels, stride)]
        for _ in range(1, num_blocks):
            layers.append(residual_block(out_channels, out_channels))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.maxpool(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x

In [8]:
def eval(data_loader, model, device="cpu"):
    correct = 0
    total = 0
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        for images, labels in data_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs
            torch.cuda.empty_cache()
    model.train()  # Set the model back to training mode
    return total, correct

In [9]:

def create_checkpoint_dir(checkpoint_dir):
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)  # Ensure checkpoint directory exists


def train(
    num_epochs,
    train_loader,
    valid_loader,
    model,
    criterion,
    optimizer,
    device,
    writer,
    checkpoint_dir="./checkpoints",
):
    # Get the model class name dynamically
    model_class_name = model.__class__.__name__

    # Create the directory based on model class name
    checkpoint_dir = os.path.join(checkpoint_dir, model_class_name)
    create_checkpoint_dir(checkpoint_dir)

    # Train the model
    for epoch in range(num_epochs):
        epoch_start_time = time.time()

        for batch_idx, (images, labels) in enumerate(train_loader):
            iter_start = time.time()

            # Move tensors to the configured device
            images = images.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # calculate the iteration latency
            iter_latency = time.time() - iter_start
            writer.add_scalar(
                "Latency/iteration", iter_latency, epoch * len(train_loader) + batch_idx
            )
            writer.add_scalar(
                "Loss/train", loss.item(), epoch * len(train_loader) + batch_idx
            )

            # Clear memory to avoid GPU memory overflow
            del images, labels, outputs
            torch.cuda.empty_cache()
            gc.collect()

        # Log epoch latency
        epoch_latency = time.time() - epoch_start_time
        writer.add_scalar("Latency/epoch", epoch_latency, epoch)
        print(
            f"Epoch [{epoch+1}/{num_epochs}] finished in {epoch_latency:.2f} seconds."
        )

        # Validation
        total, correct = eval(valid_loader, model, device)
        accuracy = 100 * correct / total
        print(f"Accuracy on validation images after epoch {epoch+1}: {accuracy:.2f}%")
        writer.add_scalar("Accuracy/validation", accuracy, epoch)

        # Save model checkpoint
        checkpoint_path = os.path.join(checkpoint_dir, f"model_epoch_{epoch+1}.pth")
        torch.save(model.state_dict(), checkpoint_path)
        print(f"Model checkpoint saved at {checkpoint_path}")

    writer.flush()  # Flush all pending events to disk

In [10]:
# Hyperparameters
dataset_name = "CIFAR100"
num_classes = 100
batch_size = 64
num_epochs = 50
learning_rate = 0.01


# Example usage:
train_loader, valid_loader = data_loader(
    dataset_name=dataset_name, data_dir="./dataset", batch_size=batch_size
)

# model = ResNet34(ResidualBlock_A, num_classes=num_classes).to(device)
model = ResNet50(ResidualBlock_B, num_classes=num_classes).to(device)

# Define a dummy input to pass through the model
dummy_input = torch.randn(1, 3, 224, 224).to(
    device
)  # Example input of shape (batch_size, channels, height, width)

# Log the model graph to TensorBoard
writer.add_graph(model, dummy_input)
writer.flush()


# Loss and optimizer
criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.SGD(
#     model.parameters(), lr=learning_rate, weight_decay=0.001, momentum=0.9
# )
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.001)

# Learning rate scheduler (reduce LR by 0.1 every 30 epochs)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)

train(
    num_epochs=num_epochs,
    train_loader=train_loader,
    valid_loader=valid_loader,
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    device=device,
    writer=writer,
    checkpoint_dir="./checkpoints",
)

Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to ./dataset/cifar-100-python.tar.gz


  1%|          | 1343488/169001437 [00:29<3:08:32, 14820.61it/s]

In [None]:
# Load the test dataset
test_loader = data_loader(
    dataset_name=dataset_name, data_dir="./dataset", batch_size=batch_size, test=True
)

# Evaluate on the test dataset
total, correct = eval(test_loader, model, device)

# Calculate accuracy
accuracy = 100 * correct / total
print(f"Accuracy of the network on the {total} test images: {accuracy:.2f} %")

In [None]:
writer.close()