In [1]:
import matplotlib.pyplot as plt
import numpy as np
import math
from torch import nn
import torchvision
import os
import torch
from torch.utils.data import DataLoader, Subset, random_split
from torchvision.datasets import CIFAR10
from torchvision import transforms

class CIFAR10Dataset:
    IMAGE_SIZE = 32
    MEAN = (0.4914, 0.4822, 0.4465)
    STD = (0.2470, 0.2435, 0.2616)

    def __init__(self, batch_size=64, num_workers=min(2, os.cpu_count()), train_size=None, test_size=None):
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.train_size = train_size
        self.test_size = test_size
        self.data_path = os.path.expanduser("~/.torchvision")

        self.class_labels = ('plane', 'car', 'bird', 'cat', 'deer',
                             'dog', 'frog', 'horse', 'ship', 'truck')

    def get_transforms(self, is_training=True):
        """Returns preprocessing transformations."""
        if is_training:
            return transforms.Compose([
                transforms.ToTensor(),
                transforms.Resize((self.IMAGE_SIZE, self.IMAGE_SIZE)),
                transforms.RandomHorizontalFlip(p=0.5),
                transforms.RandomResizedCrop((self.IMAGE_SIZE, self.IMAGE_SIZE), scale=(0.8, 1.0),
                                             ratio=(0.75, 1.33), interpolation=transforms.InterpolationMode.BILINEAR),
                transforms.Normalize(self.MEAN, self.STD)
            ])
        else:
            return transforms.Compose([
                transforms.ToTensor(),
                transforms.Resize((self.IMAGE_SIZE, self.IMAGE_SIZE)),
                transforms.Normalize(self.MEAN, self.STD)
            ])

    def load_dataset(self, is_training=True):
        """Loads CIFAR-10 dataset with transformations."""
        dataset = CIFAR10(root=self.data_path, train=is_training, download=True,
                          transform=self.get_transforms(is_training))

        if is_training and self.train_size is not None:
            indices = torch.randperm(len(dataset))[:self.train_size].tolist()
            dataset = Subset(dataset, indices)

        if not is_training and self.test_size is not None:
            indices = torch.randperm(len(dataset))[:self.test_size].tolist()
            dataset = Subset(dataset, indices)

        return dataset

    def get_dataloaders(self):
        """Returns train, validation, and test DataLoaders."""
        train_dataset = self.load_dataset(is_training=True)
        test_dataset = self.load_dataset(is_training=False)

        test_loader = DataLoader(test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers)

        train_size = int(0.8 * len(train_dataset))
        val_size = len(train_dataset) - train_size
        train_set, val_set = random_split(train_dataset, [train_size, val_size])

        train_loader = DataLoader(train_set, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers)
        val_loader = DataLoader(val_set, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers)

        return train_loader, test_loader, val_loader, self.class_labels

# Example usage:
data_loader = CIFAR10Dataset(batch_size=128, train_size=5000, test_size=1000)
train_loader, test_loader, val_loader, class_labels = data_loader.get_dataloaders()

In [None]:
import torch
import torch.nn as nn
import math


class ViTConfig:
    def __init__(self, config_dict):
        for key, value in config_dict.items():
            setattr(self, key, value)

    def display_config(self):
        for key, value in self.__dict__.items():
            print(f"{key}: {value}")


class PatchEmbeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.image_size = config.image_size
        self.patch_size = config.patch_size
        self.num_channels = config.num_channels
        self.num_patches = (self.image_size // self.patch_size) ** 2
        self.hidden_size = config.hidden_size
        
        self.projection = nn.Conv2d(
            self.num_channels,
            self.hidden_size,
            kernel_size=self.patch_size,
            stride=self.patch_size
        )

    def forward(self, x):
        x = self.projection(x)  # (batch_size, hidden_size, num_patches_w, num_patches_h)
        x = x.flatten(2)  # (batch_size, hidden_size, num_patches)
        x = x.transpose(1, 2)  # (batch_size, num_patches, hidden_size)
        return x


class LinearEmbeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.patch_embeddings = PatchEmbeddings(config)
        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
        self.position_embeddings = nn.Parameter(
            torch.zeros(1, self.patch_embeddings.num_patches + 1, config.hidden_size)
        )
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, x):
        batch_size = x.shape[0]
        x = self.patch_embeddings(x)  
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  
        x = torch.cat((cls_tokens, x), dim=1) 
        x = x + self.position_embeddings
        x = self.dropout(x)
        return x
    
    
import torch
import torch.nn as nn
import math

class SingleAttentionHead(nn.Module):
    def __init__(self, model_dim, head_dim, dropout_rate, use_bias=True):
        super().__init__()
        self.model_dim = model_dim
        self.head_dim = head_dim
        self.query_layer = nn.Linear(model_dim, head_dim, bias=use_bias)
        self.key_layer = nn.Linear(model_dim, head_dim, bias=use_bias)
        self.value_layer = nn.Linear(model_dim, head_dim, bias=use_bias)
        self.attn_dropout_layer = nn.Dropout(dropout_rate)

    def forward(self, input_tensor):
        query_tensor = self.query_layer(input_tensor)
        key_tensor = self.key_layer(input_tensor)
        value_tensor = self.value_layer(input_tensor)
        attn_scores = torch.matmul(query_tensor, key_tensor.transpose(-1, -2)) / math.sqrt(self.head_dim)
        attn_probs = nn.functional.softmax(attn_scores, dim=-1)
        attn_probs = self.attn_dropout_layer(attn_probs)

        attn_output = torch.matmul(attn_probs, value_tensor)
        return attn_output, attn_probs


class MultiHeadedAttention(nn.Module):
    def __init__(self, model_config):
        super().__init__()
        self.num_heads = model_config.num_attention_heads
        self.head_dim = model_config.hidden_size // self.num_heads
        self.total_head_dim = self.num_heads * self.head_dim
        self.use_bias = model_config.qkv_bias
        # Single projection for QKV
        self.qkv_projection_layer = nn.Linear(model_config.hidden_size, self.total_head_dim * 3, bias=self.use_bias)
        self.attn_dropout_layer = nn.Dropout(model_config.attention_probs_dropout_prob)
        # Output projection
        self.output_projection_layer = nn.Linear(self.total_head_dim,model_config.hidden_size)
        self.output_dropout_layer = nn.Dropout(model_config.hidden_dropout_prob)

    def forward(self, input_tensor, return_attn_probs=False):
        qkv_tensor = self.qkv_projection_layer(input_tensor)
        query_tensor, key_tensor, value_tensor = torch.chunk(qkv_tensor, 3, dim=-1)

        # Reshape for multi-head attention
        batch_size, seq_length, _ = query_tensor.size()
        query_tensor = query_tensor.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        key_tensor = key_tensor.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        value_tensor = value_tensor.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)

        # Compute scaled dot-product attention
        attn_scores = torch.matmul(query_tensor, key_tensor.transpose(-1, -2)) / math.sqrt(self.head_dim)
        attn_probs = nn.functional.softmax(attn_scores, dim=-1)
        attn_probs = self.attn_dropout_layer(attn_probs)
        attn_output = torch.matmul(attn_probs, value_tensor)

        # Reshape and project output
        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_length, self.total_head_dim)
        attn_output = self.output_projection_layer(attn_output)
        attn_output = self.output_dropout_layer(attn_output)
        return (attn_output, attn_probs) if return_attn_probs else (attn_output, None)


class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.sqrt_2_over_pi = math.sqrt(2 / math.pi)

    def forward(self, x):
        x = self.fc1(x)
        x = self.gelu(x)
        x = self.fc2(x)
        x = self.dropout(x)
        return x
    
    def gelu(self,x):
        return 0.5 * x * (1 + torch.tanh(self.sqrt_2_over_pi * (x + 0.044715 * x ** 3)))


import torch
import torch.nn as nn

class TransformerBlock(nn.Module):
    def __init__(self, model_config):
        super().__init__()
        self.attention_layer = MultiHeadedAttention(model_config)
        self.pre_norm_layer = nn.LayerNorm(model_config.hidden_size)
        self.post_norm_layer = nn.LayerNorm(model_config.hidden_size)
        self.feed_forward_layer = MLP(model_config)

    def forward(self, input_tensor, return_attn_probs=False):
        normalized_tensor = self.pre_norm_layer(input_tensor)
        attention_output = self.attention_layer(normalized_tensor, output_attentions=return_attn_probs)

        if return_attn_probs:
            attention_values, attention_maps = attention_output
        else:
            attention_values = attention_output[0]
        residual_tensor = input_tensor + attention_values
        normalized_tensor = self.post_norm_layer(residual_tensor)
        feed_forward_output = self.feed_forward_layer(normalized_tensor)
        final_output = residual_tensor + feed_forward_output
        return (final_output, attention_maps) if return_attn_probs else final_output


class TransformerEncoder(nn.Module):
    def __init__(self, model_config):
        super().__init__()
        self.encoder_layers = nn.ModuleList([TransformerBlock(model_config) for _ in range(model_config.num_hidden_layers)])
        
    def forward(self, input_tensor, return_attn_probs=False):
        attention_maps = [] if return_attn_probs else None
        for encoder_layer in self.encoder_layers:
            if return_attn_probs:
                input_tensor, attn_probs = encoder_layer(input_tensor, output_attentions=True)
                attention_maps.append(attn_probs)
            else:
                input_tensor = encoder_layer(input_tensor, output_attentions=False)
        return (input_tensor, attention_maps) if return_attn_probs else input_tensor


class VisionTransformer(nn.Module):
    def __init__(self, model_config):
        super().__init__()
        self.model_config = model_config
        self.embedding_layer = LinearEmbeddings(model_config)
        self.transformer_encoder = TransformerEncoder(model_config)
        self.output_layer = nn.Linear(model_config.hidden_size, model_config.num_classes)
        
        # Initialize weights
        self.apply(self._initialize_weights)

    def forward(self, input_tensor, return_attentions=False):
        embedded_features = self.embedding_layer(input_tensor)
        
        if return_attentions:
            encoded_representation, attention_maps = self.transformer_encoder(
                embedded_features, 
                output_attentions=return_attentions
            )
        else:
            encoded_representation = self.transformer_encoder(embedded_features)

        cls_token = encoded_representation[:, 0, :]
        model_output = self.output_layer(cls_token)
        
        if return_attentions:
            return model_output, attention_maps
        return model_output

    def _initialize_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            torch.nn.init.normal_(module.weight, mean=0.0, std=self.model_config.initializer_range)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        elif isinstance(module, LinearEmbeddings):
            # Initialize classification token and position embeddings
            nn.init.trunc_normal_(module.cls_token, std=self.model_config.initializer_range)
            nn.init.trunc_normal_(module.position_embeddings, std=self.model_config.initializer_range)

In [None]:
from torch import nn, optim
import json, os
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch.nn import functional as F
import torchvision
import torchvision.transforms as transforms


def save_experiment(experiment_name, config, model, train_losses, test_losses, accuracies, base_dir="experiments"):
    outdir = os.path.join(base_dir, experiment_name)
    os.makedirs(outdir, exist_ok=True)

    # Save the config
    configfile = os.path.join(outdir, 'config.json')
    with open(configfile, 'w') as f:
        json.dump(config, f, sort_keys=True, indent=4)

    # Save the metrics
    jsonfile = os.path.join(outdir, 'metrics.json')
    with open(jsonfile, 'w') as f:
        data = {
            'train_losses': train_losses,
            'test_losses': test_losses,
            'accuracies': accuracies,
        }
        json.dump(data, f, sort_keys=True, indent=4)

    # Save the model
    save_checkpoint(experiment_name, model, "final", base_dir=base_dir)


def save_checkpoint(experiment_name, model, epoch, base_dir="experiments"):
    outdir = os.path.join(base_dir, experiment_name)
    os.makedirs(outdir, exist_ok=True)
    cpfile = os.path.join(outdir, f'model_{epoch}.pt')
    torch.save(model.state_dict(), cpfile)


def load_experiment(experiment_name, checkpoint_name="model_final.pt", base_dir="experiments"):
    outdir = os.path.join(base_dir, experiment_name)
    # Load the config
    configfile = os.path.join(outdir, 'config.json')
    with open(configfile, 'r') as f:
        config = json.load(f)
    # Load the metrics
    jsonfile = os.path.join(outdir, 'metrics.json')
    with open(jsonfile, 'r') as f:
        data = json.load(f)
    train_losses = data['train_losses']
    test_losses = data['test_losses']
    accuracies = data['accuracies']
    # Load the model
    model = ViTForClassfication(config)
    cpfile = os.path.join(outdir, checkpoint_name)
    model.load_state_dict(torch.load(cpfile))
    return config, model, train_losses, test_losses, accuracies


def visualize_images():
    trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                            download=True)
    classes = ('plane', 'car', 'bird', 'cat',
            'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
    # Pick 30 samples randomly
    indices = torch.randperm(len(trainset))[:30]
    images = [np.asarray(trainset[i][0]) for i in indices]
    labels = [trainset[i][1] for i in indices]
    # Visualize the images using matplotlib
    fig = plt.figure(figsize=(10, 10))
    for i in range(30):
        ax = fig.add_subplot(6, 5, i+1, xticks=[], yticks=[])
        ax.imshow(images[i])
        ax.set_title(classes[labels[i]])

import torch
from torch.utils.tensorboard import SummaryWriter
import os
import time
from datetime import datetime


class Trainer:
    """
    Enhanced trainer class for Vision Transformer with additional features:
    - TensorBoard logging
    - Model checkpointing
    - Learning rate scheduling
    - Mixed precision training
    - Gradient accumulation
    """
    
    def __init__(self, model, optimizer, loss_fn, exp_name, device, config=None):
        self.model = model.to(device)
        self.optimizer = optimizer
        self.loss_fn = loss_fn
        self.exp_name = exp_name
        self.device = device
        self.config = config
        
        # Create experiment directory
        self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.exp_dir = os.path.join("experiments", f"{exp_name}_{self.timestamp}")
        os.makedirs(self.exp_dir, exist_ok=True)
        
        # Initialize TensorBoard writer
        self.writer = SummaryWriter(log_dir=os.path.join(self.exp_dir, "logs"))
        
        # Training metrics
        self.best_accuracy = 0.0
        self.train_losses = []
        self.test_losses = []
        self.accuracies = []
        
        # Automatic mixed precision
        self.scaler = torch.cuda.amp.GradScaler(enabled=(device == 'cuda'))

    def train(self, trainloader, testloader, epochs, 
              save_model_every_n_epochs=0, grad_accum_steps=1,
              scheduler=None, early_stopping_patience=None):
        """
        Train the model for the specified number of epochs.
        
        Args:
            trainloader: DataLoader for training data
            testloader: DataLoader for validation data
            epochs: Number of training epochs
            save_model_every_n_epochs: Save checkpoint every n epochs (0 to disable)
            grad_accum_steps: Number of gradient accumulation steps
            scheduler: Learning rate scheduler
            early_stopping_patience: Stop training if validation accuracy doesn't improve for n epochs
        """
        early_stop_counter = 0
        
        for epoch in range(1, epochs + 1):
            start_time = time.time()
            
            # Train for one epoch
            train_loss = self.train_epoch(trainloader, grad_accum_steps)
            
            # Evaluate on validation set
            accuracy, test_loss = self.evaluate(testloader)
            
            # Update learning rate scheduler if provided
            if scheduler:
                if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
                    scheduler.step(test_loss)
                else:
                    scheduler.step()
            
            # Record metrics
            self.train_losses.append(train_loss)
            self.test_losses.append(test_loss)
            self.accuracies.append(accuracy)
            
            # Log to TensorBoard
            self.writer.add_scalar('Loss/train', train_loss, epoch)
            self.writer.add_scalar('Loss/test', test_loss, epoch)
            self.writer.add_scalar('Accuracy/test', accuracy, epoch)
            
            # Print epoch summary
            epoch_time = time.time() - start_time
            lr = self.optimizer.param_groups[0]['lr']
            print(f"Epoch: {epoch}/{epochs} | Time: {epoch_time:.2f}s | LR: {lr:.2e} | "
                  f"Train Loss: {train_loss:.4f} | Test Loss: {test_loss:.4f} | "
                  f"Accuracy: {accuracy:.4f}")
            
            # Save checkpoint
            if save_model_every_n_epochs > 0 and epoch % save_model_every_n_epochs == 0:
                self._save_checkpoint(epoch, is_best=False)
            
            # Save best model
            if accuracy > self.best_accuracy:
                self.best_accuracy = accuracy
                self._save_checkpoint(epoch, is_best=True)
                early_stop_counter = 0
            else:
                early_stop_counter += 1
            
            # Early stopping
            if early_stopping_patience and early_stop_counter >= early_stopping_patience:
                print(f"Early stopping at epoch {epoch} as accuracy didn't improve for {early_stopping_patience} epochs")
                break
        
        # Final save
        self._save_checkpoint(epochs, is_best=False)
        self._save_experiment()
        self.writer.close()

    def train_epoch(self, trainloader, grad_accum_steps=1):
        """
        Train the model for one epoch with optional gradient accumulation.
        """
        self.model.train()
        total_loss = 0.0
        total_samples = 0
        
        for step, batch in enumerate(trainloader, 1):
            # Move batch to device
            images, labels = batch
            images, labels = images.to(self.device), labels.to(self.device)
            
            # Forward pass with mixed precision
            with torch.cuda.amp.autocast(enabled=(self.device == 'cuda')):
                outputs = self.model(images)
                loss = self.loss_fn(outputs, labels) / grad_accum_steps
            
            # Backward pass with gradient scaling
            self.scaler.scale(loss).backward()
            
            # Gradient accumulation
            if step % grad_accum_steps == 0:
                # Update weights
                self.scaler.step(self.optimizer)
                self.scaler.update()
                self.optimizer.zero_grad()
            
            # Update metrics
            batch_size = images.size(0)
            total_loss += loss.item() * batch_size * grad_accum_steps
            total_samples += batch_size
        
        return total_loss / total_samples

    @torch.no_grad()
    def evaluate(self, testloader):
        """
        Evaluate the model on the test/validation set.
        """
        self.model.eval()
        total_loss = 0.0
        correct = 0
        total_samples = 0
        
        for batch in testloader:
            images, labels = batch
            images, labels = images.to(self.device), labels.to(self.device)
            
            # Forward pass
            with torch.cuda.amp.autocast(enabled=(self.device == 'cuda')):
                outputs = self.model(images)
                loss = self.loss_fn(outputs, labels)
            
            # Update metrics
            batch_size = images.size(0)
            total_loss += loss.item() * batch_size
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total_samples += batch_size
        
        accuracy = correct / total_samples
        avg_loss = total_loss / total_samples
        return accuracy, avg_loss

    def _save_checkpoint(self, epoch, is_best=False):
        """Save model checkpoint."""
        checkpoint = {
            'epoch': epoch,
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'train_losses': self.train_losses,
            'test_losses': self.test_losses,
            'accuracies': self.accuracies,
            'best_accuracy': self.best_accuracy,
            'config': self.config
        }
        
        if is_best:
            filename = os.path.join(self.exp_dir, "best_model.pth")
        else:
            filename = os.path.join(self.exp_dir, f"checkpoint_epoch_{epoch}.pth")
        
        torch.save(checkpoint, filename)
        print(f"Saved {'best ' if is_best else ''}checkpoint to {filename}")

    def _save_experiment(self):
        """Save experiment results and configuration."""
        experiment = {
            'config': self.config,
            'train_losses': self.train_losses,
            'test_losses': self.test_losses,
            'accuracies': self.accuracies,
            'best_accuracy': self.best_accuracy,
            'timestamp': self.timestamp
        }
        
        filename = os.path.join(self.exp_dir, "experiment_results.pth")
        torch.save(experiment, filename)
        print(f"Saved experiment results to {filename}")



In [14]:
config = {
    # Architecture (Lightweight)
    "patch_size": 8,           # 32x32 → 4x4 patches (16 total) - reduces sequence length
    "hidden_size": 96,         # Smaller than GPU config (originally 128)
    "num_hidden_layers": 4,    # Reduced from 6 for faster CPU training
    "num_attention_heads": 3,  # Must divide hidden_size (96/3=32)
    "intermediate_size": 384,  # 4*hidden_size

      
    # Regularization (Critical for CPU training)
    "hidden_dropout_prob": 0.05,
    "attention_probs_dropout_prob": 0.05,
    
    # Training
    "initializer_range": 0.02,
    "image_size": 32,
    "num_classes": 10,
    "num_channels": 3,
    "qkv_bias": True
}

from torch.optim.lr_scheduler import SequentialLR,LinearLR,CosineAnnealingLR
# Create model
model = ViTClassification(ViTConfig(config))

# Create optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=0.01)
loss_fn = torch.nn.CrossEntropyLoss()

# Create trainer
trainer = Trainer(
    model=model,
    optimizer=optimizer,
    loss_fn=loss_fn,
    exp_name="vit_cifar10_896",
    device="cuda" if torch.cuda.is_available() else "cpu",
    config=config
)

# warmup_epochs = 10
# scheduler = SequentialLR(
#     optimizer,
#     schedulers=[
#         LinearLR(optimizer, start_factor=0.01, total_iters=warmup_epochs),
#         CosineAnnealingLR(optimizer, T_max=90, eta_min=1e-5)  # T_max = epochs - warmup
#     ],
#     milestones=[warmup_epochs]
# )

# Train the model
trainer.train(
    trainloader=train_loader,
    testloader=test_loader,
    epochs=50,
    save_model_every_n_epochs=10,
    grad_accum_steps=2,
    scheduler=torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50),
    early_stopping_patience=8
)

  self.scaler = torch.cuda.amp.GradScaler(enabled=(device == 'cuda'))
  with torch.cuda.amp.autocast(enabled=(self.device == 'cuda')):
  with torch.cuda.amp.autocast(enabled=(self.device == 'cuda')):


Epoch: 1/50 | Time: 33.83s | LR: 3.00e-04 | Train Loss: 2.2097 | Test Loss: 2.1430 | Accuracy: 0.1800
Saved best checkpoint to experiments\vit_cifar10_896_20250402_202045\best_model.pth
Epoch: 2/50 | Time: 35.60s | LR: 2.99e-04 | Train Loss: 2.0728 | Test Loss: 2.0547 | Accuracy: 0.2500
Saved best checkpoint to experiments\vit_cifar10_896_20250402_202045\best_model.pth
Epoch: 3/50 | Time: 30.65s | LR: 2.97e-04 | Train Loss: 2.0078 | Test Loss: 1.9927 | Accuracy: 0.2700
Saved best checkpoint to experiments\vit_cifar10_896_20250402_202045\best_model.pth
Epoch: 4/50 | Time: 33.87s | LR: 2.95e-04 | Train Loss: 1.9481 | Test Loss: 1.9755 | Accuracy: 0.2670
Epoch: 5/50 | Time: 34.03s | LR: 2.93e-04 | Train Loss: 1.9221 | Test Loss: 1.9043 | Accuracy: 0.3150
Saved best checkpoint to experiments\vit_cifar10_896_20250402_202045\best_model.pth
Epoch: 6/50 | Time: 33.69s | LR: 2.89e-04 | Train Loss: 1.8734 | Test Loss: 1.8901 | Accuracy: 0.3320
Saved best checkpoint to experiments\vit_cifar10_896

In [None]:

config = {
    # Architecture (Lightweight)
    "patch_size": 8,           # 32x32 → 4x4 patches (16 total) - reduces sequence length
    "hidden_size": 96,         # Smaller than GPU config (originally 128)
    "num_hidden_layers": 4,    # Reduced from 6 for faster CPU training
    "num_attention_heads": 3,  # Must divide hidden_size (96/3=32)
    "intermediate_size": 384,  # 4*hidden_size

      
    # Regularization (Critical for CPU training)
    "hidden_dropout_prob": 0.0,
    "attention_probs_dropout_prob": 0.0,
    
    # Training
    "initializer_range": 0.02,
    "image_size": 32,
    "num_classes": 10,
    "num_channels": 3,
    "qkv_bias": True
}

from torch.optim.lr_scheduler import SequentialLR,LinearLR,CosineAnnealingLR
# Create model
model = ViTClassification(ViTConfig(config))
#print(torchinfo.summary(model, (1, 3, 32, 32)))
# Create optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=0.01,betas=(0.9,0.999))
loss_fn = torch.nn.CrossEntropyLoss()

# Create trainer
trainer = Trainer(
    model=model,
    optimizer=optimizer,
    loss_fn=loss_fn,
    exp_name="vit_cifar10_896",
    device="cuda" if torch.cuda.is_available() else "cpu",
    config=config
)

# warmup_epochs = 10
# scheduler = SequentialLR(
#     optimizer,
#     schedulers=[
#         LinearLR(optimizer, start_factor=0.01, total_iters=warmup_epochs),
#         CosineAnnealingLR(optimizer, T_max=90, eta_min=1e-5)  # T_max = epochs - warmup
#     ],
#     milestones=[warmup_epochs]
# )

# Train the model
trainer.train(
    trainloader=train_loader,
    testloader=test_loader,
    epochs=100,
    save_model_every_n_epochs=10,
    grad_accum_steps=2,
    scheduler=torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50),
    early_stopping_patience=8
)

  self.scaler = torch.cuda.amp.GradScaler(enabled=(device == 'cuda'))
  with torch.cuda.amp.autocast(enabled=(self.device == 'cuda')):
  with torch.cuda.amp.autocast(enabled=(self.device == 'cuda')):


Epoch: 1/100 | Time: 29.33s | LR: 3.00e-04 | Train Loss: 2.1129 | Test Loss: 2.0503 | Accuracy: 0.2490
Saved best checkpoint to experiments\vit_cifar10_896_20250402_194930\best_model.pth
Epoch: 2/100 | Time: 25.08s | LR: 2.99e-04 | Train Loss: 2.0216 | Test Loss: 1.9860 | Accuracy: 0.2970
Saved best checkpoint to experiments\vit_cifar10_896_20250402_194930\best_model.pth
Epoch: 3/100 | Time: 30.01s | LR: 2.97e-04 | Train Loss: 1.9616 | Test Loss: 1.9251 | Accuracy: 0.3350
Saved best checkpoint to experiments\vit_cifar10_896_20250402_194930\best_model.pth
Epoch: 4/100 | Time: 30.38s | LR: 2.95e-04 | Train Loss: 1.9213 | Test Loss: 1.9029 | Accuracy: 0.3230
Epoch: 5/100 | Time: 29.33s | LR: 2.93e-04 | Train Loss: 1.8665 | Test Loss: 1.8450 | Accuracy: 0.3590
Saved best checkpoint to experiments\vit_cifar10_896_20250402_194930\best_model.pth
Epoch: 6/100 | Time: 24.62s | LR: 2.89e-04 | Train Loss: 1.8250 | Test Loss: 1.8232 | Accuracy: 0.3570
Epoch: 7/100 | Time: 26.91s | LR: 2.86e-04 | T