In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from torchvision import datasets, transforms
from typing import List, Tuple, Optional
from dataclasses import dataclass
import math

In [2]:
# --------------------------------------------------------------------------------
# 1. Custom Layer Implementations
# --------------------------------------------------------------------------------
class CustomReLU:
    """Custom ReLU activation"""

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return torch.maximum(x, torch.tensor(0.))

In [3]:
class CustomConv2d:
    def __init__(self, in_channels: int, out_channels: int, kernel_size: int,
                 stride: int = 1, padding: int = 0):
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.device = torch.device('cpu')

        # Initialize weights and bias using Kaiming initialization
        # k = 1 / (in_channels * kernel_size * kernel_size)
        # self.weight = torch.randn(out_channels, in_channels, kernel_size, kernel_size) * math.sqrt(2./k)
        fan_in = in_channels * kernel_size * kernel_size
        bound = math.sqrt(2. / fan_in)
        self.weight = torch.randn(out_channels, in_channels, kernel_size, kernel_size) * bound
        self.weight.requires_grad = True
        self.bias = torch.zeros(out_channels)
        self.bias.requires_grad = True

    def to(self, device):
        self.device = device
        self.weight = self.weight.to(device)
        self.bias = self.bias.to(device)
        return self

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Ensure input and weights are on the same device
        if x.device != self.weight.device:
            x = x.to(self.weight.device)

        batch_size, in_channels, height, width = x.shape

        # Add padding if needed
        if self.padding > 0:
            x = torch.nn.functional.pad(x, (self.padding, self.padding,
                                          self.padding, self.padding))

        # Calculate output dimensions
        out_height = (height + 2 * self.padding - self.kernel_size) // self.stride + 1
        out_width = (width + 2 * self.padding - self.kernel_size) // self.stride + 1

        # Extract patches
        patches = x.unfold(2, self.kernel_size, self.stride)  # unfold height
        patches = patches.unfold(3, self.kernel_size, self.stride)  # unfold width

        # Reshape patches to [batch_size, out_height * out_width, in_channels * kernel_size * kernel_size]
        patches = patches.permute(0, 2, 3, 1, 4, 5).contiguous()
        patches = patches.reshape(batch_size, out_height * out_width, -1)

        # Reshape weight to [out_channels, in_channels * kernel_size * kernel_size]
        weight = self.weight.reshape(self.out_channels, -1)

        # Perform convolution using batch matrix multiplication
        output = torch.matmul(patches, weight.t())  # [batch_size, out_height * out_width, out_channels]

        # Reshape output and add bias
        output = output.reshape(batch_size, out_height, out_width, self.out_channels)
        output = output.permute(0, 3, 1, 2).contiguous()  # [batch_size, out_channels, out_height, out_width]
        output = output + self.bias.view(1, -1, 1, 1)

        return output

class CustomMaxPool2d:
    def __init__(self, kernel_size: int, stride: Optional[int] = None):
        self.kernel_size = kernel_size
        self.stride = stride if stride is not None else kernel_size
        self.device = torch.device('cpu')

    def to(self, device):
        self.device = device
        return self

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Extract patches
        patches = x.unfold(2, self.kernel_size, self.stride)  # unfold height
        patches = patches.unfold(3, self.kernel_size, self.stride)  # unfold width

        # Get max values
        pooled = patches.max(dim=4)[0].max(dim=4)[0]
        return pooled

In [4]:
class CustomLinear:
    def __init__(self, in_features: int, out_features: int):
        self.in_features = in_features
        self.out_features = out_features
        self.device = torch.device('cpu')

        # Initialize weights and bias
        k = 1 / in_features
        self.weight = torch.randn(out_features, in_features) * math.sqrt(k)
        self.weight.requires_grad = True
        self.bias = torch.zeros(out_features)
        self.bias.requires_grad = True

    def to(self, device):
        self.device = device
        self.weight = self.weight.to(device)
        self.bias = self.bias.to(device)
        return self

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Ensure input and weights are on the same device
        if x.device != self.weight.device:
            x = x.to(self.weight.device)
        return torch.matmul(x, self.weight.t()) + self.bias

In [5]:
# --------------------------------------------------------------------------------
# 2. Custom CNN Model
# --------------------------------------------------------------------------------

class CustomCNN:
    """Custom CNN implementation for MNIST"""

    def __init__(self):
        # Define layers with better architecture
        self.conv1 = CustomConv2d(1, 16, kernel_size=3, padding=1)  # Changed to 16 filters
        self.conv2 = CustomConv2d(16, 32, kernel_size=3, padding=1)  # Changed to 32 filters
        self.conv3 = CustomConv2d(32, 64, kernel_size=3, padding=1)  # Added third conv layer
        self.pool = CustomMaxPool2d(kernel_size=2)
        self.relu = CustomReLU()

        # Calculate input size for first FC layer
        self.fc1 = CustomLinear(64 * 3 * 3, 128)  # Adjusted size due to three pooling layers
        self.fc2 = CustomLinear(128, 10)
        self.training = True

    def to(self, device):
        """Move model to specified device"""
        # Move weights and biases to device
        for attr in self.__dict__.values():
            if hasattr(attr, 'weight'):
                attr.weight = attr.weight.to(device)
            if hasattr(attr, 'bias'):
                attr.bias = attr.bias.to(device)
        return self

    def train(self):
        """Set the model to training mode"""
        self.training = True

    def eval(self):
        """Set the model to evaluation mode"""
        self.training = False

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # First conv block
        x = self.conv1.forward(x)
        x = self.relu.forward(x)
        x = self.pool.forward(x)  # 28x28 -> 14x14

        # Second conv block
        x = self.conv2.forward(x)
        x = self.relu.forward(x)
        x = self.pool.forward(x)  # 14x14 -> 7x7

        # Third conv block
        x = self.conv3.forward(x)
        x = self.relu.forward(x)
        x = self.pool.forward(x)  # 7x7 -> 3x3

        # Flatten and fully connected layers
        x = x.reshape(x.size(0), -1)  # Flatten
        x = self.fc1.forward(x)
        x = self.relu.forward(x)
        x = self.fc2.forward(x)

        return x

In [6]:
# --------------------------------------------------------------------------------
# 3. Custom Transformer Components
# --------------------------------------------------------------------------------
class CustomLayerNorm:
    """Custom Layer Normalization implementation"""

    def __init__(self, normalized_shape: int, eps: float = 1e-5):
        self.normalized_shape = normalized_shape
        self.eps = eps

        # Learnable parameters
        self.weight = torch.ones(normalized_shape)  # gamma
        self.bias = torch.zeros(normalized_shape)   # beta
        self.weight.requires_grad = True
        self.bias.requires_grad = True
        self.device = torch.device('cpu')

    def to(self, device):
        """Move layer to device"""
        self.device = device
        self.weight = self.weight.to(device)
        self.bias = self.bias.to(device)
        return self

    def __call__(self, x: torch.Tensor) -> torch.Tensor:
        """Make the class callable, equivalent to forward"""
        return self.forward(x)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: Input tensor of shape [..., normalized_shape]
        Returns:
            Normalized tensor of the same shape
        """
        if x.device != self.weight.device:
            x = x.to(self.weight.device)

        # Calculate mean and variance along the last dimension
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)

        # Normalize
        x_norm = (x - mean) / torch.sqrt(var + self.eps)

        # Scale and shift
        return self.weight * x_norm + self.bias

In [7]:
class CustomTransformerEncoder:
    """Custom Transformer Encoder implementation"""

    def __init__(self, embed_dim: int, num_heads: int, ff_dim: int):
        self.attention = CustomMultiHeadAttention(embed_dim, num_heads)
        self.ff1 = CustomLinear(embed_dim, ff_dim)
        self.ff2 = CustomLinear(ff_dim, embed_dim)
        self.relu = CustomReLU()
        self.training = True

        # Replace PyTorch LayerNorm with custom implementation
        self.norm1 = CustomLayerNorm(embed_dim)
        self.norm2 = CustomLayerNorm(embed_dim)

    def train(self):
        """Set encoder to training mode"""
        self.training = True
        if hasattr(self.attention, 'train'):
            self.attention.train()

    def eval(self):
        """Set encoder to evaluation mode"""
        self.training = False
        if hasattr(self.attention, 'eval'):
            self.attention.eval()

    def to(self, device):
        """Move encoder to specified device"""
        self.attention = self.attention.to(device)
        self.ff1 = self.ff1.to(device)
        self.ff2 = self.ff2.to(device)
        self.norm1 = self.norm1.to(device)
        self.norm2 = self.norm2.to(device)
        return self

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Forward pass of transformer encoder
        Args:
            x: Input tensor of shape [B, seq_len, embed_dim]
        Returns:
            Output tensor of shape [B, seq_len, embed_dim]
        """
        # Self-attention block
        residual = x
        x = self.norm1(x)
        x = self.attention.forward(x)
        x = x + residual  # Residual connection

        # Feed-forward block
        residual = x
        x = self.norm2(x)
        x = self.ff1.forward(x)
        x = self.relu.forward(x)
        x = self.ff2.forward(x)
        x = x + residual  # Residual connection

        return x

In [8]:
class CustomMultiHeadAttention:
    """Custom Multi-Head Attention implementation"""

    def __init__(self, embed_dim: int, num_heads: int):
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        # Initialize Q, K, V projections
        self.q_proj = CustomLinear(embed_dim, embed_dim)
        self.k_proj = CustomLinear(embed_dim, embed_dim)
        self.v_proj = CustomLinear(embed_dim, embed_dim)
        self.out_proj = CustomLinear(embed_dim, embed_dim)
        self.training = True

    def to(self, device):
        """Move attention module to device"""
        # Move all projection layers to device
        self.q_proj = self.q_proj.to(device)
        self.k_proj = self.k_proj.to(device)
        self.v_proj = self.v_proj.to(device)
        self.out_proj = self.out_proj.to(device)
        return self

    def train(self):
        """Set attention to training mode"""
        self.training = True

    def eval(self):
        """Set attention to evaluation mode"""
        self.training = False

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        batch_size, seq_len, _ = x.shape

        # Project to Q, K, V
        q = self.q_proj.forward(x)
        k = self.k_proj.forward(x)
        v = self.v_proj.forward(x)

        # Reshape for multi-head attention
        q = q.view(batch_size, seq_len, self.num_heads, self.head_dim)
        k = k.view(batch_size, seq_len, self.num_heads, self.head_dim)
        v = v.view(batch_size, seq_len, self.num_heads, self.head_dim)

        # Transpose for attention computation
        q = q.transpose(1, 2)
        k = k.transpose(1, 2)
        v = v.transpose(1, 2)

        # Compute attention scores
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
        attn = torch.softmax(scores, dim=-1)

        # Apply attention to values
        out = torch.matmul(attn, v)

        # Reshape and project output
        out = out.transpose(1, 2).contiguous()
        out = out.view(batch_size, seq_len, self.embed_dim)
        out = self.out_proj.forward(out)

        return out

In [9]:
# --------------------------------------------------------------------------------
# 4. Custom Transformer Model for MNIST
# --------------------------------------------------------------------------------
class CustomTransformerMNIST:
    """Custom Transformer implementation for MNIST"""

    def __init__(self, patch_size: int = 7):
        self.patch_size = patch_size
        self.num_patches = (28 // patch_size) ** 2  # For MNIST images (28x28)
        self.patch_dim = patch_size * patch_size

        # Embedding layers
        self.patch_embed = CustomLinear(self.patch_dim, 256)
        self.pos_embed = torch.randn(1, self.num_patches, 256)

        # Transformer layers
        self.encoder1 = CustomTransformerEncoder(256, 8, 512)
        self.encoder2 = CustomTransformerEncoder(256, 8, 512)

        # Classification head
        self.classifier = CustomLinear(256, 10)
        self.training = True

    def to_patches(self, x: torch.Tensor) -> torch.Tensor:
        """Convert images to patches
        Args:
            x: Input tensor of shape [B, C, H, W]
        Returns:
            Tensor of shape [B, num_patches, patch_dim]
        """
        B, C, H, W = x.shape
        assert H == W == 28, f"Expected 28x28 images, got {H}x{W}"

        # Unfold into patches
        patches = x.unfold(2, self.patch_size, self.patch_size)  # Unfold H
        patches = patches.unfold(3, self.patch_size, self.patch_size)  # Unfold W

        # Reshape to [B, num_patches, patch_dim]
        patches = patches.permute(0, 2, 3, 1, 4, 5).contiguous()
        patches = patches.view(B, self.num_patches, -1)

        return patches

    def train(self):
        """Set model to training mode"""
        self.training = True
        if hasattr(self.encoder1, 'train'):
            self.encoder1.train()
        if hasattr(self.encoder2, 'train'):
            self.encoder2.train()

    def eval(self):
        """Set model to evaluation mode"""
        self.training = False
        if hasattr(self.encoder1, 'eval'):
            self.encoder1.eval()
        if hasattr(self.encoder2, 'eval'):
            self.encoder2.eval()

    def to(self, device):
        """Move model to specified device"""
        # Move weights, biases and embeddings to device
        self.pos_embed = self.pos_embed.to(device)
        self.patch_embed = self.patch_embed.to(device)

        # Move transformer encoders
        self.encoder1 = self.encoder1.to(device)
        self.encoder2 = self.encoder2.to(device)

        # Move classifier
        self.classifier = self.classifier.to(device)

        return self

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Convert to patches and embed
        x = self.to_patches(x)  # [B, num_patches, patch_dim]
        x = self.patch_embed.forward(x)  # [B, num_patches, embed_dim]

        # Add positional embeddings
        x = x + self.pos_embed

        # Apply transformer layers
        x = self.encoder1.forward(x)
        x = self.encoder2.forward(x)

        # Global average pooling over patches
        x = x.mean(dim=1)  # [B, embed_dim]

        # Classification
        x = self.classifier.forward(x)  # [B, num_classes]

        return x

In [10]:
# --------------------------------------------------------------------------------
# 5. Training Functions
# --------------------------------------------------------------------------------

from tqdm import tqdm
import time

def train_epoch(model, train_loader, optimizer, criterion, epoch, device):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    start_time = time.time()

    # Create progress bar
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader),
                       desc=f'Epoch {epoch}', ncols=100)

    for batch_idx, (data, target) in progress_bar:
        # Move data to device
        data, target = data.to(device), target.to(device)

        # Forward pass
        output = model.forward(data)
        loss = criterion(output, target)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Statistics
        total_loss += loss.item()
        pred = output.argmax(dim=1, keepdim=True)
        batch_correct = pred.eq(target.view_as(pred)).sum().item()
        correct += batch_correct
        total += target.size(0)

        # Update progress bar
        avg_loss = total_loss / (batch_idx + 1)
        accuracy = 100. * correct / total
        progress_bar.set_postfix({
            'loss': f'{avg_loss:.4f}',
            'acc': f'{accuracy:.2f}%'
        })
        progress_bar.update()

    progress_bar.close()

    # Final epoch statistics
    epoch_time = time.time() - start_time
    avg_loss = total_loss / len(train_loader)
    accuracy = 100. * correct / total

    print(f'\nEpoch {epoch} Summary:')
    print(f'Training Loss: {avg_loss:.4f}')
    print(f'Training Accuracy: {accuracy:.2f}% ({correct}/{total})')
    print(f'Time: {epoch_time:.2f}s')

    return avg_loss, accuracy

def evaluate(model, test_loader, criterion, epoch, device):
    """Evaluate the model"""
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    start_time = time.time()

    # Create progress bar for evaluation
    progress_bar = tqdm(enumerate(test_loader), total=len(test_loader),
                       desc='Evaluation', ncols=100)

    with torch.no_grad():
        for batch_idx, (data, target) in progress_bar:
            # Move data to device
            data, target = data.to(device), target.to(device)

            output = model.forward(data)
            test_loss += criterion(output, target).item()
            pred = output.argmax(dim=1, keepdim=True)
            batch_correct = pred.eq(target.view_as(pred)).sum().item()
            correct += batch_correct
            total += target.size(0)

            # Update progress bar
            avg_loss = test_loss / (batch_idx + 1)
            accuracy = 100. * correct / total
            progress_bar.set_postfix({
                'loss': f'{avg_loss:.4f}',
                'acc': f'{accuracy:.2f}%'
            })
            progress_bar.update()

    progress_bar.close()

    # Final evaluation statistics
    eval_time = time.time() - start_time
    avg_loss = test_loss / len(test_loader)
    accuracy = 100. * correct / total

    print(f'\nEvaluation Summary:')
    print(f'Test Loss: {avg_loss:.4f}')
    print(f'Test Accuracy: {accuracy:.2f}% ({correct}/{total})')
    print(f'Time: {eval_time:.2f}s')

    return avg_loss, accuracy



In [11]:
def main():
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"\nUsing device: {device}")

    # Load MNIST dataset
    print("\nLoading MNIST dataset...")
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ])

    train_dataset = datasets.MNIST('./data', train=True, download=True,
                                 transform=transform)
    test_dataset = datasets.MNIST('./data', train=False, transform=transform)

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=32,  # Reduced batch size
        shuffle=True,
        num_workers=0,  # No multiprocessing for debugging
        pin_memory=True
    )
    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=32,
        num_workers=0,
        pin_memory=True
    )

    # Initialize models and move to device
    print("\nInitializing models...")
    cnn_model = CustomCNN()
    transformer_model = CustomTransformerMNIST()


    # Get trainable parameters AFTER moving to device
    def get_trainable_params(model):
        params = []
        for attr in model.__dict__.values():
            if hasattr(attr, 'weight') and hasattr(attr, 'bias'):
                params.extend([attr.weight, attr.bias])
            elif isinstance(attr, (CustomTransformerEncoder, CustomMultiHeadAttention)):
                params.extend(get_trainable_params(attr))
        return [p for p in params if p.requires_grad]

    # Get parameters
    cnn_params = get_trainable_params(cnn_model)
    transformer_params = get_trainable_params(transformer_model)

    # Move models to device BEFORE creating optimizers
    cnn_model = cnn_model.to(device)
    transformer_model = transformer_model.to(device)

    # Create optimizers
    criterion = torch.nn.CrossEntropyLoss()
    cnn_optimizer = torch.optim.Adam(cnn_params, lr=0.001, betas=(0.9, 0.999))
    transformer_optimizer = torch.optim.Adam(transformer_params, lr=0.001)

    # Training history
    history = {
        'cnn': {'train_loss': [], 'train_acc': [], 'test_loss': [], 'test_acc': []},
        'transformer': {'train_loss': [], 'train_acc': [], 'test_loss': [], 'test_acc': []}
    }

    # Train CNN
    print("\n" + "="*60)
    print("Training CNN Model")
    print("="*60)

    for epoch in range(1, 6):
        print(f"\n{'-'*20} Epoch {epoch}/5 {'-'*20}")
        train_loss, train_acc = train_epoch(cnn_model, train_loader, cnn_optimizer, criterion, epoch, device)
        test_loss, test_acc = evaluate(cnn_model, test_loader, criterion, epoch, device)

        # Save history
        history['cnn']['train_loss'].append(train_loss)
        history['cnn']['train_acc'].append(train_acc)
        history['cnn']['test_loss'].append(test_loss)
        history['cnn']['test_acc'].append(test_acc)

    # Train Transformer
    print("\n" + "="*60)
    print("Training Transformer Model")
    print("="*60)

    for epoch in range(1, 6):
        print(f"\n{'-'*20} Epoch {epoch}/5 {'-'*20}")
        train_loss, train_acc = train_epoch(transformer_model, train_loader,
                                          transformer_optimizer, criterion, epoch, device)
        test_loss, test_acc = evaluate(transformer_model, test_loader, criterion, epoch, device)

        # Save history
        history['transformer']['train_loss'].append(train_loss)
        history['transformer']['train_acc'].append(train_acc)
        history['transformer']['test_loss'].append(test_loss)
        history['transformer']['test_acc'].append(test_acc)

    # Plot training history
    plt.figure(figsize=(15, 5))

    # Plot loss
    plt.subplot(1, 2, 1)
    plt.plot(history['cnn']['train_loss'], label='CNN Train')
    plt.plot(history['cnn']['test_loss'], label='CNN Test')
    plt.plot(history['transformer']['train_loss'], label='Transformer Train')
    plt.plot(history['transformer']['test_loss'], label='Transformer Test')
    plt.title('Loss History')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    # Plot accuracy
    plt.subplot(1, 2, 2)
    plt.plot(history['cnn']['train_acc'], label='CNN Train')
    plt.plot(history['cnn']['test_acc'], label='CNN Test')
    plt.plot(history['transformer']['train_acc'], label='Transformer Train')
    plt.plot(history['transformer']['test_acc'], label='Transformer Test')
    plt.title('Accuracy History')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy (%)')
    plt.legend()

    plt.tight_layout()
    plt.savefig('training_history.png')
    plt.close()

    print("\nTraining complete! Results saved to 'training_history.png'")

In [12]:
if __name__ == '__main__':
    main()


Using device: cuda

Loading MNIST dataset...
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
<urlopen error [Errno 110] Connection timed out>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9.91M/9.91M [00:00<00:00, 16.1MB/s]


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
<urlopen error [Errno 110] Connection timed out>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28.9k/28.9k [00:00<00:00, 494kB/s]


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
<urlopen error [Errno 110] Connection timed out>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1.65M/1.65M [00:00<00:00, 3.84MB/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
<urlopen error [Errno 110] Connection timed out>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4.54k/4.54k [00:00<00:00, 9.97MB/s]


Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw


Initializing models...

Training CNN Model

-------------------- Epoch 1/5 --------------------


Epoch 1: 100%|██████████████████████████| 1875/1875 [00:33<00:00, 55.36it/s, loss=3.4099, acc=9.81%]



Epoch 1 Summary:
Training Loss: 3.4099
Training Accuracy: 9.81% (5884/60000)
Time: 33.88s


Evaluation: 100%|████████████████████████| 313/313 [00:04<00:00, 77.96it/s, loss=3.4028, acc=10.22%]



Evaluation Summary:
Test Loss: 3.4028
Test Accuracy: 10.22% (1022/10000)
Time: 4.02s

-------------------- Epoch 2/5 --------------------


Epoch 2: 100%|██████████████████████████| 1875/1875 [00:32<00:00, 58.29it/s, loss=3.4099, acc=9.81%]



Epoch 2 Summary:
Training Loss: 3.4099
Training Accuracy: 9.81% (5884/60000)
Time: 32.18s


Evaluation: 100%|████████████████████████| 313/313 [00:04<00:00, 68.67it/s, loss=3.4028, acc=10.22%]



Evaluation Summary:
Test Loss: 3.4028
Test Accuracy: 10.22% (1022/10000)
Time: 4.57s

-------------------- Epoch 3/5 --------------------


Epoch 3: 100%|██████████████████████████| 1875/1875 [00:32<00:00, 58.46it/s, loss=3.4099, acc=9.81%]



Epoch 3 Summary:
Training Loss: 3.4099
Training Accuracy: 9.81% (5884/60000)
Time: 32.08s


Evaluation: 100%|████████████████████████| 313/313 [00:04<00:00, 71.11it/s, loss=3.4028, acc=10.22%]



Evaluation Summary:
Test Loss: 3.4028
Test Accuracy: 10.22% (1022/10000)
Time: 4.41s

-------------------- Epoch 4/5 --------------------


Epoch 4: 100%|██████████████████████████| 1875/1875 [00:31<00:00, 59.00it/s, loss=3.4099, acc=9.81%]



Epoch 4 Summary:
Training Loss: 3.4099
Training Accuracy: 9.81% (5884/60000)
Time: 31.78s


Evaluation: 100%|████████████████████████| 313/313 [00:03<00:00, 83.25it/s, loss=3.4028, acc=10.22%]



Evaluation Summary:
Test Loss: 3.4028
Test Accuracy: 10.22% (1022/10000)
Time: 3.77s

-------------------- Epoch 5/5 --------------------


Epoch 5: 100%|██████████████████████████| 1875/1875 [00:32<00:00, 58.38it/s, loss=3.4099, acc=9.81%]



Epoch 5 Summary:
Training Loss: 3.4099
Training Accuracy: 9.81% (5884/60000)
Time: 32.12s


Evaluation: 100%|████████████████████████| 313/313 [00:03<00:00, 83.34it/s, loss=3.4028, acc=10.22%]



Evaluation Summary:
Test Loss: 3.4028
Test Accuracy: 10.22% (1022/10000)
Time: 3.76s

Training Transformer Model

-------------------- Epoch 1/5 --------------------


Epoch 1: 100%|██████████████████████████| 1875/1875 [00:53<00:00, 34.85it/s, loss=2.6856, acc=5.92%]



Epoch 1 Summary:
Training Loss: 2.6856
Training Accuracy: 5.92% (3553/60000)
Time: 53.82s


Evaluation: 100%|█████████████████████████| 313/313 [00:04<00:00, 71.59it/s, loss=2.6935, acc=5.80%]



Evaluation Summary:
Test Loss: 2.6935
Test Accuracy: 5.80% (580/10000)
Time: 4.38s

-------------------- Epoch 2/5 --------------------


Epoch 2: 100%|██████████████████████████| 1875/1875 [00:52<00:00, 35.64it/s, loss=2.6856, acc=5.92%]



Epoch 2 Summary:
Training Loss: 2.6856
Training Accuracy: 5.92% (3553/60000)
Time: 52.62s


Evaluation: 100%|█████████████████████████| 313/313 [00:04<00:00, 78.10it/s, loss=2.6935, acc=5.80%]



Evaluation Summary:
Test Loss: 2.6935
Test Accuracy: 5.80% (580/10000)
Time: 4.01s

-------------------- Epoch 3/5 --------------------


Epoch 3: 100%|██████████████████████████| 1875/1875 [00:54<00:00, 34.67it/s, loss=2.6856, acc=5.92%]



Epoch 3 Summary:
Training Loss: 2.6856
Training Accuracy: 5.92% (3553/60000)
Time: 54.09s


Evaluation: 100%|█████████████████████████| 313/313 [00:04<00:00, 76.20it/s, loss=2.6935, acc=5.80%]



Evaluation Summary:
Test Loss: 2.6935
Test Accuracy: 5.80% (580/10000)
Time: 4.12s

-------------------- Epoch 4/5 --------------------


Epoch 4: 100%|██████████████████████████| 1875/1875 [00:53<00:00, 35.21it/s, loss=2.6856, acc=5.92%]



Epoch 4 Summary:
Training Loss: 2.6856
Training Accuracy: 5.92% (3553/60000)
Time: 53.26s


Evaluation: 100%|█████████████████████████| 313/313 [00:04<00:00, 66.53it/s, loss=2.6935, acc=5.80%]



Evaluation Summary:
Test Loss: 2.6935
Test Accuracy: 5.80% (580/10000)
Time: 4.71s

-------------------- Epoch 5/5 --------------------


Epoch 5: 100%|██████████████████████████| 1875/1875 [00:53<00:00, 35.20it/s, loss=2.6856, acc=5.92%]



Epoch 5 Summary:
Training Loss: 2.6856
Training Accuracy: 5.92% (3553/60000)
Time: 53.27s


Evaluation: 100%|█████████████████████████| 313/313 [00:04<00:00, 78.23it/s, loss=2.6935, acc=5.80%]



Evaluation Summary:
Test Loss: 2.6935
Test Accuracy: 5.80% (580/10000)
Time: 4.01s

Training complete! Results saved to 'training_history.png'
