# start

In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import os
import numpy as np

import copy
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from collections import defaultdict

folder = "test_results"
os.makedirs(folder, exist_ok=True)

# track neff and sparsity

In [78]:
def mask_pc(module:nn.Module, beta=1.0, method='magnitude') -> torch.Tensor:
    x = module.weight.data
    output_size, input_size = x.shape
    if method == 'mean':
        x = x - x.mean(dim=0, keepdim=True)
    x_norm = torch.abs(x) / torch.sum(torch.abs(x), dim=0, keepdim=True)
    neff = 1/torch.sum((x_norm ** 2), dim=0, keepdim=True).squeeze(0)
    r_neff = torch.floor(beta * neff)
    
    _, indices = torch.sort(x_norm, dim=0, descending=True)
    range_tensor = torch.arange(output_size, device=x.device).unsqueeze(0).expand(input_size, -1).T
    sorted_mask = range_tensor < r_neff
    
    mask = torch.zeros_like(x, dtype=torch.bool)
    mask.scatter_(0, indices, sorted_mask)
    return mask, torch.floor(neff)

def model_pc(model, renormalize=False, beta=1.0, method='magnitude'):
    model = copy.deepcopy(model)
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            mask, neff = mask_pc(module, beta=beta, method=method)
            mask = mask.to(module.weight.device)
            with torch.no_grad():
                pre = module.weight.abs().sum(dim=0, keepdim=True)
                module.weight *= mask
                if renormalize:
                    post = module.weight.abs().sum(dim=0, keepdim=True)
                    module.weight.mul_(pre / post)
    return model, neff

def mask_pr(module:nn.Module, beta=1.0, method='magnitude') -> torch.Tensor:
    x = module.weight.data
    output_size, input_size = x.shape
    if method == 'mean':
        x = x - x.mean(dim=1, keepdim=True)
    x_norm = torch.abs(x) / torch.sum(torch.abs(x), dim=1, keepdim=True)
    neff = 1/torch.sum((x_norm ** 2), dim=1, keepdim=True).squeeze(0)
    r_neff = torch.floor(beta * neff)
    
    _, indices = torch.sort(x_norm, dim=1, descending=True)
    range_tensor = torch.arange(input_size, device=x.device).unsqueeze(0).expand(output_size, -1)
    sorted_mask = range_tensor < r_neff

    mask = torch.zeros_like(x, dtype=torch.bool)
    mask.scatter_(1, indices, sorted_mask)
    return mask, torch.floor(neff)

def model_pr(model, renormalize=False, beta=1.0, method='magnitude'):
    model = copy.deepcopy(model)
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            mask, neff = mask_pr(module, beta=beta, method=method)
            mask = mask.to(module.weight.device)
            with torch.no_grad():
                pre = module.weight.abs().sum(dim=1, keepdim=True)
                module.weight *= mask
                if renormalize:
                    post = module.weight.abs().sum(dim=1, keepdim=True)
                    module.weight.mul_(pre / post)
    return model, neff

def mask_block(module:nn.Module, beta=1.0, method='magnitude') -> torch.Tensor:
    x = module.weight.data
    x = x.view(-1)
    if method == 'mean':
        x = x - torch.mean(x)
    x_norm = torch.abs(x) / torch.sum(torch.abs(x))
    neff = 1/torch.sum((x_norm ** 2))
    r_neff = torch.floor(beta * neff)

    _, indices = torch.sort(x_norm, descending=True)
    range_tensor = torch.arange(len(x), device=x.device)
    sorted_mask = range_tensor < r_neff

    mask = torch.zeros_like(x, dtype=torch.bool)
    mask.scatter_(0, indices, sorted_mask)
    mask = mask.view_as(module.weight)
    return mask, torch.floor(neff)

def model_block(model, renormalize=False, beta=1.0, method='magnitude'):
    model = copy.deepcopy(model)
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            mask, neff = mask_block(module, beta=beta, method=method)
            mask = mask.to(module.weight.device)
            with torch.no_grad():
                pre = module.weight.abs().sum(dim=0, keepdim=True)
                module.weight *= mask
                if renormalize:
                    post = module.weight.abs().sum(dim=0, keepdim=True)
                    module.weight.mul_(pre / post)
    return model, neff

In [26]:
def model_sparsity(model):
    """Calculate the sparsity of the model"""
    total_params = 0
    zero_params = 0
    
    for name, param in model.named_parameters():
        if 'weight' in name:
            total_params += param.numel()
            zero_params += torch.sum(param == 0).item()
    
    sparsity = zero_params / total_params
    return sparsity

def per_layer_neff(model):
    """Calculate the effective parameters (Neff) per layer"""
    neff = {}
    for name, param in model.named_parameters():
        if 'weight' in name:
            layer_neff = torch.sum(param != 0).item()
            neff[name] = layer_neff
    return neff

In [79]:
# Dataset setup
batch_size = 64
test_batch_size = 1000
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=test_batch_size, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Model class with optional dropout
class LinearModel(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=[512, 512, 512], dropout_rate=0.0):
        super(LinearModel, self).__init__()
        self.layers = nn.ModuleList()
        self.dropout = nn.Dropout(dropout_rate)
        
        prev_size = input_size
        for size in hidden_size:
            self.layers.append(nn.Linear(prev_size, size))
            prev_size = size
            
        self.output = nn.Linear(prev_size, output_size)
        
    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten
        
        for layer in self.layers:
            x = F.relu(layer(x))
            x = self.dropout(x)  # Apply dropout after activation
        x = self.output(x)
        return F.log_softmax(x, dim=1)
    
    def save(self, path):
        torch.save(self.state_dict(), path)

    def load(self, path):
        self.load_state_dict(torch.load(path))

# Training function
def train(model, device, train_loader, optimizer, epoch):
    """Train for one epoch"""
    model.train()
    train_loss = 0
    correct = 0
    
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.view_as(pred)).sum().item()
        
        if batch_idx % 200 == 0:
            print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} '
                  f'({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}'
                  f'accuracy: {100. * correct / len(train_loader.dataset):.2f}%')

    avg_loss = train_loss / len(train_loader)
    accuracy = 100. * correct / len(train_loader.dataset)
    return avg_loss, accuracy

# Testing function
def test(model, device, test_loader, times=1):
    """Evaluate model on test set"""
    model.eval()
    accuracy_list = []
    loss_list = []
    for _ in range(times):
        test_loss = 0
        correct = 0
        with torch.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                test_loss += F.nll_loss(output, target, reduction='sum').item()
                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(target.view_as(pred)).sum().item()

        test_loss /= len(test_loader.dataset)
        accuracy = 100. * correct / len(test_loader.dataset)
        accuracy_list.append(accuracy)
        loss_list.append(test_loss)

    if times == 1:
        print(f'Test set: Average loss: {test_loss:.4f}, '
              f'Accuracy: {correct}/{len(test_loader.dataset)} ({accuracy:.2f}%)\n')

        return test_loss, accuracy
    
    else:
        return loss_list, accuracy_list, sum(accuracy_list)/times

Using device: cuda


In [None]:
# Model configurations
model_configs = {
    'Model_1_Underfit': {
        'hidden_size': [64, 32, 16],  # Very shallow, only 1 small hidden layer
        'lr': 1e-4,  # Lower learning rate
        'epochs': 5,  # Fewer epochs
        'dropout': 0.0,
        'description': 'Underfitted: Too simple (1 layer, 32 units)'
    },
    'Model_2_Slight_Underfit': {
        'hidden_size': [256, 128, 64],  # 2 small layers
        'lr': 5e-4,
        'epochs': 8,
        'dropout': 0.0,
        'description': 'Slightly underfitted: Simple architecture'
    },
    'Model_3_Well_Trained': {
        'hidden_size': [512, 256, 128],  # Moderate depth and width
        'lr': 3e-4,
        'epochs': 15,
        'dropout': 0.2,  # Some regularization
        'description': 'Well-trained: Balanced architecture with dropout'
    },
    'Model_4_Well_Trained_Deep': {
        'hidden_size': [1024, 512, 256],  # Deeper but with dropout
        'lr': 3e-4,
        'epochs': 20,
        'dropout': 0.3,  # More dropout for regularization
        'description': 'Well-trained: Deeper with good regularization'
    },
    'Model_5_Overfit': {
        'hidden_size': [2048, 1024, 1024],  # Very deep and wide
        'lr': 1e-3,  # Higher learning rate
        'epochs': 30,  # Many epochs
        'dropout': 0.0,  # No regularization
        'description': 'Overfitted: Very complex without regularization'
    },
    'Model_6_Extra_Overfit': {
        'hidden_size': [4096, 2048, 1024],  # Extremely deep and wide
        'lr': 1e-3,
        'epochs': 50,
        'dropout': 0.0,
        'description': 'Extra Overfitted: Very complex without regularization'
    },
    'Model_7_Extra_Overfit': {
        'hidden_size': [8192, 4096, 2048],  # Extremely deep and wide
        'lr': 1e-3,
        'epochs': 100,
        'dropout': 0.0,
        'description': 'Extra Overfitted: Very complex without regularization'
    }
}

# Train all models
all_results = {}

for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Training {model_name}: {config['description']}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = LinearModel(
        input_size=28*28, 
        output_size=10, 
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    
    # Optimizer
    optimizer = optim.Adam(model.parameters(), lr=config['lr'])
    
    # Training loop
    for epoch in range(1, config['epochs'] + 1):
        train_loss, train_accuracy = train(model, device, train_loader, optimizer, epoch)
    model.save(f'models/MNIST_model/{model_name}.pth')



Using device: cuda

Training Model_1_Underfit: Underfitted: Too simple (1 layer, 32 units)
Architecture: Input(784) -> 64 -> 32 -> 16 -> Output(10)
Learning rate: 0.0001, Epochs: 5, Dropout: 0.0

Training Model_2_Slight_Underfit: Slightly underfitted: Simple architecture
Architecture: Input(784) -> 256 -> 128 -> 64 -> Output(10)
Learning rate: 0.0005, Epochs: 8, Dropout: 0.0

Training Model_3_Well_Trained: Well-trained: Balanced architecture with dropout
Architecture: Input(784) -> 512 -> 256 -> 128 -> Output(10)
Learning rate: 0.0003, Epochs: 15, Dropout: 0.2

Training Model_4_Well_Trained_Deep: Well-trained: Deeper with good regularization
Architecture: Input(784) -> 1024 -> 512 -> 256 -> Output(10)
Learning rate: 0.0003, Epochs: 20, Dropout: 0.3

Training Model_5_Overfit: Overfitted: Very complex without regularization
Architecture: Input(784) -> 2048 -> 1024 -> 1024 -> Output(10)
Learning rate: 0.001, Epochs: 30, Dropout: 0.0

Training Model_6_Extra_Overfit: Extra Overfitted: Very 

In [49]:
# Results storage
result = {
    'model_name': [],
    'test_accuracy': [],
    'model_sparsity': [],
}

In [80]:
for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Training {model_name}: {config['description']}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = LinearModel(
        input_size=28*28, 
        output_size=10, 
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'models/MNIST_model/{model_name}.pth')
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")
    
    test_loss, test_accuracy, origin_test_accuracy = test(model, device, test_loader, times=5)
    
    result['model_name'].append(model_name)
    result['test_accuracy'].append(test_accuracy)
    result['model_sparsity'].append(0.0)
    
    # magnitude
    pc_model, pc_neff = model_pc(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pc_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pc_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pc_model))
    
    pr_model, pr_neff = model_pr(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pr_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pr_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pr_model))
    
    pb_model, pb_neff = model_block(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pb_model))

    # mean
    mean_pc_model, mean_pc_neff = model_pc(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pc_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pc_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pc_model))

    mean_pr_model, mean_pr_neff = model_pr(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pr_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pr_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pr_model))

    mean_pb_model, mean_pb_neff = model_block(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pb_model))

    # summary
    print(f"model name: {model_name}, test accuracy: {origin_test_accuracy:.2f}%")
    print(f"per column magnitude pruning test accuracy: {result['test_accuracy'][-6]:.2f}%, sparsity: {result['model_sparsity'][-4]:.4f}")
    print(f"per row magnitude pruning test accuracy: {result['test_accuracy'][-5]:.2f}%, sparsity: {result['model_sparsity'][-3]:.4f}")
    print(f"per block magnitude pruning test accuracy: {result['test_accuracy'][-4]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
    print(f"mean column mean pruning test accuracy: {result['test_accuracy'][-3]:.2f}%, sparsity: {result['model_sparsity'][-2]:.4f}")
    print(f"mean row mean pruning test accuracy: {result['test_accuracy'][-2]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
    print(f"mean block mean pruning test accuracy: {result['test_accuracy'][-1]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")


Training Model_1_Underfit: Underfitted: Too simple (1 layer, 32 units)
Architecture: Input(784) -> 64 -> 32 -> 16 -> Output(10)
Learning rate: 0.0001, Epochs: 5, Dropout: 0.0
Total parameters: 53,018
model name: Model_1_Underfit, test accuracy: 93.61%
per column magnitude pruning test accuracy: 91.23%, sparsity: 0.3608
per row magnitude pruning test accuracy: 91.85%, sparsity: 0.3415
per block magnitude pruning test accuracy: 92.98%, sparsity: 0.3692
mean column mean pruning test accuracy: 87.77%, sparsity: 0.3666
mean row mean pruning test accuracy: 92.44%, sparsity: 0.3692
mean block mean pruning test accuracy: 89.83%, sparsity: 0.3692

Training Model_2_Slight_Underfit: Slightly underfitted: Simple architecture
Architecture: Input(784) -> 256 -> 128 -> 64 -> Output(10)
Learning rate: 0.0005, Epochs: 8, Dropout: 0.0
Total parameters: 242,762
model name: Model_2_Slight_Underfit, test accuracy: 97.99%
per column magnitude pruning test accuracy: 97.62%, sparsity: 0.4369
per row magnitud

# renormalize test

In [51]:
renormal_result = {
    'test_accuracy': [],
    'model_sparsity': [],
}

for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Training {model_name}: {config['description']}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = LinearModel(
        input_size=28*28, 
        output_size=10, 
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'models/MNIST_model/{model_name}.pth')
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")

    test_loss, test_accuracy, model_test_accuracy = test(model, device, test_loader, times=5)
    renormal_result['test_accuracy'].append(model_test_accuracy)
    renormal_result['model_sparsity'].append(0.0)

    row_pruning_renormalize, renormalize_neff = model_pr(model, renormalize=True)
    row_pruning, neff = model_pr(model, renormalize=False)

    # Test the pruned models
    test_loss, test_accuracy, accuracy_mean = test(row_pruning_renormalize, device, test_loader, times=5)
    renormal_result['test_accuracy'].append(accuracy_mean)
    renormal_result['model_sparsity'].append(model_sparsity(row_pruning_renormalize))

    test_loss, test_accuracy, accuracy_mean = test(row_pruning, device, test_loader, times=5)
    renormal_result['test_accuracy'].append(accuracy_mean)
    renormal_result['model_sparsity'].append(model_sparsity(row_pruning))

    print(f"Model Name: {model_name}, Test Accuracy: {model_test_accuracy:.2f}%")
    print(f"Renormalized Model test accuracy: {renormal_result['test_accuracy'][-2]:.2f}%, Sparsity: {renormal_result['model_sparsity'][-2]:.2f}")
    print(f"Model test accuracy: {renormal_result['test_accuracy'][-1]:.2f}%, Sparsity: {renormal_result['model_sparsity'][-1]:.2f}")


Training Model_1_Underfit: Underfitted: Too simple (1 layer, 32 units)
Architecture: Input(784) -> 64 -> 32 -> 16 -> Output(10)
Learning rate: 0.0001, Epochs: 5, Dropout: 0.0
Total parameters: 53,018
Model Name: Model_1_Underfit, Test Accuracy: 93.61%
Renormalized Model test accuracy: 91.83%, Sparsity: 0.35
Model test accuracy: 91.85%, Sparsity: 0.35

Training Model_2_Slight_Underfit: Slightly underfitted: Simple architecture
Architecture: Input(784) -> 256 -> 128 -> 64 -> Output(10)
Learning rate: 0.0005, Epochs: 8, Dropout: 0.0
Total parameters: 242,762
Model Name: Model_2_Slight_Underfit, Test Accuracy: 97.99%
Renormalized Model test accuracy: 97.88%, Sparsity: 0.42
Model test accuracy: 97.90%, Sparsity: 0.42

Training Model_3_Well_Trained: Well-trained: Balanced architecture with dropout
Architecture: Input(784) -> 512 -> 256 -> 128 -> Output(10)
Learning rate: 0.0003, Epochs: 15, Dropout: 0.2
Total parameters: 567,434
Model Name: Model_3_Well_Trained, Test Accuracy: 98.49%
Renorm

# Different activation function

In [59]:
# Model class with optional dropout
class geluLinearModel(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=[512, 512, 512], dropout_rate=0.0):
        super(geluLinearModel, self).__init__()
        self.layers = nn.ModuleList()
        self.dropout = nn.Dropout(dropout_rate)
        
        prev_size = input_size
        for size in hidden_size:
            self.layers.append(nn.Linear(prev_size, size))
            prev_size = size
            
        self.output = nn.Linear(prev_size, output_size)
        
    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten
        
        for layer in self.layers:
            x = F.gelu(layer(x))
            x = self.dropout(x)  # Apply dropout after activation
        x = self.output(x)
        return F.log_softmax(x, dim=1)
    
    def save(self, path):
        torch.save(self.state_dict(), path)

    def load(self, path):
        self.load_state_dict(torch.load(path))
        
        
# Model class with optional dropout
class SigmoidLinearModel(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=[512, 512, 512], dropout_rate=0.0):
        super(SigmoidLinearModel, self).__init__()
        self.layers = nn.ModuleList()
        self.dropout = nn.Dropout(dropout_rate)
        
        prev_size = input_size
        for size in hidden_size:
            self.layers.append(nn.Linear(prev_size, size))
            prev_size = size
            
        self.output = nn.Linear(prev_size, output_size)
        
    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten
        
        for layer in self.layers:
            x = F.sigmoid(layer(x))
            x = self.dropout(x)  # Apply dropout after activation
        x = self.output(x)
        return F.log_softmax(x, dim=1)
    
    def save(self, path):
        torch.save(self.state_dict(), path)

    def load(self, path):
        self.load_state_dict(torch.load(path))
        
        
        
# Model class with optional dropout
class tanhLinearModel(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=[512, 512, 512], dropout_rate=0.0):
        super(tanhLinearModel, self).__init__()
        self.layers = nn.ModuleList()
        self.dropout = nn.Dropout(dropout_rate)
        
        prev_size = input_size
        for size in hidden_size:
            self.layers.append(nn.Linear(prev_size, size))
            prev_size = size
            
        self.output = nn.Linear(prev_size, output_size)
        
    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten
        
        for layer in self.layers:
            x = F.tanh(layer(x))
            x = self.dropout(x)  # Apply dropout after activation
        x = self.output(x)
        return F.log_softmax(x, dim=1)
    
    def save(self, path):
        torch.save(self.state_dict(), path)

    def load(self, path):
        self.load_state_dict(torch.load(path))


In [60]:
# Model configurations
model_configs = {
    'Model_1_Underfit': {
        'hidden_size': [64, 32, 16],  # Very shallow, only 1 small hidden layer
        'lr': 1e-4,  # Lower learning rate
        'epochs': 5,  # Fewer epochs
        'dropout': 0.0,
        'description': 'Underfitted: Too simple (1 layer, 32 units)'
    },
    'Model_2_Slight_Underfit': {
        'hidden_size': [256, 128, 64],  # 2 small layers
        'lr': 5e-4,
        'epochs': 8,
        'dropout': 0.0,
        'description': 'Slightly underfitted: Simple architecture'
    },
    'Model_3_Well_Trained': {
        'hidden_size': [512, 256, 128],  # Moderate depth and width
        'lr': 3e-4,
        'epochs': 15,
        'dropout': 0.2,  # Some regularization
        'description': 'Well-trained: Balanced architecture with dropout'
    },
    'Model_4_Well_Trained_Deep': {
        'hidden_size': [1024, 512, 256],  # Deeper but with dropout
        'lr': 3e-4,
        'epochs': 20,
        'dropout': 0.3,  # More dropout for regularization
        'description': 'Well-trained: Deeper with good regularization'
    },
    'Model_5_Overfit': {
        'hidden_size': [2048, 1024, 1024],  # Very deep and wide
        'lr': 1e-3,  # Higher learning rate
        'epochs': 30,  # Many epochs
        'dropout': 0.0,  # No regularization
        'description': 'Overfitted: Very complex without regularization'
    },
    'Model_6_Extra_Overfit': {
        'hidden_size': [4096, 2048, 1024],  # Extremely deep and wide
        'lr': 1e-3,
        'epochs': 50,
        'dropout': 0.0,
        'description': 'Extra Overfitted: Very complex without regularization'
    },
    'Model_7_Extra_Overfit': {
        'hidden_size': [8192, 4096, 2048],  # Extremely deep and wide
        'lr': 1e-3,
        'epochs': 100,
        'dropout': 0.0,
        'description': 'Extra Overfitted: Very complex without regularization'
    }
}

# Train all models
all_results = {}

for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Training {model_name}: {config['description']}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model1 = geluLinearModel(
        input_size=28*28, 
        output_size=10, 
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    
    # Optimizer
    optimizer = optim.Adam(model1.parameters(), lr=config['lr'])

    # Training loop
    for epoch in range(1, config['epochs'] + 1):
        train_loss, train_accuracy = train(model1, device, train_loader, optimizer, epoch)
    model1.save(f'models/MNIST_model/gelu_{model_name}.pth')
    

for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Training {model_name}: {config['description']}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model2 = SigmoidLinearModel(
        input_size=28*28, 
        output_size=10, 
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    
    # Optimizer
    optimizer = optim.Adam(model2.parameters(), lr=config['lr'])

    # Training loop
    for epoch in range(1, config['epochs'] + 1):
        train_loss, train_accuracy = train(model2, device, train_loader, optimizer, epoch)
    model2.save(f'models/MNIST_model/sigmoid_{model_name}.pth')
    
for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Training {model_name}: {config['description']}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model3 = tanhLinearModel(
        input_size=28*28, 
        output_size=10, 
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    
    # Optimizer
    optimizer = optim.Adam(model3.parameters(), lr=config['lr'])

    # Training loop
    for epoch in range(1, config['epochs'] + 1):
        train_loss, train_accuracy = train(model3, device, train_loader, optimizer, epoch)
    model3.save(f'models/MNIST_model/tanh_{model_name}.pth')



Training Model_1_Underfit: Underfitted: Too simple (1 layer, 32 units)
Architecture: Input(784) -> 64 -> 32 -> 16 -> Output(10)
Learning rate: 0.0001, Epochs: 5, Dropout: 0.0

Training Model_2_Slight_Underfit: Slightly underfitted: Simple architecture
Architecture: Input(784) -> 256 -> 128 -> 64 -> Output(10)
Learning rate: 0.0005, Epochs: 8, Dropout: 0.0

Training Model_3_Well_Trained: Well-trained: Balanced architecture with dropout
Architecture: Input(784) -> 512 -> 256 -> 128 -> Output(10)
Learning rate: 0.0003, Epochs: 15, Dropout: 0.2

Training Model_4_Well_Trained_Deep: Well-trained: Deeper with good regularization
Architecture: Input(784) -> 1024 -> 512 -> 256 -> Output(10)
Learning rate: 0.0003, Epochs: 20, Dropout: 0.3

Training Model_5_Overfit: Overfitted: Very complex without regularization
Architecture: Input(784) -> 2048 -> 1024 -> 1024 -> Output(10)
Learning rate: 0.001, Epochs: 30, Dropout: 0.0

Training Model_6_Extra_Overfit: Extra Overfitted: Very complex without reg

# GELU

In [81]:
for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Training {model_name}: {config['description']}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = geluLinearModel(
        input_size=28*28, 
        output_size=10, 
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'models/MNIST_model/gelu_{model_name}.pth')
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")
    
    test_loss, test_accuracy, origin_test_accuracy = test(model, device, test_loader, times=5)
    
    result['model_name'].append(model_name)
    result['test_accuracy'].append(test_accuracy)
    result['model_sparsity'].append(0.0)
    
    # magnitude
    pc_model, pc_neff = model_pc(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pc_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pc_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pc_model))
    
    pr_model, pr_neff = model_pr(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pr_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pr_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pr_model))
    
    pb_model, pb_neff = model_block(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pb_model))

    # mean
    mean_pc_model, mean_pc_neff = model_pc(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pc_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pc_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pc_model))

    mean_pr_model, mean_pr_neff = model_pr(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pr_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pr_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pr_model))

    mean_pb_model, mean_pb_neff = model_block(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pb_model))

    # summary
    print(f"model name: {model_name}, test accuracy: {origin_test_accuracy:.2f}%")
    print(f"per column magnitude pruning test accuracy: {result['test_accuracy'][-6]:.2f}%, sparsity: {result['model_sparsity'][-4]:.4f}")
    print(f"per row magnitude pruning test accuracy: {result['test_accuracy'][-5]:.2f}%, sparsity: {result['model_sparsity'][-3]:.4f}")
    print(f"per block magnitude pruning test accuracy: {result['test_accuracy'][-4]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
    print(f"mean column mean pruning test accuracy: {result['test_accuracy'][-3]:.2f}%, sparsity: {result['model_sparsity'][-2]:.4f}")
    print(f"mean row mean pruning test accuracy: {result['test_accuracy'][-2]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
    print(f"mean block mean pruning test accuracy: {result['test_accuracy'][-1]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")


Training Model_1_Underfit: Underfitted: Too simple (1 layer, 32 units)
Architecture: Input(784) -> 64 -> 32 -> 16 -> Output(10)
Learning rate: 0.0001, Epochs: 5, Dropout: 0.0
Total parameters: 53,018
model name: Model_1_Underfit, test accuracy: 93.94%
per column magnitude pruning test accuracy: 91.42%, sparsity: 0.3632
per row magnitude pruning test accuracy: 91.05%, sparsity: 0.3424
per block magnitude pruning test accuracy: 92.45%, sparsity: 0.3702
mean column mean pruning test accuracy: 71.88%, sparsity: 0.3663
mean row mean pruning test accuracy: 81.79%, sparsity: 0.3702
mean block mean pruning test accuracy: 89.87%, sparsity: 0.3702

Training Model_2_Slight_Underfit: Slightly underfitted: Simple architecture
Architecture: Input(784) -> 256 -> 128 -> 64 -> Output(10)
Learning rate: 0.0005, Epochs: 8, Dropout: 0.0
Total parameters: 242,762
model name: Model_2_Slight_Underfit, test accuracy: 97.85%
per column magnitude pruning test accuracy: 97.83%, sparsity: 0.4287
per row magnitud

# SIGMOID

In [82]:
for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Training {model_name}: {config['description']}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = SigmoidLinearModel(
        input_size=28*28, 
        output_size=10, 
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'models/MNIST_model/sigmoid_{model_name}.pth')

    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")
    
    test_loss, test_accuracy, origin_test_accuracy = test(model, device, test_loader, times=5)
    
    result['model_name'].append(model_name)
    result['test_accuracy'].append(test_accuracy)
    result['model_sparsity'].append(0.0)
    
    # magnitude
    pc_model, pc_neff = model_pc(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pc_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pc_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pc_model))
    
    pr_model, pr_neff = model_pr(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pr_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pr_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pr_model))
    
    pb_model, pb_neff = model_block(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pb_model))

    # mean
    mean_pc_model, mean_pc_neff = model_pc(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pc_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pc_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pc_model))

    mean_pr_model, mean_pr_neff = model_pr(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pr_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pr_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pr_model))

    mean_pb_model, mean_pb_neff = model_block(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pb_model))

    # summary
    print(f"model name: {model_name}, test accuracy: {origin_test_accuracy:.2f}%")
    print(f"per column magnitude pruning test accuracy: {result['test_accuracy'][-6]:.2f}%, sparsity: {result['model_sparsity'][-4]:.4f}")
    print(f"per row magnitude pruning test accuracy: {result['test_accuracy'][-5]:.2f}%, sparsity: {result['model_sparsity'][-3]:.4f}")
    print(f"per block magnitude pruning test accuracy: {result['test_accuracy'][-4]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
    print(f"mean column mean pruning test accuracy: {result['test_accuracy'][-3]:.2f}%, sparsity: {result['model_sparsity'][-2]:.4f}")
    print(f"mean row mean pruning test accuracy: {result['test_accuracy'][-2]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
    print(f"mean block mean pruning test accuracy: {result['test_accuracy'][-1]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")


Training Model_1_Underfit: Underfitted: Too simple (1 layer, 32 units)
Architecture: Input(784) -> 64 -> 32 -> 16 -> Output(10)
Learning rate: 0.0001, Epochs: 5, Dropout: 0.0
Total parameters: 53,018
model name: Model_1_Underfit, test accuracy: 54.06%
per column magnitude pruning test accuracy: 43.62%, sparsity: 0.3973
per row magnitude pruning test accuracy: 48.35%, sparsity: 0.3054
per block magnitude pruning test accuracy: 48.04%, sparsity: 0.3974
mean column mean pruning test accuracy: 43.38%, sparsity: 0.3962
mean row mean pruning test accuracy: 50.95%, sparsity: 0.3974
mean block mean pruning test accuracy: 51.08%, sparsity: 0.3974

Training Model_2_Slight_Underfit: Slightly underfitted: Simple architecture
Architecture: Input(784) -> 256 -> 128 -> 64 -> Output(10)
Learning rate: 0.0005, Epochs: 8, Dropout: 0.0
Total parameters: 242,762
model name: Model_2_Slight_Underfit, test accuracy: 97.36%
per column magnitude pruning test accuracy: 97.18%, sparsity: 0.4310
per row magnitud

# TANH

In [83]:
for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Training {model_name}: {config['description']}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = tanhLinearModel(
        input_size=28*28, 
        output_size=10, 
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'models/MNIST_model/tanh_{model_name}.pth')

    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")
    
    test_loss, test_accuracy, origin_test_accuracy = test(model, device, test_loader, times=5)
    
    result['model_name'].append(model_name)
    result['test_accuracy'].append(test_accuracy)
    result['model_sparsity'].append(0.0)
    
    # magnitude
    pc_model, pc_neff = model_pc(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pc_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pc_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pc_model))
    
    pr_model, pr_neff = model_pr(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pr_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pr_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pr_model))
    
    pb_model, pb_neff = model_block(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pb_model))

    # mean
    mean_pc_model, mean_pc_neff = model_pc(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pc_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pc_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pc_model))

    mean_pr_model, mean_pr_neff = model_pr(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pr_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pr_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pr_model))

    mean_pb_model, mean_pb_neff = model_block(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pb_model))

    # summary
    print(f"model name: {model_name}, test accuracy: {origin_test_accuracy:.2f}%")
    print(f"per column magnitude pruning test accuracy: {result['test_accuracy'][-6]:.2f}%, sparsity: {result['model_sparsity'][-4]:.4f}")
    print(f"per row magnitude pruning test accuracy: {result['test_accuracy'][-5]:.2f}%, sparsity: {result['model_sparsity'][-3]:.4f}")
    print(f"per block magnitude pruning test accuracy: {result['test_accuracy'][-4]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
    print(f"mean column mean pruning test accuracy: {result['test_accuracy'][-3]:.2f}%, sparsity: {result['model_sparsity'][-2]:.4f}")
    print(f"mean row mean pruning test accuracy: {result['test_accuracy'][-2]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
    print(f"mean block mean pruning test accuracy: {result['test_accuracy'][-1]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")



Training Model_1_Underfit: Underfitted: Too simple (1 layer, 32 units)
Architecture: Input(784) -> 64 -> 32 -> 16 -> Output(10)
Learning rate: 0.0001, Epochs: 5, Dropout: 0.0
Total parameters: 53,018
model name: Model_1_Underfit, test accuracy: 94.62%
per column magnitude pruning test accuracy: 94.59%, sparsity: 0.3589
per row magnitude pruning test accuracy: 94.49%, sparsity: 0.3349
per block magnitude pruning test accuracy: 94.54%, sparsity: 0.3589
mean column mean pruning test accuracy: 94.42%, sparsity: 0.3594
mean row mean pruning test accuracy: 94.51%, sparsity: 0.3589
mean block mean pruning test accuracy: 94.49%, sparsity: 0.3589

Training Model_2_Slight_Underfit: Slightly underfitted: Simple architecture
Architecture: Input(784) -> 256 -> 128 -> 64 -> Output(10)
Learning rate: 0.0005, Epochs: 8, Dropout: 0.0
Total parameters: 242,762
model name: Model_2_Slight_Underfit, test accuracy: 97.71%
per column magnitude pruning test accuracy: 97.76%, sparsity: 0.4156
per row magnitud

# different datasets

In [35]:
# --- Dataset registry ---------------------------------------------------------
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

def get_loaders(dataset_name, batch_size=128, test_batch_size=1000, data_root='./data'):
    """
    Returns: train_loader, test_loader, input_size, num_classes, meta (dict)
    """
    name = dataset_name.lower()
    meta = {}

    # Generic normalizations (safe defaults). If you want canonical stats, compute them once.
    NORM_1C = transforms.Normalize((0.5,), (0.5,))
    NORM_3C = transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))

    if name == 'mnist':
        # (You already have this; included for completeness.)
        tfm = transforms.Compose([transforms.ToTensor(),
                                  transforms.Normalize((0.1307,), (0.3081,))])
        train = datasets.MNIST(data_root, train=True, download=True, transform=tfm)
        test  = datasets.MNIST(data_root, train=False, download=True, transform=tfm)
        inp, ncls = 28*28, 10

    elif name == 'fashionmnist':
        tfm = transforms.Compose([transforms.ToTensor(), NORM_1C])
        train = datasets.FashionMNIST(data_root, train=True, download=True, transform=tfm)
        test  = datasets.FashionMNIST(data_root, train=False, download=True, transform=tfm)
        inp, ncls = 28*28, 10

    elif name == 'kmnist':
        tfm = transforms.Compose([transforms.ToTensor(), NORM_1C])
        train = datasets.KMNIST(data_root, train=True, download=True, transform=tfm)
        test  = datasets.KMNIST(data_root, train=False, download=True, transform=tfm)
        inp, ncls = 28*28, 10

    elif name in ('emnist_balanced', 'emnist'):
        # EMNIST Balanced has 47 classes. If digits look rotated, add a Rotate(90) or permute.
        tfm = transforms.Compose([transforms.ToTensor(), NORM_1C])
        train = datasets.EMNIST(data_root, split='balanced', train=True, download=True, transform=tfm)
        test  = datasets.EMNIST(data_root, split='balanced', train=False, download=True, transform=tfm)
        inp, ncls = 28*28, 47
        meta['note'] = 'EMNIST images can appear rotated; for visualization add a 90-degree rotate.'

    elif name == 'qmnist':
        tfm = transforms.Compose([transforms.ToTensor(), NORM_1C])
        train = datasets.QMNIST(data_root, what='train', download=True, transform=tfm)
        test  = datasets.QMNIST(data_root, what='test',  download=True, transform=tfm)
        inp, ncls = 28*28, 10

    elif name == 'svhn':
        tfm = transforms.Compose([transforms.ToTensor(), NORM_3C])
        train = datasets.SVHN(data_root, split='train', download=True, transform=tfm)
        test  = datasets.SVHN(data_root, split='test',  download=True, transform=tfm)
        inp, ncls = 32*32*3, 10

    elif name == 'cifar10':
        tfm = transforms.Compose([transforms.ToTensor(), NORM_3C])
        train = datasets.CIFAR10(data_root, train=True,  download=True, transform=tfm)
        test  = datasets.CIFAR10(data_root, train=False, download=True, transform=tfm)
        inp, ncls = 32*32*3, 10

    elif name == 'cifar100':
        tfm = transforms.Compose([transforms.ToTensor(), NORM_3C])
        train = datasets.CIFAR100(data_root, train=True,  download=True, transform=tfm)
        test  = datasets.CIFAR100(data_root, train=False, download=True, transform=tfm)
        inp, ncls = 32*32*3, 100

    elif name in ('stl10', 'stl10_32'):
        # Downsample to 32x32 to keep input dim manageable for MLPs.
        tfm = transforms.Compose([transforms.Resize((32,32)),
                                  transforms.ToTensor(), NORM_3C])
        train = datasets.STL10(data_root, split='train', download=True, transform=tfm)
        test  = datasets.STL10(data_root, split='test',  download=True, transform=tfm)
        inp, ncls = 32*32*3, 10
        meta['note'] = 'Original STL10 is 96x96; here we resize to 32x32 for MLPs.'

    else:
        raise ValueError(f"Unknown dataset: {dataset_name}")

    train_loader = DataLoader(train, batch_size=batch_size, shuffle=True,  num_workers=2, pin_memory=True)
    test_loader  = DataLoader(test,  batch_size=test_batch_size, shuffle=False, num_workers=2, pin_memory=True)
    return train_loader, test_loader, inp, ncls, meta


In [None]:
# Model configurations
model_configs = {
    'Model_1_Underfit': {
        'hidden_size': [64, 64, 32, 32, 16],  # Very shallow, only 1 small hidden layer
        'lr': 1e-4,  # Lower learning rate
        'epochs': 5,  # Fewer epochs
        'dropout': 0.0,
        'description': 'Underfitted: Too simple (1 layer, 32 units)'
    },
    'Model_2_Slight_Underfit': {
        'hidden_size': [256, 256, 128, 128, 64],  # 2 small layers
        'lr': 5e-4,
        'epochs': 10,
        'dropout': 0.0,
        'description': 'Slightly underfitted: Simple architecture'
    },
    'Model_3_Well_Trained': {
        'hidden_size': [512, 512, 256, 256, 128],  # Moderate depth and width
        'lr': 3e-4,
        'epochs': 15,
        'dropout': 0.2,  # Some regularization
        'description': 'Well-trained: Balanced architecture with dropout'
    },
    'Model_4_Well_Trained_Deep': {
        'hidden_size': [1024, 1024, 512, 512, 256],  # Deeper but with dropout
        'lr': 3e-4,
        'epochs': 20,
        'dropout': 0.3,  # More dropout for regularization
        'description': 'Well-trained: Deeper with good regularization'
    },
    'Model_5_Overfit': {
        'hidden_size': [2048, 2048, 1024, 1024, 512],  # Very deep and wide
        'lr': 1e-3,  # Higher learning rate
        'epochs': 30,  # Many epochs
        'dropout': 0.0,  # No regularization
        'description': 'Overfitted: Very complex without regularization'
    },
    'Model_6_Extra_Overfit': {
        'hidden_size': [4096, 4096, 2048, 2048, 1024],  # Extremely deep and wide
        'lr': 1e-3,
        'epochs': 50,
        'dropout': 0.0,
        'description': 'Extra Overfitted: Very complex without regularization'
    },
    'Model_7_Extra_Overfit': {
        'hidden_size': [8192, 8192, 4096, 4096, 2048],  # Extremely deep and wide
        'lr': 1e-3,
        'epochs': 100,
        'dropout': 0.0,
        'description': 'Extra Overfitted: Very complex without regularization'
    }
}

# Train all models
all_results = {}
dataset_name = 'cifar10'
train_loader, test_loader, input_size, num_classes, meta = get_loaders(dataset_name)

for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Training {model_name}: {config['description']}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = LinearModel(
        input_size=input_size, 
        output_size=num_classes, 
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    
    # Optimizer
    optimizer = optim.Adam(model.parameters(), lr=config['lr'])
    
    # Training loop
    for epoch in range(1, config['epochs'] + 1):
        train_loss, train_accuracy = train(model, device, train_loader, optimizer, epoch)
    model.save(f'models/{dataset_name}/{model_name}.pth')


Training Model_1_Underfit: Underfitted: Too simple (1 layer, 32 units)
Architecture: Input(784) -> 64 -> 32 -> 16 -> Output(10)
Learning rate: 0.0001, Epochs: 5, Dropout: 0.0

Training Model_2_Slight_Underfit: Slightly underfitted: Simple architecture
Architecture: Input(784) -> 256 -> 128 -> 64 -> Output(10)
Learning rate: 0.0005, Epochs: 8, Dropout: 0.0

Training Model_3_Well_Trained: Well-trained: Balanced architecture with dropout
Architecture: Input(784) -> 512 -> 256 -> 128 -> Output(10)
Learning rate: 0.0003, Epochs: 15, Dropout: 0.2

Training Model_4_Well_Trained_Deep: Well-trained: Deeper with good regularization
Architecture: Input(784) -> 1024 -> 512 -> 256 -> Output(10)
Learning rate: 0.0003, Epochs: 20, Dropout: 0.3

Training Model_5_Overfit: Overfitted: Very complex without regularization
Architecture: Input(784) -> 2048 -> 1024 -> 1024 -> Output(10)
Learning rate: 0.001, Epochs: 30, Dropout: 0.0

Training Model_6_Extra_Overfit: Extra Overfitted: Very complex without reg

In [84]:
dataset_name = 'cifar10'
train_loader, test_loader, input_size, num_classes, meta = get_loaders(dataset_name)

for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Training {model_name}: {config['description']}")
    print(f"Architecture: Input({input_size}) -> {' -> '.join(map(str, config['hidden_size']))} -> Output({num_classes})")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = LinearModel(
        input_size=input_size,
        output_size=num_classes,
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'models/{dataset_name}/{model_name}.pth')

    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")
    
    test_loss, test_accuracy, origin_test_accuracy = test(model, device, test_loader, times=5)
    
    result['model_name'].append(model_name)
    result['test_accuracy'].append(test_accuracy)
    result['model_sparsity'].append(0.0)
    
    # magnitude
    pc_model, pc_neff = model_pc(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pc_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pc_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pc_model))
    
    pr_model, pr_neff = model_pr(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pr_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pr_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pr_model))
    
    pb_model, pb_neff = model_block(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pb_model))

    # mean
    mean_pc_model, mean_pc_neff = model_pc(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pc_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pc_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pc_model))

    mean_pr_model, mean_pr_neff = model_pr(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pr_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pr_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pr_model))

    mean_pb_model, mean_pb_neff = model_block(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pb_model))

    # summary
    print(f"model name: {model_name}, test accuracy: {origin_test_accuracy:.2f}%")
    print(f"per column magnitude pruning test accuracy: {result['test_accuracy'][-6]:.2f}%, sparsity: {result['model_sparsity'][-4]:.4f}")
    print(f"per row magnitude pruning test accuracy: {result['test_accuracy'][-5]:.2f}%, sparsity: {result['model_sparsity'][-3]:.4f}")
    print(f"per block magnitude pruning test accuracy: {result['test_accuracy'][-4]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
    print(f"mean column mean pruning test accuracy: {result['test_accuracy'][-3]:.2f}%, sparsity: {result['model_sparsity'][-2]:.4f}")
    print(f"mean row mean pruning test accuracy: {result['test_accuracy'][-2]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
    print(f"mean block mean pruning test accuracy: {result['test_accuracy'][-1]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")



Training Model_1_Underfit: Underfitted: Too simple (1 layer, 32 units)
Architecture: Input(3072) -> 64 -> 32 -> 16 -> Output(10)
Learning rate: 0.0001, Epochs: 5, Dropout: 0.0
Total parameters: 199,450
model name: Model_1_Underfit, test accuracy: 44.58%
per column magnitude pruning test accuracy: 42.93%, sparsity: 0.3662
per row magnitude pruning test accuracy: 43.67%, sparsity: 0.3573
per block magnitude pruning test accuracy: 44.01%, sparsity: 0.3662
mean column mean pruning test accuracy: 40.26%, sparsity: 0.3594
mean row mean pruning test accuracy: 40.94%, sparsity: 0.3662
mean block mean pruning test accuracy: 42.95%, sparsity: 0.3662

Training Model_2_Slight_Underfit: Slightly underfitted: Simple architecture
Architecture: Input(3072) -> 256 -> 128 -> 64 -> Output(10)
Learning rate: 0.0005, Epochs: 8, Dropout: 0.0
Total parameters: 828,490
model name: Model_2_Slight_Underfit, test accuracy: 54.38%
per column magnitude pruning test accuracy: 53.99%, sparsity: 0.3867
per row magni

In [54]:
renormal_result = {
    'test_accuracy': [],
    'model_sparsity': [],
}

for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Training {model_name}: {config['description']}")
    print(f"Architecture: Input({input_size}) -> {' -> '.join(map(str, config['hidden_size']))} -> Output({num_classes})")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = LinearModel(
        input_size=input_size,
        output_size=num_classes,
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'models/{dataset_name}/{model_name}.pth')

    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")

    test_loss, test_accuracy, model_test_accuracy = test(model, device, test_loader, times=5)
    renormal_result['test_accuracy'].append(model_test_accuracy)
    renormal_result['model_sparsity'].append(0.0)

    row_pruning_renormalize, renormalize_neff = model_pr(model, renormalize=True)
    row_pruning, neff = model_pr(model, renormalize=False)

    # Test the pruned models
    test_loss, test_accuracy, accuracy_mean = test(row_pruning_renormalize, device, test_loader, times=5)
    renormal_result['test_accuracy'].append(accuracy_mean)
    renormal_result['model_sparsity'].append(model_sparsity(row_pruning_renormalize))

    test_loss, test_accuracy, accuracy_mean = test(row_pruning, device, test_loader, times=5)
    renormal_result['test_accuracy'].append(accuracy_mean)
    renormal_result['model_sparsity'].append(model_sparsity(row_pruning))

    print(f"Model Name: {model_name}, Test Accuracy: {model_test_accuracy:.2f}%")
    print(f"Renormalized Model test accuracy: {renormal_result['test_accuracy'][-2]:.2f}%, Sparsity: {renormal_result['model_sparsity'][-2]:.2f}")
    print(f"Model test accuracy: {renormal_result['test_accuracy'][-1]:.2f}%, Sparsity: {renormal_result['model_sparsity'][-1]:.2f}")


Training Model_1_Underfit: Underfitted: Too simple (1 layer, 32 units)
Architecture: Input(3072) -> 64 -> 32 -> 16 -> Output(10)
Learning rate: 0.0001, Epochs: 5, Dropout: 0.0
Total parameters: 199,450
Model Name: Model_1_Underfit, Test Accuracy: 44.58%
Renormalized Model test accuracy: 43.75%, Sparsity: 0.36
Model test accuracy: 43.67%, Sparsity: 0.36

Training Model_2_Slight_Underfit: Slightly underfitted: Simple architecture
Architecture: Input(3072) -> 256 -> 128 -> 64 -> Output(10)
Learning rate: 0.0005, Epochs: 8, Dropout: 0.0
Total parameters: 828,490
Model Name: Model_2_Slight_Underfit, Test Accuracy: 54.38%
Renormalized Model test accuracy: 53.02%, Sparsity: 0.38
Model test accuracy: 53.78%, Sparsity: 0.38

Training Model_3_Well_Trained: Well-trained: Balanced architecture with dropout
Architecture: Input(3072) -> 512 -> 256 -> 128 -> Output(10)
Learning rate: 0.0003, Epochs: 15, Dropout: 0.2
Total parameters: 1,738,890
Model Name: Model_3_Well_Trained, Test Accuracy: 56.62%
