In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import os
import numpy as np

import copy
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from collections import defaultdict

folder = "test_results"
os.makedirs(folder, exist_ok=True)

In [2]:
def mask_pc(module:nn.Module, beta=1.0, method='magnitude') -> torch.Tensor:
    x = module.weight.data
    output_size, input_size = x.shape
    if method == 'mean':
        x = x - x.mean(dim=0, keepdim=True)
    x_norm = torch.abs(x) / torch.sum(torch.abs(x), dim=0, keepdim=True)
    neff = 1/torch.sum((x_norm ** 2), dim=0, keepdim=True).squeeze(0)
    r_neff = torch.floor(beta * neff)
    r_neff = r_neff.clamp(min=1, max=output_size-1)

    _, indices = torch.sort(x_norm, dim=0, descending=True)
    range_tensor = torch.arange(output_size, device=x.device).unsqueeze(0).expand(input_size, -1).T
    sorted_mask = range_tensor < r_neff
    
    mask = torch.zeros_like(x, dtype=torch.bool)
    mask.scatter_(0, indices, sorted_mask)
    return mask, torch.floor(neff)

def model_pc(model, renormalize=False, beta=1.0, method='magnitude'):
    model = copy.deepcopy(model)
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            mask, neff = mask_pc(module, beta=beta, method=method)
            mask = mask.to(module.weight.device)
            with torch.no_grad():
                pre = module.weight.abs().sum(dim=0, keepdim=True)
                module.weight *= mask
                if renormalize:
                    post = module.weight.abs().sum(dim=0, keepdim=True)
                    module.weight.mul_(pre / post)
    return model, neff

def mask_pr(module:nn.Module, beta=1.0, method='magnitude') -> torch.Tensor:
    x = module.weight.data
    output_size, input_size = x.shape
    if method == 'mean':
        x = x - x.mean(dim=1, keepdim=True)
    x_norm = torch.abs(x) / torch.sum(torch.abs(x), dim=1, keepdim=True)
    neff = 1/torch.sum((x_norm ** 2), dim=1, keepdim=True).squeeze(0)
    r_neff = torch.floor(beta * neff)
    r_neff = r_neff.clamp(min=1, max=input_size-1)

    _, indices = torch.sort(x_norm, dim=1, descending=True)
    range_tensor = torch.arange(input_size, device=x.device).unsqueeze(0).expand(output_size, -1)
    sorted_mask = range_tensor < r_neff

    mask = torch.zeros_like(x, dtype=torch.bool)
    mask.scatter_(1, indices, sorted_mask)
    return mask, torch.floor(neff)

def model_pr(model, renormalize=False, beta=1.0, method='magnitude'):
    model = copy.deepcopy(model)
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            mask, neff = mask_pr(module, beta=beta, method=method)
            mask = mask.to(module.weight.device)
            with torch.no_grad():
                pre = module.weight.abs().sum(dim=1, keepdim=True)
                module.weight *= mask
                if renormalize:
                    post = module.weight.abs().sum(dim=1, keepdim=True)
                    module.weight.mul_(pre / post)
    return model, neff

def mask_block(module:nn.Module, beta=1.0, method='magnitude') -> torch.Tensor:
    x = module.weight.data
    x = x.view(-1)
    if method == 'mean':
        x = x - torch.mean(x)
    x_norm = torch.abs(x) / torch.sum(torch.abs(x))
    neff = 1/torch.sum((x_norm ** 2))
    r_neff = torch.floor(beta * neff)
    r_neff = r_neff.clamp(min=1, max=len(x)-1)

    _, indices = torch.sort(x_norm, descending=True)
    range_tensor = torch.arange(len(x), device=x.device)
    sorted_mask = range_tensor < r_neff

    mask = torch.zeros_like(x, dtype=torch.bool)
    mask.scatter_(0, indices, sorted_mask)
    mask = mask.view_as(module.weight)
    return mask, torch.floor(neff)

def model_block(model, renormalize=False, beta=1.0, method='magnitude'):
    model = copy.deepcopy(model)
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            mask, neff = mask_block(module, beta=beta, method=method)
            mask = mask.to(module.weight.device)
            with torch.no_grad():
                pre = module.weight.abs().sum(dim=0, keepdim=True)
                module.weight *= mask
                if renormalize:
                    post = module.weight.abs().sum(dim=0, keepdim=True)
                    module.weight.mul_(pre / post)
    return model, neff

In [3]:
def model_sparsity(model):
    """Calculate the sparsity of the model"""
    total_params = 0
    zero_params = 0
    
    for name, param in model.named_parameters():
        if 'weight' in name:
            total_params += param.numel()
            zero_params += torch.sum(param == 0).item()
    
    sparsity = zero_params / total_params
    return sparsity

def per_layer_neff(model):
    """Calculate the effective parameters (Neff) per layer"""
    neff = {}
    for name, param in model.named_parameters():
        if 'weight' in name:
            layer_neff = torch.sum(param != 0).item()
            neff[name] = layer_neff
    return neff

# Model training

In [4]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Model class with optional dropout
class LinearModel(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=[512, 512, 512], dropout_rate=0.0):
        super(LinearModel, self).__init__()
        self.layers = nn.ModuleList()
        self.dropout = nn.Dropout(dropout_rate)
        
        prev_size = input_size
        for size in hidden_size:
            self.layers.append(nn.Linear(prev_size, size))
            prev_size = size
            
        self.output = nn.Linear(prev_size, output_size)
        
    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten
        
        for layer in self.layers:
            x = F.relu(layer(x))
            x = self.dropout(x)  # Apply dropout after activation
        x = self.output(x)
        return F.log_softmax(x, dim=1)
    
    def save(self, path):
        torch.save(self.state_dict(), path)

    def load(self, path):
        self.load_state_dict(torch.load(path))

# Training function
def train(model, device, train_loader, optimizer, epoch):
    """Train for one epoch"""
    model.train()
    train_loss = 0
    correct = 0
    
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.view_as(pred)).sum().item()
        
        if batch_idx % 200 == 0:
            print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} '
                  f'({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}')

    avg_loss = train_loss / len(train_loader)
    accuracy = 100. * correct / len(train_loader.dataset)
    return avg_loss, accuracy

# Testing function
def test(model, device, test_loader, times=1):
    """Evaluate model on test set"""
    model.eval()
    accuracy_list = []
    loss_list = []
    for _ in range(times):
        test_loss = 0
        correct = 0
        with torch.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                test_loss += F.nll_loss(output, target, reduction='sum').item()
                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(target.view_as(pred)).sum().item()

        test_loss /= len(test_loader.dataset)
        accuracy = 100. * correct / len(test_loader.dataset)
        accuracy_list.append(accuracy)
        loss_list.append(test_loss)

    if times == 1:
        print(f'Test set: Average loss: {test_loss:.4f}, '
              f'Accuracy: {correct}/{len(test_loader.dataset)} ({accuracy:.2f}%)\n')

        return test_loss, accuracy
    
    else:
        return loss_list, accuracy_list, sum(accuracy_list)/times

Using device: cuda


In [5]:
# Model class with optional dropout
class geluLinearModel(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=[512, 512, 512], dropout_rate=0.0):
        super(geluLinearModel, self).__init__()
        self.layers = nn.ModuleList()
        self.dropout = nn.Dropout(dropout_rate)
        
        prev_size = input_size
        for size in hidden_size:
            self.layers.append(nn.Linear(prev_size, size))
            prev_size = size
            
        self.output = nn.Linear(prev_size, output_size)
        
    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten
        
        for layer in self.layers:
            x = F.gelu(layer(x))
            x = self.dropout(x)  # Apply dropout after activation
        x = self.output(x)
        return F.log_softmax(x, dim=1)
    
    def save(self, path):
        torch.save(self.state_dict(), path)

    def load(self, path):
        self.load_state_dict(torch.load(path))
        
        
# Model class with optional dropout
class SigmoidLinearModel(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=[512, 512, 512], dropout_rate=0.0):
        super(SigmoidLinearModel, self).__init__()
        self.layers = nn.ModuleList()
        self.dropout = nn.Dropout(dropout_rate)
        
        prev_size = input_size
        for size in hidden_size:
            self.layers.append(nn.Linear(prev_size, size))
            prev_size = size
            
        self.output = nn.Linear(prev_size, output_size)
        
    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten
        
        for layer in self.layers:
            x = F.sigmoid(layer(x))
            x = self.dropout(x)  # Apply dropout after activation
        x = self.output(x)
        return F.log_softmax(x, dim=1)
    
    def save(self, path):
        torch.save(self.state_dict(), path)

    def load(self, path):
        self.load_state_dict(torch.load(path))
        
        
        
# Model class with optional dropout
class tanhLinearModel(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=[512, 512, 512], dropout_rate=0.0):
        super(tanhLinearModel, self).__init__()
        self.layers = nn.ModuleList()
        self.dropout = nn.Dropout(dropout_rate)
        
        prev_size = input_size
        for size in hidden_size:
            self.layers.append(nn.Linear(prev_size, size))
            prev_size = size
            
        self.output = nn.Linear(prev_size, output_size)
        
    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten
        
        for layer in self.layers:
            x = F.tanh(layer(x))
            x = self.dropout(x)  # Apply dropout after activation
        x = self.output(x)
        return F.log_softmax(x, dim=1)
    
    def save(self, path):
        torch.save(self.state_dict(), path)

    def load(self, path):
        self.load_state_dict(torch.load(path))


In [6]:
model_configs = {
    # Underfit & brittle
    'Tiny_Underfit':
        {'hidden_size': [64, 32],
        'lr': 3e-4,
        'epochs': 10,
        'dropout': 0.0},
    # Deep-narrow (depth sensitivity)
    'Deep_Narrow':         
        {'hidden_size': [256, 256, 256, 256],
        'lr': 3e-4,
        'epochs': 15,
        'dropout': 0.2},
    # Well-trained baseline
    'Balanced':            
        {'hidden_size': [512, 256, 128],
        'lr': 3e-4,
        'epochs': 15,
        'dropout': 0.2},
    # Deep but still robust
    'Balanced_Deep':       
        {'hidden_size': [1024, 512, 256, 128],
        'lr': 3e-4,
        'epochs': 20,
        'dropout': 0.3},
    # Overparameterized
    'Wide':
        {'hidden_size': [2048, 1024, 512],
        'lr': 1e-3,
        'epochs': 30,
        'dropout': 0.0},
    # Very overparameterized (optional, keep one)
    'Very_Wide':          
        {'hidden_size': [4096, 2048, 1024],
        'lr': 1e-3,
        'epochs': 50,
        'dropout': 0.0},
}


In [7]:
# --- Dataset registry ---------------------------------------------------------
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

def get_loaders(dataset_name, batch_size=128, test_batch_size=1000, data_root='./data'):
    """
    Returns: train_loader, test_loader, input_size, num_classes, meta (dict)
    """
    name = dataset_name.lower()
    meta = {}

    # Generic normalizations (safe defaults). If you want canonical stats, compute them once.
    NORM_1C = transforms.Normalize((0.5,), (0.5,))
    NORM_3C = transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))

    if name == 'mnist':
        # (You already have this; included for completeness.)
        tfm = transforms.Compose([transforms.ToTensor(),
                                  transforms.Normalize((0.1307,), (0.3081,))])
        train = datasets.MNIST(data_root, train=True, download=True, transform=tfm)
        test  = datasets.MNIST(data_root, train=False, download=True, transform=tfm)
        inp, ncls = 28*28, 10

    elif name == 'fashionmnist':
        tfm = transforms.Compose([transforms.ToTensor(), NORM_1C])
        train = datasets.FashionMNIST(data_root, train=True, download=True, transform=tfm)
        test  = datasets.FashionMNIST(data_root, train=False, download=True, transform=tfm)
        inp, ncls = 28*28, 10

    elif name == 'kmnist':
        tfm = transforms.Compose([transforms.ToTensor(), NORM_1C])
        train = datasets.KMNIST(data_root, train=True, download=True, transform=tfm)
        test  = datasets.KMNIST(data_root, train=False, download=True, transform=tfm)
        inp, ncls = 28*28, 10

    elif name in ('emnist_balanced', 'emnist'):
        # EMNIST Balanced has 47 classes. If digits look rotated, add a Rotate(90) or permute.
        tfm = transforms.Compose([transforms.ToTensor(), NORM_1C])
        train = datasets.EMNIST(data_root, split='balanced', train=True, download=True, transform=tfm)
        test  = datasets.EMNIST(data_root, split='balanced', train=False, download=True, transform=tfm)
        inp, ncls = 28*28, 47
        meta['note'] = 'EMNIST images can appear rotated; for visualization add a 90-degree rotate.'

    elif name == 'qmnist':
        tfm = transforms.Compose([transforms.ToTensor(), NORM_1C])
        train = datasets.QMNIST(data_root, what='train', download=True, transform=tfm)
        test  = datasets.QMNIST(data_root, what='test',  download=True, transform=tfm)
        inp, ncls = 28*28, 10

    elif name == 'svhn':
        tfm = transforms.Compose([transforms.ToTensor(), NORM_3C])
        train = datasets.SVHN(data_root, split='train', download=True, transform=tfm)
        test  = datasets.SVHN(data_root, split='test',  download=True, transform=tfm)
        inp, ncls = 32*32*3, 10

    elif name == 'cifar10':
        tfm = transforms.Compose([transforms.ToTensor(), NORM_3C])
        train = datasets.CIFAR10(data_root, train=True,  download=True, transform=tfm)
        test  = datasets.CIFAR10(data_root, train=False, download=True, transform=tfm)
        inp, ncls = 32*32*3, 10

    elif name == 'cifar100':
        tfm = transforms.Compose([transforms.ToTensor(), NORM_3C])
        train = datasets.CIFAR100(data_root, train=True,  download=True, transform=tfm)
        test  = datasets.CIFAR100(data_root, train=False, download=True, transform=tfm)
        inp, ncls = 32*32*3, 100

    elif name in ('stl10', 'stl10_32'):
        # Downsample to 32x32 to keep input dim manageable for MLPs.
        tfm = transforms.Compose([transforms.Resize((32,32)),
                                  transforms.ToTensor(), NORM_3C])
        train = datasets.STL10(data_root, split='train', download=True, transform=tfm)
        test  = datasets.STL10(data_root, split='test',  download=True, transform=tfm)
        inp, ncls = 32*32*3, 10
        meta['note'] = 'Original STL10 is 96x96; here we resize to 32x32 for MLPs.'

    else:
        raise ValueError(f"Unknown dataset: {dataset_name}")

    train_loader = DataLoader(train, batch_size=batch_size, shuffle=True,  num_workers=2, pin_memory=True)
    test_loader  = DataLoader(test,  batch_size=test_batch_size, shuffle=False, num_workers=2, pin_memory=True)
    return train_loader, test_loader, inp, ncls, meta


In [8]:
# Train all models
all_results = {}

datasets_name = 'mnist'

train_loader, test_loader, input_size, num_classes, meta = get_loaders(datasets_name, batch_size=128)


for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Architecture: Input({input_size}) -> {' -> '.join(map(str, config['hidden_size']))} -> Output({num_classes})")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = LinearModel(
        input_size=input_size, 
        output_size=num_classes, 
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    
    # Optimizer
    optimizer = optim.Adam(model.parameters(), lr=config['lr'])
    
    # Training loop
    for epoch in range(1, config['epochs'] + 1):
        train_loss, train_accuracy = train(model, device, train_loader, optimizer, epoch)
    model.save(f'paper_{datasets_name}/{model_name}.pth')



Architecture: Input(784) -> 64 -> 32 -> Output(10)
Learning rate: 0.0003, Epochs: 10, Dropout: 0.0

Architecture: Input(784) -> 256 -> 256 -> 256 -> 256 -> Output(10)
Learning rate: 0.0003, Epochs: 15, Dropout: 0.2

Architecture: Input(784) -> 512 -> 256 -> 128 -> Output(10)
Learning rate: 0.0003, Epochs: 15, Dropout: 0.2

Architecture: Input(784) -> 1024 -> 512 -> 256 -> 128 -> Output(10)
Learning rate: 0.0003, Epochs: 20, Dropout: 0.3

Architecture: Input(784) -> 2048 -> 1024 -> 512 -> Output(10)
Learning rate: 0.001, Epochs: 30, Dropout: 0.0

Architecture: Input(784) -> 4096 -> 2048 -> 1024 -> Output(10)
Learning rate: 0.001, Epochs: 50, Dropout: 0.0


In [10]:
for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model1 = geluLinearModel(
        input_size=input_size, 
        output_size=num_classes, 
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    
    # Optimizer
    optimizer = optim.Adam(model1.parameters(), lr=config['lr'])

    # Training loop
    for epoch in range(1, config['epochs'] + 1):
        train_loss, train_accuracy = train(model1, device, train_loader, optimizer, epoch)
    model1.save(f'paper_{datasets_name}/gelu_{model_name}.pth')


for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model2 = SigmoidLinearModel(
        input_size=input_size,
        output_size=num_classes,
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    
    # Optimizer
    optimizer = optim.Adam(model2.parameters(), lr=config['lr'])

    # Training loop
    for epoch in range(1, config['epochs'] + 1):
        train_loss, train_accuracy = train(model2, device, train_loader, optimizer, epoch)
    model2.save(f'paper_{datasets_name}/sigmoid_{model_name}.pth')

for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model3 = tanhLinearModel(
        input_size=input_size,
        output_size=num_classes,
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    
    # Optimizer
    optimizer = optim.Adam(model3.parameters(), lr=config['lr'])

    # Training loop
    for epoch in range(1, config['epochs'] + 1):
        train_loss, train_accuracy = train(model3, device, train_loader, optimizer, epoch)
    model3.save(f'paper_{datasets_name}/tanh_{model_name}.pth')


Architecture: Input(784) -> 64 -> 32 -> Output(10)
Learning rate: 0.0003, Epochs: 10, Dropout: 0.0

Architecture: Input(784) -> 256 -> 256 -> 256 -> 256 -> Output(10)
Learning rate: 0.0003, Epochs: 15, Dropout: 0.2

Architecture: Input(784) -> 512 -> 256 -> 128 -> Output(10)
Learning rate: 0.0003, Epochs: 15, Dropout: 0.2

Architecture: Input(784) -> 1024 -> 512 -> 256 -> 128 -> Output(10)
Learning rate: 0.0003, Epochs: 20, Dropout: 0.3

Architecture: Input(784) -> 2048 -> 1024 -> 512 -> Output(10)
Learning rate: 0.001, Epochs: 30, Dropout: 0.0

Architecture: Input(784) -> 4096 -> 2048 -> 1024 -> Output(10)
Learning rate: 0.001, Epochs: 50, Dropout: 0.0

Architecture: Input(784) -> 64 -> 32 -> Output(10)
Learning rate: 0.0003, Epochs: 10, Dropout: 0.0

Architecture: Input(784) -> 256 -> 256 -> 256 -> 256 -> Output(10)
Learning rate: 0.0003, Epochs: 15, Dropout: 0.2

Architecture: Input(784) -> 512 -> 256 -> 128 -> Output(10)
Learning rate: 0.0003, Epochs: 15, Dropout: 0.2

Architectur

# pruning

In [11]:
# Results storage
result = {
    'model_name': [],
    'test_accuracy': [],
    'model_sparsity': [],
}

for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = LinearModel(
        input_size=input_size,
        output_size=num_classes,
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'paper_mnist/{model_name}.pth')
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")
    
    test_loss, test_accuracy, origin_test_accuracy = test(model, device, test_loader, times=5)
    
    result['model_name'].append(model_name)
    result['test_accuracy'].append(origin_test_accuracy)
    result['model_sparsity'].append(0.0)
    
    # magnitude
    pc_model, pc_neff = model_pc(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pc_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pc_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pc_model))
    
    pr_model, pr_neff = model_pr(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pr_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pr_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pr_model))
    
    pb_model, pb_neff = model_block(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pb_model))

    # mean
    mean_pc_model, mean_pc_neff = model_pc(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pc_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pc_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pc_model))

    mean_pr_model, mean_pr_neff = model_pr(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pr_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pr_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pr_model))

    mean_pb_model, mean_pb_neff = model_block(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pb_model))

    # summary
    print(f"model name: {model_name}, test accuracy: {origin_test_accuracy:.2f}%")
    print(f"per column magnitude pruning test accuracy: {result['test_accuracy'][-6]:.2f}%, sparsity: {result['model_sparsity'][-4]:.4f}")
    print(f"per row magnitude pruning test accuracy: {result['test_accuracy'][-5]:.2f}%, sparsity: {result['model_sparsity'][-3]:.4f}")
    print(f"per block magnitude pruning test accuracy: {result['test_accuracy'][-4]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
    print(f"mean column mean pruning test accuracy: {result['test_accuracy'][-3]:.2f}%, sparsity: {result['model_sparsity'][-2]:.4f}")
    print(f"mean row mean pruning test accuracy: {result['test_accuracy'][-2]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
    print(f"mean block mean pruning test accuracy: {result['test_accuracy'][-1]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")


Learning rate: 0.0003, Epochs: 10, Dropout: 0.0
Total parameters: 52,650
model name: Tiny_Underfit, test accuracy: 96.44%
per column magnitude pruning test accuracy: 95.87%, sparsity: 0.4184
per row magnitude pruning test accuracy: 95.67%, sparsity: 0.3591
per block magnitude pruning test accuracy: 96.05%, sparsity: 0.4209
mean column mean pruning test accuracy: 95.13%, sparsity: 0.4087
mean row mean pruning test accuracy: 94.83%, sparsity: 0.4209
mean block mean pruning test accuracy: 95.88%, sparsity: 0.4209

Learning rate: 0.0003, Epochs: 15, Dropout: 0.2
Total parameters: 400,906
model name: Deep_Narrow, test accuracy: 98.23%
per column magnitude pruning test accuracy: 98.20%, sparsity: 0.3647
per row magnitude pruning test accuracy: 98.26%, sparsity: 0.3334
per block magnitude pruning test accuracy: 98.25%, sparsity: 0.3644
mean column mean pruning test accuracy: 98.15%, sparsity: 0.3605
mean row mean pruning test accuracy: 98.24%, sparsity: 0.3644
mean block mean pruning test ac

# beta

In [12]:
result = {
    'model_name': [],
    'test_accuracy': [],
    'model_sparsity': [],
    'shadow_accuracy': []
}


for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = LinearModel(
        input_size=input_size,
        output_size=num_classes,
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'paper_mnist/{model_name}.pth')
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")
    
    test_loss, test_accuracy, origin_test_accuracy = test(model, device, test_loader, times=5)
    
    result['model_name'].append(model_name)
    result['test_accuracy'].append(origin_test_accuracy)
    result['model_sparsity'].append(0.0)
    result['shadow_accuracy'].append(test_accuracy)
    
    coefficients = torch.linspace(-1, 1, 11)
    betas = torch.exp(coefficients).tolist()

    for beta in betas:
        # magnitude
        pc_model, pc_neff = model_pc(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pc_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pc_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pc_model))
        result['shadow_accuracy'].append(test_accuracy)

        pr_model, pr_neff = model_pr(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pr_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pr_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pr_model))
        result['shadow_accuracy'].append(test_accuracy)

        pb_model, pb_neff = model_block(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pb_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pb_model))
        result['shadow_accuracy'].append(test_accuracy)

        # mean
        mean_pc_model, mean_pc_neff = model_pc(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pc_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pc_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pc_model))
        result['shadow_accuracy'].append(test_accuracy)

        mean_pr_model, mean_pr_neff = model_pr(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pr_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pr_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pr_model))
        result['shadow_accuracy'].append(test_accuracy)

        mean_pb_model, mean_pb_neff = model_block(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pb_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pb_model))
        result['shadow_accuracy'].append(test_accuracy)

        # summary
        print('\t')
        print('='*40)
        print(f"model name: {model_name}, test accuracy: {origin_test_accuracy:.2f}%, beta: {beta}")
        print(f"per column magnitude pruning test accuracy: {result['test_accuracy'][-6]:.2f}%, sparsity: {result['model_sparsity'][-4]:.4f}")
        print(f"per row magnitude pruning test accuracy: {result['test_accuracy'][-5]:.2f}%, sparsity: {result['model_sparsity'][-3]:.4f}")
        print(f"per block magnitude pruning test accuracy: {result['test_accuracy'][-4]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
        print(f"mean column mean pruning test accuracy: {result['test_accuracy'][-3]:.2f}%, sparsity: {result['model_sparsity'][-2]:.4f}")
        print(f"mean row mean pruning test accuracy: {result['test_accuracy'][-2]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
        print(f"mean block mean pruning test accuracy: {result['test_accuracy'][-1]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")


Architecture: Input(784) -> 64 -> 32 -> Output(10)
Learning rate: 0.0003, Epochs: 10, Dropout: 0.0
Total parameters: 52,650
	
model name: Tiny_Underfit, test accuracy: 96.44%, beta: 0.3678794503211975
per column magnitude pruning test accuracy: 50.68%, sparsity: 0.7860
per row magnitude pruning test accuracy: 63.63%, sparsity: 0.7696
per block magnitude pruning test accuracy: 70.32%, sparsity: 0.7870
mean column mean pruning test accuracy: 48.38%, sparsity: 0.7832
mean row mean pruning test accuracy: 59.36%, sparsity: 0.7870
mean block mean pruning test accuracy: 65.60%, sparsity: 0.7870
	
model name: Tiny_Underfit, test accuracy: 96.44%, beta: 0.4493289589881897
per column magnitude pruning test accuracy: 64.61%, sparsity: 0.7387
per row magnitude pruning test accuracy: 75.00%, sparsity: 0.7166
per block magnitude pruning test accuracy: 73.39%, sparsity: 0.7398
mean column mean pruning test accuracy: 66.65%, sparsity: 0.7349
mean row mean pruning test accuracy: 72.85%, sparsity: 0.73

In [13]:
result = {
    'model_name': [],
    'test_accuracy': [],
    'model_sparsity': [],
    'shadow_accuracy': []
}


for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = geluLinearModel(
        input_size=input_size,
        output_size=num_classes,
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'paper_mnist/gelu_{model_name}.pth')
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")
    
    test_loss, test_accuracy, origin_test_accuracy = test(model, device, test_loader, times=5)
    
    result['model_name'].append(model_name)
    result['test_accuracy'].append(origin_test_accuracy)
    result['model_sparsity'].append(0.0)
    result['shadow_accuracy'].append(test_accuracy)
    
    coefficients = torch.linspace(-1, 1, 11)
    betas = torch.exp(coefficients).tolist()

    for beta in betas:
        # magnitude
        pc_model, pc_neff = model_pc(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pc_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pc_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pc_model))
        result['shadow_accuracy'].append(test_accuracy)

        pr_model, pr_neff = model_pr(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pr_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pr_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pr_model))
        result['shadow_accuracy'].append(test_accuracy)

        pb_model, pb_neff = model_block(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pb_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pb_model))
        result['shadow_accuracy'].append(test_accuracy)

        # mean
        mean_pc_model, mean_pc_neff = model_pc(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pc_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pc_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pc_model))
        result['shadow_accuracy'].append(test_accuracy)

        mean_pr_model, mean_pr_neff = model_pr(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pr_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pr_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pr_model))
        result['shadow_accuracy'].append(test_accuracy)

        mean_pb_model, mean_pb_neff = model_block(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pb_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pb_model))
        result['shadow_accuracy'].append(test_accuracy)

        # summary
        print('\t')
        print('='*40)
        print(f"model name: {model_name}, test accuracy: {origin_test_accuracy:.2f}%, beta: {beta}")
        print(f"per column magnitude pruning test accuracy: {result['test_accuracy'][-6]:.2f}%, sparsity: {result['model_sparsity'][-4]:.4f}")
        print(f"per row magnitude pruning test accuracy: {result['test_accuracy'][-5]:.2f}%, sparsity: {result['model_sparsity'][-3]:.4f}")
        print(f"per block magnitude pruning test accuracy: {result['test_accuracy'][-4]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
        print(f"mean column mean pruning test accuracy: {result['test_accuracy'][-3]:.2f}%, sparsity: {result['model_sparsity'][-2]:.4f}")
        print(f"mean row mean pruning test accuracy: {result['test_accuracy'][-2]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
        print(f"mean block mean pruning test accuracy: {result['test_accuracy'][-1]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")


Architecture: Input(784) -> 64 -> 32 -> Output(10)
Learning rate: 0.0003, Epochs: 10, Dropout: 0.0
Total parameters: 52,650
	
model name: Tiny_Underfit, test accuracy: 96.96%, beta: 0.3678794503211975
per column magnitude pruning test accuracy: 36.08%, sparsity: 0.7866
per row magnitude pruning test accuracy: 48.63%, sparsity: 0.7711
per block magnitude pruning test accuracy: 42.92%, sparsity: 0.7874
mean column mean pruning test accuracy: 30.98%, sparsity: 0.7837
mean row mean pruning test accuracy: 49.05%, sparsity: 0.7874
mean block mean pruning test accuracy: 47.50%, sparsity: 0.7874
	
model name: Tiny_Underfit, test accuracy: 96.96%, beta: 0.4493289589881897
per column magnitude pruning test accuracy: 52.58%, sparsity: 0.7393
per row magnitude pruning test accuracy: 44.75%, sparsity: 0.7191
per block magnitude pruning test accuracy: 50.79%, sparsity: 0.7403
mean column mean pruning test accuracy: 56.39%, sparsity: 0.7357
mean row mean pruning test accuracy: 58.18%, sparsity: 0.74

In [14]:
result = {
    'model_name': [],
    'test_accuracy': [],
    'model_sparsity': [],
    'shadow_accuracy': []
}


for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = SigmoidLinearModel(
        input_size=input_size,
        output_size=num_classes,
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'paper_mnist/sigmoid_{model_name}.pth')
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")
    
    test_loss, test_accuracy, origin_test_accuracy = test(model, device, test_loader, times=5)
    
    result['model_name'].append(model_name)
    result['test_accuracy'].append(origin_test_accuracy)
    result['model_sparsity'].append(0.0)
    result['shadow_accuracy'].append(test_accuracy)
    
    coefficients = torch.linspace(-1, 1, 11)
    betas = torch.exp(coefficients).tolist()

    for beta in betas:
        # magnitude
        pc_model, pc_neff = model_pc(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pc_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pc_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pc_model))
        result['shadow_accuracy'].append(test_accuracy)

        pr_model, pr_neff = model_pr(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pr_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pr_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pr_model))
        result['shadow_accuracy'].append(test_accuracy)

        pb_model, pb_neff = model_block(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pb_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pb_model))
        result['shadow_accuracy'].append(test_accuracy)

        # mean
        mean_pc_model, mean_pc_neff = model_pc(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pc_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pc_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pc_model))
        result['shadow_accuracy'].append(test_accuracy)

        mean_pr_model, mean_pr_neff = model_pr(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pr_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pr_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pr_model))
        result['shadow_accuracy'].append(test_accuracy)

        mean_pb_model, mean_pb_neff = model_block(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pb_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pb_model))
        result['shadow_accuracy'].append(test_accuracy)

        # summary
        print('\t')
        print('='*40)
        print(f"model name: {model_name}, test accuracy: {origin_test_accuracy:.2f}%, beta: {beta}")
        print(f"per column magnitude pruning test accuracy: {result['test_accuracy'][-6]:.2f}%, sparsity: {result['model_sparsity'][-4]:.4f}")
        print(f"per row magnitude pruning test accuracy: {result['test_accuracy'][-5]:.2f}%, sparsity: {result['model_sparsity'][-3]:.4f}")
        print(f"per block magnitude pruning test accuracy: {result['test_accuracy'][-4]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
        print(f"mean column mean pruning test accuracy: {result['test_accuracy'][-3]:.2f}%, sparsity: {result['model_sparsity'][-2]:.4f}")
        print(f"mean row mean pruning test accuracy: {result['test_accuracy'][-2]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
        print(f"mean block mean pruning test accuracy: {result['test_accuracy'][-1]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")


Architecture: Input(784) -> 64 -> 32 -> Output(10)
Learning rate: 0.0003, Epochs: 10, Dropout: 0.0
Total parameters: 52,650
	
model name: Tiny_Underfit, test accuracy: 94.39%, beta: 0.3678794503211975
per column magnitude pruning test accuracy: 52.43%, sparsity: 0.7980
per row magnitude pruning test accuracy: 41.83%, sparsity: 0.7677
per block magnitude pruning test accuracy: 43.28%, sparsity: 0.7979
mean column mean pruning test accuracy: 69.56%, sparsity: 0.7997
mean row mean pruning test accuracy: 40.14%, sparsity: 0.7979
mean block mean pruning test accuracy: 57.68%, sparsity: 0.7979
	
model name: Tiny_Underfit, test accuracy: 94.39%, beta: 0.4493289589881897
per column magnitude pruning test accuracy: 73.93%, sparsity: 0.7533
per row magnitude pruning test accuracy: 59.08%, sparsity: 0.7144
per block magnitude pruning test accuracy: 74.90%, sparsity: 0.7531
mean column mean pruning test accuracy: 76.84%, sparsity: 0.7552
mean row mean pruning test accuracy: 56.00%, sparsity: 0.75

In [15]:
result = {
    'model_name': [],
    'test_accuracy': [],
    'model_sparsity': [],
    'shadow_accuracy': []
}


for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = tanhLinearModel(
        input_size=input_size,
        output_size=num_classes,
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'paper_mnist/tanh_{model_name}.pth')
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")
    
    test_loss, test_accuracy, origin_test_accuracy = test(model, device, test_loader, times=5)
    
    result['model_name'].append(model_name)
    result['test_accuracy'].append(origin_test_accuracy)
    result['model_sparsity'].append(0.0)
    result['shadow_accuracy'].append(test_accuracy)
    
    coefficients = torch.linspace(-1, 1, 11)
    betas = torch.exp(coefficients).tolist()

    for beta in betas:
        # magnitude
        pc_model, pc_neff = model_pc(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pc_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pc_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pc_model))
        result['shadow_accuracy'].append(test_accuracy)

        pr_model, pr_neff = model_pr(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pr_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pr_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pr_model))
        result['shadow_accuracy'].append(test_accuracy)

        pb_model, pb_neff = model_block(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pb_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pb_model))
        result['shadow_accuracy'].append(test_accuracy)

        # mean
        mean_pc_model, mean_pc_neff = model_pc(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pc_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pc_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pc_model))
        result['shadow_accuracy'].append(test_accuracy)

        mean_pr_model, mean_pr_neff = model_pr(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pr_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pr_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pr_model))
        result['shadow_accuracy'].append(test_accuracy)

        mean_pb_model, mean_pb_neff = model_block(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pb_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pb_model))
        result['shadow_accuracy'].append(test_accuracy)

        # summary
        print('\t')
        print('='*40)
        print(f"model name: {model_name}, test accuracy: {origin_test_accuracy:.2f}%, beta: {beta}")
        print(f"per column magnitude pruning test accuracy: {result['test_accuracy'][-6]:.2f}%, sparsity: {result['model_sparsity'][-4]:.4f}")
        print(f"per row magnitude pruning test accuracy: {result['test_accuracy'][-5]:.2f}%, sparsity: {result['model_sparsity'][-3]:.4f}")
        print(f"per block magnitude pruning test accuracy: {result['test_accuracy'][-4]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
        print(f"mean column mean pruning test accuracy: {result['test_accuracy'][-3]:.2f}%, sparsity: {result['model_sparsity'][-2]:.4f}")
        print(f"mean row mean pruning test accuracy: {result['test_accuracy'][-2]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
        print(f"mean block mean pruning test accuracy: {result['test_accuracy'][-1]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")


Architecture: Input(784) -> 64 -> 32 -> Output(10)
Learning rate: 0.0003, Epochs: 10, Dropout: 0.0
Total parameters: 52,650
	
model name: Tiny_Underfit, test accuracy: 97.09%, beta: 0.3678794503211975
per column magnitude pruning test accuracy: 70.07%, sparsity: 0.7842
per row magnitude pruning test accuracy: 79.24%, sparsity: 0.7694
per block magnitude pruning test accuracy: 80.31%, sparsity: 0.7842
mean column mean pruning test accuracy: 71.46%, sparsity: 0.7847
mean row mean pruning test accuracy: 73.38%, sparsity: 0.7842
mean block mean pruning test accuracy: 82.36%, sparsity: 0.7842
	
model name: Tiny_Underfit, test accuracy: 97.09%, beta: 0.4493289589881897
per column magnitude pruning test accuracy: 81.51%, sparsity: 0.7364
per row magnitude pruning test accuracy: 87.48%, sparsity: 0.7167
per block magnitude pruning test accuracy: 89.35%, sparsity: 0.7364
mean column mean pruning test accuracy: 77.80%, sparsity: 0.7369
mean row mean pruning test accuracy: 86.42%, sparsity: 0.73

# Fashion MINST

## model training

In [16]:
model_configs = {
    # Underfit & brittle
    'Tiny_Underfit':
        {'hidden_size': [64],
        'lr': 3e-4,
        'epochs': 10,
        'dropout': 0.0},
    # Deep-narrow (depth sensitivity)
    'Deep_Narrow':         
        {'hidden_size': [128, 128, 128, 128, 128, 128, 128, 128],
        'lr': 3e-4,
        'epochs': 15,
        'dropout': 0.2},
    # Well-trained baseline
    'Balanced':            
        {'hidden_size': [512, 256],
        'lr': 3e-4,
        'epochs': 15,
        'dropout': 0.2},
    # Deep but still robust
    'Balanced_Deep':       
        {'hidden_size': [512, 256, 128, 64],
        'lr': 3e-4,
        'epochs': 20,
        'dropout': 0.3},
    # Overparameterized
    'Wide':
        {'hidden_size': [2048, 1024],
        'lr': 1e-3,
        'epochs': 30,
        'dropout': 0.0},
    # Very overparameterized (optional, keep one)
    'Very_Wide':          
        {'hidden_size': [4096, 2048, 1024, 512],
        'lr': 1e-3,
        'epochs': 50,
        'dropout': 0.0},
}


In [18]:
# Train all models
all_results = {}

datasets_name = 'fashionmnist'

train_loader, test_loader, input_size, num_classes, meta = get_loaders(datasets_name, batch_size=128)


for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Architecture: Input({input_size}) -> {' -> '.join(map(str, config['hidden_size']))} -> Output({num_classes})")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = LinearModel(
        input_size=input_size, 
        output_size=num_classes, 
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    
    # Optimizer
    optimizer = optim.Adam(model.parameters(), lr=config['lr'])
    
    # Training loop
    for epoch in range(1, config['epochs'] + 1):
        train_loss, train_accuracy = train(model, device, train_loader, optimizer, epoch)
    os.makedirs(f'paper_{datasets_name}', exist_ok=True)
    model.save(f'paper_{datasets_name}/{model_name}.pth')



Architecture: Input(784) -> 64 -> Output(10)
Learning rate: 0.0003, Epochs: 10, Dropout: 0.0

Architecture: Input(784) -> 128 -> 128 -> 128 -> 128 -> 128 -> 128 -> 128 -> 128 -> Output(10)
Learning rate: 0.0003, Epochs: 15, Dropout: 0.2

Architecture: Input(784) -> 512 -> 256 -> Output(10)
Learning rate: 0.0003, Epochs: 15, Dropout: 0.2

Architecture: Input(784) -> 512 -> 256 -> 128 -> 64 -> Output(10)
Learning rate: 0.0003, Epochs: 20, Dropout: 0.3

Architecture: Input(784) -> 2048 -> 1024 -> Output(10)
Learning rate: 0.001, Epochs: 30, Dropout: 0.0

Architecture: Input(784) -> 4096 -> 2048 -> 1024 -> 512 -> Output(10)
Learning rate: 0.001, Epochs: 50, Dropout: 0.0


In [19]:
for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model1 = geluLinearModel(
        input_size=input_size, 
        output_size=num_classes, 
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    
    # Optimizer
    optimizer = optim.Adam(model1.parameters(), lr=config['lr'])

    # Training loop
    for epoch in range(1, config['epochs'] + 1):
        train_loss, train_accuracy = train(model1, device, train_loader, optimizer, epoch)
    model1.save(f'paper_{datasets_name}/gelu_{model_name}.pth')


for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model2 = SigmoidLinearModel(
        input_size=input_size,
        output_size=num_classes,
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    
    # Optimizer
    optimizer = optim.Adam(model2.parameters(), lr=config['lr'])

    # Training loop
    for epoch in range(1, config['epochs'] + 1):
        train_loss, train_accuracy = train(model2, device, train_loader, optimizer, epoch)
    model2.save(f'paper_{datasets_name}/sigmoid_{model_name}.pth')

for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model3 = tanhLinearModel(
        input_size=input_size,
        output_size=num_classes,
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    
    # Optimizer
    optimizer = optim.Adam(model3.parameters(), lr=config['lr'])

    # Training loop
    for epoch in range(1, config['epochs'] + 1):
        train_loss, train_accuracy = train(model3, device, train_loader, optimizer, epoch)
    model3.save(f'paper_{datasets_name}/tanh_{model_name}.pth')


Architecture: Input(784) -> 64 -> Output(10)
Learning rate: 0.0003, Epochs: 10, Dropout: 0.0

Architecture: Input(784) -> 128 -> 128 -> 128 -> 128 -> 128 -> 128 -> 128 -> 128 -> Output(10)
Learning rate: 0.0003, Epochs: 15, Dropout: 0.2

Architecture: Input(784) -> 512 -> 256 -> Output(10)
Learning rate: 0.0003, Epochs: 15, Dropout: 0.2

Architecture: Input(784) -> 512 -> 256 -> 128 -> 64 -> Output(10)
Learning rate: 0.0003, Epochs: 20, Dropout: 0.3

Architecture: Input(784) -> 2048 -> 1024 -> Output(10)
Learning rate: 0.001, Epochs: 30, Dropout: 0.0

Architecture: Input(784) -> 4096 -> 2048 -> 1024 -> 512 -> Output(10)
Learning rate: 0.001, Epochs: 50, Dropout: 0.0

Architecture: Input(784) -> 64 -> Output(10)
Learning rate: 0.0003, Epochs: 10, Dropout: 0.0

Architecture: Input(784) -> 128 -> 128 -> 128 -> 128 -> 128 -> 128 -> 128 -> 128 -> Output(10)
Learning rate: 0.0003, Epochs: 15, Dropout: 0.2

Architecture: Input(784) -> 512 -> 256 -> Output(10)
Learning rate: 0.0003, Epochs: 1

# pruning

In [20]:
# Results storage
result = {
    'model_name': [],
    'test_accuracy': [],
    'model_sparsity': [],
}

for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = LinearModel(
        input_size=input_size,
        output_size=num_classes,
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'paper_{datasets_name}/{model_name}.pth')
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")
    
    test_loss, test_accuracy, origin_test_accuracy = test(model, device, test_loader, times=5)
    
    result['model_name'].append(model_name)
    result['test_accuracy'].append(origin_test_accuracy)
    result['model_sparsity'].append(0.0)
    
    # magnitude
    pc_model, pc_neff = model_pc(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pc_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pc_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pc_model))
    
    pr_model, pr_neff = model_pr(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pr_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pr_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pr_model))
    
    pb_model, pb_neff = model_block(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pb_model))

    # mean
    mean_pc_model, mean_pc_neff = model_pc(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pc_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pc_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pc_model))

    mean_pr_model, mean_pr_neff = model_pr(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pr_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pr_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pr_model))

    mean_pb_model, mean_pb_neff = model_block(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pb_model))

    # summary
    print(f"model name: {model_name}, test accuracy: {origin_test_accuracy:.2f}%")
    print(f"per column magnitude pruning test accuracy: {result['test_accuracy'][-6]:.2f}%, sparsity: {result['model_sparsity'][-4]:.4f}")
    print(f"per row magnitude pruning test accuracy: {result['test_accuracy'][-5]:.2f}%, sparsity: {result['model_sparsity'][-3]:.4f}")
    print(f"per block magnitude pruning test accuracy: {result['test_accuracy'][-4]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
    print(f"mean column mean pruning test accuracy: {result['test_accuracy'][-3]:.2f}%, sparsity: {result['model_sparsity'][-2]:.4f}")
    print(f"mean row mean pruning test accuracy: {result['test_accuracy'][-2]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
    print(f"mean block mean pruning test accuracy: {result['test_accuracy'][-1]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")


Learning rate: 0.0003, Epochs: 10, Dropout: 0.0
Total parameters: 50,890
model name: Tiny_Underfit, test accuracy: 86.24%
per column magnitude pruning test accuracy: 83.88%, sparsity: 0.3979
per row magnitude pruning test accuracy: 84.31%, sparsity: 0.3624
per block magnitude pruning test accuracy: 84.75%, sparsity: 0.3983
mean column mean pruning test accuracy: 83.21%, sparsity: 0.3790
mean row mean pruning test accuracy: 84.02%, sparsity: 0.3983
mean block mean pruning test accuracy: 85.12%, sparsity: 0.3983

Learning rate: 0.0003, Epochs: 15, Dropout: 0.2
Total parameters: 217,354
model name: Deep_Narrow, test accuracy: 87.06%
per column magnitude pruning test accuracy: 86.53%, sparsity: 0.3636
per row magnitude pruning test accuracy: 86.82%, sparsity: 0.3466
per block magnitude pruning test accuracy: 86.63%, sparsity: 0.3627
mean column mean pruning test accuracy: 83.53%, sparsity: 0.3563
mean row mean pruning test accuracy: 86.93%, sparsity: 0.3627
mean block mean pruning test ac

# BETA

In [21]:
result = {
    'model_name': [],
    'test_accuracy': [],
    'model_sparsity': [],
    'shadow_accuracy': []
}


for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = LinearModel(
        input_size=input_size,
        output_size=num_classes,
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'paper_{datasets_name}/{model_name}.pth')
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")
    
    test_loss, test_accuracy, origin_test_accuracy = test(model, device, test_loader, times=5)
    
    result['model_name'].append(model_name)
    result['test_accuracy'].append(origin_test_accuracy)
    result['model_sparsity'].append(0.0)
    result['shadow_accuracy'].append(test_accuracy)
    
    coefficients = torch.linspace(-1, 1, 11)
    betas = torch.exp(coefficients).tolist()

    for beta in betas:
        # magnitude
        pc_model, pc_neff = model_pc(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pc_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pc_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pc_model))
        result['shadow_accuracy'].append(test_accuracy)

        pr_model, pr_neff = model_pr(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pr_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pr_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pr_model))
        result['shadow_accuracy'].append(test_accuracy)

        pb_model, pb_neff = model_block(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pb_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pb_model))
        result['shadow_accuracy'].append(test_accuracy)

        # mean
        mean_pc_model, mean_pc_neff = model_pc(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pc_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pc_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pc_model))
        result['shadow_accuracy'].append(test_accuracy)

        mean_pr_model, mean_pr_neff = model_pr(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pr_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pr_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pr_model))
        result['shadow_accuracy'].append(test_accuracy)

        mean_pb_model, mean_pb_neff = model_block(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pb_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pb_model))
        result['shadow_accuracy'].append(test_accuracy)

        # summary
        print('\t')
        print('='*40)
        print(f"model name: {model_name}, test accuracy: {origin_test_accuracy:.2f}%, beta: {beta}")
        print(f"per column magnitude pruning test accuracy: {result['test_accuracy'][-6]:.2f}%, sparsity: {result['model_sparsity'][-4]:.4f}")
        print(f"per row magnitude pruning test accuracy: {result['test_accuracy'][-5]:.2f}%, sparsity: {result['model_sparsity'][-3]:.4f}")
        print(f"per block magnitude pruning test accuracy: {result['test_accuracy'][-4]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
        print(f"mean column mean pruning test accuracy: {result['test_accuracy'][-3]:.2f}%, sparsity: {result['model_sparsity'][-2]:.4f}")
        print(f"mean row mean pruning test accuracy: {result['test_accuracy'][-2]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
        print(f"mean block mean pruning test accuracy: {result['test_accuracy'][-1]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")


Architecture: Input(784) -> 64 -> Output(10)
Learning rate: 0.0003, Epochs: 10, Dropout: 0.0
Total parameters: 50,890
	
model name: Tiny_Underfit, test accuracy: 86.24%, beta: 0.3678794503211975
per column magnitude pruning test accuracy: 60.83%, sparsity: 0.7785
per row magnitude pruning test accuracy: 64.52%, sparsity: 0.7710
per block magnitude pruning test accuracy: 62.61%, sparsity: 0.7787
mean column mean pruning test accuracy: 69.03%, sparsity: 0.7721
mean row mean pruning test accuracy: 62.52%, sparsity: 0.7787
mean block mean pruning test accuracy: 65.50%, sparsity: 0.7787
	
model name: Tiny_Underfit, test accuracy: 86.24%, beta: 0.4493289589881897
per column magnitude pruning test accuracy: 51.43%, sparsity: 0.7295
per row magnitude pruning test accuracy: 70.54%, sparsity: 0.7181
per block magnitude pruning test accuracy: 67.00%, sparsity: 0.7297
mean column mean pruning test accuracy: 67.14%, sparsity: 0.7214
mean row mean pruning test accuracy: 66.31%, sparsity: 0.7297
mea

In [22]:
result = {
    'model_name': [],
    'test_accuracy': [],
    'model_sparsity': [],
    'shadow_accuracy': []
}


for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = geluLinearModel(
        input_size=input_size,
        output_size=num_classes,
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'paper_{datasets_name}/gelu_{model_name}.pth')
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")
    
    test_loss, test_accuracy, origin_test_accuracy = test(model, device, test_loader, times=5)
    
    result['model_name'].append(model_name)
    result['test_accuracy'].append(origin_test_accuracy)
    result['model_sparsity'].append(0.0)
    result['shadow_accuracy'].append(test_accuracy)
    
    coefficients = torch.linspace(-1, 1, 11)
    betas = torch.exp(coefficients).tolist()

    for beta in betas:
        # magnitude
        pc_model, pc_neff = model_pc(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pc_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pc_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pc_model))
        result['shadow_accuracy'].append(test_accuracy)

        pr_model, pr_neff = model_pr(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pr_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pr_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pr_model))
        result['shadow_accuracy'].append(test_accuracy)

        pb_model, pb_neff = model_block(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pb_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pb_model))
        result['shadow_accuracy'].append(test_accuracy)

        # mean
        mean_pc_model, mean_pc_neff = model_pc(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pc_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pc_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pc_model))
        result['shadow_accuracy'].append(test_accuracy)

        mean_pr_model, mean_pr_neff = model_pr(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pr_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pr_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pr_model))
        result['shadow_accuracy'].append(test_accuracy)

        mean_pb_model, mean_pb_neff = model_block(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pb_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pb_model))
        result['shadow_accuracy'].append(test_accuracy)

        # summary
        print('\t')
        print('='*40)
        print(f"model name: {model_name}, test accuracy: {origin_test_accuracy:.2f}%, beta: {beta}")
        print(f"per column magnitude pruning test accuracy: {result['test_accuracy'][-6]:.2f}%, sparsity: {result['model_sparsity'][-4]:.4f}")
        print(f"per row magnitude pruning test accuracy: {result['test_accuracy'][-5]:.2f}%, sparsity: {result['model_sparsity'][-3]:.4f}")
        print(f"per block magnitude pruning test accuracy: {result['test_accuracy'][-4]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
        print(f"mean column mean pruning test accuracy: {result['test_accuracy'][-3]:.2f}%, sparsity: {result['model_sparsity'][-2]:.4f}")
        print(f"mean row mean pruning test accuracy: {result['test_accuracy'][-2]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
        print(f"mean block mean pruning test accuracy: {result['test_accuracy'][-1]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")


Architecture: Input(784) -> 64 -> Output(10)
Learning rate: 0.0003, Epochs: 10, Dropout: 0.0
Total parameters: 50,890
	
model name: Tiny_Underfit, test accuracy: 86.49%, beta: 0.3678794503211975
per column magnitude pruning test accuracy: 46.37%, sparsity: 0.7753
per row magnitude pruning test accuracy: 61.09%, sparsity: 0.7682
per block magnitude pruning test accuracy: 67.97%, sparsity: 0.7754
mean column mean pruning test accuracy: 61.98%, sparsity: 0.7738
mean row mean pruning test accuracy: 62.11%, sparsity: 0.7754
mean block mean pruning test accuracy: 68.44%, sparsity: 0.7754
	
model name: Tiny_Underfit, test accuracy: 86.49%, beta: 0.4493289589881897
per column magnitude pruning test accuracy: 53.25%, sparsity: 0.7256
per row magnitude pruning test accuracy: 68.89%, sparsity: 0.7148
per block magnitude pruning test accuracy: 75.25%, sparsity: 0.7257
mean column mean pruning test accuracy: 75.06%, sparsity: 0.7236
mean row mean pruning test accuracy: 75.34%, sparsity: 0.7257
mea

In [23]:
result = {
    'model_name': [],
    'test_accuracy': [],
    'model_sparsity': [],
    'shadow_accuracy': []
}


for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = SigmoidLinearModel(
        input_size=input_size,
        output_size=num_classes,
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'paper_{datasets_name}/{model_name}.pth')
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")
    
    test_loss, test_accuracy, origin_test_accuracy = test(model, device, test_loader, times=5)
    
    result['model_name'].append(model_name)
    result['test_accuracy'].append(origin_test_accuracy)
    result['model_sparsity'].append(0.0)
    result['shadow_accuracy'].append(test_accuracy)
    
    coefficients = torch.linspace(-1, 1, 11)
    betas = torch.exp(coefficients).tolist()

    for beta in betas:
        # magnitude
        pc_model, pc_neff = model_pc(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pc_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pc_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pc_model))
        result['shadow_accuracy'].append(test_accuracy)

        pr_model, pr_neff = model_pr(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pr_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pr_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pr_model))
        result['shadow_accuracy'].append(test_accuracy)

        pb_model, pb_neff = model_block(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pb_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pb_model))
        result['shadow_accuracy'].append(test_accuracy)

        # mean
        mean_pc_model, mean_pc_neff = model_pc(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pc_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pc_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pc_model))
        result['shadow_accuracy'].append(test_accuracy)

        mean_pr_model, mean_pr_neff = model_pr(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pr_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pr_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pr_model))
        result['shadow_accuracy'].append(test_accuracy)

        mean_pb_model, mean_pb_neff = model_block(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pb_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pb_model))
        result['shadow_accuracy'].append(test_accuracy)

        # summary
        print('\t')
        print('='*40)
        print(f"model name: {model_name}, test accuracy: {origin_test_accuracy:.2f}%, beta: {beta}")
        print(f"per column magnitude pruning test accuracy: {result['test_accuracy'][-6]:.2f}%, sparsity: {result['model_sparsity'][-4]:.4f}")
        print(f"per row magnitude pruning test accuracy: {result['test_accuracy'][-5]:.2f}%, sparsity: {result['model_sparsity'][-3]:.4f}")
        print(f"per block magnitude pruning test accuracy: {result['test_accuracy'][-4]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
        print(f"mean column mean pruning test accuracy: {result['test_accuracy'][-3]:.2f}%, sparsity: {result['model_sparsity'][-2]:.4f}")
        print(f"mean row mean pruning test accuracy: {result['test_accuracy'][-2]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
        print(f"mean block mean pruning test accuracy: {result['test_accuracy'][-1]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")


Architecture: Input(784) -> 64 -> Output(10)
Learning rate: 0.0003, Epochs: 10, Dropout: 0.0
Total parameters: 50,890
	
model name: Tiny_Underfit, test accuracy: 60.40%, beta: 0.3678794503211975
per column magnitude pruning test accuracy: 18.10%, sparsity: 0.7785
per row magnitude pruning test accuracy: 26.55%, sparsity: 0.7710
per block magnitude pruning test accuracy: 33.76%, sparsity: 0.7787
mean column mean pruning test accuracy: 28.04%, sparsity: 0.7721
mean row mean pruning test accuracy: 55.87%, sparsity: 0.7787
mean block mean pruning test accuracy: 34.01%, sparsity: 0.7787
	
model name: Tiny_Underfit, test accuracy: 60.40%, beta: 0.4493289589881897
per column magnitude pruning test accuracy: 18.43%, sparsity: 0.7295
per row magnitude pruning test accuracy: 34.65%, sparsity: 0.7181
per block magnitude pruning test accuracy: 37.66%, sparsity: 0.7297
mean column mean pruning test accuracy: 34.54%, sparsity: 0.7214
mean row mean pruning test accuracy: 56.90%, sparsity: 0.7297
mea

KeyboardInterrupt: 

In [24]:
result = {
    'model_name': [],
    'test_accuracy': [],
    'model_sparsity': [],
    'shadow_accuracy': []
}


for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = tanhLinearModel(
        input_size=input_size,
        output_size=num_classes,
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'paper_{datasets_name}/{model_name}.pth')
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")
    
    test_loss, test_accuracy, origin_test_accuracy = test(model, device, test_loader, times=5)
    
    result['model_name'].append(model_name)
    result['test_accuracy'].append(origin_test_accuracy)
    result['model_sparsity'].append(0.0)
    result['shadow_accuracy'].append(test_accuracy)
    
    coefficients = torch.linspace(-1, 1, 11)
    betas = torch.exp(coefficients).tolist()

    for beta in betas:
        # magnitude
        pc_model, pc_neff = model_pc(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pc_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pc_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pc_model))
        result['shadow_accuracy'].append(test_accuracy)

        pr_model, pr_neff = model_pr(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pr_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pr_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pr_model))
        result['shadow_accuracy'].append(test_accuracy)

        pb_model, pb_neff = model_block(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pb_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pb_model))
        result['shadow_accuracy'].append(test_accuracy)

        # mean
        mean_pc_model, mean_pc_neff = model_pc(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pc_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pc_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pc_model))
        result['shadow_accuracy'].append(test_accuracy)

        mean_pr_model, mean_pr_neff = model_pr(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pr_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pr_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pr_model))
        result['shadow_accuracy'].append(test_accuracy)

        mean_pb_model, mean_pb_neff = model_block(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pb_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pb_model))
        result['shadow_accuracy'].append(test_accuracy)

        # summary
        print('\t')
        print('='*40)
        print(f"model name: {model_name}, test accuracy: {origin_test_accuracy:.2f}%, beta: {beta}")
        print(f"per column magnitude pruning test accuracy: {result['test_accuracy'][-6]:.2f}%, sparsity: {result['model_sparsity'][-4]:.4f}")
        print(f"per row magnitude pruning test accuracy: {result['test_accuracy'][-5]:.2f}%, sparsity: {result['model_sparsity'][-3]:.4f}")
        print(f"per block magnitude pruning test accuracy: {result['test_accuracy'][-4]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
        print(f"mean column mean pruning test accuracy: {result['test_accuracy'][-3]:.2f}%, sparsity: {result['model_sparsity'][-2]:.4f}")
        print(f"mean row mean pruning test accuracy: {result['test_accuracy'][-2]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
        print(f"mean block mean pruning test accuracy: {result['test_accuracy'][-1]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")


Architecture: Input(784) -> 64 -> Output(10)
Learning rate: 0.0003, Epochs: 10, Dropout: 0.0
Total parameters: 50,890


KeyboardInterrupt: 

In [25]:
result = {
    'model_name': [],
    'test_accuracy': [],
    'model_sparsity': [],
    'shadow_accuracy': []
}


for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = tanhLinearModel(
        input_size=input_size,
        output_size=num_classes,
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'paper_{datasets_name}/{model_name}.pth')
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")
    
    test_loss, test_accuracy, origin_test_accuracy = test(model, device, test_loader, times=5)
    print(f"Test accuracy: {origin_test_accuracy:.2f}%")


Architecture: Input(784) -> 64 -> Output(10)
Learning rate: 0.0003, Epochs: 10, Dropout: 0.0
Total parameters: 50,890
Test accuracy: 70.06%

Architecture: Input(784) -> 128 -> 128 -> 128 -> 128 -> 128 -> 128 -> 128 -> 128 -> Output(10)
Learning rate: 0.0003, Epochs: 15, Dropout: 0.2
Total parameters: 217,354
Test accuracy: 41.67%

Architecture: Input(784) -> 512 -> 256 -> Output(10)
Learning rate: 0.0003, Epochs: 15, Dropout: 0.2
Total parameters: 535,818
Test accuracy: 73.81%

Architecture: Input(784) -> 512 -> 256 -> 128 -> 64 -> Output(10)
Learning rate: 0.0003, Epochs: 20, Dropout: 0.3
Total parameters: 575,050
Test accuracy: 66.39%

Architecture: Input(784) -> 2048 -> 1024 -> Output(10)
Learning rate: 0.001, Epochs: 30, Dropout: 0.0
Total parameters: 3,716,106
Test accuracy: 14.35%

Architecture: Input(784) -> 4096 -> 2048 -> 1024 -> 512 -> Output(10)
Learning rate: 0.001, Epochs: 50, Dropout: 0.0
Total parameters: 14,234,122
Test accuracy: 10.01%


In [26]:
result = {
    'model_name': [],
    'test_accuracy': [],
    'model_sparsity': [],
    'shadow_accuracy': []
}


for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = SigmoidLinearModel(
        input_size=input_size,
        output_size=num_classes,
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'paper_{datasets_name}/{model_name}.pth')
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")
    
    test_loss, test_accuracy, origin_test_accuracy = test(model, device, test_loader, times=5)
    print(f"Test accuracy: {origin_test_accuracy:.2f}%")


Architecture: Input(784) -> 64 -> Output(10)
Learning rate: 0.0003, Epochs: 10, Dropout: 0.0
Total parameters: 50,890
Test accuracy: 60.40%

Architecture: Input(784) -> 128 -> 128 -> 128 -> 128 -> 128 -> 128 -> 128 -> 128 -> Output(10)
Learning rate: 0.0003, Epochs: 15, Dropout: 0.2
Total parameters: 217,354
Test accuracy: 10.00%

Architecture: Input(784) -> 512 -> 256 -> Output(10)
Learning rate: 0.0003, Epochs: 15, Dropout: 0.2
Total parameters: 535,818
Test accuracy: 40.67%

Architecture: Input(784) -> 512 -> 256 -> 128 -> 64 -> Output(10)
Learning rate: 0.0003, Epochs: 20, Dropout: 0.3
Total parameters: 575,050
Test accuracy: 10.00%

Architecture: Input(784) -> 2048 -> 1024 -> Output(10)
Learning rate: 0.001, Epochs: 30, Dropout: 0.0
Total parameters: 3,716,106
Test accuracy: 66.87%

Architecture: Input(784) -> 4096 -> 2048 -> 1024 -> 512 -> Output(10)
Learning rate: 0.001, Epochs: 50, Dropout: 0.0
Total parameters: 14,234,122
Test accuracy: 16.04%
