# start

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import os
import numpy as np

import copy
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from collections import defaultdict

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# track neff and sparsity

In [7]:
def mask_block(module:nn.Module, beta=1.0, method='magnitude') -> torch.Tensor:
    x = module.weight.data
    x = x.view(-1)
    if method == 'mean':
        x = x - torch.mean(x)
    x_norm = torch.abs(x) / torch.sum(torch.abs(x))
    neff = 1/torch.sum((x_norm ** 2))
    r_neff = torch.floor(beta * neff)
    r_neff = r_neff.clamp(min=1, max=len(x)-1)

    _, indices = torch.sort(x_norm, descending=True)
    range_tensor = torch.arange(len(x), device=x.device)
    sorted_mask = range_tensor < r_neff

    mask = torch.zeros_like(x, dtype=torch.bool)
    mask.scatter_(0, indices, sorted_mask)
    mask = mask.view_as(module.weight)
    return mask, r_neff

def model_block(model, renormalize=False, beta=1.0, method='magnitude'):
    model = copy.deepcopy(model)
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            mask, neff = mask_block(module, beta=beta, method=method)
            mask = mask.to(module.weight.device)
            n = module.weight.data.numel()
            with torch.no_grad():
                module.weight *= mask
                if renormalize:
                    module.weight.mul_(n / neff)
    return model, neff

In [8]:
def model_sparsity(model):
    total_params = 0
    zero_params = 0
    
    for name, param in model.named_parameters():
        if 'weight' in name:
            total_params += param.numel()
            zero_params += torch.sum(param == 0).item()
    
    sparsity = zero_params / total_params
    return sparsity

def per_layer_neff(model):
    neff = {}
    for name, param in model.named_parameters():
        if 'weight' in name:
            layer_neff = torch.sum(param != 0).item()
            neff[name] = layer_neff
    return neff

In [9]:
class LinearModel(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=[512, 512, 512], dropout_rate=0.0):
        super(LinearModel, self).__init__()
        self.layers = nn.ModuleList()
        self.dropout = nn.Dropout(dropout_rate)
        
        prev_size = input_size
        for size in hidden_size:
            self.layers.append(nn.Linear(prev_size, size))
            prev_size = size
            
        self.output = nn.Linear(prev_size, output_size)
        
    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten
        
        for layer in self.layers:
            x = F.relu(layer(x))
            x = self.dropout(x)  # Apply dropout after activation
        x = self.output(x)
        return F.log_softmax(x, dim=1)
    
    def save(self, path):
        torch.save(self.state_dict(), path)

    def load(self, path):
        self.load_state_dict(torch.load(path))

# Training function
def train(model, device, train_loader, optimizer, epoch):
    """Train for one epoch"""
    model.train()
    train_loss = 0
    correct = 0
    
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.view_as(pred)).sum().item()

    avg_loss = train_loss / len(train_loader)
    accuracy = 100. * correct / len(train_loader.dataset)
    return avg_loss, accuracy

# Testing function
def test(model, device, test_loader, times=1):
    """Evaluate model on test set"""
    model.eval()
    accuracy_list = []
    loss_list = []
    for _ in range(times):
        test_loss = 0
        correct = 0
        with torch.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                test_loss += F.nll_loss(output, target, reduction='sum').item()
                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(target.view_as(pred)).sum().item()

        test_loss /= len(test_loader.dataset)
        accuracy = 100. * correct / len(test_loader.dataset)
        accuracy_list.append(accuracy)
        loss_list.append(test_loss)

    if times == 1:
        return test_loss, accuracy
    
    else:
        return loss_list, accuracy_list, sum(accuracy_list)/times

In [None]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

def get_loaders(dataset_name, batch_size=128, test_batch_size=1000, data_root='./data'):
    """
    Returns: train_loader, test_loader, input_size, num_classes, meta (dict)
    """
    name = dataset_name.lower()
    meta = {}

    # Generic normalizations (safe defaults). If you want canonical stats, compute them once.
    NORM_1C = transforms.Normalize((0.5,), (0.5,))
    NORM_3C = transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))

    if name == 'mnist':
        # (You already have this; included for completeness.)
        tfm = transforms.Compose([transforms.ToTensor(),
                                  transforms.Normalize((0.1307,), (0.3081,))])
        train = datasets.MNIST(data_root, train=True, download=True, transform=tfm)
        test  = datasets.MNIST(data_root, train=False, download=True, transform=tfm)
        inp, ncls = 28*28, 10

    elif name == 'fashionmnist':
        tfm = transforms.Compose([transforms.ToTensor(), NORM_1C])
        train = datasets.FashionMNIST(data_root, train=True, download=True, transform=tfm)
        test  = datasets.FashionMNIST(data_root, train=False, download=True, transform=tfm)
        inp, ncls = 28*28, 10
        
    elif name == 'cifar10':
        tfm = transforms.Compose([transforms.ToTensor(), NORM_3C])
        train = datasets.CIFAR10(data_root, train=True,  download=True, transform=tfm)
        test  = datasets.CIFAR10(data_root, train=False, download=True, transform=tfm)
        inp, ncls = 32*32*3, 10

    elif name == 'cifar100':
        tfm = transforms.Compose([transforms.ToTensor(), NORM_3C])
        train = datasets.CIFAR100(data_root, train=True,  download=True, transform=tfm)
        test  = datasets.CIFAR100(data_root, train=False, download=True, transform=tfm)
        inp, ncls = 32*32*3, 100
        
    else:
        raise ValueError(f"Unknown dataset: {dataset_name}")

    train_loader = DataLoader(train, batch_size=batch_size, shuffle=True,  num_workers=2, pin_memory=True)
    test_loader  = DataLoader(test,  batch_size=test_batch_size, shuffle=False, num_workers=2, pin_memory=True)
    return train_loader, test_loader, inp, ncls, meta


## Model Training

In [None]:
# Model configurations
MNIST_model_configs = {
    'Model_1': {
        'hidden_size': [16],
        'lr': 1e-4,
        'epochs': 5,
        'dropout': 0.0,
    },
    'Model_2': {
        'hidden_size': [512, 128, 32],
        'lr': 3e-4,
        'epochs': 15,
        'dropout': 0.0,
    },
    'Model_3': {
        'hidden_size': [1024, 512, 256, 128, 32],
        'lr': 3e-4,
        'epochs': 25,
        'dropout': 0.0,
    },
}


In [None]:

# Dataset setup
batch_size = 64
test_batch_size = 1000

dataset_name = 'mnist'

train_loader, test_loader, input_size, num_classes, meta = get_loaders(dataset_name, batch_size, test_batch_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Train all models
all_results = {}

for model_name, config in MNIST_model_configs.items():
    print(f"\n{'='*60}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    model = LinearModel(
        input_size=input_size,
        output_size=num_classes,
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    
    optimizer = optim.Adam(model.parameters(), lr=config['lr'])
    
    # Training loop
    for epoch in range(1, config['epochs'] + 1):
        train_loss, train_accuracy = train(model, device, train_loader, optimizer, epoch)
        test_loss, test_accuracy = test(model, device, test_loader)
        print(f"Epoch {epoch}: Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")

    os.makedirs(f'models/{dataset_name}', exist_ok=True)
    model.save(f'models/{dataset_name}/{model_name}.pth')



Using device: cuda

Architecture: Input(784) -> 16 -> Output(10)
Learning rate: 0.0001, Epochs: 5, Dropout: 0.0
Epoch 1: Train Loss: 1.0237, Train Accuracy: 73.47%, Test Loss: 0.5058, Test Accuracy: 87.42%
Epoch 2: Train Loss: 0.4370, Train Accuracy: 88.33%, Test Loss: 0.3679, Test Accuracy: 90.12%
Epoch 3: Train Loss: 0.3558, Train Accuracy: 90.05%, Test Loss: 0.3238, Test Accuracy: 90.88%
Epoch 4: Train Loss: 0.3221, Train Accuracy: 90.88%, Test Loss: 0.3024, Test Accuracy: 91.41%
Epoch 5: Train Loss: 0.3028, Train Accuracy: 91.32%, Test Loss: 0.2873, Test Accuracy: 91.76%

Architecture: Input(784) -> 512 -> 128 -> 32 -> Output(10)
Learning rate: 0.0003, Epochs: 15, Dropout: 0.0
Epoch 1: Train Loss: 0.3360, Train Accuracy: 90.08%, Test Loss: 0.1486, Test Accuracy: 95.62%
Epoch 2: Train Loss: 0.1264, Train Accuracy: 96.24%, Test Loss: 0.0995, Test Accuracy: 96.93%
Epoch 3: Train Loss: 0.0845, Train Accuracy: 97.43%, Test Loss: 0.0853, Test Accuracy: 97.34%
Epoch 4: Train Loss: 0.0610,

In [14]:
# Model configurations
fashion_mnist_model_configs = {
    'Model_1': {
        'hidden_size': [32],
        'lr': 1e-4,
        'epochs': 5,
        'dropout': 0.0,
    },
    'Model_2': {
        'hidden_size': [512, 128, 32],
        'lr': 3e-4,
        'epochs': 15,
        'dropout': 0.0,
    },
    'Model_3': {
        'hidden_size': [1024, 512, 256, 128, 32],
        'lr': 3e-4,
        'epochs': 25,
        'dropout': 0.0,
    },
}

In [15]:
# Dataset setup
batch_size = 64
test_batch_size = 1000

dataset_name = 'fashionmnist'

train_loader, test_loader, input_size, num_classes, meta = get_loaders(dataset_name, batch_size, test_batch_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Train all models
all_results = {}

for model_name, config in fashion_mnist_model_configs.items():
    print(f"\n{'='*60}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    model = LinearModel(
        input_size=input_size,
        output_size=num_classes,
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    
    optimizer = optim.Adam(model.parameters(), lr=config['lr'])
    
    # Training loop
    for epoch in range(1, config['epochs'] + 1):
        train_loss, train_accuracy = train(model, device, train_loader, optimizer, epoch)
        test_loss, test_accuracy = test(model, device, test_loader)
        print(f"Epoch {epoch}: Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")

    os.makedirs(f'models/{dataset_name}', exist_ok=True)
    model.save(f'models/{dataset_name}/{model_name}.pth')



Using device: cuda

Architecture: Input(784) -> 32 -> Output(10)
Learning rate: 0.0001, Epochs: 5, Dropout: 0.0
Epoch 1: Train Loss: 0.9275, Train Accuracy: 70.41%, Test Loss: 0.6359, Test Accuracy: 78.09%
Epoch 2: Train Loss: 0.5641, Train Accuracy: 80.59%, Test Loss: 0.5504, Test Accuracy: 80.47%
Epoch 3: Train Loss: 0.5043, Train Accuracy: 82.55%, Test Loss: 0.5129, Test Accuracy: 81.69%
Epoch 4: Train Loss: 0.4735, Train Accuracy: 83.59%, Test Loss: 0.4946, Test Accuracy: 82.21%
Epoch 5: Train Loss: 0.4542, Train Accuracy: 84.17%, Test Loss: 0.4790, Test Accuracy: 83.13%

Architecture: Input(784) -> 512 -> 128 -> 32 -> Output(10)
Learning rate: 0.0003, Epochs: 15, Dropout: 0.0
Epoch 1: Train Loss: 0.5816, Train Accuracy: 79.49%, Test Loss: 0.4531, Test Accuracy: 83.49%
Epoch 2: Train Loss: 0.3988, Train Accuracy: 85.62%, Test Loss: 0.4021, Test Accuracy: 85.48%
Epoch 3: Train Loss: 0.3553, Train Accuracy: 87.03%, Test Loss: 0.3781, Test Accuracy: 86.53%
Epoch 4: Train Loss: 0.3277,

## pruning

In [17]:
# Results storage
result = {
    'model_name': [],
    'test_accuracy': [],
    'model_sparsity': [],
}

In [None]:
datasets_name = 'mnist'
input_size = 28*28
num_classes = 10

for model_name, config in MNIST_model_configs.items():
    print(f"\n{'='*60}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = LinearModel(
        input_size=input_size,
        output_size=num_classes, 
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'models/{dataset_name}/{model_name}.pth')
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")
    
    test_loss, test_accuracy, origin_test_accuracy = test(model, device, test_loader, times=5)
    
    result['model_name'].append(model_name)
    result['test_accuracy'].append(origin_test_accuracy)
    result['model_sparsity'].append(0.0)
    
    # magnitude
    pb_model, pb_neff = model_block(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pb_model))
    
    # magnitude renormalized
    pb_model, pb_neff = model_block(model, renormalize=True, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_magnitude_renorm")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pb_model))

    # mean
    mean_pb_model, mean_pb_neff = model_block(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pb_model))
    
    # mean renormalized
    mean_pb_model, mean_pb_neff = model_block(model, renormalize=True, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_mean_renorm")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pb_model))

    # summary
    print(f"model name: {model_name}, test accuracy: {origin_test_accuracy:.2f}%")
    print(f"pb magnitude pruning test accuracy: {result['test_accuracy'][-4]:.2f}%, sparsity: {result['model_sparsity'][-4]:.4f}")
    print(f"pb magnitude renormalized pruning test accuracy: {result['test_accuracy'][-3]:.2f}%, sparsity: {result['model_sparsity'][-3]:.4f}")
    print(f"mean block mean pruning test accuracy: {result['test_accuracy'][-2]:.2f}%, sparsity: {result['model_sparsity'][-2]:.4f}")
    print(f"mean block mean renormalized pruning test accuracy: {result['test_accuracy'][-1]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")


Architecture: Input(784) -> 16 -> Output(10)
Learning rate: 0.0001, Epochs: 5, Dropout: 0.0
Total parameters: 12,730
model name: Model_1, test accuracy: 91.76%
pb magnitude pruning test accuracy: 89.04%, sparsity: 0.3664
pb magnitude renormalized pruning test accuracy: 89.08%, sparsity: 0.3664
mean block mean pruning test accuracy: 89.47%, sparsity: 0.3841
mean block mean renormalized pruning test accuracy: 89.37%, sparsity: 0.3841

Architecture: Input(784) -> 512 -> 128 -> 32 -> Output(10)
Learning rate: 0.0003, Epochs: 15, Dropout: 0.0
Total parameters: 472,042
model name: Model_2, test accuracy: 97.82%
pb magnitude pruning test accuracy: 97.58%, sparsity: 0.4238
pb magnitude renormalized pruning test accuracy: 97.59%, sparsity: 0.4238
mean block mean pruning test accuracy: 97.60%, sparsity: 0.4244
mean block mean renormalized pruning test accuracy: 97.56%, sparsity: 0.4244

Architecture: Input(784) -> 1024 -> 512 -> 256 -> 128 -> 32 -> Output(10)
Learning rate: 0.0003, Epochs: 25, 

In [16]:
# Results storage
result = {
    'model_name': [],
    'test_accuracy': [],
    'model_sparsity': [],
}

In [17]:
dataset_name = 'fashionmnist'
input_size = 28*28
num_classes = 10

for model_name, config in fashion_mnist_model_configs.items():
    print(f"\n{'='*60}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = LinearModel(
        input_size=input_size,
        output_size=num_classes, 
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'models/{dataset_name}/{model_name}.pth')
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")
    
    test_loss, test_accuracy, origin_test_accuracy = test(model, device, test_loader, times=5)
    
    result['model_name'].append(model_name)
    result['test_accuracy'].append(origin_test_accuracy)
    result['model_sparsity'].append(0.0)
    
    # magnitude
    pb_model, pb_neff = model_block(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pb_model))
    
    # magnitude renormalized
    pb_model, pb_neff = model_block(model, renormalize=True, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_magnitude_renorm")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pb_model))

    # mean
    mean_pb_model, mean_pb_neff = model_block(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pb_model))
    
    # mean renormalized
    mean_pb_model, mean_pb_neff = model_block(model, renormalize=True, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_mean_renorm")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pb_model))

    # summary
    print(f"model name: {model_name}, test accuracy: {origin_test_accuracy:.2f}%")
    print(f"pb magnitude pruning test accuracy: {result['test_accuracy'][-4]:.2f}%, sparsity: {result['model_sparsity'][-4]:.4f}")
    print(f"pb magnitude renormalized pruning test accuracy: {result['test_accuracy'][-3]:.2f}%, sparsity: {result['model_sparsity'][-3]:.4f}")
    print(f"mean block mean pruning test accuracy: {result['test_accuracy'][-2]:.2f}%, sparsity: {result['model_sparsity'][-2]:.4f}")
    print(f"mean block mean renormalized pruning test accuracy: {result['test_accuracy'][-1]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")


Architecture: Input(784) -> 32 -> Output(10)
Learning rate: 0.0001, Epochs: 5, Dropout: 0.0
Total parameters: 25,450
model name: Model_1, test accuracy: 83.13%
pb magnitude pruning test accuracy: 81.59%, sparsity: 0.3765
pb magnitude renormalized pruning test accuracy: 81.64%, sparsity: 0.3765
mean block mean pruning test accuracy: 82.15%, sparsity: 0.3785
mean block mean renormalized pruning test accuracy: 82.21%, sparsity: 0.3785

Architecture: Input(784) -> 512 -> 128 -> 32 -> Output(10)
Learning rate: 0.0003, Epochs: 15, Dropout: 0.0
Total parameters: 472,042
model name: Model_2, test accuracy: 88.37%
pb magnitude pruning test accuracy: 87.59%, sparsity: 0.4128
pb magnitude renormalized pruning test accuracy: 87.37%, sparsity: 0.4128
mean block mean pruning test accuracy: 87.22%, sparsity: 0.4130
mean block mean renormalized pruning test accuracy: 87.22%, sparsity: 0.4130

Architecture: Input(784) -> 1024 -> 512 -> 256 -> 128 -> 32 -> Output(10)
Learning rate: 0.0003, Epochs: 25, 

## beta swap

In [19]:
beta_values = [0.5, 0.75, 1.0, 1.25, 1.5]
beta_results = {}

for model_name, config in MNIST_model_configs.items():
    print(f"\n{'='*60}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = LinearModel(
        input_size=input_size,
        output_size=num_classes, 
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'models/{dataset_name}/{model_name}.pth')
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")
    
    test_loss, test_accuracy, origin_test_accuracy = test(model, device, test_loader, times=5)
    
    result['model_name'].append(model_name)
    result['test_accuracy'].append(origin_test_accuracy)
    result['model_sparsity'].append(0.0)
    
    for beta in beta_values:
        print(f"\n--- Beta: {beta} ---")
        # Results storage
        result = {
            'model_name': [],
            'test_accuracy': [],
            'model_sparsity': [],
        }
    
        # magnitude
        pb_model, pb_neff = model_block(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pb_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pb_model))
        
        # magnitude renormalized
        pb_model, pb_neff = model_block(model, renormalize=True, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pb_{beta}_magnitude_renorm")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pb_model))

        # mean
        mean_pb_model, mean_pb_neff = model_block(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pb_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pb_model))
        
        # mean renormalized
        mean_pb_model, mean_pb_neff = model_block(model, renormalize=True, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pb_{beta}_mean_renorm")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pb_model))

        beta_results[beta] = result

        # summary
        print(f"model name: {model_name}, test accuracy: {origin_test_accuracy:.2f}%")
        print(f"pb magnitude pruning test accuracy: {result['test_accuracy'][-4]:.2f}%, sparsity: {result['model_sparsity'][-4]:.4f}")
        print(f"pb magnitude renormalized pruning test accuracy: {result['test_accuracy'][-3]:.2f}%, sparsity: {result['model_sparsity'][-3]:.4f}")
        print(f"mean block mean pruning test accuracy: {result['test_accuracy'][-2]:.2f}%, sparsity: {result['model_sparsity'][-2]:.4f}")
        print(f"mean block mean renormalized pruning test accuracy: {result['test_accuracy'][-1]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")


Architecture: Input(784) -> 16 -> Output(10)
Learning rate: 0.0001, Epochs: 5, Dropout: 0.0
Total parameters: 12,730

--- Beta: 0.5 ---
model name: Model_1, test accuracy: 91.76%
pb magnitude pruning test accuracy: 68.07%, sparsity: 0.6832
pb magnitude renormalized pruning test accuracy: 68.30%, sparsity: 0.6832
mean block mean pruning test accuracy: 66.13%, sparsity: 0.6921
mean block mean renormalized pruning test accuracy: 65.29%, sparsity: 0.6921

--- Beta: 0.75 ---
model name: Model_1, test accuracy: 91.76%
pb magnitude pruning test accuracy: 77.29%, sparsity: 0.5248
pb magnitude renormalized pruning test accuracy: 77.09%, sparsity: 0.5248
mean block mean pruning test accuracy: 75.74%, sparsity: 0.5382
mean block mean renormalized pruning test accuracy: 75.93%, sparsity: 0.5382

--- Beta: 1.0 ---
model name: Model_1, test accuracy: 91.76%
pb magnitude pruning test accuracy: 89.04%, sparsity: 0.3664
pb magnitude renormalized pruning test accuracy: 89.08%, sparsity: 0.3664
mean blo

In [18]:
beta_values = [0.5, 0.75, 1.0, 1.25, 1.5]
beta_results = {}
dataset_name = 'fashionmnist'
input_size = 28*28
num_classes = 10

for model_name, config in fashion_mnist_model_configs.items():
    print(f"\n{'='*60}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = LinearModel(
        input_size=input_size,
        output_size=num_classes, 
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'models/{dataset_name}/{model_name}.pth')
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")
    
    test_loss, test_accuracy, origin_test_accuracy = test(model, device, test_loader, times=5)
    
    result['model_name'].append(model_name)
    result['test_accuracy'].append(origin_test_accuracy)
    result['model_sparsity'].append(0.0)
    
    for beta in beta_values:
        print(f"\n--- Beta: {beta} ---")
        # Results storage
        result = {
            'model_name': [],
            'test_accuracy': [],
            'model_sparsity': [],
        }
    
        # magnitude
        pb_model, pb_neff = model_block(model, renormalize=False, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pb_{beta}_magnitude")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pb_model))
        
        # magnitude renormalized
        pb_model, pb_neff = model_block(model, renormalize=True, beta=beta, method='magnitude')
        test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pb_{beta}_magnitude_renorm")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(pb_model))

        # mean
        mean_pb_model, mean_pb_neff = model_block(model, renormalize=False, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pb_{beta}_mean")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pb_model))
        
        # mean renormalized
        mean_pb_model, mean_pb_neff = model_block(model, renormalize=True, beta=beta, method='mean')
        test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
        result['model_name'].append(f"{model_name}_pb_{beta}_mean_renorm")
        result['test_accuracy'].append(accuracy_mean)
        result['model_sparsity'].append(model_sparsity(mean_pb_model))

        beta_results[beta] = result

        # summary
        print(f"model name: {model_name}, test accuracy: {origin_test_accuracy:.2f}%")
        print(f"pb magnitude pruning test accuracy: {result['test_accuracy'][-4]:.2f}%, sparsity: {result['model_sparsity'][-4]:.4f}")
        print(f"pb magnitude renormalized pruning test accuracy: {result['test_accuracy'][-3]:.2f}%, sparsity: {result['model_sparsity'][-3]:.4f}")
        print(f"mean block mean pruning test accuracy: {result['test_accuracy'][-2]:.2f}%, sparsity: {result['model_sparsity'][-2]:.4f}")
        print(f"mean block mean renormalized pruning test accuracy: {result['test_accuracy'][-1]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")


Architecture: Input(784) -> 32 -> Output(10)
Learning rate: 0.0001, Epochs: 5, Dropout: 0.0
Total parameters: 25,450

--- Beta: 0.5 ---
model name: Model_1, test accuracy: 83.13%
pb magnitude pruning test accuracy: 61.73%, sparsity: 0.6882
pb magnitude renormalized pruning test accuracy: 61.90%, sparsity: 0.6882
mean block mean pruning test accuracy: 66.96%, sparsity: 0.6893
mean block mean renormalized pruning test accuracy: 67.19%, sparsity: 0.6893

--- Beta: 0.75 ---
model name: Model_1, test accuracy: 83.13%
pb magnitude pruning test accuracy: 80.85%, sparsity: 0.5324
pb magnitude renormalized pruning test accuracy: 81.08%, sparsity: 0.5324
mean block mean pruning test accuracy: 77.21%, sparsity: 0.5339
mean block mean renormalized pruning test accuracy: 76.92%, sparsity: 0.5339

--- Beta: 1.0 ---
model name: Model_1, test accuracy: 83.13%
pb magnitude pruning test accuracy: 81.59%, sparsity: 0.3765
pb magnitude renormalized pruning test accuracy: 81.64%, sparsity: 0.3765
mean blo

# not test yet

# Different activation function

In [None]:
# Model class with optional dropout
class geluLinearModel(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=[512, 512, 512], dropout_rate=0.0):
        super(geluLinearModel, self).__init__()
        self.layers = nn.ModuleList()
        self.dropout = nn.Dropout(dropout_rate)
        
        prev_size = input_size
        for size in hidden_size:
            self.layers.append(nn.Linear(prev_size, size))
            prev_size = size
            
        self.output = nn.Linear(prev_size, output_size)
        
    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten
        
        for layer in self.layers:
            x = F.gelu(layer(x))
            x = self.dropout(x)  # Apply dropout after activation
        x = self.output(x)
        return F.log_softmax(x, dim=1)
    
    def save(self, path):
        torch.save(self.state_dict(), path)

    def load(self, path):
        self.load_state_dict(torch.load(path))
        
        
# Model class with optional dropout
class SigmoidLinearModel(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=[512, 512, 512], dropout_rate=0.0):
        super(SigmoidLinearModel, self).__init__()
        self.layers = nn.ModuleList()
        self.dropout = nn.Dropout(dropout_rate)
        
        prev_size = input_size
        for size in hidden_size:
            self.layers.append(nn.Linear(prev_size, size))
            prev_size = size
            
        self.output = nn.Linear(prev_size, output_size)
        
    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten
        
        for layer in self.layers:
            x = F.sigmoid(layer(x))
            x = self.dropout(x)  # Apply dropout after activation
        x = self.output(x)
        return F.log_softmax(x, dim=1)
    
    def save(self, path):
        torch.save(self.state_dict(), path)

    def load(self, path):
        self.load_state_dict(torch.load(path))
        
        
        
# Model class with optional dropout
class tanhLinearModel(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=[512, 512, 512], dropout_rate=0.0):
        super(tanhLinearModel, self).__init__()
        self.layers = nn.ModuleList()
        self.dropout = nn.Dropout(dropout_rate)
        
        prev_size = input_size
        for size in hidden_size:
            self.layers.append(nn.Linear(prev_size, size))
            prev_size = size
            
        self.output = nn.Linear(prev_size, output_size)
        
    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten
        
        for layer in self.layers:
            x = F.tanh(layer(x))
            x = self.dropout(x)  # Apply dropout after activation
        x = self.output(x)
        return F.log_softmax(x, dim=1)
    
    def save(self, path):
        torch.save(self.state_dict(), path)

    def load(self, path):
        self.load_state_dict(torch.load(path))


In [None]:
# Model configurations
model_configs = {
    'Model_1_Underfit': {
        'hidden_size': [64, 32, 16],  # Very shallow, only 1 small hidden layer
        'lr': 1e-4,  # Lower learning rate
        'epochs': 5,  # Fewer epochs
        'dropout': 0.0,
        'description': 'Underfitted: Too simple (1 layer, 32 units)'
    },
    'Model_2_Slight_Underfit': {
        'hidden_size': [256, 128, 64],  # 2 small layers
        'lr': 5e-4,
        'epochs': 8,
        'dropout': 0.0,
        'description': 'Slightly underfitted: Simple architecture'
    },
    'Model_3_Well_Trained': {
        'hidden_size': [512, 256, 128],  # Moderate depth and width
        'lr': 3e-4,
        'epochs': 15,
        'dropout': 0.2,  # Some regularization
        'description': 'Well-trained: Balanced architecture with dropout'
    },
    'Model_4_Well_Trained_Deep': {
        'hidden_size': [1024, 512, 256],  # Deeper but with dropout
        'lr': 3e-4,
        'epochs': 20,
        'dropout': 0.3,  # More dropout for regularization
        'description': 'Well-trained: Deeper with good regularization'
    },
    'Model_5_Overfit': {
        'hidden_size': [2048, 1024, 1024],  # Very deep and wide
        'lr': 1e-3,  # Higher learning rate
        'epochs': 30,  # Many epochs
        'dropout': 0.0,  # No regularization
        'description': 'Overfitted: Very complex without regularization'
    },
    'Model_6_Extra_Overfit': {
        'hidden_size': [4096, 2048, 1024],  # Extremely deep and wide
        'lr': 1e-3,
        'epochs': 50,
        'dropout': 0.0,
        'description': 'Extra Overfitted: Very complex without regularization'
    },
    'Model_7_Extra_Overfit': {
        'hidden_size': [8192, 4096, 2048],  # Extremely deep and wide
        'lr': 1e-3,
        'epochs': 100,
        'dropout': 0.0,
        'description': 'Extra Overfitted: Very complex without regularization'
    }
}

# Train all models
all_results = {}

for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Training {model_name}: {config['description']}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model1 = geluLinearModel(
        input_size=28*28, 
        output_size=10, 
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    
    # Optimizer
    optimizer = optim.Adam(model1.parameters(), lr=config['lr'])

    # Training loop
    for epoch in range(1, config['epochs'] + 1):
        train_loss, train_accuracy = train(model1, device, train_loader, optimizer, epoch)
    model1.save(f'models/MNIST_model/gelu_{model_name}.pth')
    

for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Training {model_name}: {config['description']}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model2 = SigmoidLinearModel(
        input_size=28*28, 
        output_size=10, 
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    
    # Optimizer
    optimizer = optim.Adam(model2.parameters(), lr=config['lr'])

    # Training loop
    for epoch in range(1, config['epochs'] + 1):
        train_loss, train_accuracy = train(model2, device, train_loader, optimizer, epoch)
    model2.save(f'models/MNIST_model/sigmoid_{model_name}.pth')
    
for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Training {model_name}: {config['description']}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model3 = tanhLinearModel(
        input_size=28*28, 
        output_size=10, 
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    
    # Optimizer
    optimizer = optim.Adam(model3.parameters(), lr=config['lr'])

    # Training loop
    for epoch in range(1, config['epochs'] + 1):
        train_loss, train_accuracy = train(model3, device, train_loader, optimizer, epoch)
    model3.save(f'models/MNIST_model/tanh_{model_name}.pth')


# GELU

In [None]:
for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Training {model_name}: {config['description']}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = geluLinearModel(
        input_size=28*28, 
        output_size=10, 
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'models/MNIST_model/gelu_{model_name}.pth')
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")
    
    test_loss, test_accuracy, origin_test_accuracy = test(model, device, test_loader, times=5)
    
    result['model_name'].append(model_name)
    result['test_accuracy'].append(origin_test_accuracy)
    result['model_sparsity'].append(0.0)
    
    # magnitude
    pc_model, pc_neff = model_pc(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pc_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pc_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pc_model))
    
    pr_model, pr_neff = model_pr(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pr_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pr_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pr_model))
    
    pb_model, pb_neff = model_block(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pb_model))

    # mean
    mean_pc_model, mean_pc_neff = model_pc(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pc_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pc_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pc_model))

    mean_pr_model, mean_pr_neff = model_pr(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pr_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pr_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pr_model))

    mean_pb_model, mean_pb_neff = model_block(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pb_model))

    # summary
    print(f"model name: {model_name}, test accuracy: {origin_test_accuracy:.2f}%")
    print(f"per column magnitude pruning test accuracy: {result['test_accuracy'][-6]:.2f}%, sparsity: {result['model_sparsity'][-4]:.4f}")
    print(f"per row magnitude pruning test accuracy: {result['test_accuracy'][-5]:.2f}%, sparsity: {result['model_sparsity'][-3]:.4f}")
    print(f"per block magnitude pruning test accuracy: {result['test_accuracy'][-4]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
    print(f"mean column mean pruning test accuracy: {result['test_accuracy'][-3]:.2f}%, sparsity: {result['model_sparsity'][-2]:.4f}")
    print(f"mean row mean pruning test accuracy: {result['test_accuracy'][-2]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
    print(f"mean block mean pruning test accuracy: {result['test_accuracy'][-1]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")

# SIGMOID

In [None]:
for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Training {model_name}: {config['description']}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = SigmoidLinearModel(
        input_size=28*28, 
        output_size=10, 
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'models/MNIST_model/sigmoid_{model_name}.pth')

    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")
    
    test_loss, test_accuracy, origin_test_accuracy = test(model, device, test_loader, times=5)
    
    result['model_name'].append(model_name)
    result['test_accuracy'].append(origin_test_accuracy)
    result['model_sparsity'].append(0.0)
    
    # magnitude
    pc_model, pc_neff = model_pc(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pc_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pc_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pc_model))
    
    pr_model, pr_neff = model_pr(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pr_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pr_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pr_model))
    
    pb_model, pb_neff = model_block(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pb_model))

    # mean
    mean_pc_model, mean_pc_neff = model_pc(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pc_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pc_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pc_model))

    mean_pr_model, mean_pr_neff = model_pr(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pr_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pr_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pr_model))

    mean_pb_model, mean_pb_neff = model_block(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pb_model))

    # summary
    print(f"model name: {model_name}, test accuracy: {origin_test_accuracy:.2f}%")
    print(f"per column magnitude pruning test accuracy: {result['test_accuracy'][-6]:.2f}%, sparsity: {result['model_sparsity'][-4]:.4f}")
    print(f"per row magnitude pruning test accuracy: {result['test_accuracy'][-5]:.2f}%, sparsity: {result['model_sparsity'][-3]:.4f}")
    print(f"per block magnitude pruning test accuracy: {result['test_accuracy'][-4]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
    print(f"mean column mean pruning test accuracy: {result['test_accuracy'][-3]:.2f}%, sparsity: {result['model_sparsity'][-2]:.4f}")
    print(f"mean row mean pruning test accuracy: {result['test_accuracy'][-2]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
    print(f"mean block mean pruning test accuracy: {result['test_accuracy'][-1]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")

# TANH

In [None]:
for model_name, config in model_configs.items():
    print(f"\n{'='*60}")
    print(f"Training {model_name}: {config['description']}")
    print(f"Architecture: Input(784) -> {' -> '.join(map(str, config['hidden_size']))} -> Output(10)")
    print(f"Learning rate: {config['lr']}, Epochs: {config['epochs']}, Dropout: {config['dropout']}")
    print(f"{'='*60}")
    
    # Create model
    model = tanhLinearModel(
        input_size=28*28, 
        output_size=10, 
        hidden_size=config['hidden_size'],
        dropout_rate=config['dropout']
    ).to(device)
    model.load(f'models/MNIST_model/tanh_{model_name}.pth')

    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")
    
    test_loss, test_accuracy, origin_test_accuracy = test(model, device, test_loader, times=5)
    
    result['model_name'].append(model_name)
    result['test_accuracy'].append(origin_test_accuracy)
    result['model_sparsity'].append(0.0)
    
    # magnitude
    pc_model, pc_neff = model_pc(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pc_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pc_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pc_model))
    
    pr_model, pr_neff = model_pr(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pr_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pr_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pr_model))
    
    pb_model, pb_neff = model_block(model, renormalize=False, beta=1.0, method='magnitude')
    test_loss, test_accuracy, accuracy_mean = test(pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_magnitude")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(pb_model))

    # mean
    mean_pc_model, mean_pc_neff = model_pc(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pc_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pc_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pc_model))

    mean_pr_model, mean_pr_neff = model_pr(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pr_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pr_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pr_model))

    mean_pb_model, mean_pb_neff = model_block(model, renormalize=False, beta=1.0, method='mean')
    test_loss, test_accuracy, accuracy_mean = test(mean_pb_model, device, test_loader, times=5)
    result['model_name'].append(f"{model_name}_pb_1_mean")
    result['test_accuracy'].append(accuracy_mean)
    result['model_sparsity'].append(model_sparsity(mean_pb_model))

    # summary
    print(f"model name: {model_name}, test accuracy: {origin_test_accuracy:.2f}%")
    print(f"per column magnitude pruning test accuracy: {result['test_accuracy'][-6]:.2f}%, sparsity: {result['model_sparsity'][-4]:.4f}")
    print(f"per row magnitude pruning test accuracy: {result['test_accuracy'][-5]:.2f}%, sparsity: {result['model_sparsity'][-3]:.4f}")
    print(f"per block magnitude pruning test accuracy: {result['test_accuracy'][-4]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
    print(f"mean column mean pruning test accuracy: {result['test_accuracy'][-3]:.2f}%, sparsity: {result['model_sparsity'][-2]:.4f}")
    print(f"mean row mean pruning test accuracy: {result['test_accuracy'][-2]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
    print(f"mean block mean pruning test accuracy: {result['test_accuracy'][-1]:.2f}%, sparsity: {result['model_sparsity'][-1]:.4f}")
