In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.environ['HOME_DIR'] = 'drive/MyDrive/hidden-networks'
# !pip install -r $HOME_DIR/requirements.txt

import sys
sys.path.append(os.path.join('/content', os.environ['HOME_DIR']))

!pip install optuna
import optuna

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import CosineAnnealingLR
import torch.autograd as autograd
import collections

from supermask_pruning import GetSubnet, SupermaskConv, SupermaskLinear
from supermask_pruning import train, test

class ArgClass:
    def __init__(self, args):
        self.setattrs(**args)
        
    def setattrs(self, **kwargs):
        for name, val in kwargs.items():
            setattr(self, name, val)

In [4]:
class Net(nn.Module):
    def __init__(self, args, input_channels, image_size, num_labels):
        super().__init__()
        
        sparsities = getattr(args, "sparsity", [{"sparsity": 1.0}, {"sparsity": 1.0}, {"sparsity": 1.0}, {"sparsity": 1.0}, {"sparsity": 1.0}])
        self.conv1 = SupermaskConv(input_channels, 64, 3, 1, bias=args.bias, init=args.init, **sparsities[0])
        self.conv2 = SupermaskConv(64, 64, 3, 1, bias=args.bias, init=args.init, **sparsities[1])
        s = (image_size - 4) * (image_size - 4) * 64 // 4
        self.fc1 = SupermaskLinear(s, 256, bias=args.bias, init=args.init, **sparsities[2])
        self.fc2 = SupermaskLinear(256, 256, bias=args.bias, init=args.init, **sparsities[3])
        self.fc3 = SupermaskLinear(256, num_labels, bias=args.bias, init=args.init, **sparsities[4])
        self.args = args

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.max_pool2d(x, 2)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        output = F.log_softmax(x, dim=1)
        return output
    
    def get_extra_state(self):
        return self.args
      
    def set_extra_state(self, state):
        self.args = state

In [None]:
args = {"init": "signed_constant", "bias": False}
for i in Net(ArgClass(args), 3, 32, 10).children():
  print(i.weight)

In [12]:
global failed_model
failed_model = None

In [13]:
# The main function runs the full training loop on a dataset of your choice
def main(model_args, train_args, base_model=None, trial=None):
    args = ArgClass(model_args)
    train_args = ArgClass(train_args)
    dataset = args.dataset

    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")
    print(f"Using device {device}")

    transform = None
    if dataset == "MNIST":
        transform = transforms.Compose([transforms.ToTensor(), 
                                        transforms.Normalize((0.1307,), (0.3081,))
                                        ])
        train_transform = transform
        input_channels, image_size, num_labels = 1, 28, 10
    elif dataset == "CIFAR10":
        train_transform = transforms.Compose([transforms.RandomCrop(32, padding=4),
                                              transforms.RandomHorizontalFlip(),
                                              transforms.ToTensor(),
                                              transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
                                              ])
        transform = transforms.Compose([transforms.ToTensor(),
                                        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
                                        ])
        input_channels, image_size, num_labels = 3, 32, 10
    else:
        raise ValueError("Only supported datasets are CIFAR10 and MNIST currently.")

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    train_loader = torch.utils.data.DataLoader(
        getattr(datasets, dataset)(os.path.join(train_args.data, dataset), 
                                   train=True, download=True, transform=transform),
        batch_size=args.batch_size, shuffle=True, **kwargs)
    train_augmented_loader = torch.utils.data.DataLoader(
        getattr(datasets, dataset)(os.path.join(train_args.data, dataset), 
                                   train=True, transform=train_transform),
        batch_size=args.batch_size, shuffle=True, **kwargs)
    test_loader = torch.utils.data.DataLoader(
        getattr(datasets, dataset)(os.path.join(train_args.data, dataset), 
                                   train=False, transform=transform),
        batch_size=train_args.test_batch_size, shuffle=True, **kwargs)

    model = Net(args, input_channels, image_size, num_labels).to(device)

    if getattr(args, "copy_layers", None) is not None:
        if (bool(args.copy_layers) ^ (base_model is not None)):
            raise ValueError("copy_layers arg must be None or [] if base_model is not specified")
        if base_model is not None and args.copy_layers:
            for layer in args.copy_layers:
                model.load_state_dict(getattr(base_model, layer).state_dict(prefix=f"{layer}."), strict=False)
            
    # NOTE: only pass the parameters where p.requires_grad == True to the optimizer! Important!
    optimizer = getattr(optim, args.optimizer)(
        [p for p in model.parameters() if p.requires_grad],
        **args.optim_kwargs,
    )
    assert isinstance(args.epochs, list) or isinstance(args.epochs, int)
    num_epochs, check_freeze = (args.epochs, False) if isinstance(args.epochs, int) else (max(args.epochs), True)
    criterion = nn.CrossEntropyLoss().to(device)
    scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs) if args.scheduler else None 

    for epoch in range(1, num_epochs + 1):
        if check_freeze:
            for freeze_at_epoch, child in zip(args.epochs, model.children()):
                if freeze_at_epoch == epoch - 1:
                    child.freeze()
                    print(f"Freezing {child} before epoch {epoch}")
        try:
            train(model, train_args.log_interval, device, train_augmented_loader, optimizer, criterion, epoch)
        except Exception as e:
            global failed_model
            failed_model = model
            raise e
        if (train_args.train_eval_interval and epoch % train_args.train_eval_interval == 0) or (train_args.eval_on_last and epoch == args.epochs):
            train_acc, train_loss = test(model, device, criterion, train_loader, name="Train")
            if trial:
                trial.set_user_attr('train_acc', {**trial.user_attrs.get('train_acc', {}), **{epoch: train_acc}})
                trial.set_user_attr('train_loss', {**trial.user_attrs.get('train_loss', {}), **{epoch: train_loss}})
        if (train_args.test_eval_interval and epoch % train_args.test_eval_interval == 0) or (train_args.eval_on_last and epoch == args.epochs):
            test_acc, test_loss = test(model, device, criterion, test_loader, name="Test")
            if trial:
                trial.set_user_attr('test_acc', {**trial.user_attrs.get('test_acc', {}), **{epoch: test_acc}})
                trial.set_user_attr('test_loss', {**trial.user_attrs.get('test_loss', {}), **{epoch: test_loss}})
                trial.report(test_acc, epoch-1)
                if trial.should_prune():
                    raise optuna.exceptions.TrialPruned()

        if scheduler:
            scheduler.step()

    if args.save_name is not None:
        torch.save(model.state_dict(), os.path.join(os.environ['HOME_DIR'], \
                                                    "trained_networks", args.save_name))
    
    return model, device, train_loader, test_loader, criterion

def get_prune_mask(layer, sparsity):
    with torch.no_grad():
        return GetSubnet.apply(layer.scores.abs(), sparsity)

In [None]:
# # Arguments that do not affect model at all
train_args = {
    "test_batch_size": 1000, # input batch size for testing (default: 1000)
    'data': '../data', # Location to store data (e.g. MNIST)
    'log_interval': 500000, # how many batches to wait before logging training status
    'train_eval_interval': 10, # epoch interval at which to print training accuracy
    'test_eval_interval': 10, # epoch interval at which to print test accuracy
    'eval_on_last': True
}

args = {
  "dataset": "CIFAR10",
  "init": "signed_constant",
  "batch_size": 128, # input batch size for training (default: 64)
  "epochs": 100, # number of epochs to train (default: 14)
  "optimizer": "SGD",
  "optim_kwargs": {"lr": 0.1, "momentum": 0.9, "weight_decay": 0.0001},
  "scheduler": True, # False for Adam, True for SGD, does CosineAnnealing
  'no_cuda': False, # disables CUDA training
  'seed': 1000, # random seed (default: 1)
  'save_name': None, #"conv2_frozen_sp50_rs1000", # "simple20_rs2", # For Saving the current Model, None if not saving
  'sparsity': [{"threshold": 0.2, "min_sparsity": 0.05} for i in range(5)], # 'how sparse is each layer'
  'copy_layers': [], # ['conv1', 'conv2', 'fc2'],
  'bias': False
}

# trained_model, device, train_loader, test_loader, criterion = main(args, train_args)
name_of_experiment = 'threshold-100epoch'
train_results = []
test_results = []
args['seed'] = 282
for threshold in [0.02, 0.05, 0.1, 0.2, 0.5, 0.01]:
    args['sparsity'] = [{"threshold": threshold, "min_sparsity": 0.05} for i in range(5)]
    args['save_name'] = f'FINAL_thresholds_{threshold}'
    print(f"----{args['seed']}----{args}----")
    trained_model, device, train_loader, test_loader, criterion = main(args, train_args)
    train_acc, train_loss = test(trained_model, device, criterion, train_loader, name="Train")
    test_acc, test_loss = test(trained_model, device, criterion, test_loader)
    train_results.append((threshold, train_acc, train_loss))
    test_results.append((threshold, test_acc, test_loss))
    torch.save((train_args, args, train_results, test_results), \
               os.path.join(os.environ["HOME_DIR"], "results", f"{name_of_experiment}_{args['dataset']}.pt"))

# # thresholds = [0.01, 0.02, 0.05, 0.1, 0.2, 0.3]
# # train_results = {x: [] for x in thresholds}
# # test_results = {x: [] for x in thresholds}
# # name_of_experiment = "fc1_thresholds"
    
# # for fc1_threshold in thresholds:
# #     for rs in range(70, 73):
# #         args["seed"] = rs
# #         args['sparsity'][2]['threshold'] = fc1_threshold
# #         print(f"----{rs}----{fc1_threshold}----")
# #         print(args['sparsity'])
# #         # args["save_name"] = f"{args['dataset']}_{rs}_{name_of_experiment}_{fc1_threshold}.pt"
# #         trained_model, device, train_loader, test_loader, criterion = main(args, train_args)
# #         train_acc, train_loss = test(trained_model, device, criterion, train_loader)
# #         test_acc, test_loss = test(trained_model, device, criterion, test_loader)
# #         train_results[fc1_threshold].append((train_acc, train_loss))
# #         test_results[fc1_threshold].append((test_acc, test_loss))
# #         torch.save((train_args, args, train_results, test_results), \
# #                    os.path.join(os.environ["HOME_DIR"], "results", f"{args['dataset']}-{name_of_experiment}.pt"))

----282----{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 128, 'epochs': 100, 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.1, 'momentum': 0.9, 'weight_decay': 0.0001}, 'scheduler': True, 'no_cuda': False, 'seed': 282, 'save_name': 'FINAL_thresholds_0.02', 'sparsity': [{'threshold': 0.02, 'min_sparsity': 0.05}, {'threshold': 0.02, 'min_sparsity': 0.05}, {'threshold': 0.02, 'min_sparsity': 0.05}, {'threshold': 0.02, 'min_sparsity': 0.05}, {'threshold': 0.02, 'min_sparsity': 0.05}], 'copy_layers': [], 'bias': False}----
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0130, Accuracy: 21167/50000 (42%)


Test set: Average loss: 0.0017, Accuracy: 4228/10000 (42%)


Train set: Average loss: 0.0129, Accuracy: 21500/50000 (43%)


Test set: Average loss: 0.0016, Accuracy: 4310/10000 (43%)


Train set: Average loss: 0.0128, Accuracy: 21774/50000 (44%)


Test set: Average loss: 0.0016, Accuracy: 4361/10000 (44%)


Train set: Average loss: 0.0128

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f81359f8e60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1358, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1341, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process



Test set: Average loss: 0.0017, Accuracy: 4066/10000 (41%)


Train set: Average loss: 0.0131, Accuracy: 20183/50000 (40%)


Test set: Average loss: 0.0017, Accuracy: 4085/10000 (41%)


Train set: Average loss: 0.0131, Accuracy: 20104/50000 (40%)


Test set: Average loss: 0.0017, Accuracy: 4077/10000 (41%)


Train set: Average loss: 0.0131, Accuracy: 20104/50000 (40%)


Test set: Average loss: 0.0017, Accuracy: 4077/10000 (41%)

----282----{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 128, 'epochs': 100, 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.1, 'momentum': 0.9, 'weight_decay': 0.0001}, 'scheduler': True, 'no_cuda': False, 'seed': 282, 'save_name': 'FINAL_thresholds_0.5', 'sparsity': [{'threshold': 0.5, 'min_sparsity': 0.05}, {'threshold': 0.5, 'min_sparsity': 0.05}, {'threshold': 0.5, 'min_sparsity': 0.05}, {'threshold': 0.5, 'min_sparsity': 0.05}, {'threshold': 0.5, 'min_sparsity': 0.05}], 'copy_layers': [], 'bias': False}----
Using device cuda
Files already d

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f81359f8e60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1358, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1341, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process



Test set: Average loss: 0.0018, Accuracy: 3790/10000 (38%)


Train set: Average loss: 0.0136, Accuracy: 19006/50000 (38%)



Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f81359f8e60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1358, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1341, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process



Test set: Average loss: 0.0017, Accuracy: 3875/10000 (39%)


Train set: Average loss: 0.0134, Accuracy: 19483/50000 (39%)



Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f81359f8e60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1358, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1341, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process



Test set: Average loss: 0.0017, Accuracy: 3936/10000 (39%)


Train set: Average loss: 0.0134, Accuracy: 19495/50000 (39%)



Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f81359f8e60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1358, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1341, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process



Test set: Average loss: 0.0017, Accuracy: 3956/10000 (40%)


Train set: Average loss: 0.0133, Accuracy: 19703/50000 (39%)



Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f81359f8e60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1358, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1341, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process



Test set: Average loss: 0.0017, Accuracy: 3995/10000 (40%)


Train set: Average loss: 0.0133, Accuracy: 19660/50000 (39%)



Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f81359f8e60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1358, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1341, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process



Test set: Average loss: 0.0017, Accuracy: 3963/10000 (40%)


Train set: Average loss: 0.0133, Accuracy: 19592/50000 (39%)



Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f81359f8e60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1358, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1341, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process



Test set: Average loss: 0.0017, Accuracy: 3983/10000 (40%)


Train set: Average loss: 0.0133, Accuracy: 19782/50000 (40%)



Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f81359f8e60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1358, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1341, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process



Test set: Average loss: 0.0017, Accuracy: 3996/10000 (40%)


Train set: Average loss: 0.0133, Accuracy: 19810/50000 (40%)



Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f81359f8e60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1358, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1341, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process



Test set: Average loss: 0.0017, Accuracy: 4006/10000 (40%)


Train set: Average loss: 0.0133, Accuracy: 19810/50000 (40%)



Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f81359f8e60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1358, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1341, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process



Test set: Average loss: 0.0017, Accuracy: 4006/10000 (40%)

----282----{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 128, 'epochs': 100, 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.1, 'momentum': 0.9, 'weight_decay': 0.0001}, 'scheduler': True, 'no_cuda': False, 'seed': 282, 'save_name': 'FINAL_thresholds_0.01', 'sparsity': [{'threshold': 0.01, 'min_sparsity': 0.05}, {'threshold': 0.01, 'min_sparsity': 0.05}, {'threshold': 0.01, 'min_sparsity': 0.05}, {'threshold': 0.01, 'min_sparsity': 0.05}, {'threshold': 0.01, 'min_sparsity': 0.05}], 'copy_layers': [], 'bias': False}----
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0123, Accuracy: 22689/50000 (45%)



Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f81359f8e60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1358, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1341, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process



Test set: Average loss: 0.0016, Accuracy: 4489/10000 (45%)


Train set: Average loss: 0.0122, Accuracy: 22900/50000 (46%)



Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f81359f8e60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1358, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1341, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process



Test set: Average loss: 0.0016, Accuracy: 4579/10000 (46%)


Train set: Average loss: 0.0121, Accuracy: 23135/50000 (46%)



Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f81359f8e60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1358, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1341, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process



Test set: Average loss: 0.0015, Accuracy: 4633/10000 (46%)


Train set: Average loss: 0.0121, Accuracy: 23251/50000 (47%)



Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f81359f8e60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1358, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1341, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process



Test set: Average loss: 0.0015, Accuracy: 4631/10000 (46%)



RuntimeError: ignored

In [7]:
import pickle
def save_study(study, filename):
    with open(os.path.join(os.environ["HOME_DIR"], "results", "studies", f"{filename}.pickle"), "wb") as f:
        pickle.dump(study, f)

In [15]:
def objective(trial):
    generated_seed = torch.randint(100, (1,)).item()
    trial.set_user_attr('seed', generated_seed)

    train_args = {
      "test_batch_size": 1000, # input batch size for testing (default: 1000)
      'data': '../data', # Location to store data (e.g. MNIST)
      'log_interval': 1000000, # how many batches to wait before logging training status
      'train_eval_interval': 20, # epoch interval at which to print training accuracy
      'test_eval_interval': 20, # epoch interval at which to print test accuracy
      'eval_on_last': True
    }

    args = {
      "dataset": "CIFAR10",
      "init": "signed_constant",
      "batch_size": 64, # input batch size for training (default: 64)
      "epochs": [trial.suggest_int(f'layer{i}_epochs', 20, 160) for i in range(5)], # number of epochs to train (default: 14)
      "optimizer": "SGD",
      "optim_kwargs": {"lr": trial.suggest_float('learning_rate', 0.01, 0.5), 
                       "momentum": trial.suggest_float('momentum', 0.2, 0.95),
                       "weight_decay": trial.suggest_float('weight_decay', 0.00005, 0.001)},
      "scheduler": True, # False for Adam, True for SGD, does CosineAnnealing
      'no_cuda': False, # disables CUDA training
      'seed': generated_seed, # random seed (default: 1)
      'save_name': None, # "simple20_rs2", # For Saving the current Model, None if not saving
      'sparsity': [
                   {"sparsity": trial.suggest_float('sparsity_conv1', 0.1, 0.95)}, 
                   {"sparsity": trial.suggest_float('sparsity_conv2', 0.1, 0.95)}, 
                   {"sparsity": trial.suggest_float('sparsity_fc1', 0.1, 0.95)}, 
                   {"sparsity": trial.suggest_float('sparsity_fc2', 0.1, 0.95)},
                   {"sparsity": trial.suggest_float('sparsity_fc3', 0.1, 0.95)}
                  ], # 'how sparse is each layer'
      'copy_layers': [], # ['conv1', 'conv2', 'fc2'],
      'bias': False
    }

    print(args)

    trained_model, device, train_loader, test_loader, criterion = main(args, train_args, trial=trial)
    train_acc, train_loss = test(trained_model, device, criterion, train_loader)
    test_acc, test_loss = test(trained_model, device, criterion, test_loader)

    return test_acc

In [None]:
# with open(os.path.join(os.environ["HOME_DIR"], "results", "studies", "hp_search_study.pickle"), "rb") as f:
#     study = pickle.load(f)
study = optuna.create_study(direction='maximize', 
                            pruner=optuna.pruners.PatientPruner(
                                optuna.pruners.MedianPruner(n_startup_trials=15, n_warmup_steps=50, interval_steps=1), 
                                patience=40
                                )
                            )
for _ in range(100):
    study.optimize(objective, n_trials=1, show_progress_bar=True)
    save_study(study, "result__conv2_sparsity_per_layer_freezing")

  
[32m[I 2022-05-03 05:30:26,633][0m A new study created in memory with name: no-name-3aab3e0c-e1ac-40c2-ab8d-80ad9f460c48[0m
  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [62, 156, 42, 143, 119], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.348635484985551, 'momentum': 0.7454563459246135, 'weight_decay': 0.0009874760015541415}, 'scheduler': True, 'no_cuda': False, 'seed': 97, 'save_name': None, 'sparsity': [{'sparsity': 0.10893508969416346}, {'sparsity': 0.38954459790126017}, {'sparsity': 0.7752736002325623}, {'sparsity': 0.4099074970417489}, {'sparsity': 0.857791243083931}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0297, Accuracy: 14804/50000 (30%)


Test set: Average loss: 0.0019, Accuracy: 3023/10000 (30%)


Train set: Average loss: 0.0299, Accuracy: 15371/50000 (31%)


Test set: Average loss: 0.0019, Accuracy: 3110/10000 (31%)

Freezing SupermaskLinear(in_features=12544, out_features=256, bias=False, sparsity=0.7752736002325623) before epoch 43

Train set: Average loss: 0.0301, Accuracy: 15231/500

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [25, 91, 139, 59, 52], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.3018664869575255, 'momentum': 0.8939040071981936, 'weight_decay': 9.052329253251526e-05}, 'scheduler': True, 'no_cuda': False, 'seed': 62, 'save_name': None, 'sparsity': [{'sparsity': 0.15286391309311764}, {'sparsity': 0.2709690200056263}, {'sparsity': 0.5302371530682538}, {'sparsity': 0.6598669045075236}, {'sparsity': 0.8439083400946588}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0205, Accuracy: 27008/50000 (54%)


Test set: Average loss: 0.0013, Accuracy: 5345/10000 (53%)

Freezing SupermaskConv(3, 64, kernel_size=(3, 3), stride=(1, 1), bias=False, sparsity=0.15286391309311764) before epoch 26

Train set: Average loss: 0.0169, Accuracy: 31305/50000 (63%)


Test set: Average loss: 0.0011, Accuracy: 6195/10000 (62%)

Freezing SupermaskLinear(in_features=256, out_feat

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [20, 133, 152, 154, 114], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.29940624492021767, 'momentum': 0.8654401774459712, 'weight_decay': 0.0005734004116529172}, 'scheduler': True, 'no_cuda': False, 'seed': 59, 'save_name': None, 'sparsity': [{'sparsity': 0.4542739995411129}, {'sparsity': 0.7663425737544275}, {'sparsity': 0.5496625235931911}, {'sparsity': 0.8711018792882713}, {'sparsity': 0.890979686078824}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0247, Accuracy: 22122/50000 (44%)


Test set: Average loss: 0.0016, Accuracy: 4477/10000 (45%)

Freezing SupermaskConv(3, 64, kernel_size=(3, 3), stride=(1, 1), bias=False, sparsity=0.4542739995411129) before epoch 21

Train set: Average loss: 0.0247, Accuracy: 21978/50000 (44%)


Test set: Average loss: 0.0016, Accuracy: 4402/10000 (44%)


Train set: Average loss: 0.0244, Accuracy: 22007

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [130, 25, 123, 118, 25], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.03782379871729465, 'momentum': 0.624225423996158, 'weight_decay': 0.0006164164601894036}, 'scheduler': True, 'no_cuda': False, 'seed': 39, 'save_name': None, 'sparsity': [{'sparsity': 0.8686416426820391}, {'sparsity': 0.8734246519566199}, {'sparsity': 0.33034645351529046}, {'sparsity': 0.36629286461245125}, {'sparsity': 0.32224153785064313}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0230, Accuracy: 24169/50000 (48%)


Test set: Average loss: 0.0015, Accuracy: 4817/10000 (48%)

Freezing SupermaskConv(64, 64, kernel_size=(3, 3), stride=(1, 1), bias=False, sparsity=0.8734246519566199) before epoch 26
Freezing SupermaskLinear(in_features=256, out_features=10, bias=False, sparsity=0.32224153785064313) before epoch 26

Train set: Average loss: 0.0183, Accuracy: 29882/500

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [51, 159, 159, 153, 139], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.06830851822644912, 'momentum': 0.7674790723204057, 'weight_decay': 0.0005663579022042212}, 'scheduler': True, 'no_cuda': False, 'seed': 88, 'save_name': None, 'sparsity': [{'sparsity': 0.15519833201630315}, {'sparsity': 0.7503790181092227}, {'sparsity': 0.7214603150325328}, {'sparsity': 0.4919494413783082}, {'sparsity': 0.8540865203348432}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0193, Accuracy: 28230/50000 (56%)


Test set: Average loss: 0.0013, Accuracy: 5570/10000 (56%)


Train set: Average loss: 0.0219, Accuracy: 25715/50000 (51%)


Test set: Average loss: 0.0014, Accuracy: 5116/10000 (51%)

Freezing SupermaskConv(3, 64, kernel_size=(3, 3), stride=(1, 1), bias=False, sparsity=0.15519833201630315) before epoch 52

Train set: Average loss: 0.0201, Accuracy: 27

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [102, 119, 50, 52, 48], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.09557364259900528, 'momentum': 0.36424959602093104, 'weight_decay': 0.00020249477914977267}, 'scheduler': True, 'no_cuda': False, 'seed': 81, 'save_name': None, 'sparsity': [{'sparsity': 0.12657168576212915}, {'sparsity': 0.23485557295621123}, {'sparsity': 0.7148293519997253}, {'sparsity': 0.8645045601204174}, {'sparsity': 0.22705021247488902}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0222, Accuracy: 25039/50000 (50%)


Test set: Average loss: 0.0014, Accuracy: 4928/10000 (49%)


Train set: Average loss: 0.0199, Accuracy: 27473/50000 (55%)


Test set: Average loss: 0.0013, Accuracy: 5425/10000 (54%)

Freezing SupermaskLinear(in_features=256, out_features=10, bias=False, sparsity=0.22705021247488902) before epoch 49
Freezing SupermaskLinear(in_features=12544, out_fe

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [141, 114, 143, 40, 66], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.36840490513408164, 'momentum': 0.8624480879391365, 'weight_decay': 0.0009467859115382575}, 'scheduler': True, 'no_cuda': False, 'seed': 76, 'save_name': None, 'sparsity': [{'sparsity': 0.8922844951705377}, {'sparsity': 0.5964283914785219}, {'sparsity': 0.31494696919346665}, {'sparsity': 0.7229531766892339}, {'sparsity': 0.779042801673619}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0274, Accuracy: 18776/50000 (38%)


Test set: Average loss: 0.0017, Accuracy: 3783/10000 (38%)


Train set: Average loss: 0.0267, Accuracy: 19429/50000 (39%)


Test set: Average loss: 0.0017, Accuracy: 3919/10000 (39%)

Freezing SupermaskLinear(in_features=256, out_features=256, bias=False, sparsity=0.7229531766892339) before epoch 41

Train set: Average loss: 0.0276, Accuracy: 17861/5000

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [35, 116, 128, 37, 36], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.4636953998091417, 'momentum': 0.5222486842645486, 'weight_decay': 0.00027298070118682137}, 'scheduler': True, 'no_cuda': False, 'seed': 49, 'save_name': None, 'sparsity': [{'sparsity': 0.3958929962332278}, {'sparsity': 0.2022121398046281}, {'sparsity': 0.4455021198693059}, {'sparsity': 0.48357312242937256}, {'sparsity': 0.15562921005094427}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0197, Accuracy: 27554/50000 (55%)


Test set: Average loss: 0.0013, Accuracy: 5433/10000 (54%)

Freezing SupermaskConv(3, 64, kernel_size=(3, 3), stride=(1, 1), bias=False, sparsity=0.3958929962332278) before epoch 36
Freezing SupermaskLinear(in_features=256, out_features=10, bias=False, sparsity=0.15562921005094427) before epoch 37
Freezing SupermaskLinear(in_features=256, out_features=

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [101, 154, 44, 142, 77], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.1061806719862119, 'momentum': 0.2625008391577436, 'weight_decay': 0.00020403351214056138}, 'scheduler': True, 'no_cuda': False, 'seed': 26, 'save_name': None, 'sparsity': [{'sparsity': 0.8414417206947493}, {'sparsity': 0.246887151208699}, {'sparsity': 0.7208584293075296}, {'sparsity': 0.5817365192721367}, {'sparsity': 0.8198311899249479}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0190, Accuracy: 28933/50000 (58%)


Test set: Average loss: 0.0012, Accuracy: 5688/10000 (57%)


Train set: Average loss: 0.0160, Accuracy: 31873/50000 (64%)


Test set: Average loss: 0.0011, Accuracy: 6267/10000 (63%)

Freezing SupermaskLinear(in_features=12544, out_features=256, bias=False, sparsity=0.7208584293075296) before epoch 45

Train set: Average loss: 0.0166, Accuracy: 31424/500

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [145, 69, 66, 70, 45], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.11767248210086553, 'momentum': 0.4387472344820691, 'weight_decay': 0.00047038168741304437}, 'scheduler': True, 'no_cuda': False, 'seed': 19, 'save_name': None, 'sparsity': [{'sparsity': 0.6094110377692759}, {'sparsity': 0.6755044855384573}, {'sparsity': 0.6651977164892446}, {'sparsity': 0.4490100998733555}, {'sparsity': 0.30518591449831634}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0159, Accuracy: 32326/50000 (65%)


Test set: Average loss: 0.0011, Accuracy: 6394/10000 (64%)


Train set: Average loss: 0.0164, Accuracy: 31648/50000 (63%)


Test set: Average loss: 0.0011, Accuracy: 6240/10000 (62%)

Freezing SupermaskLinear(in_features=256, out_features=10, bias=False, sparsity=0.30518591449831634) before epoch 46

Train set: Average loss: 0.0189, Accuracy: 28567/5000

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [70, 71, 87, 91, 95], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.2040757395169503, 'momentum': 0.9444088836518465, 'weight_decay': 5.426122422834515e-05}, 'scheduler': True, 'no_cuda': False, 'seed': 13, 'save_name': None, 'sparsity': [{'sparsity': 0.33129574112185}, {'sparsity': 0.44407652320483115}, {'sparsity': 0.13311800643507832}, {'sparsity': 0.2502310330693142}, {'sparsity': 0.5814818017433981}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0151, Accuracy: 33568/50000 (67%)


Test set: Average loss: 0.0010, Accuracy: 6594/10000 (66%)


Train set: Average loss: 0.0141, Accuracy: 34780/50000 (70%)


Test set: Average loss: 0.0009, Accuracy: 6870/10000 (69%)


Train set: Average loss: 0.0132, Accuracy: 36072/50000 (72%)


Test set: Average loss: 0.0009, Accuracy: 7020/10000 (70%)

Freezing SupermaskConv(3, 64, kernel_size=(3, 3), s

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [75, 74, 92, 92, 97], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.21533909823795389, 'momentum': 0.9468825529708406, 'weight_decay': 7.148338819735524e-05}, 'scheduler': True, 'no_cuda': False, 'seed': 11, 'save_name': None, 'sparsity': [{'sparsity': 0.31047755464947735}, {'sparsity': 0.4166979186673385}, {'sparsity': 0.13307071379929597}, {'sparsity': 0.11003454734564597}, {'sparsity': 0.6121844955878875}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0168, Accuracy: 31608/50000 (63%)


Test set: Average loss: 0.0011, Accuracy: 6245/10000 (62%)


Train set: Average loss: 0.0163, Accuracy: 31574/50000 (63%)


Test set: Average loss: 0.0011, Accuracy: 6278/10000 (63%)


Train set: Average loss: 0.0148, Accuracy: 34058/50000 (68%)


Test set: Average loss: 0.0010, Accuracy: 6691/10000 (67%)

Freezing SupermaskConv(64, 64, kernel_size=(3, 

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [81, 49, 94, 90, 86], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.2526075900726818, 'momentum': 0.6834612511304782, 'weight_decay': 5.2519053675210174e-05}, 'scheduler': True, 'no_cuda': False, 'seed': 40, 'save_name': None, 'sparsity': [{'sparsity': 0.2714825640144255}, {'sparsity': 0.4369494264560504}, {'sparsity': 0.9171719105844887}, {'sparsity': 0.2173751645834186}, {'sparsity': 0.598005071122161}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0199, Accuracy: 27137/50000 (54%)


Test set: Average loss: 0.0013, Accuracy: 5377/10000 (54%)


Train set: Average loss: 0.0206, Accuracy: 26187/50000 (52%)


Test set: Average loss: 0.0013, Accuracy: 5194/10000 (52%)

Freezing SupermaskConv(64, 64, kernel_size=(3, 3), stride=(1, 1), bias=False, sparsity=0.4369494264560504) before epoch 50

Train set: Average loss: 0.0199, Accuracy: 27564/50

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [44, 90, 111, 74, 64], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.2077880610529832, 'momentum': 0.8963094708743591, 'weight_decay': 0.0003773282902490654}, 'scheduler': True, 'no_cuda': False, 'seed': 99, 'save_name': None, 'sparsity': [{'sparsity': 0.5977175414748166}, {'sparsity': 0.13917346851993587}, {'sparsity': 0.14055238089845024}, {'sparsity': 0.6822608457593697}, {'sparsity': 0.5037729624667787}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0191, Accuracy: 28289/50000 (57%)


Test set: Average loss: 0.0012, Accuracy: 5684/10000 (57%)


Train set: Average loss: 0.0183, Accuracy: 29338/50000 (59%)


Test set: Average loss: 0.0012, Accuracy: 5849/10000 (58%)

Freezing SupermaskConv(3, 64, kernel_size=(3, 3), stride=(1, 1), bias=False, sparsity=0.5977175414748166) before epoch 45

Train set: Average loss: 0.0169, Accuracy: 31245/

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [20, 56, 71, 20, 152], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.16082854809471597, 'momentum': 0.7839033074843035, 'weight_decay': 0.0008036506477207177}, 'scheduler': True, 'no_cuda': False, 'seed': 48, 'save_name': None, 'sparsity': [{'sparsity': 0.29229087613930976}, {'sparsity': 0.32818050215312994}, {'sparsity': 0.48642775834710994}, {'sparsity': 0.2645432809108955}, {'sparsity': 0.444000392218604}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0176, Accuracy: 30517/50000 (61%)


Test set: Average loss: 0.0011, Accuracy: 6066/10000 (61%)

Freezing SupermaskConv(3, 64, kernel_size=(3, 3), stride=(1, 1), bias=False, sparsity=0.29229087613930976) before epoch 21
Freezing SupermaskLinear(in_features=256, out_features=256, bias=False, sparsity=0.2645432809108955) before epoch 21

Train set: Average loss: 0.0196, Accuracy: 28490/50000

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [122, 94, 74, 109, 110], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.4378482730419161, 'momentum': 0.5684939263853008, 'weight_decay': 0.00032412190108172694}, 'scheduler': True, 'no_cuda': False, 'seed': 14, 'save_name': None, 'sparsity': [{'sparsity': 0.2270526498506446}, {'sparsity': 0.5348200592712145}, {'sparsity': 0.33208744812484325}, {'sparsity': 0.6285479045513305}, {'sparsity': 0.7259956297901271}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0252, Accuracy: 22981/50000 (46%)


Test set: Average loss: 0.0016, Accuracy: 4593/10000 (46%)


Train set: Average loss: 0.0223, Accuracy: 26280/50000 (53%)


Test set: Average loss: 0.0014, Accuracy: 5217/10000 (52%)


Train set: Average loss: 0.0200, Accuracy: 27581/50000 (55%)


Test set: Average loss: 0.0013, Accuracy: 5503/10000 (55%)

Freezing SupermaskLinear(in_features=12544, ou

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [64, 90, 113, 68, 62], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.30719370225891074, 'momentum': 0.9456201913112906, 'weight_decay': 0.00014497725539784352}, 'scheduler': True, 'no_cuda': False, 'seed': 63, 'save_name': None, 'sparsity': [{'sparsity': 0.4055029133048076}, {'sparsity': 0.10019364560308675}, {'sparsity': 0.5635606840720598}, {'sparsity': 0.2989494689364002}, {'sparsity': 0.702549789065469}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0196, Accuracy: 28146/50000 (56%)


Test set: Average loss: 0.0013, Accuracy: 5628/10000 (56%)


Train set: Average loss: 0.0193, Accuracy: 28549/50000 (57%)


Test set: Average loss: 0.0012, Accuracy: 5697/10000 (57%)


Train set: Average loss: 0.0174, Accuracy: 30587/50000 (61%)


Test set: Average loss: 0.0011, Accuracy: 6031/10000 (60%)

Freezing SupermaskLinear(in_features=256, out_fe

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [159, 32, 102, 106, 93], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.18354086482341822, 'momentum': 0.8276771889054555, 'weight_decay': 0.00041335943718010546}, 'scheduler': True, 'no_cuda': False, 'seed': 88, 'save_name': None, 'sparsity': [{'sparsity': 0.5479657468523615}, {'sparsity': 0.3373913623628495}, {'sparsity': 0.40736073181806237}, {'sparsity': 0.7981095272217296}, {'sparsity': 0.6504673979560648}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0186, Accuracy: 29349/50000 (59%)


Test set: Average loss: 0.0012, Accuracy: 5790/10000 (58%)

Freezing SupermaskConv(64, 64, kernel_size=(3, 3), stride=(1, 1), bias=False, sparsity=0.3373913623628495) before epoch 33

Train set: Average loss: 0.0192, Accuracy: 28081/50000 (56%)


Test set: Average loss: 0.0012, Accuracy: 5552/10000 (56%)


Train set: Average loss: 0.0181, Accuracy: 29

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [95, 66, 134, 56, 131], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.37584424314622866, 'momentum': 0.6686527021115451, 'weight_decay': 0.0007501902175116965}, 'scheduler': True, 'no_cuda': False, 'seed': 81, 'save_name': None, 'sparsity': [{'sparsity': 0.7091811281239576}, {'sparsity': 0.5399896165542328}, {'sparsity': 0.2012779266512471}, {'sparsity': 0.15004034974758873}, {'sparsity': 0.43680436046533455}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0243, Accuracy: 23088/50000 (46%)


Test set: Average loss: 0.0016, Accuracy: 4647/10000 (46%)


Train set: Average loss: 0.0233, Accuracy: 23867/50000 (48%)


Test set: Average loss: 0.0015, Accuracy: 4766/10000 (48%)

Freezing SupermaskLinear(in_features=256, out_features=256, bias=False, sparsity=0.15004034974758873) before epoch 57

Train set: Average loss: 0.0221, Accuracy: 25766/50

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [36, 41, 75, 85, 22], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.25788574205001535, 'momentum': 0.7219597997266833, 'weight_decay': 0.00015166804269003574}, 'scheduler': True, 'no_cuda': False, 'seed': 71, 'save_name': None, 'sparsity': [{'sparsity': 0.2137907749795781}, {'sparsity': 0.46160508205737594}, {'sparsity': 0.2237384309127669}, {'sparsity': 0.5720825803832542}, {'sparsity': 0.9225076143962971}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0161, Accuracy: 32290/50000 (65%)


Test set: Average loss: 0.0011, Accuracy: 6353/10000 (64%)

Freezing SupermaskLinear(in_features=256, out_features=10, bias=False, sparsity=0.9225076143962971) before epoch 23
Freezing SupermaskConv(3, 64, kernel_size=(3, 3), stride=(1, 1), bias=False, sparsity=0.2137907749795781) before epoch 37

Train set: Average loss: 0.0141, Accuracy: 34952/50000 (7

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [56, 41, 29, 90, 27], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.2501725805248084, 'momentum': 0.7040432132078915, 'weight_decay': 0.00023589727393858028}, 'scheduler': True, 'no_cuda': False, 'seed': 43, 'save_name': None, 'sparsity': [{'sparsity': 0.36276458802633155}, {'sparsity': 0.49120751220062564}, {'sparsity': 0.23049953959880837}, {'sparsity': 0.5600075707950388}, {'sparsity': 0.916091807356358}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0151, Accuracy: 33572/50000 (67%)


Test set: Average loss: 0.0010, Accuracy: 6593/10000 (66%)

Freezing SupermaskLinear(in_features=256, out_features=10, bias=False, sparsity=0.916091807356358) before epoch 28
Freezing SupermaskLinear(in_features=12544, out_features=256, bias=False, sparsity=0.23049953959880837) before epoch 30

Train set: Average loss: 0.0144, Accuracy: 34185/50000 (68%)

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [36, 81, 77, 76, 50], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.28988426788024646, 'momentum': 0.8252401070829678, 'weight_decay': 0.00012579318487214742}, 'scheduler': True, 'no_cuda': False, 'seed': 12, 'save_name': None, 'sparsity': [{'sparsity': 0.22511730438031813}, {'sparsity': 0.34366771906847315}, {'sparsity': 0.10270113839497857}, {'sparsity': 0.7061548455014418}, {'sparsity': 0.9383823631836791}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0164, Accuracy: 31883/50000 (64%)


Test set: Average loss: 0.0011, Accuracy: 6288/10000 (63%)

Freezing SupermaskConv(3, 64, kernel_size=(3, 3), stride=(1, 1), bias=False, sparsity=0.22511730438031813) before epoch 37

Train set: Average loss: 0.0160, Accuracy: 32516/50000 (65%)


Test set: Average loss: 0.0011, Accuracy: 6379/10000 (64%)

Freezing SupermaskLinear(in_features=256, out_f

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [33, 55, 60, 104, 23], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.16181720599326627, 'momentum': 0.9065644137960044, 'weight_decay': 5.032871525494373e-05}, 'scheduler': True, 'no_cuda': False, 'seed': 61, 'save_name': None, 'sparsity': [{'sparsity': 0.19222327827940086}, {'sparsity': 0.6210614690925981}, {'sparsity': 0.2465511559035769}, {'sparsity': 0.6198738120668983}, {'sparsity': 0.7828447166600658}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0153, Accuracy: 33407/50000 (67%)


Test set: Average loss: 0.0010, Accuracy: 6594/10000 (66%)

Freezing SupermaskLinear(in_features=256, out_features=10, bias=False, sparsity=0.7828447166600658) before epoch 24
Freezing SupermaskConv(3, 64, kernel_size=(3, 3), stride=(1, 1), bias=False, sparsity=0.19222327827940086) before epoch 34

Train set: Average loss: 0.0153, Accuracy: 33313/50000 (

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [73, 106, 88, 55, 79], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.33112761459497575, 'momentum': 0.801587540516298, 'weight_decay': 0.00014889518600170917}, 'scheduler': True, 'no_cuda': False, 'seed': 31, 'save_name': None, 'sparsity': [{'sparsity': 0.4718350070854878}, {'sparsity': 0.2800775888489645}, {'sparsity': 0.5991084526799294}, {'sparsity': 0.7696091645985127}, {'sparsity': 0.722739332322094}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0172, Accuracy: 30810/50000 (62%)


Test set: Average loss: 0.0011, Accuracy: 6147/10000 (61%)


Train set: Average loss: 0.0160, Accuracy: 32447/50000 (65%)


Test set: Average loss: 0.0010, Accuracy: 6388/10000 (64%)

Freezing SupermaskLinear(in_features=256, out_features=256, bias=False, sparsity=0.7696091645985127) before epoch 56

Train set: Average loss: 0.0156, Accuracy: 32681/50000 (

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [77, 105, 86, 122, 101], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.4156499012476844, 'momentum': 0.7787960624831716, 'weight_decay': 0.0003010860087848866}, 'scheduler': True, 'no_cuda': False, 'seed': 91, 'save_name': None, 'sparsity': [{'sparsity': 0.4686763774859147}, {'sparsity': 0.44847256818632186}, {'sparsity': 0.3873456877255287}, {'sparsity': 0.9332709029767265}, {'sparsity': 0.5450800886322062}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0224, Accuracy: 25359/50000 (51%)


Test set: Average loss: 0.0015, Accuracy: 5083/10000 (51%)


Train set: Average loss: 0.0174, Accuracy: 30707/50000 (61%)


Test set: Average loss: 0.0011, Accuracy: 6063/10000 (61%)


Train set: Average loss: 0.0163, Accuracy: 31644/50000 (63%)


Test set: Average loss: 0.0011, Accuracy: 6271/10000 (63%)

Freezing SupermaskConv(3, 64, kernel_size=(3, 3

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [87, 135, 83, 82, 84], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.3444076242048116, 'momentum': 0.6366448875071483, 'weight_decay': 0.00016711843284053371}, 'scheduler': True, 'no_cuda': False, 'seed': 83, 'save_name': None, 'sparsity': [{'sparsity': 0.34427272203231746}, {'sparsity': 0.481276867988332}, {'sparsity': 0.6411751303243818}, {'sparsity': 0.7910133200434124}, {'sparsity': 0.6990811073076154}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0193, Accuracy: 28279/50000 (57%)


Test set: Average loss: 0.0012, Accuracy: 5686/10000 (57%)


Train set: Average loss: 0.0165, Accuracy: 31809/50000 (64%)


Test set: Average loss: 0.0011, Accuracy: 6232/10000 (62%)


Train set: Average loss: 0.0163, Accuracy: 31885/50000 (64%)


Test set: Average loss: 0.0011, Accuracy: 6314/10000 (63%)


Train set: Average loss: 0.0165, Accuracy: 31842

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [68, 40, 57, 44, 75], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.23095642424421986, 'momentum': 0.7372297025721701, 'weight_decay': 0.0003576227833462401}, 'scheduler': True, 'no_cuda': False, 'seed': 54, 'save_name': None, 'sparsity': [{'sparsity': 0.45484660636749963}, {'sparsity': 0.30184420224906516}, {'sparsity': 0.17550237096432666}, {'sparsity': 0.37123844647697923}, {'sparsity': 0.5244799184795615}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0161, Accuracy: 31627/50000 (63%)


Test set: Average loss: 0.0011, Accuracy: 6299/10000 (63%)


Train set: Average loss: 0.0137, Accuracy: 35282/50000 (71%)


Test set: Average loss: 0.0009, Accuracy: 6934/10000 (69%)

Freezing SupermaskConv(64, 64, kernel_size=(3, 3), stride=(1, 1), bias=False, sparsity=0.30184420224906516) before epoch 41
Freezing SupermaskLinear(in_features=256, out_

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [116, 102, 101, 23, 160], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.27123888595067225, 'momentum': 0.8328204301587417, 'weight_decay': 0.0002487789615262853}, 'scheduler': True, 'no_cuda': False, 'seed': 37, 'save_name': None, 'sparsity': [{'sparsity': 0.6933319945894458}, {'sparsity': 0.17434975486015625}, {'sparsity': 0.86518542746182}, {'sparsity': 0.7704561183258478}, {'sparsity': 0.7568124350410412}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0244, Accuracy: 22737/50000 (45%)


Test set: Average loss: 0.0016, Accuracy: 4491/10000 (45%)

Freezing SupermaskLinear(in_features=256, out_features=256, bias=False, sparsity=0.7704561183258478) before epoch 24

Train set: Average loss: 0.0220, Accuracy: 25304/50000 (51%)


Test set: Average loss: 0.0014, Accuracy: 5015/10000 (50%)


Train set: Average loss: 0.0221, Accuracy: 25083/5000

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [47, 61, 110, 97, 76], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.4023069355452865, 'momentum': 0.527424894300121, 'weight_decay': 0.0004415282740011097}, 'scheduler': True, 'no_cuda': False, 'seed': 72, 'save_name': None, 'sparsity': [{'sparsity': 0.5239411168311081}, {'sparsity': 0.39118031935317765}, {'sparsity': 0.2843807081144209}, {'sparsity': 0.5478290242662871}, {'sparsity': 0.6421412813093555}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0168, Accuracy: 30799/50000 (62%)


Test set: Average loss: 0.0011, Accuracy: 6047/10000 (60%)


Train set: Average loss: 0.0165, Accuracy: 31804/50000 (64%)


Test set: Average loss: 0.0011, Accuracy: 6255/10000 (63%)

Freezing SupermaskConv(3, 64, kernel_size=(3, 3), stride=(1, 1), bias=False, sparsity=0.5239411168311081) before epoch 48

Train set: Average loss: 0.0135, Accuracy: 35294/50

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [46, 59, 113, 129, 129], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.49199443045813007, 'momentum': 0.4803403090169498, 'weight_decay': 0.0004501350069090617}, 'scheduler': True, 'no_cuda': False, 'seed': 38, 'save_name': None, 'sparsity': [{'sparsity': 0.5414398518242778}, {'sparsity': 0.36917027267672736}, {'sparsity': 0.26610864224289454}, {'sparsity': 0.5198418588346952}, {'sparsity': 0.4242308494163336}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0187, Accuracy: 28888/50000 (58%)


Test set: Average loss: 0.0012, Accuracy: 5674/10000 (57%)


Train set: Average loss: 0.0212, Accuracy: 27857/50000 (56%)


Test set: Average loss: 0.0014, Accuracy: 5543/10000 (55%)

Freezing SupermaskConv(3, 64, kernel_size=(3, 3), stride=(1, 1), bias=False, sparsity=0.5414398518242778) before epoch 47
Freezing SupermaskConv(64, 64, kernel_size=(3, 

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [48, 24, 115, 134, 126], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.4048351589221216, 'momentum': 0.47885585634203987, 'weight_decay': 0.000432813297043943}, 'scheduler': True, 'no_cuda': False, 'seed': 87, 'save_name': None, 'sparsity': [{'sparsity': 0.7883126900036797}, {'sparsity': 0.38271725131914586}, {'sparsity': 0.3044165190354116}, {'sparsity': 0.517700743708566}, {'sparsity': 0.38063061496931944}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0194, Accuracy: 28027/50000 (56%)


Test set: Average loss: 0.0013, Accuracy: 5573/10000 (56%)

Freezing SupermaskConv(64, 64, kernel_size=(3, 3), stride=(1, 1), bias=False, sparsity=0.38271725131914586) before epoch 25

Train set: Average loss: 0.0177, Accuracy: 30232/50000 (60%)


Test set: Average loss: 0.0012, Accuracy: 5970/10000 (60%)

Freezing SupermaskConv(3, 64, kernel_size=(3, 3

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [41, 60, 100, 100, 139], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.499647933387417, 'momentum': 0.4017029505495383, 'weight_decay': 0.000678328918814147}, 'scheduler': True, 'no_cuda': False, 'seed': 1, 'save_name': None, 'sparsity': [{'sparsity': 0.5388472467732387}, {'sparsity': 0.38877342036674284}, {'sparsity': 0.26204196923455186}, {'sparsity': 0.43243638353287356}, {'sparsity': 0.48329048360940663}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0175, Accuracy: 30252/50000 (61%)


Test set: Average loss: 0.0011, Accuracy: 6038/10000 (60%)


Train set: Average loss: 0.0183, Accuracy: 29521/50000 (59%)


Test set: Average loss: 0.0012, Accuracy: 5784/10000 (58%)

Freezing SupermaskConv(3, 64, kernel_size=(3, 3), stride=(1, 1), bias=False, sparsity=0.5388472467732387) before epoch 42

Train set: Average loss: 0.0167, Accuracy: 31284