In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import os
os.environ['HOME_DIR'] = 'drive/MyDrive/hidden-networks'
# !pip install -r $HOME_DIR/requirements.txt

import sys
sys.path.append(os.path.join('/content', os.environ['HOME_DIR']))

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import CosineAnnealingLR
import torch.autograd as autograd
import collections

from supermask_pruning import GetSubnet, SupermaskConv, SupermaskLinear
from supermask_pruning import train, test

class ArgClass:
    def __init__(self, args):
        self.setattrs(**args)
        
    def setattrs(self, **kwargs):
        for name, val in kwargs.items():
            setattr(self, name, val)

In [4]:
class Net(nn.Module):
    def __init__(self, args, input_channels, image_size, num_labels):
        super().__init__()
        
        sparsities = getattr(args, "sparsity", [{"sparsity": 1.0}, {"sparsity": 1.0}, {"sparsity": 1.0}, {"sparsity": 1.0}, {"sparsity": 1.0}])
        self.conv1 = SupermaskConv(input_channels, 64, 3, 1, bias=args.bias, init=args.init, **sparsities[0])
        self.conv2 = SupermaskConv(64, 64, 3, 1, bias=args.bias, init=args.init, **sparsities[1])
        s = (image_size - 4) * (image_size - 4) * 64 // 4
        self.fc1 = SupermaskLinear(s, 256, bias=args.bias, init=args.init, **sparsities[2])
        self.fc2 = SupermaskLinear(256, 256, bias=args.bias, init=args.init, **sparsities[3])
        self.fc3 = SupermaskLinear(256, num_labels, bias=args.bias, init=args.init, **sparsities[4])
        self.fc1.calculate_subscores = True
        self.args = args

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.max_pool2d(x, 2)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        output = F.log_softmax(x, dim=1)
        return output
    
    def get_extra_state(self):
        return self.args
      
    def set_extra_state(self, state):
        self.args = state

In [5]:
args = {"init": "signed_constant", "bias": False}
for i in Net(ArgClass(args), 3, 32, 10).children():
  print(i.weight)

Parameter containing:
tensor([[[[ 0.1925,  0.1925,  0.1925],
          [ 0.1925,  0.1925,  0.1925],
          [ 0.1925,  0.1925, -0.1925]],

         [[ 0.1925, -0.1925, -0.1925],
          [ 0.1925,  0.1925,  0.1925],
          [-0.1925,  0.1925, -0.1925]],

         [[-0.1925, -0.1925, -0.1925],
          [ 0.1925, -0.1925,  0.1925],
          [ 0.1925,  0.1925,  0.1925]]],


        [[[ 0.1925, -0.1925,  0.1925],
          [ 0.1925,  0.1925,  0.1925],
          [-0.1925,  0.1925, -0.1925]],

         [[ 0.1925,  0.1925,  0.1925],
          [ 0.1925, -0.1925,  0.1925],
          [-0.1925, -0.1925,  0.1925]],

         [[-0.1925,  0.1925,  0.1925],
          [ 0.1925,  0.1925, -0.1925],
          [ 0.1925,  0.1925,  0.1925]]],


        [[[ 0.1925,  0.1925,  0.1925],
          [ 0.1925,  0.1925, -0.1925],
          [ 0.1925, -0.1925,  0.1925]],

         [[ 0.1925,  0.1925, -0.1925],
          [-0.1925, -0.1925, -0.1925],
          [ 0.1925,  0.1925,  0.1925]],

         [[ 0.1925, -0

In [6]:
# The main function runs the full training loop on a dataset of your choice
def main(model_args, train_args, base_model=None, trial=None):
    args = ArgClass(model_args)
    train_args = ArgClass(train_args)
    dataset = args.dataset

    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")
    print(f"Using device {device}")

    transform = None
    if dataset == "MNIST":
        transform = transforms.Compose([transforms.ToTensor(), 
                                        transforms.Normalize((0.1307,), (0.3081,))
                                        ])
        train_transform = transform
        input_channels, image_size, num_labels = 1, 28, 10
    elif dataset == "CIFAR10":
        train_transform = transforms.Compose([transforms.RandomCrop(32, padding=4),
                                              transforms.RandomHorizontalFlip(),
                                              transforms.ToTensor(),
                                              transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
                                              ])
        transform = transforms.Compose([transforms.ToTensor(),
                                        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
                                        ])
        input_channels, image_size, num_labels = 3, 32, 10
    else:
        raise ValueError("Only supported datasets are CIFAR10 and MNIST currently.")

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    train_loader = torch.utils.data.DataLoader(
        getattr(datasets, dataset)(os.path.join(train_args.data, dataset), 
                                   train=True, download=True, transform=transform),
        batch_size=args.batch_size, shuffle=True, **kwargs)
    train_augmented_loader = torch.utils.data.DataLoader(
        getattr(datasets, dataset)(os.path.join(train_args.data, dataset), 
                                   train=True, transform=train_transform),
        batch_size=args.batch_size, shuffle=True, **kwargs)
    test_loader = torch.utils.data.DataLoader(
        getattr(datasets, dataset)(os.path.join(train_args.data, dataset), 
                                   train=False, transform=transform),
        batch_size=train_args.test_batch_size, shuffle=True, **kwargs)

    model = Net(args, input_channels, image_size, num_labels).to(device)

    if getattr(args, "copy_layers", None) is not None:
        if (bool(args.copy_layers) ^ (base_model is not None)):
            raise ValueError("copy_layers arg must be None or [] if base_model is not specified")
        if base_model is not None and args.copy_layers:
            for layer in args.copy_layers:
                model.load_state_dict(getattr(base_model, layer).state_dict(prefix=f"{layer}."), strict=False)
            
    # NOTE: only pass the parameters where p.requires_grad == True to the optimizer! Important!
    optimizer = getattr(optim, args.optimizer)(
        [p for p in model.parameters() if p.requires_grad],
        **args.optim_kwargs,
    )
    assert isinstance(args.epochs, list) or isinstance(args.epochs, int)
    num_epochs, check_freeze = (args.epochs, False) if isinstance(args.epochs, int) else (max(args.epochs), True)
    criterion = nn.CrossEntropyLoss().to(device)
    scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs) if args.scheduler else None 

    for epoch in range(1, num_epochs + 1):
        if check_freeze:
            for freeze_at_epoch, child in zip(args.epochs, model.children()):
                if freeze_at_epoch == epoch - 1:
                    child.freeze()
                    print(f"Freezing {child} before epoch {epoch}")

        print(model.fc1.scores)
        train(model, train_args.log_interval, device, train_augmented_loader, optimizer, criterion, epoch, penalty=model_args['score_penalty'])
        if (train_args.train_eval_interval and epoch % train_args.train_eval_interval == 0) or (train_args.eval_on_last and epoch == args.epochs):
            train_acc, train_loss = test(model, device, criterion, train_loader, name="Train")
            if trial:
                trial.set_user_attr('train_acc', {**trial.user_attrs.get('train_acc', {}), **{epoch: train_acc}})
                trial.set_user_attr('train_loss', {**trial.user_attrs.get('train_loss', {}), **{epoch: train_loss}})
        if (train_args.test_eval_interval and epoch % train_args.test_eval_interval == 0) or (train_args.eval_on_last and epoch == args.epochs):
            test_acc, test_loss = test(model, device, criterion, test_loader, name="Test")
            if trial:
                trial.set_user_attr('test_acc', {**trial.user_attrs.get('test_acc', {}), **{epoch: test_acc}})
                trial.set_user_attr('test_loss', {**trial.user_attrs.get('test_loss', {}), **{epoch: test_loss}})
                trial.report(test_acc, epoch-1)
                if trial.should_prune():
                    raise optuna.exceptions.TrialPruned()

        if scheduler:
            scheduler.step()

    if args.save_name is not None:
        torch.save(model.state_dict(), os.path.join(os.environ['HOME_DIR'], \
                                                    "trained_networks", args.save_name))
    
    return model, device, train_loader, test_loader, criterion

def get_prune_mask(layer, sparsity):
    with torch.no_grad():
        return GetSubnet.apply(layer.scores.abs(), sparsity)

In [None]:
from directly_penalized_supermask_pruning import GetSubnet, SupermaskConv, SupermaskLinear
from directly_penalized_supermask_pruning import train, test

# # Arguments that do not affect model at all
train_args = {
    "test_batch_size": 1000, # input batch size for testing (default: 1000)
    'data': '../data', # Location to store data (e.g. MNIST)
    'log_interval': 500, # how many batches to wait before logging training status
    'train_eval_interval': 10, # epoch interval at which to print training accuracy
    'test_eval_interval': 2, # epoch interval at which to print test accuracy
    'eval_on_last': True
}

args = {
  "dataset": "CIFAR10",
  "init": "signed_constant",
  "batch_size": 16, # input batch size for training (default: 64)
  "epochs": [160] * 5, # number of epochs to train (default: 14)
  "optimizer": "SGD",
  "optim_kwargs": {"lr": 0.1, "momentum": 0.9, "weight_decay": 0.0001},
  "scheduler": True, # False for Adam, True for SGD, does CosineAnnealing
  'no_cuda': False, # disables CUDA training
  'seed': 1000, # random seed (default: 1)
  'save_name': None, #"conv2_frozen_sp50_rs1000", # "simple20_rs2", # For Saving the current Model, None if not saving
  'sparsity': [{"sparsity": 0.5}, {"sparsity": 0.5}, {"sparsity": 0.5}, {"sparsity": 0.5}, {"sparsity": 0.5}], # 'how sparse is each layer'
  'copy_layers': [], # ['conv1', 'conv2', 'fc2'],
  'bias': False, 
  'score_penalty': 0
}

trained_model, device, train_loader, test_loader, criterion = main(args, train_args)

Using device cuda
Files already downloaded and verified
Parameter containing:
tensor([[-0.0049, -0.0035,  0.0062,  ...,  0.0010, -0.0041,  0.0045],
        [ 0.0081,  0.0013,  0.0028,  ..., -0.0011,  0.0061,  0.0004],
        [ 0.0011,  0.0025, -0.0089,  ...,  0.0065,  0.0069, -0.0043],
        ...,
        [-0.0026,  0.0084,  0.0071,  ...,  0.0011,  0.0012,  0.0010],
        [ 0.0054, -0.0066, -0.0025,  ...,  0.0075, -0.0074, -0.0059],
        [-0.0077, -0.0006, -0.0019,  ..., -0.0049,  0.0072, -0.0077]],
       device='cuda:0', requires_grad=True)
Parameter containing:
tensor([[-0.0033, -0.0032,  0.0067,  ...,  0.0015, -0.0036,  0.0025],
        [ 0.0023, -0.0006,  0.0063,  ...,  0.0010,  0.0031,  0.0004],
        [-0.0008,  0.0033, -0.0042,  ...,  0.0063,  0.0039, -0.0033],
        ...,
        [-0.0027,  0.0069,  0.0037,  ...,  0.0005,  0.0017, -0.0002],
        [ 0.0040, -0.0048, -0.0039,  ...,  0.0051, -0.0051, -0.0042],
        [-0.0028, -0.0010, -0.0036,  ..., -0.0052,  0.0065,

In [7]:
from directly_penalized_supermask_pruning import GetSubnet, SupermaskConv, SupermaskLinear
from directly_penalized_supermask_pruning import train, test

# # Arguments that do not affect model at all
train_args = {
    "test_batch_size": 1000, # input batch size for testing (default: 1000)
    'data': '../data', # Location to store data (e.g. MNIST)
    'log_interval': 500, # how many batches to wait before logging training status
    'train_eval_interval': 10, # epoch interval at which to print training accuracy
    'test_eval_interval': 2, # epoch interval at which to print test accuracy
    'eval_on_last': True
}

args = {
  "dataset": "CIFAR10",
  "init": "signed_constant",
  "batch_size": 16, # input batch size for training (default: 64)
  "epochs": 160, # number of epochs to train (default: 14)
  "optimizer": "SGD",
  "optim_kwargs": {"lr": 0.1, "momentum": 0.9, "weight_decay": 0.0001},
  "scheduler": True, # False for Adam, True for SGD, does CosineAnnealing
  'no_cuda': False, # disables CUDA training
  'seed': 1000, # random seed (default: 1)
  'save_name': None, #"conv2_frozen_sp50_rs1000", # "simple20_rs2", # For Saving the current Model, None if not saving
  'sparsity': [{"sparsity": 0.5}, {"sparsity": 0.5}, {"sparsity": 0.5}, {"sparsity": 0.5}, {"sparsity": 0.5}], # 'how sparse is each layer'
  'copy_layers': [], # ['conv1', 'conv2', 'fc2'],
  'bias': False, 
  'score_penalty': 10
}

trained_model, device, train_loader, test_loader, criterion = main(args, train_args)

Using device cuda
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ../data/CIFAR10/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ../data/CIFAR10/cifar-10-python.tar.gz to ../data/CIFAR10
Parameter containing:
tensor([[-0.0049, -0.0035,  0.0062,  ...,  0.0010, -0.0041,  0.0045],
        [ 0.0081,  0.0013,  0.0028,  ..., -0.0011,  0.0061,  0.0004],
        [ 0.0011,  0.0025, -0.0089,  ...,  0.0065,  0.0069, -0.0043],
        ...,
        [-0.0026,  0.0084,  0.0071,  ...,  0.0011,  0.0012,  0.0010],
        [ 0.0054, -0.0066, -0.0025,  ...,  0.0075, -0.0074, -0.0059],
        [-0.0077, -0.0006, -0.0019,  ..., -0.0049,  0.0072, -0.0077]],
       device='cuda:0', requires_grad=True)
Parameter containing:
tensor([[-1.2357, -1.2380,  1.2382,  ...,  1.2283, -1.2313,  1.2412],
        [ 1.0258,  1.0208,  1.0223,  ..., -1.0204,  1.0240,  1.0199],
        [ 1.0171,  1.0181, -1.0228,  ...,  1.0206,  1.0217, -1.0189],
        ...,
        [-0.9959,  1.0002,  0.9985,  ...,  0.9943,  0.9948,  0.9942],
        [ 1.2840, -1.2714, -1.2906,  ...,  1.2789, -1.2825, -1.2851],
        [-1.2546, -1.2527, -1.2590,  ..., -1.2

KeyboardInterrupt: ignored

In [None]:
# # Arguments that do not affect model at all
train_args = {
    "test_batch_size": 1000, # input batch size for testing (default: 1000)
    'data': '../data', # Location to store data (e.g. MNIST)
    'log_interval': 500, # how many batches to wait before logging training status
    'train_eval_interval': 10, # epoch interval at which to print training accuracy
    'test_eval_interval': 2, # epoch interval at which to print test accuracy
    'eval_on_last': True
}

args = {
  "dataset": "CIFAR10",
  "init": "signed_constant",
  "batch_size": 16, # input batch size for training (default: 64)
  "epochs": [80, 100, 160, 140, 120], # number of epochs to train (default: 14)
  "optimizer": "SGD",
  "optim_kwargs": {"lr": 0.1, "momentum": 0.9, "weight_decay": 0.0001},
  "scheduler": True, # False for Adam, True for SGD, does CosineAnnealing
  'no_cuda': False, # disables CUDA training
  'seed': 1000, # random seed (default: 1)
  'save_name': None, #"conv2_frozen_sp50_rs1000", # "simple20_rs2", # For Saving the current Model, None if not saving
  'sparsity': [{"sparsity": 0.5}, {"sparsity": 0.5}, {"sparsity": 0.5}, {"sparsity": 0.5}, {"sparsity": 0.5}], # 'how sparse is each layer'
  'copy_layers': [], # ['conv1', 'conv2', 'fc2'],
  'bias': False
}

trained_model, device, train_loader, test_loader, criterion = main(args, train_args)
# # name_of_experiment = 'threshold-100epoch'
# # train_results = []
# # test_results = []
# # for rs in range(100, 105):
# #     args["seed"] = rs
# #     trained_model, device, train_loader, test_loader, criterion = main(args, train_args)
# #     train_acc, train_loss = test(trained_model, device, criterion, train_loader, name="Train")
# #     test_acc, test_loss = test(trained_model, device, criterion, test_loader)
# #     train_results.append((train_acc, train_loss))
# #     test_results.append((test_acc, test_loss))
# #     torch.save((train_args, args, train_results, test_results), \
# #                os.path.join(os.environ["HOME_DIR"], "results", f"{name_of_experiment}_{args['dataset']}.pt"))

# # thresholds = [0.01, 0.02, 0.05, 0.1, 0.2, 0.3]
# # train_results = {x: [] for x in thresholds}
# # test_results = {x: [] for x in thresholds}
# # name_of_experiment = "fc1_thresholds"
    
# # for fc1_threshold in thresholds:
# #     for rs in range(70, 73):
# #         args["seed"] = rs
# #         args['sparsity'][2]['threshold'] = fc1_threshold
# #         print(f"----{rs}----{fc1_threshold}----")
# #         print(args['sparsity'])
# #         # args["save_name"] = f"{args['dataset']}_{rs}_{name_of_experiment}_{fc1_threshold}.pt"
# #         trained_model, device, train_loader, test_loader, criterion = main(args, train_args)
# #         train_acc, train_loss = test(trained_model, device, criterion, train_loader)
# #         test_acc, test_loss = test(trained_model, device, criterion, test_loader)
# #         train_results[fc1_threshold].append((train_acc, train_loss))
# #         test_results[fc1_threshold].append((test_acc, test_loss))
# #         torch.save((train_args, args, train_results, test_results), \
# #                    os.path.join(os.environ["HOME_DIR"], "results", f"{args['dataset']}-{name_of_experiment}.pt"))

Using device cuda
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ../data/CIFAR10/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ../data/CIFAR10/cifar-10-python.tar.gz to ../data/CIFAR10

Test set: Average loss: 0.0014, Accuracy: 4971/10000 (50%)


Test set: Average loss: 0.0013, Accuracy: 5355/10000 (54%)


Test set: Average loss: 0.0012, Accuracy: 5900/10000 (59%)


Test set: Average loss: 0.0012, Accuracy: 5762/10000 (58%)


Train set: Average loss: 0.0732, Accuracy: 29485/50000 (59%)


Test set: Average loss: 0.0012, Accuracy: 5860/10000 (59%)


Test set: Average loss: 0.0013, Accuracy: 5541/10000 (55%)


Test set: Average loss: 0.0011, Accuracy: 6001/10000 (60%)


Test set: Average loss: 0.0012, Accuracy: 5952/10000 (60%)


Test set: Average loss: 0.0011, Accuracy: 5944/10000 (59%)


Train set: Average loss: 0.0682, Accuracy: 30860/50000 (62%)


Test set: Average loss: 0.0011, Accuracy: 6084/10000 (61%)


Test set: Average loss: 0.0012, Accuracy: 6004/10000 (60%)


Test set: Average loss: 0.0012, Accuracy: 6047/10000 (60%)


Test set: Average loss: 0.0011, Accuracy: 6207/10000 (62%)


Test set: A

In [None]:
import pickle
def save_study(study, filename):
    with open(os.path.join(os.environ["HOME_DIR"], "results", "studies", f"{filename}.pickle"), "wb") as f:
        pickle.dump(study, f)

In [None]:
def objective(trial):
    train_args = {
      "test_batch_size": 1000, # input batch size for testing (default: 1000)
      'data': '../data', # Location to store data (e.g. MNIST)
      'log_interval': 1000000, # how many batches to wait before logging training status
      'train_eval_interval': 20, # epoch interval at which to print training accuracy
      'test_eval_interval': 20, # epoch interval at which to print test accuracy
      'eval_on_last': True
    }

    args = {
      "dataset": "CIFAR10",
      "init": "signed_constant",
      "batch_size": 64, # input batch size for training (default: 64)
      "epochs": [trial.suggest_int(f'layer{i}_epochs', 20, 160) for i in range(4)], # number of epochs to train (default: 14)
      "optimizer": "SGD",
      "optim_kwargs": {"lr": trial.suggest_float('learning_rate', 0.01, 0.5), 
                       "momentum": trial.suggest_float('momentum', 0.2, 0.95),
                       "weight_decay": trial.suggest_float('weight_decay', 0.0001, 0.001)},
      "scheduler": True, # False for Adam, True for SGD, does CosineAnnealing
      'no_cuda': False, # disables CUDA training
      'seed': 500, # random seed (default: 1)
      'save_name': None, # "simple20_rs2", # For Saving the current Model, None if not saving
      'sparsity': [
                   {"sparsity": trial.suggest_float('sparsity_conv1', 0.1, 0.95)}, 
                   {"sparsity": trial.suggest_float('sparsity_conv2', 0.1, 0.95)}, 
                   {"sparsity": trial.suggest_float('sparsity_fc1', 0.1, 0.95)}, 
                   {"sparsity": trial.suggest_float('sparsity_fc2', 0.1, 0.95)},
                   {"sparsity": trial.suggest_float('sparsity_fc3', 0.1, 0.95)}
                  ], # 'how sparse is each layer'
      'copy_layers': [], # ['conv1', 'conv2', 'fc2'],
      'bias': False
    }

    print(args)

    trained_model, device, train_loader, test_loader, criterion = main(args, train_args, trial=trial)
    train_acc, train_loss = test(trained_model, device, criterion, train_loader)
    test_acc, test_loss = test(trained_model, device, criterion, test_loader)

    return test_acc

In [None]:
# with open(os.path.join(os.environ["HOME_DIR"], "results", "studies", "hp_search_study.pickle"), "rb") as f:
#     study = pickle.load(f)
study = optuna.create_study(direction='maximize', 
                            pruner=optuna.pruners.PatientPruner(
                                optuna.pruners.MedianPruner(n_startup_trials=15, n_warmup_steps=50, interval_steps=1), 
                                patience=40
                                )
                            )
for _ in range(100):
    study.optimize(objective, n_trials=1, show_progress_bar=True)
    save_study(study, "conv2_augmented_search_rs_500")

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [156, 141, 100, 108], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.45958742350989223, 'momentum': 0.20586289441261235, 'weight_decay': 0.000940641581032237}, 'scheduler': True, 'no_cuda': False, 'seed': 500, 'save_name': None, 'sparsity': [{'sparsity': 0.5765381764615303}, {'sparsity': 0.14134903253384878}, {'sparsity': 0.7820934128266148}, {'sparsity': 0.5602424184243817}, {'sparsity': 0.3638287789179808}], 'copy_layers': [], 'bias': False}
Using device cuda
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ../data/CIFAR10/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ../data/CIFAR10/cifar-10-python.tar.gz to ../data/CIFAR10

Train set: Average loss: 0.0264, Accuracy: 20378/50000 (41%)


Test set: Average loss: 0.0017, Accuracy: 4010/10000 (40%)


Train set: Average loss: 0.0242, Accuracy: 23013/50000 (46%)


Test set: Average loss: 0.0015, Accuracy: 4583/10000 (46%)


Train set: Average loss: 0.0253, Accuracy: 21574/50000 (43%)


Test set: Average loss: 0.0016, Accuracy: 4227/10000 (42%)


Train set: Average loss: 0.0208, Accuracy: 26005/50000 (52%)


Test set: Average loss: 0.0013, Accuracy: 5148/10000 (51%)


Train set: Average loss: 0.0203, Accuracy: 26668/50000 (53%)


Test set: Average loss: 0.0013, Accuracy: 5347/10000 (53%)

Freezing SupermaskLinear(in_features=12544, out_features=256, bias=False, sparsity=0.7820934128266148) before epoch 101
Freezing SupermaskLinear(in_features=256, out_features=256, bias=False, sparsity=0.5602424184243817) before epoch 109

Train set: Average loss: 0.0175, Accuracy: 30368/50000 (61%)


Test set:

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [48, 153, 116, 77], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.19392713319908203, 'momentum': 0.5440907568578053, 'weight_decay': 0.0008990447248441982}, 'scheduler': True, 'no_cuda': False, 'seed': 500, 'save_name': None, 'sparsity': [{'sparsity': 0.7083618112585519}, {'sparsity': 0.4177374344237944}, {'sparsity': 0.6901192267053294}, {'sparsity': 0.3375777103605044}, {'sparsity': 0.5422276725501263}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0213, Accuracy: 26108/50000 (52%)


Test set: Average loss: 0.0014, Accuracy: 5166/10000 (52%)


Train set: Average loss: 0.0197, Accuracy: 27639/50000 (55%)


Test set: Average loss: 0.0013, Accuracy: 5527/10000 (55%)

Freezing SupermaskConv(3, 64, kernel_size=(3, 3), stride=(1, 1), bias=False, sparsity=0.7083618112585519) before epoch 49

Train set: Average loss: 0.0170, Accuracy: 30748/500

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [27, 40, 88, 119], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.1399020037275351, 'momentum': 0.3888026279025666, 'weight_decay': 0.0005905809990740046}, 'scheduler': True, 'no_cuda': False, 'seed': 500, 'save_name': None, 'sparsity': [{'sparsity': 0.31454342404694857}, {'sparsity': 0.2072698755731019}, {'sparsity': 0.6154815646358571}, {'sparsity': 0.10747298097678944}, {'sparsity': 0.3127737631716025}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0194, Accuracy: 28289/50000 (57%)


Test set: Average loss: 0.0013, Accuracy: 5608/10000 (56%)

Freezing SupermaskConv(3, 64, kernel_size=(3, 3), stride=(1, 1), bias=False, sparsity=0.31454342404694857) before epoch 28

Train set: Average loss: 0.0171, Accuracy: 30611/50000 (61%)


Test set: Average loss: 0.0011, Accuracy: 6033/10000 (60%)

Freezing SupermaskConv(64, 64, kernel_size=(3, 3), s

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [109, 120, 39, 107], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.396271070297724, 'momentum': 0.20034597630563944, 'weight_decay': 0.0004426238750702677}, 'scheduler': True, 'no_cuda': False, 'seed': 500, 'save_name': None, 'sparsity': [{'sparsity': 0.525614171145173}, {'sparsity': 0.3487365468694312}, {'sparsity': 0.9313112995497825}, {'sparsity': 0.25504685488449785}, {'sparsity': 0.6010673282663183}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0253, Accuracy: 20503/50000 (41%)


Test set: Average loss: 0.0016, Accuracy: 4080/10000 (41%)

