In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.environ['HOME_DIR'] = 'drive/MyDrive/hidden-networks'
# !pip install -r $HOME_DIR/requirements.txt

import sys
sys.path.append(os.path.join('/content', os.environ['HOME_DIR']))

!pip install optuna
import optuna

Collecting optuna
  Downloading optuna-2.10.0-py3-none-any.whl (308 kB)
[K     |████████████████████████████████| 308 kB 5.2 MB/s 
Collecting cliff
  Downloading cliff-3.10.1-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 9.4 MB/s 
[?25hCollecting alembic
  Downloading alembic-1.7.7-py3-none-any.whl (210 kB)
[K     |████████████████████████████████| 210 kB 34.1 MB/s 
Collecting colorlog
  Downloading colorlog-6.6.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.8.2
  Downloading cmaes-0.8.2-py3-none-any.whl (15 kB)
Collecting Mako
  Downloading Mako-1.2.0-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 5.4 MB/s 
Collecting stevedore>=2.0.1
  Downloading stevedore-3.5.0-py3-none-any.whl (49 kB)
[K     |████████████████████████████████| 49 kB 5.8 MB/s 
Collecting autopage>=0.4.0
  Downloading autopage-0.5.0-py3-none-any.whl (29 kB)
Collecting pbr!=2.1.0,>=2.0.0
  Downloading pbr-5.8.1-py2.py3-none-any.whl (113 kB)
[K     |██████

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import CosineAnnealingLR
import torch.autograd as autograd
import collections

from supermask_pruning import GetSubnet, SupermaskConv, SupermaskLinear
from supermask_pruning import train, test

class ArgClass:
    def __init__(self, args):
        self.setattrs(**args)
        
    def setattrs(self, **kwargs):
        for name, val in kwargs.items():
            setattr(self, name, val)

In [4]:
class Net(nn.Module):
    def __init__(self, args, input_channels, image_size, num_labels):
        super().__init__()
        
        sparsities = getattr(args, "sparsity", [{"sparsity": 1.0}, {"sparsity": 1.0}, {"sparsity": 1.0}, {"sparsity": 1.0}, {"sparsity": 1.0}])
        self.conv1 = SupermaskConv(input_channels, 64, 3, 1, bias=args.bias, init=args.init, **sparsities[0])
        self.conv2 = SupermaskConv(64, 64, 3, 1, bias=args.bias, init=args.init, **sparsities[1])
        s = (image_size - 4) * (image_size - 4) * 64 // 4
        self.fc1 = SupermaskLinear(s, 256, bias=args.bias, init=args.init, **sparsities[2])
        self.fc2 = SupermaskLinear(256, 256, bias=args.bias, init=args.init, **sparsities[3])
        self.fc3 = SupermaskLinear(256, num_labels, bias=args.bias, init=args.init, **sparsities[4])
        self.args = args

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.max_pool2d(x, 2)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        output = F.log_softmax(x, dim=1)
        return output
    
    def get_extra_state(self):
        return self.args
      
    def set_extra_state(self, state):
        self.args = state

In [5]:
# The main function runs the full training loop on a dataset of your choice
def main(model_args, train_args, base_model=None, trial=None):
    args = ArgClass(model_args)
    train_args = ArgClass(train_args)
    dataset = args.dataset

    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")
    print(f"Using device {device}")

    transform = None
    if dataset == "MNIST":
        transform = transforms.Compose([transforms.ToTensor(), 
                                        transforms.Normalize((0.1307,), (0.3081,))
                                        ])
        train_transform = transform
        input_channels, image_size, num_labels = 1, 28, 10
    elif dataset == "CIFAR10":
        train_transform = transforms.Compose([transforms.RandomCrop(32, padding=4),
                                              transforms.RandomHorizontalFlip(),
                                              transforms.ToTensor(),
                                              transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
                                              ])
        transform = transforms.Compose([transforms.ToTensor(),
                                        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
                                        ])
        input_channels, image_size, num_labels = 3, 32, 10
    else:
        raise ValueError("Only supported datasets are CIFAR10 and MNIST currently.")

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    train_loader = torch.utils.data.DataLoader(
        getattr(datasets, dataset)(os.path.join(train_args.data, dataset), 
                                   train=True, download=True, transform=transform),
        batch_size=args.batch_size, shuffle=True, **kwargs)
    train_augmented_loader = torch.utils.data.DataLoader(
        getattr(datasets, dataset)(os.path.join(train_args.data, dataset), 
                                   train=True, transform=train_transform),
        batch_size=args.batch_size, shuffle=True, **kwargs)
    test_loader = torch.utils.data.DataLoader(
        getattr(datasets, dataset)(os.path.join(train_args.data, dataset), 
                                   train=False, transform=transform),
        batch_size=train_args.test_batch_size, shuffle=True, **kwargs)

    model = Net(args, input_channels, image_size, num_labels).to(device)

    if getattr(args, "copy_layers", None) is not None:
        if (bool(args.copy_layers) ^ (base_model is not None)):
            raise ValueError("copy_layers arg must be None or [] if base_model is not specified")
        if base_model is not None and args.copy_layers:
            for layer in args.copy_layers:
                model.load_state_dict(getattr(base_model, layer).state_dict(prefix=f"{layer}."), strict=False)
            
    # NOTE: only pass the parameters where p.requires_grad == True to the optimizer! Important!
    optimizer = getattr(optim, args.optimizer)(
        [p for p in model.parameters() if p.requires_grad],
        **args.optim_kwargs,
    )
    assert isinstance(args.epochs, list) or isinstance(args.epochs, int)
    num_epochs, check_freeze = (args.epochs, False) if isinstance(args.epochs, int) else (max(args.epochs), True)
    criterion = nn.CrossEntropyLoss().to(device)
    scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs) if args.scheduler else None 

    for epoch in range(1, num_epochs + 1):
        if check_freeze:
            for freeze_at_epoch, child in zip(args.epochs, model.children()):
                if freeze_at_epoch == epoch - 1:
                    child.freeze()
                    print(f"Freezing {child} before epoch {epoch}")

        train(model, train_args.log_interval, device, train_augmented_loader, optimizer, criterion, epoch)
        if (train_args.train_eval_interval and epoch % train_args.train_eval_interval == 0) or (train_args.eval_on_last and epoch == args.epochs):
            train_acc, train_loss = test(model, device, criterion, train_loader, name="Train")
            if trial:
                trial.set_user_attr('train_acc', {**trial.user_attrs.get('train_acc', {}), **{epoch: train_acc}})
                trial.set_user_attr('train_loss', {**trial.user_attrs.get('train_loss', {}), **{epoch: train_loss}})
        if (train_args.test_eval_interval and epoch % train_args.test_eval_interval == 0) or (train_args.eval_on_last and epoch == args.epochs):
            test_acc, test_loss = test(model, device, criterion, test_loader, name="Test")
            if trial:
                trial.set_user_attr('test_acc', {**trial.user_attrs.get('test_acc', {}), **{epoch: test_acc}})
                trial.set_user_attr('test_loss', {**trial.user_attrs.get('test_loss', {}), **{epoch: test_loss}})
                trial.report(test_acc, epoch-1)
                if trial.should_prune():
                    raise optuna.exceptions.TrialPruned()

        if scheduler:
            scheduler.step()

    if args.save_name is not None:
        torch.save(model.state_dict(), os.path.join(os.environ['HOME_DIR'], \
                                                    "trained_networks", args.save_name))
    
    return model, device, train_loader, test_loader, criterion

def get_prune_mask(layer, sparsity):
    with torch.no_grad():
        return GetSubnet.apply(layer.scores.abs(), sparsity)

In [6]:
# # Arguments that do not affect model at all
# train_args = {
#     "test_batch_size": 1000, # input batch size for testing (default: 1000)
#     'data': '../data', # Location to store data (e.g. MNIST)
#     'log_interval': 500, # how many batches to wait before logging training status
#     'train_eval_interval': 5, # epoch interval at which to print training accuracy
#     'test_eval_interval': 5, # epoch interval at which to print test accuracy
#     'eval_on_last': False
# }

# args = {
#   "dataset": "CIFAR10",
#   "init": "signed_constant",
#   "batch_size": 64, # input batch size for training (default: 64)
#   "epochs": 40, # number of epochs to train (default: 14)
#   "optimizer": "SGD",
#   "optim_kwargs": {"lr": 0.1, "momentum": 0.9, "weight_decay": 0.0005},
#   "scheduler": True, # False for Adam, True for SGD, does CosineAnnealing
#   'no_cuda': False, # disables CUDA training
#   'seed': 1000, # random seed (default: 1)
#   'save_name': None, # "simple20_rs2", # For Saving the current Model, None if not saving
#   'sparsity': [{"sparsity": 0.5}, {"sparsity": 0.5}, {"sparsity": 0.5}, {"sparsity": 0.5}, {"sparsity": 0.5}], # 'how sparse is each layer'
#   'copy_layers': [], # ['conv1', 'conv2', 'fc2'],
#   'bias': False
# }

# trained_model, device, train_loader, test_loader, criterion = main(args, train_args)
# # name_of_experiment = 'threshold-100epoch'
# # train_results = []
# # test_results = []
# # for rs in range(100, 105):
# #     args["seed"] = rs
# #     trained_model, device, train_loader, test_loader, criterion = main(args, train_args)
# #     train_acc, train_loss = test(trained_model, device, criterion, train_loader, name="Train")
# #     test_acc, test_loss = test(trained_model, device, criterion, test_loader)
# #     train_results.append((train_acc, train_loss))
# #     test_results.append((test_acc, test_loss))
# #     torch.save((train_args, args, train_results, test_results), \
# #                os.path.join(os.environ["HOME_DIR"], "results", f"{name_of_experiment}_{args['dataset']}.pt"))

# # thresholds = [0.01, 0.02, 0.05, 0.1, 0.2, 0.3]
# # train_results = {x: [] for x in thresholds}
# # test_results = {x: [] for x in thresholds}
# # name_of_experiment = "fc1_thresholds"
    
# # for fc1_threshold in thresholds:
# #     for rs in range(70, 73):
# #         args["seed"] = rs
# #         args['sparsity'][2]['threshold'] = fc1_threshold
# #         print(f"----{rs}----{fc1_threshold}----")
# #         print(args['sparsity'])
# #         # args["save_name"] = f"{args['dataset']}_{rs}_{name_of_experiment}_{fc1_threshold}.pt"
# #         trained_model, device, train_loader, test_loader, criterion = main(args, train_args)
# #         train_acc, train_loss = test(trained_model, device, criterion, train_loader)
# #         test_acc, test_loss = test(trained_model, device, criterion, test_loader)
# #         train_results[fc1_threshold].append((train_acc, train_loss))
# #         test_results[fc1_threshold].append((test_acc, test_loss))
# #         torch.save((train_args, args, train_results, test_results), \
# #                    os.path.join(os.environ["HOME_DIR"], "results", f"{args['dataset']}-{name_of_experiment}.pt"))

In [7]:
import pickle
def save_study(study, filename):
    with open(os.path.join(os.environ["HOME_DIR"], "results", "studies", f"{filename}.pickle"), "wb") as f:
        pickle.dump(study, f)

In [8]:
def objective(trial):
    train_args = {
      "test_batch_size": 1000, # input batch size for testing (default: 1000)
      'data': '../data', # Location to store data (e.g. MNIST)
      'log_interval': 1000000, # how many batches to wait before logging training status
      'train_eval_interval': 20, # epoch interval at which to print training accuracy
      'test_eval_interval': 20, # epoch interval at which to print test accuracy
      'eval_on_last': True
    }

    args = {
      "dataset": "CIFAR10",
      "init": "signed_constant",
      "batch_size": 64, # input batch size for training (default: 64)
      "epochs": [trial.suggest_int(f'layer{i}_epochs', 20, 160) for i in range(4)], # number of epochs to train (default: 14)
      "optimizer": "SGD",
      "optim_kwargs": {"lr": trial.suggest_float('learning_rate', 0.01, 0.5), 
                       "momentum": trial.suggest_float('momentum', 0.2, 0.95),
                       "weight_decay": trial.suggest_float('weight_decay', 0.0001, 0.001)},
      "scheduler": True, # False for Adam, True for SGD, does CosineAnnealing
      'no_cuda': False, # disables CUDA training
      'seed': 500, # random seed (default: 1)
      'save_name': None, # "simple20_rs2", # For Saving the current Model, None if not saving
      'sparsity': [
                   {"sparsity": trial.suggest_float('sparsity_conv1', 0.1, 0.95)}, 
                   {"sparsity": trial.suggest_float('sparsity_conv2', 0.1, 0.95)}, 
                   {"sparsity": trial.suggest_float('sparsity_fc1', 0.1, 0.95)}, 
                   {"sparsity": trial.suggest_float('sparsity_fc2', 0.1, 0.95)},
                   {"sparsity": trial.suggest_float('sparsity_fc3', 0.1, 0.95)}
                  ], # 'how sparse is each layer'
      'copy_layers': [], # ['conv1', 'conv2', 'fc2'],
      'bias': False
    }

    print(args)

    trained_model, device, train_loader, test_loader, criterion = main(args, train_args, trial=trial)
    train_acc, train_loss = test(trained_model, device, criterion, train_loader)
    test_acc, test_loss = test(trained_model, device, criterion, test_loader)

    return test_acc

In [None]:
with open(os.path.join(os.environ["HOME_DIR"], "results", "studies", "conv2_augmented_search_rs_500.pickle"), "rb") as f:
    study = pickle.load(f)
# study = optuna.create_study(direction='maximize', 
#                             pruner=optuna.pruners.PatientPruner(
#                                 optuna.pruners.MedianPruner(n_startup_trials=15, n_warmup_steps=50, interval_steps=1), 
#                                 patience=40
#                                 )
#                             )
for _ in range(100):
    study.optimize(objective, n_trials=1, show_progress_bar=True)
    save_study(study, "conv2_augmented_search_rs_500")

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [109, 120, 39, 107], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.396271070297724, 'momentum': 0.20034597630563944, 'weight_decay': 0.0004426238750702677}, 'scheduler': True, 'no_cuda': False, 'seed': 500, 'save_name': None, 'sparsity': [{'sparsity': 0.525614171145173}, {'sparsity': 0.3487365468694312}, {'sparsity': 0.9313112995497825}, {'sparsity': 0.25504685488449785}, {'sparsity': 0.6010673282663183}], 'copy_layers': [], 'bias': False}
Using device cuda
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ../data/CIFAR10/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ../data/CIFAR10/cifar-10-python.tar.gz to ../data/CIFAR10

Train set: Average loss: 0.0246, Accuracy: 21432/50000 (43%)


Test set: Average loss: 0.0016, Accuracy: 4211/10000 (42%)

Freezing SupermaskLinear(in_features=12544, out_features=256, bias=False, sparsity=0.9313112995497825) before epoch 40

Train set: Average loss: 0.0194, Accuracy: 27978/50000 (56%)


Test set: Average loss: 0.0012, Accuracy: 5630/10000 (56%)


Train set: Average loss: 0.0192, Accuracy: 28158/50000 (56%)


Test set: Average loss: 0.0012, Accuracy: 5594/10000 (56%)


Train set: Average loss: 0.0185, Accuracy: 28986/50000 (58%)


Test set: Average loss: 0.0012, Accuracy: 5832/10000 (58%)


Train set: Average loss: 0.0181, Accuracy: 29598/50000 (59%)


Test set: Average loss: 0.0012, Accuracy: 5910/10000 (59%)

Freezing SupermaskLinear(in_features=256, out_features=256, bias=False, sparsity=0.25504685488449785) before epoch 108
Freezing SupermaskConv(3, 64, kernel_size=(3, 3), stride=(1, 1), bias=Fal

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [129, 38, 66, 31], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.050155182527061686, 'momentum': 0.3724449929581699, 'weight_decay': 0.0004064172200979968}, 'scheduler': True, 'no_cuda': False, 'seed': 500, 'save_name': None, 'sparsity': [{'sparsity': 0.3930152916929157}, {'sparsity': 0.4176712279819478}, {'sparsity': 0.7815619556267753}, {'sparsity': 0.3786923023060079}, {'sparsity': 0.7247892523217693}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0198, Accuracy: 27543/50000 (55%)


Test set: Average loss: 0.0013, Accuracy: 5405/10000 (54%)

Freezing SupermaskLinear(in_features=256, out_features=256, bias=False, sparsity=0.3786923023060079) before epoch 32
Freezing SupermaskConv(64, 64, kernel_size=(3, 3), stride=(1, 1), bias=False, sparsity=0.4176712279819478) before epoch 39

Train set: Average loss: 0.0163, Accuracy: 31873/50000 (64

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [87, 101, 25, 114], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.2953074015686102, 'momentum': 0.26695095873566516, 'weight_decay': 0.00021563869433279795}, 'scheduler': True, 'no_cuda': False, 'seed': 500, 'save_name': None, 'sparsity': [{'sparsity': 0.47096792587084246}, {'sparsity': 0.5960650775135269}, {'sparsity': 0.8850761407310832}, {'sparsity': 0.21388370448342367}, {'sparsity': 0.6397382569637672}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0196, Accuracy: 27949/50000 (56%)


Test set: Average loss: 0.0013, Accuracy: 5531/10000 (55%)

Freezing SupermaskLinear(in_features=12544, out_features=256, bias=False, sparsity=0.8850761407310832) before epoch 26

Train set: Average loss: 0.0179, Accuracy: 29697/50000 (59%)


Test set: Average loss: 0.0012, Accuracy: 5936/10000 (59%)


Train set: Average loss: 0.0196, Accuracy: 27733/500

  self._init_valid()


  0%|          | 0/1 [00:00<?, ?it/s]

{'dataset': 'CIFAR10', 'init': 'signed_constant', 'batch_size': 64, 'epochs': [52, 76, 56, 57], 'optimizer': 'SGD', 'optim_kwargs': {'lr': 0.3490578356374213, 'momentum': 0.6107788651546929, 'weight_decay': 0.00011775031737854439}, 'scheduler': True, 'no_cuda': False, 'seed': 500, 'save_name': None, 'sparsity': [{'sparsity': 0.9268817653484971}, {'sparsity': 0.9336765980273081}, {'sparsity': 0.10574738922176485}, {'sparsity': 0.6761214324548408}, {'sparsity': 0.11600309578778711}], 'copy_layers': [], 'bias': False}
Using device cuda
Files already downloaded and verified

Train set: Average loss: 0.0229, Accuracy: 24267/50000 (49%)


Test set: Average loss: 0.0015, Accuracy: 4839/10000 (48%)

