<a href="https://colab.research.google.com/github/marsggbo/AutoMLDemos/blob/master/ch7/RandomNAS_MNIST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Dependencies and Import Modules

You shoud restart the runtime after running the following pip commands.

In [2]:
!pip install hyperbox==1.3.1
!pip install pytorch-lightning==1.8.6

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hyperbox
  Downloading hyperbox-1.3.0-py3-none-any.whl (240 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.0/240.0 KB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting hydra-core>=1.2
  Downloading hydra_core-1.3.1-py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 KB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytorch-lightning>=1.5
  Downloading pytorch_lightning-1.8.6-py3-none-any.whl (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.3/800.3 KB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m
Collecting omegaconf<2.4,>=2.2
  Downloading omegaconf-2.3.0-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 KB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting antlr4-python3-runtime==4.9.*
  Downloading antlr4-p

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rich
  Downloading rich-13.1.0-py3-none-any.whl (238 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.4/238.4 KB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wandb
  Downloading wandb-0.13.9-py2.py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m80.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting loguru
  Downloading loguru-0.6.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 KB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting commonmark<0.10.0,>=0.9.0
  Downloading commonmark-0.9.1-py2.py3-none-any.whl (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.1/51.1 KB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ..

In [3]:
import argparse

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.optim.lr_scheduler import StepLR
from torchvision import datasets, transforms

from hyperbox.mutables import spaces, ops
from hyperbox.mutator import RandomMutator
from hyperbox.networks.base_nas_network import BaseNASNetwork


# Arguments

In [4]:
def get_args():
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')    
    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs', type=int, default=2, metavar='N',
                        help='number of epochs to train (default: 14)')
    parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
                        help='learning rate (default: 1.0)')
    parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
                        help='Learning rate step gamma (default: 0.7)')
    parser.add_argument('--no-cuda', action='store_true', default=False,
                        help='disables CUDA training')
    parser.add_argument('--no-mps', action='store_true', default=False,
                        help='disables macOS GPU training')
    parser.add_argument('--dry-run', action='store_true', default=False,
                        help='quickly check a single pass')
    parser.add_argument('--seed', type=int, default=1, metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--log-interval', type=int, default=100, metavar='N',
                        help='how many batches to wait before logging training status')
    parser.add_argument('--save-model', action='store_true', default=False,
                        help='For Saving the current Model')
    args = parser.parse_args(args=[])
    return args

args = get_args()
print(args)

Namespace(batch_size=64, dry_run=False, epochs=2, gamma=0.7, log_interval=100, lr=1.0, no_cuda=False, no_mps=False, save_model=False, seed=1, test_batch_size=1000)


# Neural Architecture

In [5]:
class Net(BaseNASNetwork):
    def __init__(self, mask=None):
        super(Net, self).__init__(mask)
        self.conv1 = spaces.OperationSpace(candidates=[
            nn.Conv2d(1, 32, 3, 1, 1),
            nn.Conv2d(1, 32, 5, 1, 2),
            nn.Conv2d(1, 32, 7, 1, 3)
        ], key='conv1', mask=self.mask)
        
        self.conv2 = spaces.OperationSpace(candidates=[
            nn.Conv2d(32, 64, 3, 1, 1),
            nn.Conv2d(32, 64, 5, 1, 2),
            nn.Conv2d(32, 64, 7, 1, 3)
        ], key='conv2', mask=self.mask)
        
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(12544, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output

test `Net`

In [6]:
x = torch.rand(2,1,28,28)
net = Net()
rm = RandomMutator(net)
rm.reset()
y = net(x)
arch = rm._cache
print(f"arch={arch}")
print(net)
print(y.shape)


arch={'conv1': tensor([False,  True, False]), 'conv2': tensor([ True, False, False])}
Net(
  (conv1): OperationSpace(
    (candidates): ModuleList(
      (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): Conv2d(1, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
      (2): Conv2d(1, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
    )
  )
  (conv2): OperationSpace(
    (candidates): ModuleList(
      (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
      (2): Conv2d(32, 64, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
    )
  )
  (dropout1): Dropout(p=0.25, inplace=False)
  (dropout2): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=12544, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=10, bias=True)
)
torch.Size([2, 10])


# Prepare MNIST Dataset

In [17]:

transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
    ])

train_set = datasets.MNIST('./data', download=True, train=True, transform=transform)
train_set, val_set = torch.utils.data.random_split(train_set, [50000, 10000])
test_set = datasets.MNIST('./data', download=True, train=False, transform=transform)
print(len(train_set))
print(len(val_set))
print(len(test_set))

50000
10000
10000


In [20]:
x,y = train_set[0]
print(type(x), x.shape)
print(y)

<class 'torch.Tensor'> torch.Size([1, 28, 28])
9


In [22]:
x,y = test_set[0]
print(type(x), x.shape)
print(y)

<class 'torch.Tensor'> torch.Size([1, 28, 28])
7


In [23]:
torch.manual_seed(args.seed)

use_cuda = not args.no_cuda and torch.cuda.is_available()
if use_cuda:
    device = torch.device("cuda")

train_kwargs = {'batch_size': args.batch_size, 'shuffle': True}
test_kwargs = {'batch_size': args.test_batch_size, 'shuffle': False}
if use_cuda:
    cuda_kwargs = {'num_workers': 1,
                    'pin_memory': True
                   }
    train_kwargs.update(cuda_kwargs)
    test_kwargs.update(cuda_kwargs)

train_loader = torch.utils.data.DataLoader(train_set,**train_kwargs)
val_loader = torch.utils.data.DataLoader(val_set, **test_kwargs)
test_loader = torch.utils.data.DataLoader(test_set, **test_kwargs)


## test dataloader

In [25]:
for batch_id, (imgs, labels) in enumerate(test_loader):
    if batch_id > 2:
        break
    print(imgs.shape, labels.shape)

torch.Size([1000, 1, 28, 28]) torch.Size([1000])
torch.Size([1000, 1, 28, 28]) torch.Size([1000])
torch.Size([1000, 1, 28, 28]) torch.Size([1000])


# Train & Test functions

In [26]:
def train(args, model, mutator, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad(set_to_none=True)
        if mutator is not None:
            mutator.reset()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if (batch_idx + 1) % args.log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
            if args.dry_run:
                break


def test(model, device, test_loader, verbose=True):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / len(test_loader.dataset)

    if verbose:
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            test_loss, correct, len(test_loader.dataset), accuracy))
    return test_loss, accuracy


# Search

In [27]:
# build search space for evaluation
search_space = []
for i in range(3):
    for j in range(3):
        index1 = torch.tensor(i)
        index2 = torch.tensor(j)
        arch = {
            'conv1': F.one_hot(index1, num_classes=3).view(-1).bool(),
            'conv2': F.one_hot(index2, num_classes=3).view(-1).bool()
        }
        search_space.append(arch)
print(f"Search space includes {len(search_space)} candidate models.")

def mask_to_arch_str(mask: dict):
    conv_names = np.array(['conv3x3', 'conv5x5', 'conv7x7'])
    arch = ''
    for key, one_hot_mask in mask.items():
        arch += f"{conv_names[one_hot_mask][0]}, "
    return arch

Search space includes 9 candidate models.


In [28]:

model = Net().to(device)
rm = RandomMutator(model)
optimizer = optim.Adadelta(model.parameters(), lr=args.lr)

scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
history = {}
for epoch in range(1, args.epochs + 1):
    train(args, model, rm, device, train_loader, optimizer, epoch)
    for mask in search_space:
        rm.sample_by_mask(mask)
        arch = model.arch
        val_loss, val_acc = test(model, device, val_loader, False)
        if arch not in history:
            history[arch] = {
                'mask': mask,
                'acc': [val_acc],
                'best_acc': val_acc
            }
        else:
            history[arch]['acc'].append(val_acc)
            history[arch]['best_acc'] = max(history[arch]['acc'])
        print(f"{arch} acc={val_acc} loss={val_loss}")
    scheduler.step()

if args.save_model:
    torch.save(model.state_dict(), "mnist_cnn_nas.pt")

conv1-conv1:tensor([ True, False, False])
conv2-conv2:tensor([ True, False, False])
 acc=96.88 loss=0.09886496887207032
conv1-conv1:tensor([ True, False, False])
conv2-conv2:tensor([False,  True, False])
 acc=95.59 loss=0.14767451248168945
conv1-conv1:tensor([ True, False, False])
conv2-conv2:tensor([False, False,  True])
 acc=96.16 loss=0.12815116500854493
conv1-conv1:tensor([False,  True, False])
conv2-conv2:tensor([ True, False, False])
 acc=96.86 loss=0.10230393753051757
conv1-conv1:tensor([False,  True, False])
conv2-conv2:tensor([False,  True, False])
 acc=96.56 loss=0.11312274322509766
conv1-conv1:tensor([False,  True, False])
conv2-conv2:tensor([False, False,  True])
 acc=95.85 loss=0.1292439224243164
conv1-conv1:tensor([False, False,  True])
conv2-conv2:tensor([ True, False, False])
 acc=96.94 loss=0.10101297073364258
conv1-conv1:tensor([False, False,  True])
conv2-conv2:tensor([False,  True, False])
 acc=96.92 loss=0.10152501525878906
conv1-conv1:tensor([False, False,  True])

# Export the best model

In [29]:
best_acc = 0
mask = None
for arch, info in history.items():
    acc = info['best_acc']
    if acc > best_acc:
        best_acc = acc
        mask = info['mask']

print(f"The best arch is {mask} with acc {best_acc}.")

The best arch is {'conv1': tensor([False,  True, False]), 'conv2': tensor([False,  True, False])} with acc 97.95.


In [30]:
subnet = model.build_subnet(mask).to(device)
val_loss, val_acc = test(subnet, device, test_loader, True)


Test set: Average loss: 0.0528, Accuracy: 9832/10000 (98%)



In [31]:
print(subnet)

Net(
  (conv1): OperationSpace(key='conv1', value=Conv2d(1, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2)))
  (conv2): OperationSpace(key='conv2', value=Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2)))
  (dropout1): Dropout(p=0.25, inplace=False)
  (dropout2): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=12544, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=10, bias=True)
)
