# Read Me

    This script trains batches of multi-layer perceptrons with user-specified structure and digitization options. In order to find the model most resilient to photon shot noise, the training hyperparameters were randomly sampled and a mixture of training strategies (e.g., data augmentation, random digitization etc.) were employed. The best model was selected by testing models against simulated photon shot noise, which is given an example of in ./model_evaluation_shot_noise_sim.ipynb. The script requires package ray and multiple GPUs for parallel training of multiple models, optuna package for parameter searching, and wandb package to log the training results. This is the original script that resulted in the model used in the experiment, which is ./RA_4bit_H2_100_100_lr_0.043_0.50_m_0.87_wep_6_randActDigi_v80_ep97.pt
    
    For the minimalist training script using only Pytorch (without hyperparameter searching or results logging), please see ./main_mnist_mlp_QAT.py

# Load Libraries

In [1]:
from __future__ import print_function
import os, sys
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import optuna

import wandb # logging training files, optional. If not available, set args.wandb = False
from torchvision import datasets, transforms

In [2]:
# Load functions for multiple GPU parallelization
exec(open("../ana_lib/gpu_par.py").read())

2020-11-17 12:33:40,059	INFO services.py:1166 -- View the Ray dashboard at http://127.0.0.1:8265


# Overall Training Structure

There are three nested loops in a neural architecture search project:

    loop around sets of hyperparamters:
        loop around epoches for training of a model of a particular set of hyperparameters:
            loop around mini-batches in an epoch:

In [3]:
""" Training and hyperparameter search configurations """

parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                    help='input batch size for training (default: 64)')
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                    help='input batch size for testing (default: 1000)')
parser.add_argument('--epochs', type=int, default=100, metavar='N',
                    help='number of epochs to train (default: 100)')
parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
                    help='learning rate (default: 0.01)')
parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
                    help='SGD momentum (default: 0.5)')
parser.add_argument('--no-cuda', action='store_true', default=False,
                    help='disables CUDA training')
parser.add_argument('--seed', type=int, default=1, metavar='S',
                    help='random seed (default: 1)')
parser.add_argument('--gpus', default=0,
                    help='gpus used for training - e.g 0,1,3')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                    help='how many batches to wait before logging training status')
parser.add_argument('--wandb', action='store_true', default=True, 
                    help='enables wandb logger')  
parser.add_argument('--csv', action='store_true', default=False, 
                    help='enables csv logger')  
args = parser.parse_args("")
args.cuda = not args.no_cuda and torch.cuda.is_available()

# Set random seeds to reproduce results
torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

In [4]:
""" Prepare data loaders """

def conv1d(NCHW_tensor):
    conv_ker = torch.tensor([[0.05, 0.1, 0.05], [0.1, 1, 0.1], [0.05, 0.1, 0.05]])
    conv_ker = conv_ker.view(1, 1, conv_ker.size(0), conv_ker.size(1))
    img_conv1d = F.conv1d(NCHW_tensor.unsqueeze(0), conv_ker, padding=1).squeeze(0)
    return img_conv1d/img_conv1d.max()

# Data Augmentation with random affine transformation and 2D convolution
transforms_distort = transforms.Compose([transforms.RandomAffine(5, translate=(0.04, 0.04), scale=(0.96, 1.04)), \
                                         transforms.ToTensor(), \
                                         transforms.Lambda(conv1d)])

kwargs = {'num_workers': 20, 'pin_memory': True} if args.cuda else {}

train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('./ML_data', train=True, download=True,
                   transform=transforms_distort),
    batch_size=args.batch_size, shuffle=True, **kwargs)

test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('./ML_data', train=False, 
                   transform=transforms_distort),
    batch_size=args.test_batch_size, shuffle=True, **kwargs)

In [5]:
""" Definition of digitized fully connected layers """

def Digitize(tensor, quant_mode='det', levels=16, min_val=None, max_val=None):
    if not min_val and not max_val:
        min_val, max_val = tensor.min(), tensor.max()
    tensor.clamp_(min_val, max_val).add_(-1*min_val).mul_(levels-1).div_(max_val-min_val)
    if quant_mode == "det": 
        tensor.round_()
    elif quant_mode == "rand":
        tensor.add_(torch.rand(tensor.size(), device=tensor.device).add_(-0.5)).round_()
    tensor.mul_(max_val-min_val).div_(levels-1).add_(min_val)
    return tensor

class DigitizeLinear(nn.Linear):

    def __init__(self,  *kargs, a_quant_mode="det", w_quant_mode="det", a_quant_levels=16, w_quant_levels=32, running_weight=0.001, **kwargs):
        super(DigitizeLinear, self).__init__(*kargs, **kwargs)
        self.act_quant_mode = a_quant_mode
        self.weight_quant_mode = w_quant_mode
        self.register_buffer("act_quant_levels", torch.tensor(a_quant_levels))
        self.register_buffer("weight_quant_levels", torch.tensor(w_quant_levels))
        self.register_buffer("running_weight", torch.tensor(running_weight)) 
        self.register_buffer("running_min", None)
        self.register_buffer("running_max", None)

    def forward(self, input):

        if not self.weight_quant_mode is None: # Set a flag to control weight digitization.
            if not hasattr(self.weight,'org'):
                self.weight.org=self.weight.data.clone()
            self.weight.data=Digitize(self.weight.data, quant_mode=self.weight_quant_mode, levels=self.weight_quant_levels)

        if not self.bias is None:
            self.bias.org=self.bias.data.clone()    
        out = nn.functional.linear(input, self.weight, bias=self.bias)

        if not self.act_quant_mode is None: # A flag to control output digitization. 
            if self.training: # Update the running average of min and max only during training
                with torch.no_grad():
                    if not self.running_min and not self.running_max:
                        self.running_min, self.running_max = out.min(), out.max()
                    self.running_min = (1-self.running_weight) * self.running_min + self.running_weight * out.min()
                    self.running_max = (1-self.running_weight) * self.running_max + self.running_weight * out.max()
            out.data=Digitize(out.data, quant_mode=self.act_quant_mode, levels=self.act_quant_levels, min_val=self.running_min, max_val=self.running_max)
    
        return out

In [6]:
""" Definition of QAT NN structure """

class Net(nn.Module):
    def __init__(self, Nunits, **kwargs):
        super().__init__()
        self.fcs = nn.ModuleList([DigitizeLinear(i,j,**kwargs) for i, j in zip(Nunits[:-1], Nunits[1:])])

    def forward(self, X):
        X = X.view(X.size(0), -1)
        for i, fc in enumerate(self.fcs):
            X = fc(X)
            if fc is not self.fcs[-1]:
                X = F.relu(X)
        return X
    
    def set_digitize_config(self, a_quant_mode, w_quant_mode, a_quant_levels, w_quant_levels):
        for fc in self.fcs:
            fc.act_quant_mode = a_quant_mode
            fc.weight_quant_mode = w_quant_mode
            fc.act_quant_levels = torch.tensor(a_quant_levels)
            fc.weight_quant_levels = torch.tensor(w_quant_levels)
        

In [7]:
 """ helper functions and classes """
    
# A manager for dynamical book-keeping of the top k accuracies and model checkpoints during training
class top_k_manager(object):
    def __init__(self, k=10):
        self.k_best = k
        self.top_k_metric =[0.0]*self.k_best
        self.top_k_paths = [""]*self.k_best
    
    # Compare the new_metric to the top k metrics in the past, and find its place.
    def update_rank(self, new_metric, path_keeping):
        for rank, record_metric in enumerate(self.top_k_metric):
            if record_metric <= new_metric:    
                if os.path.exists(self.top_k_paths[-1]):
                    os.remove(self.top_k_paths[-1])
                if rank < self.k_best - 1:
                    self.top_k_metric[rank+1:] = self.top_k_metric[rank:-1]
                    self.top_k_paths[rank+1:] = self.top_k_paths[rank:-1]  
                self.top_k_metric[rank] = new_metric
                self.top_k_paths[rank] = path_keeping
                return True # the top k list has been updated
        return False
    
# A simple hook class that returns the input and output of a layer during forward/backward pass
class Hook():
    def __init__(self, module, backward=False):
        if backward==False:
            self.hook = module.register_forward_hook(self.hook_fn)
        else:
            self.hook = module.register_backward_hook(self.hook_fn)
    def hook_fn(self, module, input, output):
        self.input = input
        self.output = output
    def close(self):
        self.hook.remove()

# Definition of Training and Testing Loops

    Explanation on quantization-aware training algorithm used in train():
    1. The activations are calculated with a forward passing, involving only quantized weights and activations. Meanwhile, the non-quantized version of the weights is still kept in memory for later use..
    2. The gradients are calculated with backprop based on the quantized activations and weights calucated in (1). 
    3. The non-quantized version of parameters (weights + biases) are updated with gradients, and saved without quantization. Quantizating parameters immediately after updating them can erase small updates.
    4. Quantization is only later performed on these parameters during the evaluation of activations in forward passing or errors in backprop. During these evaluation steps, a quantized copy of the non-quantized parameters are used. Meanwhile, the original non-quantized version stays unchanged until updated with the next batch of calculated gradients.
    PS: the clipping of the parameters represents the straight-through esimator across hard tanh nonlinear layers.


In [8]:
def train(epoch, model, optimizer, criterion):
    model.train()
    # Loop around mini-batches in an epoch
    for batch_idx, (data, target) in enumerate(train_loader):
        if args.cuda:
            data, target = data.cuda(), target.cuda()
        
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        
        """ For an explanation of the parameter update below, see explanation above. """
        for p in list(model.parameters()):
            if hasattr(p,'org'):
                p.data.copy_(p.org)
        optimizer.step()
        for p in list(model.parameters()):
            if hasattr(p,'org'):
                p.org.copy_(p.data.clamp_(-1,1))
        """
        if batch_idx % args.log_interval == 0:
            print(f"Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)}"
            +f" ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}")
        """
        if args.wandb:
            wandb.log({"train_loss": loss.item(), "batch": batch_idx}, step = epoch)

def test(epoch, model, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    fc_hks = [Hook(layer) for layer in model.fcs]
    # Loop around mini-batches in an epoch
    with torch.no_grad():
        for data, target in test_loader:
            if args.cuda:
                data, target = data.cuda(), target.cuda()
            output = model(data)
            test_loss += criterion(output, target).item() # sum up batch loss
            pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
            correct += pred.eq(target.data.view_as(pred)).cpu().sum()

    test_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / len(test_loader.dataset)
    val_stats = {"val_loss": test_loss, "accuracy": accuracy}
    for i, hk in enumerate(fc_hks):
        val_stats[f"fc{i+1}"] = wandb.Histogram(fc_hks[i].output.cpu())      
            
    print(f"\nTest set: Epoch {epoch}, Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)}" 
          +f"({accuracy:.0f}%)\n")

    if args.wandb:
        wandb.log(val_stats, step=epoch)
    return test_loss, accuracy

In [10]:
""" The objective function runs a trial in a NAS study (a loop around epochs) """

def objective(trial, NAS_project_name):

    # Define the hyperparameter search space
    fc1_units = trial.suggest_categorical("fc1_units", [100, ])
    fc2_units = 200 - fc1_units # aka, the 1st and 2nd hidden layers both have 100 units.
    Nunits = [28**2, fc1_units, fc2_units, 10]
    learning_rate = trial.suggest_uniform("lr", 0.03, 0.05)
    momentum = trial.suggest_uniform("mm", 0.7, 1)
    lr_decay = trial.suggest_uniform("lr_decay", 0.3, 0.5)
    warmup_epochs = trial.suggest_categorical("warmup_eps", [6, 8, 10, 12])
    model_description = f"4bit_H2_{fc1_units}_{fc2_units}_lr_{learning_rate:.3f}_{lr_decay:.2f}" + f"_m_{momentum:.2f}" + f"_wep_{warmup_epochs}" + "_randActDigi" + f"_v_{trial.number}"

    # Instantiate a MLP model
    model = Net(Nunits, a_quant_mode=None, w_quant_mode=None, a_quant_levels=16, w_quant_levels=32)
    if args.cuda:
        gpu_id = get_gpu_id()
        print(f"cuda:{gpu_id} available")
        torch.cuda.set_device(gpu_id)
        model.cuda() # transfer the model from cpu to gpu
    
    # Set up logging if necessary
    if args.wandb:
        wandb.init(project=NAS_project_name, name=model_description, reinit=True)
        wandb.watch(model, log="all")

    # Configure loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    #optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)

    # Loop around epoches
    tpk_mngr = top_k_manager()
    ckpt_save_path = "./" + model_description
    if not os.path.exists(ckpt_save_path):
        os.makedirs(ckpt_save_path)
    for epoch in range(1, args.epochs + 1):
        if epoch > warmup_epochs:
            model.set_digitize_config("rand", "det", 16, 32)
        train(epoch, model, optimizer, criterion)
        loss, accu = test(epoch, model, criterion)
        # schedule learning rate decay
        if epoch%20==0:
            optimizer.param_groups[0]['lr']=optimizer.param_groups[0]['lr'] * (lr_decay)
        # Save the best models aftering the training gets more stable
        if epoch > 20:
            if tpk_mngr.update_rank(accu, ckpt_save_path+f"/ep{epoch}.pt"):
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': loss,
                }, ckpt_save_path+f"/ep{epoch}.pt")

    # Log the best models
    trial.set_user_attr('top 3 accuracy', torch.tensor(tpk_mngr.top_k_metric[:3]).mean().item()) # Save the best accuracy during the taining loop      
    if args.wandb:
        wandb.run.summary["top_k_accu"] = tpk_mngr.top_k_metric
        wandb.run.summary["top_k_paths"] = tpk_mngr.top_k_paths
    return torch.tensor(tpk_mngr.top_k_metric[:5]).mean().item() # return the average of top k accuracies to guide NAS

In [11]:
""" The main function conducts an optuna neural architecture search (NAS) study (loop around hyperparameters) """

def hyper_run(rseed):
    if args.wandb:
        wandb.config = args
    NAS_project_name = "ParamSearch_mlp_4bit_QAT_H2_200_NAS"
    sampler = optuna.samplers.TPESampler(seed=rseed) 
    storage = f'sqlite:///'+NAS_project_name+'.db' # way to specify an SQL database
    study = optuna.create_study(study_name=NAS_project_name, storage=storage, 
                                sampler=sampler, direction="maximize", load_if_exists=True) 
    study.optimize(lambda trial: objective(trial, NAS_project_name), n_trials=30)

In [None]:
# Start parallel parameter seaching in multiple GPUs
gpu_map(hyper_run, range(4))

(pid=32047) [I 2020-11-17 12:35:12,447] A new study created with name: ParamSearch_mlp_randy_aug_4bit_QAT_H2_180_NAS


(pid=32047) cuda:3 available


(pid=32071) [I 2020-11-17 12:35:14,012] Using an existing study with name 'ParamSearch_mlp_randy_aug_4bit_QAT_H2_180_NAS' instead of creating a new one.


(pid=32071) cuda:3 available


(pid=31949) [I 2020-11-17 12:35:15,123] Using an existing study with name 'ParamSearch_mlp_randy_aug_4bit_QAT_H2_180_NAS' instead of creating a new one.


(pid=31949) cuda:3 available


(pid=32094) [I 2020-11-17 12:35:15,652] Using an existing study with name 'ParamSearch_mlp_randy_aug_4bit_QAT_H2_180_NAS' instead of creating a new one.


(pid=32094) cuda:3 available


(pid=32047) wandb: Tracking run with wandb version 0.9.7
(pid=32071) wandb: Tracking run with wandb version 0.9.7
(pid=32047) wandb: Wandb version 0.10.10 is available!  To upgrade, please run:
(pid=32047) wandb:  $ pip install wandb --upgrade
(pid=32047) wandb: Run data is saved locally in wandb/run-20201117_123518-378fv7qm
(pid=32047) wandb: Syncing run 4bit_H2_168_12_lr_0.031_0.32_m_0.87_wep_8_randActDigi_v_0
(pid=32047) wandb: ⭐️ View project at https://app.wandb.ai/gangsterkitty/ParamSearch_mlp_randy_aug_4bit_QAT_H2_180_NAS
(pid=32047) wandb: 🚀 View run at https://app.wandb.ai/gangsterkitty/ParamSearch_mlp_randy_aug_4bit_QAT_H2_180_NAS/runs/378fv7qm
(pid=32047) wandb: Run `wandb off` to turn off syncing.
(pid=32047) 
(pid=32071) wandb: Wandb version 0.10.10 is available!  To upgrade, please run:
(pid=32071) wandb:  $ pip install wandb --upgrade
(pid=32071) wandb: Run data is saved locally in wandb/run-20201117_123519-f2626jk6
(pid=32094) wandb: Tracking run with wandb version 0.9.

(pid=32047) 
(pid=32047) Test set: Epoch 1, Average loss: 0.0002, Accuracy: 9436/10000(94%)
(pid=32047) 
(pid=32071) 
(pid=32071) Test set: Epoch 1, Average loss: 0.0002, Accuracy: 9477/10000(95%)
(pid=32071) 
(pid=31949) 
(pid=31949) Test set: Epoch 1, Average loss: 0.0002, Accuracy: 9310/10000(93%)
(pid=31949) 
(pid=32094) 
(pid=32094) Test set: Epoch 1, Average loss: 0.0002, Accuracy: 9387/10000(94%)
(pid=32094) 
(pid=32047) 
(pid=32047) Test set: Epoch 2, Average loss: 0.0001, Accuracy: 9550/10000(96%)
(pid=32047) 
(pid=32071) 
(pid=32071) Test set: Epoch 2, Average loss: 0.0001, Accuracy: 9583/10000(96%)
(pid=32071) 
(pid=32094) 
(pid=32094) Test set: Epoch 2, Average loss: 0.0001, Accuracy: 9668/10000(97%)
(pid=32094) 
(pid=31949) 
(pid=31949) Test set: Epoch 2, Average loss: 0.0001, Accuracy: 9555/10000(96%)
(pid=31949) 
(pid=32047) 
(pid=32047) Test set: Epoch 3, Average loss: 0.0001, Accuracy: 9642/10000(96%)
(pid=32047) 
(pid=32071) 
(pid=32071) Test set: Epoch 3, Average los

(pid=32071) Test set: Epoch 39, Average loss: 0.0007, Accuracy: 9496/10000(95%)
(pid=32071) 
(pid=32047) 
(pid=32047) Test set: Epoch 40, Average loss: 0.0002, Accuracy: 9608/10000(96%)
(pid=32047) 
(pid=32094) 
(pid=32094) Test set: Epoch 41, Average loss: 0.0002, Accuracy: 9666/10000(97%)
(pid=32094) 
(pid=31949) 
(pid=31949) Test set: Epoch 38, Average loss: 0.0003, Accuracy: 9567/10000(96%)
(pid=31949) 
(pid=32071) 
(pid=32071) Test set: Epoch 40, Average loss: 0.0007, Accuracy: 9588/10000(96%)
(pid=32071) 
(pid=32047) 
(pid=32047) Test set: Epoch 41, Average loss: 0.0001, Accuracy: 9682/10000(97%)
(pid=32047) 
(pid=32094) 
(pid=32094) Test set: Epoch 42, Average loss: 0.0002, Accuracy: 9595/10000(96%)
(pid=32094) 
(pid=32047) 
(pid=32047) Test set: Epoch 42, Average loss: 0.0002, Accuracy: 9634/10000(96%)
(pid=32047) 
(pid=31949) 
(pid=31949) Test set: Epoch 39, Average loss: 0.0003, Accuracy: 9624/10000(96%)
(pid=31949) 
(pid=32071) 
(pid=32071) Test set: Epoch 41, Average loss: 

(pid=32071) Test set: Epoch 74, Average loss: 0.0005, Accuracy: 9643/10000(96%)
(pid=32071) 
(pid=32094) 
(pid=32094) Test set: Epoch 82, Average loss: 0.0001, Accuracy: 9684/10000(97%)
(pid=32094) 
(pid=31949) 
(pid=31949) Test set: Epoch 73, Average loss: 0.0002, Accuracy: 9603/10000(96%)
(pid=31949) 
(pid=32047) 
(pid=32047) Test set: Epoch 83, Average loss: 0.0001, Accuracy: 9670/10000(97%)
(pid=32047) 
(pid=32071) 
(pid=32071) Test set: Epoch 75, Average loss: 0.0006, Accuracy: 9645/10000(96%)
(pid=32071) 
(pid=32094) 
(pid=32094) Test set: Epoch 83, Average loss: 0.0001, Accuracy: 9649/10000(96%)
(pid=32094) 
(pid=32047) 
(pid=32047) Test set: Epoch 84, Average loss: 0.0002, Accuracy: 9616/10000(96%)
(pid=32047) 
(pid=31949) 
(pid=31949) Test set: Epoch 74, Average loss: 0.0002, Accuracy: 9625/10000(96%)
(pid=31949) 
(pid=32094) 
(pid=32094) Test set: Epoch 84, Average loss: 0.0001, Accuracy: 9646/10000(96%)
(pid=32094) 
(pid=32071) 
(pid=32071) Test set: Epoch 76, Average loss: 

(pid=32047) [I 2020-11-17 12:59:13,328] Finished trial#0 with value: 96.72200012207031 with parameters: {'fc1_units': 168, 'lr': 0.03141449760021982, 'mm': 0.867989808754481, 'lr_decay': 0.3242657162410432, 'warmup_eps': 8}. Best is trial#0 with value: 96.72200012207031.


(pid=32047) cuda:3 available


(pid=32047) 
(pid=32047) wandb: Waiting for W&B process to finish, PID 42161
(pid=32047) wandb: Program ended successfully.
(pid=32047) wandb: Run summary:
(pid=32047) wandb:                     batch 937
(pid=32047) wandb:                  _runtime 1443.8451828956604
(pid=32047) wandb:                  val_loss 0.00016083306819200517
(pid=32047) wandb:                train_loss 0.0008957763202488422
(pid=32047) wandb:                _timestamp 1605635953.0751202
(pid=32047) wandb:                     _step 100
(pid=32047) wandb:                  accuracy 95.9800033569336
(pid=32047) wandb: Syncing 5 W&B file(s), 1 media file(s), 0 artifact file(s) and 0 other file(s)
wandb:                                                                                
(pid=32047) wandb: Synced 4bit_H2_168_12_lr_0.031_0.32_m_0.87_wep_8_randActDigi_v_0: https://app.wandb.ai/gangsterkitty/ParamSearch_mlp_randy_aug_4bit_QAT_H2_180_NAS/runs/378fv7qm
(pid=32047) wandb: Tracking run with wandb version 0.9.7

(pid=32071) 
(pid=32071) Test set: Epoch 89, Average loss: 0.0006, Accuracy: 9627/10000(96%)
(pid=32071) 
(pid=32094) 
(pid=32094) Test set: Epoch 100, Average loss: 0.0001, Accuracy: 9654/10000(97%)
(pid=32094) 


(pid=32094) [I 2020-11-17 12:59:25,275] Finished trial#3 with value: 96.83599853515625 with parameters: {'fc1_units': 164, 'lr': 0.033701641563464134, 'mm': 0.8863081731642224, 'lr_decay': 0.4895461223055756, 'warmup_eps': 12}. Best is trial#3 with value: 96.83599853515625.
(pid=32094) 
(pid=32094) wandb: Waiting for W&B process to finish, PID 42401
(pid=32094) wandb: Program ended successfully.


(pid=32094) cuda:3 available


(pid=32094) wandb: Run summary:
(pid=32094) wandb:                  accuracy 96.54000091552734
(pid=32094) wandb:                train_loss 0.06844605505466461
(pid=32094) wandb:                     _step 100
(pid=32094) wandb:                  val_loss 0.00014039584398269653
(pid=32094) wandb:                     batch 937
(pid=32094) wandb:                _timestamp 1605635965.0620441
(pid=32094) wandb:                  _runtime 1452.0894711017609
(pid=32094) wandb: Syncing 5 W&B file(s), 1 media file(s), 0 artifact file(s) and 0 other file(s)
wandb:                                                                                
(pid=32094) wandb: Synced 4bit_H2_164_16_lr_0.034_0.49_m_0.89_wep_12_randActDigi_v_3: https://app.wandb.ai/gangsterkitty/ParamSearch_mlp_randy_aug_4bit_QAT_H2_180_NAS/runs/37cimam6
(pid=32094) wandb: Tracking run with wandb version 0.9.7


(pid=31949) 
(pid=31949) Test set: Epoch 88, Average loss: 0.0002, Accuracy: 9651/10000(97%)
(pid=31949) 


(pid=32094) wandb: Wandb version 0.10.10 is available!  To upgrade, please run:
(pid=32094) wandb:  $ pip install wandb --upgrade
(pid=32094) wandb: Run data is saved locally in wandb/run-20201117_125929-2caehkfe
(pid=32094) wandb: Syncing run 4bit_H2_164_16_lr_0.036_0.44_m_0.73_wep_12_randActDigi_v_5
(pid=32094) wandb: ⭐️ View project at https://app.wandb.ai/gangsterkitty/ParamSearch_mlp_randy_aug_4bit_QAT_H2_180_NAS
(pid=32094) wandb: 🚀 View run at https://app.wandb.ai/gangsterkitty/ParamSearch_mlp_randy_aug_4bit_QAT_H2_180_NAS/runs/2caehkfe
(pid=32094) wandb: Run `wandb off` to turn off syncing.
(pid=32094) 


(pid=32047) 
(pid=32047) Test set: Epoch 1, Average loss: 0.0002, Accuracy: 9255/10000(93%)
(pid=32047) 
(pid=32071) 
(pid=32071) Test set: Epoch 90, Average loss: 0.0006, Accuracy: 9625/10000(96%)
(pid=32071) 
(pid=32047) 
(pid=32047) Test set: Epoch 2, Average loss: 0.0001, Accuracy: 9566/10000(96%)
(pid=32047) 
(pid=32094) 
(pid=32094) Test set: Epoch 1, Average loss: 0.0002, Accuracy: 9253/10000(93%)
(pid=32094) 
(pid=31949) 
(pid=31949) Test set: Epoch 89, Average loss: 0.0002, Accuracy: 9632/10000(96%)
(pid=31949) 
(pid=32094) 
(pid=32094) Test set: Epoch 2, Average loss: 0.0002, Accuracy: 9524/10000(95%)
(pid=32094) 
(pid=32047) 
(pid=32047) Test set: Epoch 3, Average loss: 0.0001, Accuracy: 9645/10000(96%)
(pid=32047) 
(pid=32071) 
(pid=32071) Test set: Epoch 91, Average loss: 0.0006, Accuracy: 9600/10000(96%)
(pid=32071) 
(pid=31949) 
(pid=31949) Test set: Epoch 90, Average loss: 0.0002, Accuracy: 9638/10000(96%)
(pid=31949) 
(pid=32094) 
(pid=32094) Test set: Epoch 3, Average

(pid=32071) [I 2020-11-17 13:03:10,474] Finished trial#1 with value: 96.46600341796875 with parameters: {'fc1_units': 156, 'lr': 0.049943696218777374, 'mm': 0.8865114718677318, 'lr_decay': 0.3256248895858713, 'warmup_eps': 8}. Best is trial#3 with value: 96.83599853515625.


(pid=32071) cuda:3 available


(pid=32071) 
(pid=32071) wandb: Waiting for W&B process to finish, PID 42256
(pid=32071) wandb: Program ended successfully.


(pid=32047) 
(pid=32047) Test set: Epoch 17, Average loss: 0.0002, Accuracy: 9599/10000(96%)
(pid=32047) 
(pid=32094) 
(pid=32094) Test set: Epoch 16, Average loss: 0.0002, Accuracy: 9525/10000(95%)
(pid=32094) 


(pid=32071) wandb: Run summary:
(pid=32071) wandb:                train_loss 0.021660849452018738
(pid=32071) wandb:                     _step 100
(pid=32071) wandb:                _timestamp 1605636190.2000477
(pid=32071) wandb:                  accuracy 96.05999755859375
(pid=32071) wandb:                     batch 937
(pid=32071) wandb:                  _runtime 1678.9753375053406
(pid=32071) wandb:                  val_loss 0.0005887947976589203
(pid=32071) wandb: Syncing 5 W&B file(s), 1 media file(s), 0 artifact file(s) and 0 other file(s)
wandb:                                                                                
(pid=32071) wandb: Synced 4bit_H2_156_24_lr_0.050_0.33_m_0.89_wep_8_randActDigi_v_1: https://app.wandb.ai/gangsterkitty/ParamSearch_mlp_randy_aug_4bit_QAT_H2_180_NAS/runs/f2626jk6
(pid=32071) wandb: Tracking run with wandb version 0.9.7
(pid=32071) wandb: Wandb version 0.10.10 is available!  To upgrade, please run:
(pid=32071) wandb:  $ pip install wandb --up

In [11]:
# Check the instances of training threads (each is an optuna project) in GPU
ray_objs

[ObjectRef(df5a1a828c9685d3ffffffff0100000001000000),
 ObjectRef(cb230a572350ff44ffffffff0100000001000000),
 ObjectRef(7bbd90284b71e599ffffffff0100000001000000),
 ObjectRef(bd37d2621480fc7dffffffff0100000001000000)]

In [12]:
# Terminate all current GPU threads. 
# Note: just stopping Jupyter notebook wont stop the threads.
kill_gpu_processes()