## Using neptune.ai with PyTorch to log information during model development

### by Michael Ruddy

In [None]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# PyTorch stuff
import torch, torchvision
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms

# Neptune
import neptune.new as neptune

Let's use the MNIST dataset to test out these features.

In [None]:
# load up the MNIST dataset
trnsfm = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((.5), (.5))])

ds_train = torchvision.datasets.MNIST(root='./data', train=True,
                                        download=True, transform=trnsfm)
ds_val = torchvision.datasets.MNIST(root='./data', train=False,
                                       download=True, transform=trnsfm)

# I'm going to do more than one "run" in this notebook
global_hyperparam = {'N_train':len(ds_train),
                     'N_val':len(ds_val)}

batch_size = 4
global_hyperparam['batch_size'] = batch_size

# dataloaders
dl_train = torch.utils.data.DataLoader(ds_train, batch_size=batch_size,
                                          shuffle=True, num_workers=2)
dl_val = torch.utils.data.DataLoader(ds_val, batch_size=len(ds_val),
                                         shuffle=False, num_workers=2)

And a very simple CNN architecture.

In [None]:
# simple CNN
class small_CNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
                
        self.linear1 = nn.Linear(64*7*7, 100)
        self.linear2 = nn.Linear(100, 10)
        
        self.pool = nn.MaxPool2d(kernel_size=2)
        self.relu = nn.ReLU()
        self.unroll = nn.Flatten()
        
    def forward(self, x):
        
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)
        
        x = self.conv2(x)
        x = self.relu(x)
        x = self.pool(x)
        
        x = self.conv3(x)
        x = self.relu(x)
        
        x = self.linear1(self.unroll(x))
        x = self.relu(x)
        x = self.linear2(x)
        
        return x

Here we have some simple functions to train epoch.

In [None]:
def one_pass(model, dataloader, optimizer, lossFun, backwards=True, print_loss=False, log=None):
    
    if backwards == True:
        model.train()
    else:
        model.eval()
    
    total_loss = 0.0
    for x, y in tqdm(dataloader):
        
        y_pred = model(x)
        loss = lossFun(y_pred, y)
        total_loss += loss.item()
        
        # pass the key name to log the loss each batch
        if log:
            run[log].log(loss.item())
        
        if backwards == True:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    avg_loss = total_loss / len(dataloader)
    
    if print_loss == True:
        print(avg_loss)
    
    return avg_loss


def one_pass_acc(model, dataloader, num_points):
    model.eval()
    total_incorrect = 0
    
    softmax = nn.LogSoftmax(dim=1)
    
    for x, y in dataloader:
        y_pred = softmax(model(x))
        y_pred = torch.argmax(y_pred, dim=1)
        total_incorrect += torch.count_nonzero(y - y_pred).item()
        
    acc = 1 - (total_incorrect / num_points)
    
    return acc

Now let's perform an experiment. We must first create a project using our account at neptune.ai and get the api_token and project name from there. We'll keep track of various hyperparameters, but also statistics about training such as the training/validation loss each epoch. Finally we can save the model parameters as well.

Some helpful tidbits:
- The choice of organizing the set-up into a `config` folder is arbitrary. I can organize this information however I please. Same thing with train and validation folders.
- What is helpful to make sure that these have the same organization across runs to make comparison easy.

In [None]:
# initialize a run
run = neptune.init(
    project="your_project_name",
    api_token="your_api_key",
    name = "Small_CNN",
    tags = ["Scratch", "3 Downsamples"]
)

# set up model and training
model = small_CNN()
lossFun = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)
num_epochs = 10

# log the set-up
for key, value in global_hyperparam.items():
    run[f'config/{key}'] = value
    
run['config/model'] = type(model).__name__
run['config/criterion'] = type(lossFun).__name__
run['config/optimizer'] = type(optimizer).__name__
run['config/params'] = {"learning_rate": optimizer.param_groups[0]['lr'],
                        "epoch_nr" : num_epochs}

for epoch in tqdm(range(num_epochs)):
    
    train_loss = one_pass(model, dl_train, optimizer, lossFun, log="train/batch_loss")
    valid_loss = one_pass(model, dl_val, optimizer, lossFun, backwards=False)
    
    train_acc = one_pass_acc(model, dl_train, len(ds_train))
    valid_acc = one_pass_acc(model, dl_val, len(ds_val))
    
    # log the loss and accuracy each epoch
    run["train/loss"].log(train_loss)
    run["val/loss"].log(valid_loss)
    run["train/acc"].log(train_acc)
    run["val/acc"].log(valid_acc)

# save your progress
checkpoint = {'model_state_dict': model.state_dict(),
              'optimizer_state_dict' :optimizer.state_dict()}
torch.save(checkpoint, 'model_checkpoint.pt')

# upload the model weights along with an architecture description
run['model/model_checkpoint'].upload('model_checkpoint.pt')

# save model architecture description
model_arch = open("model_arch.txt", "w")
model_arch.write(str(model))
model_arch.close()
run['model/architecture'].upload("model_arch.txt")
    
# stop logging this run
run.stop()

Let's say I close the notebook and want to go back and keep logging the previous model.

In [None]:
# get back to that same run
run = neptune.init(
    project="your_project_name",
    api_token="your_api_key",
    run='NEP-1'
)

# downloads the file with the same name (will overwrite if already there)
run['model/model_checkpoint'].download()

# set up model and training again
model = small_CNN()
lossFun = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)
num_epochs = 10

# load up the previous checkpoint
# model architecture must be the same!
checkpoint = torch.load('model_checkpoint.pt')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

for key, value in global_hyperparam.items():
    run[f'config/{key}'] = value
    
run['config/model'] = type(model).__name__
run['config/criterion'] = type(lossFun).__name__
run['config/optimizer'] = type(optimizer).__name__
run['config/params'] = {"learning_rate": optimizer.param_groups[0]['lr'],
                        "epoch_nr" : num_epochs}

for epoch in tqdm(range(num_epochs)):
    
    train_loss = one_pass(model, dl_train, optimizer, lossFun, log="train/batch_loss")
    valid_loss = one_pass(model, dl_val, optimizer, lossFun, backwards=False)
    
    train_acc = one_pass_acc(model, dl_train, len(ds_train))
    valid_acc = one_pass_acc(model, dl_val, len(ds_val))
    
    # continue to log the loss and accuracy each epoch
    run["train/loss"].log(train_loss)
    run["val/loss"].log(valid_loss)
    run["train/acc"].log(train_acc)
    run["val/acc"].log(valid_acc)

# save your progress again
checkpoint = {'model_state_dict': model.state_dict(),
              'optimizer_state_dict' :optimizer.state_dict()}
torch.save(checkpoint, 'model_checkpoint.pt')
run['model/model_checkpoint'].upload('model_checkpoint.pt')
    
# stop logging the run
run.stop()

Let's compare to a different style of model. After running this, go to the Compare Runs tab in neptune.ai

In [None]:
# simple CNN
class smaller_CNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
                
        self.linear1 = nn.Linear(32*14*14, 100)
        self.linear2 = nn.Linear(100, 10)
        
        self.pool = nn.MaxPool2d(kernel_size=2)
        self.relu = nn.ReLU()
        self.unroll = nn.Flatten()
        
    def forward(self, x):
        
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)
        
        x = self.conv2(x)
        x = self.relu(x)
        
        x = self.linear1(self.unroll(x))
        x = self.relu(x)
        x = self.linear2(x)
        
        return x

In [None]:
# initialize a run
run = neptune.init(
    project="your_project_name",
    api_token="your_api_key",
    name = "Smaller_CNN",
    tags = ["Scratch", "2 Downsamples"]
)

# set up model and training
model = smaller_CNN()
lossFun = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)
num_epochs = 10

# log the set-up
for key, value in global_hyperparam.items():
    run[f'config/{key}'] = value
    
run['config/model'] = type(model).__name__
run['config/criterion'] = type(lossFun).__name__
run['config/optimizer'] = type(optimizer).__name__
run['config/params'] = {"learning_rate": optimizer.param_groups[0]['lr'],
                        "epoch_nr" : num_epochs}

for epoch in tqdm(range(num_epochs)):
    
    train_loss = one_pass(model, dl_train, optimizer, lossFun, log="train/batch_loss")
    valid_loss = one_pass(model, dl_val, optimizer, lossFun, backwards=False)
    
    train_acc = one_pass_acc(model, dl_train, len(ds_train))
    valid_acc = one_pass_acc(model, dl_val, len(ds_val))
    
    # log the loss and accuracy each epoch
    run["train/loss"].log(train_loss)
    run["val/loss"].log(valid_loss)
    run["train/acc"].log(train_acc)
    run["val/acc"].log(valid_acc)

# save your progress
checkpoint = {'model_state_dict': model.state_dict(),
              'optimizer_state_dict' :optimizer.state_dict()}
torch.save(checkpoint, 'model_checkpoint.pt')

# upload the model weights along with an architecture description
run['model/model_checkpoint'].upload('model_checkpoint.pt')

# save model architecture description
model_arch = open("model_arch.txt", "w")
model_arch.write(str(model))
model_arch.close()
run['model/architecture'].upload("model_arch.txt")
    
# stop logging this run
run.stop()