# Neptune + Pytorch
## Logging and visualizing debugging metrics in PyTorch

Introduction

See how Neptune Scale can be used for foundation model traning when you are required to track a large number of metrics across your transformers architecture.

This guide will show you how to:
- Initialize the Neptune Run object and log configuration parameters
- Log standard loss and accuracy metrics to Neptune
- Log debugging metrics during model training such as;
    * Activations per layer
    * Gradients (mean and std) per layer

## Before you start

  1. Create a Neptune Scale account. [Register &rarr;](https://neptune.ai/early-access)
  2. Create a Neptune project that you will use for tracking metadata. For instructions, see [Projects](https://docs-beta.neptune.ai/projects/) in the Neptune Scale docs.
  3. Install and configure Neptune Scale for logging metadata. For instructions, see [Get started](https://docs-beta.neptune.ai/setup) in the Neptune Scale docs.

### Install Neptune and Dependencies

In [None]:
# Install dependencies
! pip install -q -U neptune_scale torch torchvision

In [None]:
# TODO - update config to include model architecture
# TODO - Add more hyperparameters
# TODO - look at CNN layers
# TODO - output and log the model architecture
# TODO - check loss and accuracy calculations
# TODO - clean up the evaluation function to exclude tracking gradients
# TODO - do not use group tags
# TODO - track the input features
# TODO - clean the training loop of commented out code that is unused
# TODO - add batchnormalization and drop out layers to improve the model
# TODO - add HookManager class

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.multiprocessing as mp
import numpy as np

## Set Hyperparameters for Training

In [2]:
params = {
    "optimizer": "Adam",
    "batch_size": 512,
    "learning_rate": 0.01,
    "epochs": 5,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "input_features": 256,
    "n_classes": 10,
    "input_size": 28 * 28
}

## Download and transform the data for training
In this example, we will be using the MINST dataset as part of the PyTorch library for illustration. We create a train, validation and test dataset and apply a transformation.

In [3]:

# Transform to normalize the data and convert it to tensor
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))  # Normalizing the image to range [-1, 1]
])

# Download and load the MNIST dataset
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
val_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)  # Use test set as validation
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# DataLoader for training, validation, and testing
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)

In [4]:

# Simple Convolutional Neural Network model for MNIST
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)  # Input channels = 1 (grayscale images)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)  # Flattened size of image after convolution layers
        self.fc2 = nn.Linear(128, 10)  # 10 output classes for digits 0-9

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2)  # Pooling layer to downsample
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)
        x = x.view(-1, 64 * 7 * 7)  # Flatten the tensor for the fully connected layer
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        # Define layers (increase number of layers)
        self.fc1 = nn.Linear(params["input_size"], params["input_features"])
        self.fc2 = nn.Linear(params["input_features"], 512)
        self.fc3 = nn.Linear(512, 256)
        self.fc4 = nn.Linear(256, 128)
        self.fc5 = nn.Linear(128, params["n_classes"])      # Output layer (10 classes for MNIST)

    def forward(self, x):
        x = x.view(-1, params["input_size"])  # Flatten the input image (28x28)
        x = torch.relu(self.fc1(x))  # Apply ReLU activation
        x = torch.relu(self.fc2(x))  # Apply ReLU activation
        x = torch.relu(self.fc3(x))  # Apply ReLU activation
        x = torch.relu(self.fc4(x))  # Apply ReLU activation
        x = self.fc5(x)  # Output layer
        return x



In [5]:
# Function to evaluate the model (validation/test) with gradients tracked
def evaluate(model, data_loader, criterion, device, track_gradients=False):
    model.train() if track_gradients else model.eval()  # Ensure model is in training mode if tracking gradients
    correct_preds = 0
    total_preds = 0
    epoch_loss = 0
    with torch.no_grad():  # Disable gradient tracking during evaluation
        for data, target in data_loader:

            data, target = data.to(device), target.to(device)

            # Forward pass (with gradient tracking if specified)
            output = model(data)
            loss = criterion(output, target)  # Correct loss computation
            epoch_loss += loss.item()

            if track_gradients:
                # Track gradients (we will backpropagate but do not update model parameters)
                loss.backward()

            # Calculate accuracy
            _, predicted = torch.max(output.data, 1)
            total_preds += target.size(0)
            correct_preds += (predicted == target).sum().item()

    accuracy = correct_preds / total_preds
    return epoch_loss / len(data_loader), accuracy


## Neptune - Initialize Training Run and Log Configs

In [None]:
# Define Neptune parameters
from neptune_scale import Run
from uuid import uuid4

run = Run(
    project = "leo/pytorch-tutorial",
    run_id=f"pytorch-{uuid4()}"
    )

run.log_configs(
    {
        "config/learning_rate": params["learning_rate"],
        "config/optimizer": params["optimizer"],
        "config/batch_size": params["batch_size"],
        "config/epochs": params["epochs"],
        "config/input_size": params["input_size"]
    }
)

run.add_tags(tags=[params["optimizer"]], group_tags=True)
run.add_tags(tags=["Torch-MINST"])

## Neptune - Log Metrics while Training

In [14]:
## Setup distributed environment
def setup(rank, world_size, backend):

    import os

    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    dist.init_process_group(backend=backend, rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

def clean_up():
    dist.destroy_process_group()

def setupModel(rank, params):
    # Instantiate the model, loss function, and optimizer
    # model = SimpleCNN()
    model = SimpleNN()
    device = torch.device(f'cuda:{rank}' if torch.cuda.is_available() else 'cpu')
    print(device)
    model.to(device)
    if torch.cuda.device_count() > 1:
        print(f"Using {torch.cuda.device_count()} GPUs")
        model = DDP(model, device_ids=[rank])

    # Select an optimizer
    if params["optimizer"] == "Adam":
        optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"])
        print(params["optimizer"])
    elif params["optimizer"] == "SGD":
        optimizer = optim.SGD(model.parameters(), lr=params["learning_rate"])
        print(params["optimizer"])
    else:
        print("No optimizer selected")

    return model, optimizer, device


In [None]:
def train(rank: int, world_size: int, params, train_dataset):

    try:
        setup(rank, world_size, "gloo")
        model, optimizer, device = setupModel(rank, params)

        criterion = nn.CrossEntropyLoss()  # Loss function

        sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, num_replicas=world_size, rank=rank, shuffle=True)
        train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], sampler=sampler)

        # Training loop
        num_epochs = params["epochs"]
        step_counter = 0
        for epoch in range(num_epochs):
            model.train()
            epoch_loss = 0
            correct_preds = 0
            total_preds = 0

            # Training step
            for batch_idx, (data, target) in enumerate(train_loader):
                step_counter += 1
                optimizer.zero_grad()

                data, target = data.to(device), target.to(device)

                # Forward pass
                output = model(data)

                # Compute the loss
                loss = criterion(output, target)

                # Backward pass and optimization
                loss.backward()
                optimizer.step()

                epoch_loss += loss.item()

                # Calculate accuracy
                _, predicted = torch.max(output.data, 1)
                total_preds += target.size(0)
                correct_preds += (predicted == target).sum().item()

                batch_accuracy = correct_preds / total_preds

                # Validation step per training step
                val_loss, val_accuracy = evaluate(model, val_loader, criterion, device)  # Evaluate after each step

                if rank == 0:
                    print(f"Train loss: {loss.item()}")
                    print(f"Accuracy: {batch_accuracy}")
                    print(f"Validation loss: {val_loss}")

                dist.barrier() # synchonize processes before moving to next step

            clean_up() # Clean up processes from DDP training

    except Exception as e:
        print(f"Error during training process (Rank {rank}): {e}")

        ''''
        run.log_metrics(
            data = {
                "metrics/train/loss": loss.item(),
                "metrics/train/accuracy": batch_accuracy,
                "metrics/validation/loss": val_loss,
                "metrics/validation/accuracy": val_accuracy,
                "epoch_value": epoch
            },
            step = step_counter
        )

# Final Testing Step with gradient tracking
test_loss, test_accuracy = evaluate(model, test_loader, track_gradients=False)  # Track gradients during test
print(f"Testing complete. Loss: {test_loss:.4f}, Accuracy: {test_accuracy:.2f}%")

run.log_configs(
        {
        "metrics/test/loss": test_loss,
        "metrics/test/accuracy": test_accuracy
    }
)'
'''

In [8]:
run.close()

neptune:INFO: Waiting for all operations to be processed
neptune:INFO: All operations were processed


In [21]:
# clean_up()

mp.set_start_method('spawn', force=True)

train(0, 1, params, train_dataset)

Error during training process (Rank 0): use_libuv was requested but PyTorch was build without libuv support


In [None]:
# Run DDP
# clean_up()
num_gpu = torch.cuda.device_count()
mp.set_start_method('spawn', force=True)
mp.spawn(train, args=(num_gpu, params, train_dataset), nprocs=num_gpu, join=True)