# Neptune + PyTorch

Introduction

This guide will show you how to:
- Initialize the Neptune Run object and log configuration parameters
- Log standard loss and accuracy metrics to Neptune
- Log debugging metrics during model training such as;
    * Activations per layer
    * Gradients (mean and std weights and biases) per layer

## Before you start

  1. Create a Neptune Scale account. [Register &rarr;](https://neptune.ai/early-access)
  2. Create a Neptune project that you will use for tracking metadata. For instructions, see [Projects](https://docs-beta.neptune.ai/projects/) in the Neptune Scale docs.
  3. Install and configure Neptune Scale for logging metadata. For instructions, see [Get started](https://docs-beta.neptune.ai/setup) in the Neptune Scale docs.

### Install Neptune and Dependencies

In [None]:
# Install dependencies
! pip install -q -U neptune_scale torch torchvision

In [None]:
# TODO - update config to include model architecture
# TODO - Add more hyperparameters
# TODO - Add additional logging metrics (weights, gradients, activations, etc.)

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np

## Set Hyperparameters for Training

In [6]:
params = {
    "optimizer": "Adam",
    "batch_size": 512,
    "learning_rate": 0.05,
    "epochs": 5, 
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "input_features": 256,
    "n_classes": 10,
    "input_size": 28 * 28
}

In [5]:

# Transform to normalize the data and convert it to tensor
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))  # Normalizing the image to range [-1, 1]
])

# Download and load the MNIST dataset
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
val_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)  # Use test set as validation
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# DataLoader for training, validation, and testing
train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=params["batch_size"], shuffle=False)


In [10]:

# Simple Convolutional Neural Network model for MNIST
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)  # Input channels = 1 (grayscale images)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)  # Flattened size of image after convolution layers
        self.fc2 = nn.Linear(128, 10)  # 10 output classes for digits 0-9
    
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2)  # Pooling layer to downsample
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)
        x = x.view(-1, 64 * 7 * 7)  # Flatten the tensor for the fully connected layer
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
    
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        # Define layers (increase number of layers)
        self.fc1 = nn.Linear(params["input_size"], params["input_features"]) 
        self.fc2 = nn.Linear(params["input_features"], 64)
        self.fc3 = nn.Linear(64, params["n_classes"])      # Output layer (10 classes for MNIST)

        # Registering hooks to track activations
        self.hooks = []
        self.hooks.append(self.fc1.register_forward_hook(self.save_activation("fc1")))
        self.hooks.append(self.fc2.register_forward_hook(self.save_activation("fc2")))
        self.hooks.append(self.fc3.register_forward_hook(self.save_activation("fc3")))

    def forward(self, x):
        x = x.view(-1, params["input_size"])  # Flatten the input image (28x28)
        x = torch.relu(self.fc1(x))  # Apply ReLU activation
        x = torch.relu(self.fc2(x))  # Apply ReLU activation
        x = self.fc3(x)  # Output layer
        return x
    
        # Function to save activations
    def save_activation(self, name):
        def hook(model, input, output):
            self.activations[name] = output
        return hook
    
    def get_activations(self):
        return self.activations

    def clear_activations(self):
        self.activations = {}

# Instantiate the model, loss function, and optimizer
# model = SimpleCNN()
model = SimpleNN()
criterion = nn.CrossEntropyLoss()  # Loss function

# Select an optimizer
if params["optimizer"] == "Adam":
    optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"])
    print(params["optimizer"])
elif params["optimizer"] == "SGD":
    optimizer = optim.SGD(model.parameters(), lr=params["learning_rate"])
    print(params["optimizer"])
else:
    print("No optimizer selected")


Adam


In [11]:
# Function to evaluate the model (validation/test) with gradients tracked
def evaluate(model, data_loader, track_gradients=False):
    model.train() if track_gradients else model.eval()  # Ensure model is in training mode if tracking gradients
    correct_preds = 0
    total_preds = 0
    epoch_loss = 0
    with torch.no_grad():  # Disable gradient tracking during evaluation
        for data, target in data_loader:
            # Forward pass (with gradient tracking if specified)
            output = model(data)
            loss = criterion(output, target)  # Correct loss computation
            epoch_loss += loss.item()
            
            if track_gradients:
                # Track gradients (we will backpropagate but do not update model parameters)
                loss.backward()
            
            # Calculate accuracy
            _, predicted = torch.max(output.data, 1)
            total_preds += target.size(0)
            correct_preds += (predicted == target).sum().item()
    
    accuracy = 100 * correct_preds / total_preds
    return epoch_loss / len(data_loader), accuracy


## Neptune - Initialize Training Run and Log Configs

In [None]:
# Define Neptune parameters
from neptune_scale import Run
from uuid import uuid4

run = Run(run_id=f"pytorch-{uuid4()}")

run.log_configs(
    {
        "config/learning_rate": params["learning_rate"],
        "config/optimizer": params["optimizer"],
        "config/batch_size": params["batch_size"],
        "config/epochs": params["epochs"]
    }
)

run.add_tags(tags=[params["optimizer"]], group_tags=True)
run.add_tags(tags=["Torch-MINST"])

## Neptune - Log Metrics while Training

In [None]:
# Training loop
num_epochs = params["epochs"]
step_counter = 0
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    correct_preds = 0
    total_preds = 0

    # Reset activations for each epoch
    model.clear_activations()
    
    # Training step
    for batch_idx, (data, target) in enumerate(train_loader):
        step_counter += 1
        optimizer.zero_grad()
        
        # Forward pass
        output = model(data)
        
        # Compute the loss
        loss = criterion(output, target)
        epoch_loss += loss.item()
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        # Calculate accuracy
        _, predicted = torch.max(output.data, 1)
        total_preds += target.size(0)
        correct_preds += (predicted == target).sum().item()
        
        # Print loss and accuracy for each batch (step)
        #if (batch_idx + 1) % 5 == 0:  # Every 5 steps
        batch_accuracy = 100 * correct_preds / total_preds
        print(f"Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}, Accuracy: {batch_accuracy:.2f}%")
                
        # Validation step per training step
        val_loss, val_accuracy = evaluate(model, val_loader)  # Evaluate after each step
        print(f"Validation at step [{batch_idx+1}/{len(train_loader)}] - Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.2f}%")

        run.log_metrics(
            data = {
                "metrics/train/loss": loss.item(),
                "metrics/train/accuracy": batch_accuracy,
                "metrics/validation/loss": val_loss,
                "metrics/validation/accuracy": val_accuracy,
                "epoch_value": epoch
            },
            step = step_counter
        )
    
    # Print loss and accuracy for the entire training epoch
    train_accuracy = 100 * correct_preds / total_preds
    print(f"Epoch [{epoch+1}/{num_epochs}] Training complete. Loss: {epoch_loss / len(train_loader):.4f}, Accuracy: {train_accuracy:.2f}%")

    # Track activations and gradients per layer
    activation_dict_mean = {
        f"layers/layer_{name}/activation_mean": activation.mean().item() for name, activation in model.get_activations().items()
        }
    
    activation_dict_std = {
        f"layers/layer_{name}/activation_std": activation.std().item() for name, activation in model.get_activations().items()
    }

    params_dict_std = {
            f"layers/layer_{name.split(".")[0]}/{name.split(".")[1]}_std": param.grad.std().item() for name, param in model.named_parameters()
        }
    
    params_dict_mean = {
            f"layers/layer_{name.split(".")[0]}/{name.split(".")[1]}_mean": param.grad.mean().item() for name, param in model.named_parameters()
        }
    
    layers_dict = {**activation_dict_mean, 
                   **activation_dict_std,
                   **params_dict_mean,
                   **params_dict_std
                   }
    print(layers_dict)

    # data_to_log = {
    #        "metrics/test/loss_epoch": epoch_loss / len(train_loader),
     #       "metrics/train/accuracy_epoch": train_accuracy
      #  }.update(activation_dict)
    
    run.log_metrics(
    data = layers_dict,
    step = epoch
    )
    
# Final Testing Step with gradient tracking
test_loss, test_accuracy = evaluate(model, test_loader, track_gradients=False)  # Track gradients during test
print(f"Testing complete. Loss: {test_loss:.4f}, Accuracy: {test_accuracy:.2f}%")

run.log_configs(
        {
        "metrics/test/loss": test_loss,
        "metrics/test/accuracy": test_accuracy
    }
)

In [9]:
run.close()

neptune:INFO: Waiting for all operations to be processed
neptune:INFO: All operations were processed
