# Using Neptune for logging PyTorch training jobs on SageMaker

<div class="alert alert-info">This notebook should be run from a SageMaker notebook.</div>

In [None]:
%pip install -q -U sagemaker neptune-client

In [None]:
# Python Built-Ins:
from datetime import datetime
import os
import json
import logging
from tempfile import TemporaryFile
import time

# External Dependencies:
import boto3
from botocore.exceptions import ClientError
import numpy as np
import sagemaker
from sagemaker.pytorch.estimator import PyTorch

# Neptune
import neptune.new as neptune

# Initialize the session
sess = sagemaker.Session()
role = sagemaker.get_execution_role()

# Configuration:
bucket_name = sess.default_bucket()
prefix = "mnist/"
output_path = f"s3://{bucket_name}/{prefix[:-1]}"

## The example use case: MNIST

MNIST is a widely used dataset for handwritten digit classification. It consists of 70,000 labeled 28x28 pixel grayscale images of handwritten digits. The dataset is split into 60,000 training images and 10,000 test images.

In this example, we download the MNIST data from a public S3 bucket and upload it to your default SageMaker bucket as selected above.

In [None]:
def fetch_sample_data(
    to_bucket: str,
    to_prefix: str,
    from_bucket: str = "sagemaker-sample-files",
    from_prefix: str = "datasets/image/MNIST",
    dataset: str = "mnist-train",
):
    DATASETS = {
        "mnist-train": ["train-images-idx3-ubyte.gz", "train-labels-idx1-ubyte.gz"],
        "mnist-test": ["t10k-images-idx3-ubyte.gz", "t10k-labels-idx1-ubyte.gz"],
    }

    if dataset not in DATASETS:
        raise ValueError(f"dataset '{dataset}' not in known set: {set(DATASETS.keys())}")

    if len(from_prefix) and not from_prefix.endswith("/"):
        from_prefix += "/"
    if len(to_prefix) and not to_prefix.endswith("/"):
        to_prefix += "/"

    s3client = boto3.client("s3")
    for key in DATASETS[dataset]:
        # If you're in the same region as the source bucket, you might consider copy_object() instead:
        with TemporaryFile() as ftmp:
            s3client.download_fileobj(from_bucket, f"{from_prefix}{key}", ftmp)
            ftmp.seek(0)
            s3client.upload_fileobj(ftmp, to_bucket, f"{to_prefix}{key}")


train_prefix = f"{prefix}data/train"
fetch_sample_data(to_bucket=bucket_name, to_prefix=train_prefix, dataset="mnist-train")
train_s3uri = f"s3://{bucket_name}/{train_prefix}"
print(f"Uploaded training data to {train_s3uri}")

test_prefix = f"{prefix}data/test"
fetch_sample_data(to_bucket=bucket_name, to_prefix=test_prefix, dataset="mnist-test")
test_s3uri = f"s3://{bucket_name}/{test_prefix}"
print(f"Uploaded training data to {test_s3uri}")

In [None]:
print("Training data:")
!aws s3 ls --recursive $train_s3uri
print("Test data:")
!aws s3 ls --recursive $test_s3uri

## Train 

We are going to use the [SageMaker PyTorch Estimator](https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/sagemaker.pytorch.html).
The estimator uses the code from the `code/` directory. The code is adapted from the [Amazon SageMaker Examples repository](https://github.com/aws/amazon-sagemaker-examples/tree/main/advanced_functionality/multi_model_pytorch).
What was changed in the code is that Neptune logging was added to the `code/train.py` script.

```diff
[...]

def train(args):
+   run = neptune.init_run(tags=["sagemaker"])
    
    [...]
    
+   run["training/args"] = args
+   run["training/model/loss_fn"] = type(loss_fn).__name__
+   run["training/model/model"] = type(net).__name__
+   run["training/model/optimizer"] = type(optimizer).__name__

    logger.info("Start training ...")
    for epoch in range(1, args.epochs + 1):
        net.train()
        for batch_idx, (imgs, labels) in enumerate(train_loader, 1):
            imgs, labels = imgs.to(device), labels.to(device)
            output = net(imgs)
            loss = loss_fn(output, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            [...]
            
+           run["training/train/batch/loss"].log(loss.item())

        # test the model
+       train_loss, train_acc = test(net, train_loader, device)
+       run["training/train/epoch/loss"].log(train_loss)
+       run["training/train/epoch/accuracy"].log(train_acc)
        
+       test_loss, test_acc = test(net, test_loader, device)
+       run["training/test/epoch/loss"].log(test_loss)
+       run["training/test/epoch/accuracy"].log(test_acc)

[...]
        
```

Another difference is that we need `neptune-client` as an additional dependency, so we add it to `code/requirements.txt`. The SageMaker Estimator by default installs all the dependencies defined there.

Because SageMaker does not use python>=3.6 by default, we need to use a custom Docker image to train the model as Python 3.6 has already reached end-of-life and is not supported by Neptune. For this purpose, we use [one of the images provided by AWS](https://github.com/aws/deep-learning-containers/blob/master/available_images.md) as `image_uri`. 
 
Moreover, we provide the Neptune API token and project name as environment variables to the training job via the `environment` argument.

In [None]:
hyperparameters = {
    "batch-size": 128,
    "epochs": 5,
    "learning-rate": 1e-3,
    "log-interval": 100,
}

image_uri = "763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.12.1-cpu-py38-ubuntu20.04-sagemaker"

estimator = PyTorch(
    entry_point="train.py",
    source_dir="code",  # directory of your training script
    role=role,
    image_uri=image_uri,
    instance_type="ml.m5.large",
    instance_count=1,
    output_path=output_path,
    hyperparameters=hyperparameters,
    environment={
        "NEPTUNE_API_TOKEN": neptune.ANONYMOUS_API_TOKEN,
        "NEPTUNE_PROJECT": "common/showroom"
    }
)

estimator.fit({"training": train_s3uri, "testing": test_s3uri})