# Distributed Training with SageMaker Studio

This demo will walk you through how you can utilize Amazon SageMaker distributed training libraries within SageMaker Studio.

Before proceeding with the notebook, copy the `DefaultBucket` output value from the lab instructions window and paste it between the quotes in the following cell. Executing this cell ensures that SageMaker will use the lab created bucket throughout the notebook.

In [None]:
bucket_name = sagemaker_session.default_bucket()

Environment setup stuff:

In [2]:
import boto3
import os
import sagemaker
from sagemaker import get_execution_role, session
from sagemaker.s3 import S3Downloader, S3Uploader

sagemaker_session = sagemaker.Session()
bucket_name = sagemaker_session.default_bucket()
role = get_execution_role()
region = sagemaker_session.boto_session.region_name
sagemaker_session._default_bucket = bucket_name

Designated SageMaker role: arn:aws:iam::656165796789:role/service-role/AmazonSageMaker-ExecutionRole-20220518T115741
SageMaker default bucket:
sagemaker-us-west-2-656165796789



Set up the PyTorch estimator:

In [14]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(
    base_job_name="pytorch-smdataparallel-mnist",
    source_dir="./",
    entry_point="train_pytorch_smdataparallel_mnist.py",
    role=role,
    framework_version="1.8.1",
    py_version="py36",
    # For training with multinode distributed training, set this count. Example: 2
    instance_count=1,
    # For training with p3dn instance use - ml.p3dn.24xlarge, with p4dn instance use - ml.p4d.24xlarge
    # may need to use g4dn.8xlarge / g4dn.16xlarge / g4dn.12xlarge	
    instance_type="ml.p3.16xlarge",
    # instance_type="g4dn.8xlarge",
    # cannot use g4dn with smdataparallel
    sagemaker_session=sagemaker_session,
    # Training using SMDataParallel Distributed Training Framework
    distribution={"smdistributed": {"dataparallel": {"enabled": True}}},
    debugger_hook_config=False,
)



In [19]:
estimator.fit()

2022-06-16 16:39:19 Starting - Starting the training job...ProfilerReport-1655397559: InProgress
...
2022-06-16 16:40:18 Starting - Preparing the instances for training............
2022-06-16 16:42:05 Downloading - Downloading input data
2022-06-16 16:42:05 Training - Downloading the training image..............................
2022-06-16 16:47:09 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-06-16 16:47:12,897 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-06-16 16:47:12,972 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-06-16 16:47:12,980 sagemaker_pytorch_container.training INFO     Invoking SMDataParallel[0m
[34m2022-06-16 16:47:12,980 sagemaker_pytorch_container.training INFO     Invoking user training script

In [20]:
model_data = estimator.model_data
print("Storing {} as model_data".format(model_data))
# TODO refactor
%store model_data

Storing s3://sagemaker-us-west-2-656165796789/pytorch-smdataparallel-mnist-2022-06-16-16-39-19-191/output/model.tar.gz as model_data
Stored 'model_data' (str)


In [21]:
# Retrieve a saved model from a previous notebook run's stored variable
# TODO refactor
%store -r model_data

# If no model was found, set it manually here.
# model_data = 's3://sagemaker-us-west-2-XXX/pytorch-smdataparallel-mnist-2020-10-16-17-15-16-419/output/model.tar.gz'

print("Using this model: {}".format(model_data))

Using this model: s3://sagemaker-us-west-2-656165796789/pytorch-smdataparallel-mnist-2022-06-16-16-39-19-191/output/model.tar.gz


In [24]:
!pygmentize ./inference.py

[37m# Licensed to the Apache Software Foundation (ASF) under one[39;49;00m
[37m# or more contributor license agreements.  See the NOTICE file[39;49;00m
[37m# distributed with this work for additional information[39;49;00m
[37m# regarding copyright ownership.  The ASF licenses this file[39;49;00m
[37m# to you under the Apache License, Version 2.0 (the[39;49;00m
[37m# "License"); you may not use this file except in compliance[39;49;00m
[37m# with the License.  You may obtain a copy of the License at[39;49;00m
[37m#[39;49;00m
[37m#   http://www.apache.org/licenses/LICENSE-2.0[39;49;00m
[37m#[39;49;00m
[37m# Unless required by applicable law or agreed to in writing,[39;49;00m
[37m# software distributed under the License is distributed on an[39;49;00m
[37m# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY[39;49;00m
[37m# KIND, either express or implied.  See the License for the[39;49;00m
[37m# specific language governing permissions and limitations[39;49;

In [25]:
from sagemaker.pytorch import PyTorchModel

model = PyTorchModel(
    model_data=model_data,
    source_dir="./",
    entry_point="inference.py",
    role=role,
    framework_version="1.6.0",
    py_version="py3",
)

In [27]:
predictor = model.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")

ClientError: An error occurred (404) when calling the HeadObject operation: Not Found

In [28]:
# Download the test set
import torchvision
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from packaging.version import Version

# Set the source to download MNIST data from
TORCHVISION_VERSION = "0.9.1"
if Version(torchvision.__version__) < Version(TORCHVISION_VERSION):
    # Set path to data source and include checksum key to make sure data isn't corrupted
    datasets.MNIST.resources = [
        (
            "https://sagemaker-sample-files.s3.amazonaws.com/datasets/image/MNIST/train-images-idx3-ubyte.gz",
            "f68b3c2dcbeaaa9fbdd348bbdeb94873",
        ),
        (
            "https://sagemaker-sample-files.s3.amazonaws.com/datasets/image/MNIST/train-labels-idx1-ubyte.gz",
            "d53e105ee54ea40749a09fcbcd1e9432",
        ),
        (
            "https://sagemaker-sample-files.s3.amazonaws.com/datasets/image/MNIST/t10k-images-idx3-ubyte.gz",
            "9fb629c4189551a2d022fa330f9573f3",
        ),
        (
            "https://sagemaker-sample-files.s3.amazonaws.com/datasets/image/MNIST/t10k-labels-idx1-ubyte.gz",
            "ec29112dd5afa0611ce80d1b7f02629c",
        ),
    ]
else:
    # Set path to data source
    datasets.MNIST.mirrors = [
        "https://sagemaker-sample-files.s3.amazonaws.com/datasets/image/MNIST/"
    ]


test_set = datasets.MNIST(
    "data",
    download=True,
    train=False,
    transform=transforms.Compose(
        [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
    ),
)


# Randomly sample 16 images from the test set
test_loader = DataLoader(test_set, shuffle=True, batch_size=16)
test_images, _ = iter(test_loader).next()

# inspect the images
import torchvision
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline


def imshow(img):
    img = img.numpy()
    img = np.transpose(img, (1, 2, 0))
    plt.imshow(img)
    return


# unnormalize the test images for displaying
unnorm_images = (test_images * 0.3081) + 0.1307

print("Sampled test images: ")
imshow(torchvision.utils.make_grid(unnorm_images))

ModuleNotFoundError: No module named 'torchvision'

In [29]:
# Send the sampled images to endpoint for inference
outputs = predictor.predict(test_images.numpy())
predicted = np.argmax(outputs, axis=1)

print("Predictions: ")
print(predicted.tolist())

NameError: name 'predictor' is not defined

In [None]:
predictor.delete_endpoint()