In [1]:
import sagemaker

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
role_name = role.split(["/"][-1])
print(f"The Amazon Resource Name (ARN) of the role used for this demo is: {role}")
print(f"The name of the role used for this demo is: {role_name[-1]}")

The Amazon Resource Name (ARN) of the role used for this demo is: arn:aws:iam::886035371869:role/torchserve-workshop-SageMakerAPIExecutionRole
The name of the role used for this demo is: torchserve-workshop-SageMakerAPIExecutionRole


In [4]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(
    base_job_name="BRAD-pytorch-smdataparallel-mnist",
    source_dir="code",
    entry_point="train_pytorch_smdataparallel_mnist.py",
    role=role,
    framework_version="1.8.1",
    py_version="py36",
    # For training with multinode distributed training, set this count. Example: 2
    instance_count=3,
    # For training with p3dn instance use - ml.p3dn.24xlarge, with p4dn instance use - ml.p4d.24xlarge
    instance_type="ml.p3.16xlarge",
    sagemaker_session=sagemaker_session,
    # Training using SMDataParallel Distributed Training Framework
    distribution={"smdistributed": {"dataparallel": {"enabled": True}}},
    debugger_hook_config=False,
)

In [5]:
estimator.fit()


2021-06-15 01:26:17 Starting - Starting the training job...
2021-06-15 01:26:41 Starting - Launching requested ML instancesProfilerReport-1623720377: InProgress
............
2021-06-15 01:28:41 Starting - Preparing the instances for training.........
2021-06-15 01:30:01 Downloading - Downloading input data
2021-06-15 01:30:01 Training - Downloading the training image.......................[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-06-15 01:33:58,292 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-06-15 01:33:58,370 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-06-15 01:34:01,466 sagemaker_pytorch_container.training INFO     Invoking SMDataParallel[0m
[34m2021-06-15 01:34:01,467 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-06-15 01:

UnexpectedStatusException: Error for Training job BRAD-pytorch-smdataparallel-mnist-2021-06-15-01-26-17-432: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "mpirun --host algo-1 -np 8 --allow-run-as-root --tag-output --oversubscribe -mca btl_tcp_if_include eth0 -mca oob_tcp_if_include eth0 -mca plm_rsh_no_tree_spawn 1 -mca pml ob1 -mca btl ^openib -mca orte_abort_on_non_zero_status 1 -mca btl_vader_single_copy_mechanism none -mca plm_rsh_num_concurrent 1 -x NCCL_SOCKET_IFNAME=eth0 -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH -x SMDATAPARALLEL_USE_SINGLENODE=1 -x FI_PROVIDER=efa -x RDMAV_FORK_SAFE=1 -x LD_PRELOAD=/opt/conda/lib/python3.6/site-packages/gethostname.cpython-36m-x86_64-linux-gnu.so smddprun /opt/conda/bin/python3.6 -m mpi4py train_pytorch_smdataparallel_mnist.py"
[1,1]<stderr>:Traceback (most recent call last):
[1,1]<stderr>:  File "/opt/conda/lib/python3.6/runpy.py", line 193, in _run_module_as_main
[1,2]<stderr>:Traceback (most recent call last):
[1,2]<stderr>:  File "/opt/conda/lib/python3.6/runpy.py", line 193, in _run_module_as_main
[1,4]<stderr>:Traceback (most recent call last):
[1,4]<stderr>:  

In [6]:

model_data = estimator.model_data
model_data

's3://sagemaker-us-east-1-886035371869/BRAD-pytorch-smdataparallel-mnist-2021-06-15-01-26-17-432/output/model.tar.gz'

In [None]:
import sagemaker

role = sagemaker.get_execution_role()

from sagemaker.pytorch import PyTorchModel

model = PyTorchModel(
    model_data=model_data,
    source_dir="code",
    entry_point="inference.py",
    role=role,
    framework_version="1.6.0",
    py_version="py3",
)

In [None]:
predictor = model.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")


In [None]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

test_set = datasets.MNIST(
    "data",
    download=True,
    train=False,
    transform=transforms.Compose(
        [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
    ),
)


# Randomly sample 16 images from the test set
test_loader = DataLoader(test_set, shuffle=True, batch_size=16)
test_images, _ = iter(test_loader).next()

# inspect the images
import torchvision
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline


def imshow(img):
    img = img.numpy()
    img = np.transpose(img, (1, 2, 0))
    plt.imshow(img)
    return


# unnormalize the test images for displaying
unnorm_images = (test_images * 0.3081) + 0.1307

print("Sampled test images: ")
imshow(torchvision.utils.make_grid(unnorm_images))

In [None]:
outputs = predictor.predict(test_images.numpy())
predicted = np.argmax(outputs, axis=1)

print("Predictions: ")
print(predicted.tolist())
