In [14]:
import os
os.chdir('..')

In [15]:
!ls

pytorch_smmodelparallel_mnist.ipynb  utils


In [2]:
!pip install sagemaker-experiments


Collecting sagemaker-experiments
  Downloading sagemaker_experiments-0.1.33-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 759 kB/s  eta 0:00:01
Installing collected packages: sagemaker-experiments
Successfully installed sagemaker-experiments-0.1.33


In [16]:
%%time
import sagemaker
from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorch
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
import boto3
from time import gmtime, strftime

role = (
    get_execution_role()
)  # provide a pre-existing role ARN as an alternative to creating a new role
print(f"SageMaker Execution Role:{role}")

session = boto3.session.Session()

SageMaker Execution Role:arn:aws:iam::886035371869:role/torchserve-workshop-SageMakerAPIExecutionRole
CPU times: user 41.6 ms, sys: 8.02 ms, total: 49.7 ms
Wall time: 97.3 ms


In [17]:
sagemaker_session = sagemaker.session.Session(boto_session=session)
mpioptions = "-verbose -x orte_base_help_aggregate=0 "

all_experiment_names = [exp.experiment_name for exp in Experiment.list()]

# choose an experiment name (only need to create it once)
experiment_name = "SM-MP-DEMO"

# Load the experiment if it exists, otherwise create
if experiment_name not in all_experiment_names:
    customer_churn_experiment = Experiment.create(
        experiment_name=experiment_name, sagemaker_boto_client=boto3.client("sagemaker")
    )
else:
    customer_churn_experiment = Experiment.load(
        experiment_name=experiment_name, sagemaker_boto_client=boto3.client("sagemaker")
    )

# Create a trial for the current run
trial = Trial.create(
    trial_name="SMD-MP-demo-{}".format(strftime("%Y-%m-%d-%H-%M-%S", gmtime())),
    experiment_name=customer_churn_experiment.experiment_name,
    sagemaker_boto_client=boto3.client("sagemaker"),
)


smd_mp_estimator = PyTorch(
    entry_point="pt_mnist.py",  # Pick your train script
    source_dir="utils",
    role=role,
    instance_type="ml.p3.16xlarge",
    sagemaker_session=sagemaker_session,
    framework_version="1.6.0",
    py_version="py36",
    instance_count=1,
    distribution={
        "smdistributed": {
            "modelparallel": {
                "enabled": True,
                "parameters": {
                    "microbatches": 4,
                    "placement_strategy": "spread",
                    "pipeline": "interleaved",
                    "optimize": "speed",
                    "partitions": 2,
                    "ddp": True,
                },
            }
        },
        "mpi": {
            "enabled": True,
            "processes_per_host": 2,  # Pick your processes_per_host
            "custom_mpi_options": mpioptions,
        },
    },
    base_job_name="SMD-MP-demo",
)

In [18]:
smd_mp_estimator.fit(
    experiment_config={
        "ExperimentName": customer_churn_experiment.experiment_name,
        "TrialName": trial.trial_name,
        "TrialComponentDisplayName": "Training",
    }
)

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: SMD-MP-demo-2021-06-14-14-05-37-359


2021-06-14 14:05:37 Starting - Starting the training job...
2021-06-14 14:05:41 Starting - Launching requested ML instancesProfilerReport-1623679537: InProgress
.........
2021-06-14 14:07:28 Starting - Preparing the instances for training.........
2021-06-14 14:09:08 Downloading - Downloading input data...
2021-06-14 14:09:28 Training - Downloading the training image..............[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-06-14 14:11:46,900 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-06-14 14:11:46,979 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-06-14 14:11:46,988 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-06-14 14:11:47,593 sagemaker-training-toolkit INFO     Starting MPI run as worker node.[0m
[34m2021-06-14 14:11:47,593 



Training seconds: 2457
Billable seconds: 2457


In [None]:
smd_mp_estimator.model_data

In [20]:
smd_mp_estimator.model_uri