# Data Distributed Parallel Mode Training

## Setup

In [None]:

import boto3
import sagemaker

role = 'sagemaker-role'
role_arn = 'arn:aws:iam::************:role/sagemaker-role'
sagemaker_session = sagemaker.Session()
s3_bucket_name = sagemaker_session.default_bucket()

In [None]:
# prefix for our training
from datetime import datetime
now = datetime.now()
d = now.strftime("%d-%m-%Y-%H-%M-%S")
job_name = f'sagemaker-{d}'
job_name

## Training Loop

In [None]:
# for distributed training
distribution = { 
    "pytorchddp": {
        "enabled": True,
        "custom_mpi_options": "-verbose -x NCCL_DEBUG=VERSION"
    }
}

In [None]:
# instance details
instance_type = "ml.g4dn.12xlarge"
instance_count = 2

In [None]:
hyperparameters={
            "batch-size":64,
            "max-epochs":2,
            }

In [None]:
from sagemaker.pytorch import PyTorch


ddp_estimator = PyTorch(
        entry_point="b_train_ddp.py", 
        source_dir='b_code',
        role=role,
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        framework_version='1.12.0',
        py_version='py38',
        instance_count=instance_count,
        hyperparameters=hyperparameters,
        disable_profiler=True, # for distributed training
        debugger_hook_config=False, # for distributed training
        distribution=distribution,
        job_name=job_name,
        )

In [None]:
ddp_estimator.fit(job_name=job_name, logs=True)

In [None]:
ddp_estimator.latest_training_job.describe()