In [1]:
import json
import os
import uuid

import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role

In [2]:
!  mkdir -p data && cd data && wget -c --quiet https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz && tar -xvzf cifar-10-python.tar.gz

cifar-10-batches-py/
cifar-10-batches-py/data_batch_4
cifar-10-batches-py/readme.html
cifar-10-batches-py/test_batch
cifar-10-batches-py/data_batch_3
cifar-10-batches-py/batches.meta
cifar-10-batches-py/data_batch_2
cifar-10-batches-py/data_batch_5
cifar-10-batches-py/data_batch_1


In [None]:
! mkdir -p saved_models

## Run locally multiGPU training

In [3]:
!python -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 --node_rank=0 \
        /home/ec2-user/SageMaker/pytorch_resnet_cifar10_mirror/trainer_adascale.py  --num_epochs 200 \
        --batch_size 256 

/home/ec2-user/anaconda3/envs/JupyterSystemEnv/bin/python: Error while finding module specification for 'torch.distributed.launch' (ModuleNotFoundError: No module named 'torch')


In [None]:
# Run Sagemaker ddp training

In [None]:
! pwd

In [None]:
! ls /home/ec2-user/SageMaker/pytorch_resnet_cifar10_mirror/sm


In [None]:
! cd /home/ec2-user/SageMaker/pytorch_resnet_cifar10_mirror/sm &&  python ddp-launcher.py --gpus 8 \
--data_dir /home/ec2-user/SageMaker/data/ \
--model_dir /home/ec2-user/SageMaker/ \
--num_epochs 100




## Multinode multi GPU

In [None]:
config = {
    'batch_size': 256,
    'num_epochs' : 50}

In [None]:
bucket = 'mansmane-us-west-2'


In [None]:
# Training time of this job 
token = str(uuid.uuid4())[:10]  # we create a unique token to avoid checkpoint collisions in S3

job = PyTorch(
    entry_point='ddp-launcher.py',
    source_dir='/home/ec2-user/SageMaker/pytorch_resnet_cifar10_mirror/sm',
    role=get_execution_role(),
    framework_version='1.8.1',
    instance_count=1,
    instance_type='ml.p3.16xlarge',
    base_job_name='resnet-multi-GPU-g5',
    py_version='py36',
    hyperparameters=config,
    checkpoint_s3_uri='s3://{}/{}/checkpoints'.format(bucket, token),  # S3 destination of /opt/ml/checkpoints files
    output_path='s3://{}/{}'.format(bucket, token),
    code_location='s3://{}/{}/code'.format(bucket, token), # source_dir code will be staged in S3 there
    environment={"SMDEBUG_LOG_LEVEL":"off"},  # reduce verbosity of Debugger
    debugger_hook_config=False,  # deactivate debugger to avoid warnings in model artifact
    disable_profiler=True,  # keep running resources to a minimum to avoid permission errors
    metric_definitions=[
        {"Name": "Train_loss", "Regex": "Training_loss: ([0-9.]+).*$"},
        {"Name": "Learning_rate", "Regex": "learning rate: ([0-9.]+).*$"},        
        {"Name": "Val_loss", "Regex": "Val_loss: ([0-9.]+).*$"},        
        {"Name": "Throughput", "Regex": "Throughput: ([0-9.]+).*$"},
        {"Name": "Val_pixel_acc", "Regex": "Val_pixel_acc: ([0-9.]+).*$"}
    ],
    tags=[{'Key': 'Project', 'Value': 'A2D2_segmentation'}])  # tag the job for experiment tracking

In [None]:
train_path = 's3://mansmane-us-west-2/cifar10/'

In [None]:
job.fit({'dataset': train_path}, wait=False)

In [None]:
token = str(uuid.uuid4())[:10]  # we create a unique token to avoid checkpoint collisions in S3
instance_count = 2

job = PyTorch(
    entry_point='ddp-launcher.py',
    source_dir='/home/ec2-user/SageMaker/pytorch_resnet_cifar10_mirror/sm',
    role=get_execution_role(),
    framework_version='1.8.1',
    instance_count=instance_count,
    instance_type='ml.p3.16xlarge',
    base_job_name='resnet-multi-GPU-g5-instance-' + str(instance_count),
    py_version='py36',
    hyperparameters=config,
    checkpoint_s3_uri='s3://{}/{}/checkpoints'.format(bucket, token),  # S3 destination of /opt/ml/checkpoints files
    output_path='s3://{}/{}'.format(bucket, token),
    code_location='s3://{}/{}/code'.format(bucket, token), # source_dir code will be staged in S3 there
    environment={"SMDEBUG_LOG_LEVEL":"off"},  # reduce verbosity of Debugger
    debugger_hook_config=False,  # deactivate debugger to avoid warnings in model artifact
    disable_profiler=True,  # keep running resources to a minimum to avoid permission errors
    metric_definitions=[
        {"Name": "Train_loss", "Regex": "Training_loss: ([0-9.]+).*$"},
        {"Name": "Learning_rate", "Regex": "learning rate: ([0-9.]+).*$"},        
        {"Name": "Val_loss", "Regex": "Val_loss: ([0-9.]+).*$"},        
        {"Name": "Throughput", "Regex": "Throughput: ([0-9.]+).*$"},
        {"Name": "Val_pixel_acc", "Regex": "Val_pixel_acc: ([0-9.]+).*$"}
    ],
    tags=[{'Key': 'Project', 'Value': 'A2D2_segmentation'}])  # tag the job for experiment tracking

train_path = 's3://mansmane-us-west-2/cifar10/'
job.fit({'dataset': train_path}, wait=False)

In [None]:
! pip install fairscale