# [모듈 3.1] 세이지메이커에서 멀티 노트 분산 훈련 하기

이 노트북은 커널을 'conda_python3' 를 사용합니다.

---
아래와 같이 pytorchddp 를 백엔드로 해서 2개의 "ml.p3.8xlarge" 로 학습합니다.

```
    distribution={"pytorchddp":{"enabled": True}},    
    instance_count=2,
```

# 1. 환경 설정


## 기본 세팅
사용하는 패키지는 import 시점에 다시 재로딩 합니다.

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('./scripts')

In [2]:
import sagemaker

sagemaker.__version__

# sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

## 파라미터 세팅

In [3]:
import torch
import os

epochs = 1
print("epochs: ", epochs)

epochs:  1


# 2. 세이지 메이크 로컬 모드 훈련


## 로컬 모드로 실행 여부
- False 이면 실행 안함.

In [4]:
local_mode = True 

## 로컬의 GPU, CPU 여부로 instance_type 결정

In [5]:
import os
import subprocess

try:
    if subprocess.call("nvidia-smi") == 0:
        ## Set type to GPU if one is present
        instance_type = "local_gpu"
    else:
        instance_type = "local"        
except:
    pass

print("Instance type = " + instance_type)

Sat Mar  4 14:17:21 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.65.01    Driver Version: 515.65.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:00:17.0 Off |                    0 |
| N/A   28C    P0    40W / 300W |      3MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:00:18.0 Off |                    0 |
| N/A   27C    P0    41W / 300W |      3MiB / 16384MiB |      0%      Default |
|       

## 로컬 모드로 훈련 실행
- 아래의 두 라인이 로컬모드로 훈련을 지시 합니다.
```python
    instance_type=instance_type, # local_gpu or local 지정
    session = sagemaker.LocalSession(), # 로컬 세션을 사용합니다.
```

In [6]:
hyperparameters = {'epochs': epochs, 
                    }  

In [7]:
from sagemaker.pytorch import PyTorch
import os

if local_mode:
    local_estimator = PyTorch(
        entry_point="TFT_Train_SM_DDP.py",    
        source_dir='src',    
        role=role,
        framework_version='1.12.1',    
        py_version='py38',        
        distribution={"pytorchddp":{"enabled": True}},        
        instance_count=1,
        instance_type=instance_type, # local_gpu or local 지정
        session = sagemaker.LocalSession(), # 로컬 세션을 사용합니다.
        hyperparameters= hyperparameters               

    )
    local_estimator.fit()

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2023-03-04-14-17-21-759
INFO:sagemaker.local.local_session:Starting training job
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker.local.image:No AWS credentials found in session but credentials from EC2 Metadata Service are available.
INFO:sagemaker.local.image:docker compose file: 
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  algo-1-rc30d:
    command: train
    container_name: n3a77bo8pk-algo-1-rc30d
    deploy:
      resources:
        reservations:
          devices:
          - capabilities:
            - gpu
    environment:
    - '[Masked]'
    - '[Masked]'
    image: 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.12.1-gpu-py38
    networks:
      sagemaker-local:
        aliases:
        - algo-1-rc30d
 

Creating n3a77bo8pk-algo-1-rc30d ... 
Creating n3a77bo8pk-algo-1-rc30d ... done
Attaching to n3a77bo8pk-algo-1-rc30d
[36mn3a77bo8pk-algo-1-rc30d |[0m 2023-03-04 14:17:25,446 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
[36mn3a77bo8pk-algo-1-rc30d |[0m 2023-03-04 14:17:25,511 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
[36mn3a77bo8pk-algo-1-rc30d |[0m 2023-03-04 14:17:25,521 sagemaker-training-toolkit INFO     instance_groups entry not present in resource_config
[36mn3a77bo8pk-algo-1-rc30d |[0m 2023-03-04 14:17:25,523 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
[36mn3a77bo8pk-algo-1-rc30d |[0m 2023-03-04 14:17:25,531 sagemaker_pytorch_container.training INFO     Invoking SMDataParallel for native PT DDP job
[36mn3a77bo8pk-algo-1-rc30d |[0m 2023-03-04 14:17:25,531 sagemaker_pytorch_container.training INFO     Invoking user training script.
[

INFO:root:creating /tmp/tmpflvadli6/artifacts/output/data
INFO:root:copying /tmp/tmpflvadli6/algo-1-rc30d/output/success -> /tmp/tmpflvadli6/artifacts/output
INFO:root:copying /tmp/tmpflvadli6/model/model.pth -> /tmp/tmpflvadli6/artifacts/model


[36mn3a77bo8pk-algo-1-rc30d exited with code 0
[0mAborting on container exit...
===== Job Complete =====


# 3. SageMaker Cloud Mode


## 파라미터 셋업

In [8]:
#instance_type = 'ml.g4dn.12xlarge'
instance_type = 'ml.p3.8xlarge'
hyperparameters = {'epochs': epochs, 
                    }  

In [9]:
from sagemaker.pytorch import PyTorch
import os


estimator = PyTorch(
    entry_point="TFT_Train_SM_DDP.py",    
    source_dir='src',    
    role=role,
    framework_version='1.12.1',    
    py_version='py38',     
    distribution={"pytorchddp":{"enabled": True}},    
    instance_count=2,
    instance_type=instance_type, # local_gpu or local 지정
    session = sagemaker.Session(),
    hyperparameters= hyperparameters               
    
)
estimator.fit(wait=False)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2023-03-04-14-18-52-747


In [10]:
estimator.logs()

2023-03-04 14:18:53 Starting - Starting the training job......
2023-03-04 14:19:49 Starting - Preparing the instances for training.........
2023-03-04 14:21:11 Downloading - Downloading input data
2023-03-04 14:21:11 Training - Downloading the training image.....................
2023-03-04 14:24:32 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-03-04 14:24:49,023 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-03-04 14:24:49,059 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2023-03-04 14:24:49,071 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-03-04 14:24:49,074 sagemaker_pytorch_container.training INFO     Invoking SMDataParallel for native PT DDP job[0m
[34m2023

# 4. 모델 가중치 파일 확인

In [11]:
print("model artifact: \n", estimator.model_data)

model artifact: 
 s3://sagemaker-us-east-1-057716757052/pytorch-training-2023-03-04-14-18-52-747/output/model.tar.gz
