# [모듈 2.1] 세이지메이커에서 분산 훈련 하기

이 노트북은 커널을 'conda_python3' 를 사용합니다.

# 1. 환경 설정


## 기본 세팅
사용하는 패키지는 import 시점에 다시 재로딩 합니다.

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('./scripts')

## 파라미터 세팅

In [2]:
import torch
import os

epochs = 1
num_gpus = torch.cuda.device_count()
# model_dir = 'model'
# num_gpus = 4
# train_notebook = True

print("num_gpus: ", num_gpus)
print("epochs: ", epochs)



num_gpus:  1
epochs:  2


In [3]:
# if train_notebook:


#     os.makedirs(model_dir, exist_ok=True)
        
#     src_dir = os.getcwd()
#     os.environ['SM_MODEL_DIR'] = f'{src_dir}/{model_dir}'
#     os.environ['SM_NUM_GPUS'] = str(num_gpus)
    

# 2. 세이지 메이크 로컬 모드 훈련
#### 로컬의 GPU, CPU 여부로 instance_type 결정

In [4]:
import os
import subprocess


try:
    if subprocess.call("nvidia-smi") == 0:
        ## Set type to GPU if one is present
        instance_type = "local_gpu"
    else:
        instance_type = "local"        
except:
    pass

print("Instance type = " + instance_type)

Sat Mar  4 09:17:34 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:00:1E.0 Off |                    0 |
| N/A   37C    P0    36W / 300W |   1251MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
import sagemaker

sagemaker.__version__

# sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

## 2.4. 로컬 모드로 훈련 실행
- 아래의 두 라인이 로컬모드로 훈련을 지시 합니다.
```python
    instance_type=instance_type, # local_gpu or local 지정
    session = sagemaker.LocalSession(), # 로컬 세션을 사용합니다.
```

In [6]:
hyperparameters = {'epochs': epochs, 
                   'n_gpus': num_gpus,
                    }  

In [7]:
from sagemaker.pytorch import PyTorch
import os
import subprocess

# region = sagemaker_session.boto_region_name


local_estimator = PyTorch(
    entry_point="TFT_Train.py",    
    source_dir='scripts',    
    role=role,
    framework_version='1.12.1',    
    py_version='py38',        
    instance_count=1,
    instance_type=instance_type, # local_gpu or local 지정
    session = sagemaker.LocalSession(), # 로컬 세션을 사용합니다.
    hyperparameters= hyperparameters               
    
)
local_estimator.fit()

Creating cucd7st2m9-algo-1-w67e8 ... 
Creating cucd7st2m9-algo-1-w67e8 ... done
Attaching to cucd7st2m9-algo-1-w67e8
[36mcucd7st2m9-algo-1-w67e8 |[0m 2023-03-04 09:17:38,627 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
[36mcucd7st2m9-algo-1-w67e8 |[0m 2023-03-04 09:17:38,650 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
[36mcucd7st2m9-algo-1-w67e8 |[0m 2023-03-04 09:17:38,660 sagemaker-training-toolkit INFO     instance_groups entry not present in resource_config
[36mcucd7st2m9-algo-1-w67e8 |[0m 2023-03-04 09:17:38,664 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
[36mcucd7st2m9-algo-1-w67e8 |[0m 2023-03-04 09:17:38,667 sagemaker_pytorch_container.training INFO     Invoking user training script.
[36mcucd7st2m9-algo-1-w67e8 |[0m 2023-03-04 09:17:38,709 botocore.credentials INFO     Found credentials from IAM Role: BaseNotebookInstanceEc2Instance

# 2. 훈련 코드를 직접 로컬에서 실행
- --n_gpus {num_gpus} --epochs {epochs} 와 같은 파이라미터를 전달하여 실행 합니다.
- 실행 완료 후에 model_dir 경로에 모델 가중치 파일이 저장 됩니다.

In [8]:
# import sagemaker
# from sagemaker.pytorch import PyTorch
# from sagemaker.local import LocalSession

# sagemaker_session = sagemaker.Session()
# role = sagemaker.get_execution_role()
# region = sagemaker_session.boto_region_name

# # hard code point to the DLC images
# image_uri = '763104351884.dkr.ecr.{}.amazonaws.com/pytorch-training:1.12.0-gpu-py38-cu113-ubuntu20.04-sagemaker'.format(region)

# estimator = PyTorch(
#   entry_point="mnist.py",
#   base_job_name="{}-ddp-mnist".format(your_user_string),
#   image_uri = image_uri,
#   role=role,
#   source_dir="scripts",
#   # configures the SageMaker training resource, you can increase as you need
#   instance_count=1,
#   instance_type="ml.g4dn.12xlarge",
#   py_version="py38",
#   sagemaker_session=sagemaker_session,
#   distribution={"pytorchddp":{"enabled": True}},
#   debugger_hook_config=False,
#   #profiler_config=profiler_config,
#   hyperparameters={"batch_size":32, "epochs":300},
#   # enable warm pools for 20 minutes
#   keep_alive_period_in_seconds = 20 *60)

# 3. 모델 가중치 파일 확인

In [None]:
os.listdir(model_dir)