# [모듈 2.1] 세이지메이커에서 분산 훈련 하기

이 노트북은 커널을 'conda_python3' 를 사용합니다.

---
이 노트북은 PyTorch Lightning 의 Multi GPUs 기능으로 1개의 인스턴스에서 (ml.g4dn.12xlarge) 에서 훈련 합니다.

# 1. 환경 설정


## 기본 세팅
사용하는 패키지는 import 시점에 다시 재로딩 합니다.

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('./scripts')

In [2]:
import sagemaker

sagemaker.__version__

# sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

## 파라미터 세팅

In [3]:
import torch
import os

epochs = 2
num_gpus = torch.cuda.device_count()
# model_dir = 'model'
# num_gpus = 4
# train_notebook = True

print("num_gpus: ", num_gpus)
print("epochs: ", epochs)



num_gpus:  8
epochs:  2


# 2. 세이지 메이크 로컬 모드 훈련
#### 로컬의 GPU, CPU 여부로 instance_type 결정

In [6]:
import os
import subprocess


try:
    if subprocess.call("nvidia-smi") == 0:
        ## Set type to GPU if one is present
        instance_type = "local_gpu"
    else:
        instance_type = "local"        
except:
    pass

print("Instance type = " + instance_type)

Tue Mar  7 06:27:29 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.65.01    Driver Version: 515.65.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:00:17.0 Off |                    0 |
| N/A   28C    P0    40W / 300W |      3MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:00:18.0 Off |                    0 |
| N/A   27C    P0    41W / 300W |      3MiB / 16384MiB |      0%      Default |
|       

## 로컬 모드로 훈련 실행
- 아래의 두 라인이 로컬모드로 훈련을 지시 합니다.
```python
    instance_type=instance_type, # local_gpu or local 지정
    session = sagemaker.LocalSession(), # 로컬 세션을 사용합니다.
```

In [7]:
hyperparameters = {'epochs': epochs, 
                   'n_gpus': num_gpus,
                    }  

In [8]:
from sagemaker.pytorch import PyTorch
import os
import subprocess


local_estimator = PyTorch(
    entry_point="TFT_Train.py",    
    source_dir='src',    
    role=role,
    framework_version='1.12.1',    
    py_version='py38',        
    instance_count=1,
    instance_type=instance_type, # local_gpu or local 지정
    session = sagemaker.LocalSession(), # 로컬 세션을 사용합니다.
    hyperparameters= hyperparameters               
    
)
local_estimator.fit()

Creating z5ar2vrarj-algo-1-zx6c3 ... 
Creating z5ar2vrarj-algo-1-zx6c3 ... done
Attaching to z5ar2vrarj-algo-1-zx6c3
[36mz5ar2vrarj-algo-1-zx6c3 |[0m 2023-03-07 06:27:41,386 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
[36mz5ar2vrarj-algo-1-zx6c3 |[0m 2023-03-07 06:27:41,451 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
[36mz5ar2vrarj-algo-1-zx6c3 |[0m 2023-03-07 06:27:41,461 sagemaker-training-toolkit INFO     instance_groups entry not present in resource_config
[36mz5ar2vrarj-algo-1-zx6c3 |[0m 2023-03-07 06:27:41,464 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
[36mz5ar2vrarj-algo-1-zx6c3 |[0m 2023-03-07 06:27:41,471 sagemaker_pytorch_container.training INFO     Invoking user training script.
[36mz5ar2vrarj-algo-1-zx6c3 |[0m 2023-03-07 06:27:41,537 botocore.credentials INFO     Found credentials from IAM Role: BaseNotebookInstanceEc2Instance

# 3. SageMaker Cloud Mode


## 파라미터 셋업

In [9]:
instance_type = 'ml.g4dn.12xlarge' # AMD Radeon Pro V520 4장 GPU

hyperparameters = {'epochs': epochs, 
                    }  

In [10]:
from sagemaker.pytorch import PyTorch
import os

estimator = PyTorch(
    entry_point="TFT_Train.py",    
    source_dir='src',    
    role=role,
    framework_version='1.12.1',    
    py_version='py38',     
    instance_count=1,
    instance_type=instance_type, # local_gpu or local 지정
    session = sagemaker.Session(),
    hyperparameters= hyperparameters               
    
)
estimator.fit(wait=False)

In [11]:
estimator.logs()

2023-03-07 06:33:33 Starting - Starting the training job...
2023-03-07 06:34:01 Starting - Preparing the instances for trainingProfilerReport-1678170813: InProgress
......
2023-03-07 06:34:53 Downloading - Downloading input data...
2023-03-07 06:35:33 Training - Downloading the training image............
2023-03-07 06:37:33 Training - Training image download completed. Training in progress.......[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-03-07 06:38:19,326 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-03-07 06:38:19,363 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2023-03-07 06:38:19,373 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-03-07 06:38:19,375 sagemaker_pytorch_container.training INFO     Invoking user training scrip

# 4. 모델 가중치 파일 확인

In [12]:
print("model artifact: \n", estimator.model_data)

model artifact: 
 s3://sagemaker-us-east-1-057716757052/pytorch-training-2023-03-07-06-33-33-202/output/model.tar.gz
