In [23]:
import os
from datetime import datetime
from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorch
import boto3

S3_BUCKET_AUTO_SAVE = 'sagemaker-us-east-1-654654320282'  # Esse é um bucket padrão para salvar automaticamente dados da execução do training job. Não precisa modificar.

boto_session = boto3.session.Session()
region = boto_session.region_name
# Make sure that you are running training in the same region as your S3 bucket
os.environ['AWS_DEFAULT_REGION'] = region

| Instance Type    | GPUs                         | vCPUs | Memory (RAM) | Cost (us-east-1)   |
|------------------|------------------------------|-------|--------------|-------------------|
| ml.g5.4xlarge    | NVIDIA A100 Tensor Core (1x) | 16    | 64 GB        | \$2.03 per hour   |
| ml.p3.2xlarge    | NVIDIA V100 Tensor Core (1x) | 8     | 61 GB        | \$3.83 per hour   |
| ml.g4dn.12xlarge | NVIDIA T4 Tensor Core (4x)   | 48    | 192 GB       | \$4.89 per hour   |
| ml.p3.8xlarge    | NVIDIA V100 Tensor Core (4x) | 32    | 244 GB       | \$14.69 per hour  |
| ml.p3.16xlarge   | NVIDIA V100 Tensor Core (8x) | 64    | 488 GB       | \$28.15 per hour  |
| ml.p3dn.24xlarge | NVIDIA V100 Tensor Core (8x) | 96    | 768 GB       | \$35.89 per hour  |
| ml.p4d.24xlarge  | NVIDIA A100 Tensor Core (8x) | 96    | 1.1 TB       | \$37.69 per hour  |

In [24]:
# Normalmente vamos precisar modificar apenas as variáveis contidas nesta célula para executar o nosso training job.
user_id = "edmundo"  # This is used for naming your training job
S3_BUCKET_SHARED = "oncodata-sagemaker-shared"

# this is all for naming
date_str = datetime.now().strftime("%d-%m-%Y")
time_str = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")

instance_type = "ml.g5.4xlarge"  # This can be any instance from the table above
nodes = 1  # This is the number of nodes or instances you want to use.
source_dir = 'code'  # specifies the local directory (relative to where the script is run) that contains the training script and any additional code or resources needed for the training job. This directory is uploaded to S3 and then copied to the SageMaker instance(s) before the training starts.
entry_point = "dsmil_train_job.py"  # specifies the entry point script that will be executed when the training job starts. This script should be in the source_dir directory.

job_name = f'{user_id}-{time_str}'
output_path_auto_save = os.path.join("s3://", S3_BUCKET_AUTO_SAVE, user_id, "sagemaker-output", date_str)  # specifies the S3 bucket where the training output will be stored. This directory is created if it does not exist.
s3_manual_save_path = os.path.join(S3_BUCKET_SHARED, user_id, "models", date_str)  # Apenas um parâmetro que criei para salvar o modelo nesta pasta do S3

# These are the hyperparameters that are passed to the training script. Neste exemplo não parametrizei tudo, mas outros parâmetros podem ser usados, como num-classes, learning-rate, etc.
hyperparameters = {"batch-size": 32, "epochs": 50, "s3-manual-save-path": s3_manual_save_path}

use_spot_instances = True  # Use spot instances to reduce cost. We recommend using it, since the wait time isn't usually long.
max_run = 3600 * 4  # Máximo numero de segundos que esse job tem permissão para executar
max_wait = 3600 * 4 if use_spot_instances else None  # Máxima espera por uma spot instance, fallback para on demand se não achar
assert max_wait >= max_run  # Dá erro se não passar

volume_size = 50  # Tamanho do storage das instâncias, em GB

train_data = "s3://oncodata-sagemaker-shared/roraima/features/stomach/Stomach_feats_mocov3/v1/"  # Dados de treino a serem passados para o training job
channels = {'train': train_data}  # Dados para serem passados ao training job. Neste caso, o valor em train será recuperado no script com os.environ['SM_CHANNEL_TRAIN'], mas convertido para um diretório local com os dados do bucket indicado. Pode-se usar qualquer nome de channel desejado.

In [25]:
if nodes > 1 and instance_type in ['ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'ml.p3.16xlarge']:
    distribution = {"smdistributed": {"dataparallel": {"enabled": True}}}
else:
    distribution = None

In [26]:
estimator = PyTorch(
                entry_point=entry_point,
                source_dir=source_dir,
                py_version='py310',
                framework_version='2.0.1',
                role=get_execution_role(),  # This is the role that Sagemaker assumes to perform tasks on your behalf
                instance_count=nodes,
                instance_type=instance_type,
                distribution=distribution,
                output_path=output_path_auto_save,
                checkpoint_s3_uri=output_path_auto_save,
                model_dir=output_path_auto_save,
                hyperparameters=hyperparameters,
                use_spot_instances=use_spot_instances,
                max_run=max_run,
                max_wait=max_wait,
                volume_size=volume_size,
                disable_profiler=True,  # Reduce number of logs since we don't need profiler or debugger for this training
                debugger_hook_config=False,
                input_mode='FastFile',  # Faz streaming do S3, usando o S3 como local filesystem e ao mesmo tempo não consumindo todo o armazenamento local
)

In [27]:
estimator.fit(channels, wait=True, job_name=job_name)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: edmundo-21-03-2024-06-26-26


2024-03-21 06:26:27 Starting - Starting the training job...
2024-03-21 06:26:42 Starting - Preparing the instances for training...
2024-03-21 06:27:15 Downloading - Downloading input data...
2024-03-21 06:27:30 Downloading - Downloading the training image..................
2024-03-21 06:30:51 Training - Training image download completed. Training in progress.......[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-03-21 06:31:43,760 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-03-21 06:31:43,778 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-03-21 06:31:43,788 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-03-21 06:31:43,790 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2024-03-21 06:31:45,