In [1]:
import boto3
import sagemaker
from sagemaker.pytorch import PyTorch

# Set up S3 bucket and paths to input data and model artifacts
bucket_name = "store-objects-detection"
prefix = "sagemaker/SKU110K"

training_data_path = f's3://{bucket_name}/{prefix}'
model_artifacts_path = f's3://{bucket_name}/model-artifacts/'

# Set up SageMaker session, role, and instance types
sagemaker_session = sagemaker.Session()
train_instance_type = 'ml.g4dn.xlarge'

role = "sagemaker-iam-role"

In [2]:
from sagemaker.image_uris import retrieve 
training_image = retrieve("pytorch", 
                            region="eu-central-1", 
                            version="1.13.1", 
                            py_version="py39", 
                            instance_type=train_instance_type, 
                            image_scope="training")
print(training_image)


763104351884.dkr.ecr.eu-central-1.amazonaws.com/pytorch-training:1.13.1-gpu-py39


In [3]:
# Define the training job
estimator = PyTorch(
    image_uri=training_image,
    source_dir="code",
    entry_point="train.py",
    role=role,
    py_version="py39",
    framework_version="1.13.1",
    instance_count=1,
    instance_type=train_instance_type,
    output_path=model_artifacts_path,
    sagemaker_session=sagemaker_session,
    hyperparameters = {'epochs': 30, 'batch-size': 4, 'model': 'Retina_Net', 'sagemaker': True, 'learning-rate': 0.005, 'weight-decay': 0.00005, 'epoch': 0}
)
estimator.fit({'train': f's3://{bucket_name}/{prefix}',
                'test': f's3://{bucket_name}/{prefix}'}, logs="All")

INFO:sagemaker:Creating training-job with name: pytorch-training-2023-05-01-18-10-09-990


2023-05-01 18:10:11 Starting - Starting the training job...
2023-05-01 18:10:27 Starting - Preparing the instances for training......
2023-05-01 18:11:18 Downloading - Downloading input data...............................................bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2023-05-01 18:19:38,580 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2023-05-01 18:19:38,601 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2023-05-01 18:19:38,613 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2023-05-01 18:19:38,619 sagemaker_pytorch_container.training INFO     Invoking user training script.
2023-05-01 18:19:38,866 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:
/opt/conda/bin/python3.9 -m pip install -r requirements.txt
Collecting Pillow==9.3.0 (from -r requirem

KeyboardInterrupt: 