In [1]:
import sagemaker
import torch
import torchvision
import torchvision.transforms as transforms
from sagemaker.pytorch import PyTorch

role = sagemaker.get_execution_role()

# Download and prepare data
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Download and transform both training and test sets
trainset = torchvision.datasets.FashionMNIST(
    root='data', train=True, download=True, transform=transform
)

testset = torchvision.datasets.FashionMNIST(
    root='data', train=False, download=True, transform=transform
)

# Save both datasets to disk
torch.save(trainset, 'train_dataset.pt')
torch.save(testset, 'test_dataset.pt')

# Upload to S3
session = sagemaker.Session()
bucket = session.default_bucket()
prefix = 'fashion-mnist'

# Upload training data to S3
train_data_path = session.upload_data(
    path='fashion_mnist_train.pt',
    bucket=bucket,
    key_prefix=f'{prefix}/train'
)

# Upload test data to S3
test_data_path = session.upload_data(
    path='fashion_mnist_test.pt',
    bucket=bucket,
    key_prefix=f'{prefix}/test'
)

print(f"Training data uploaded to: {train_data_path}")
print(f"Test data uploaded to: {test_data_path}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Training data uploaded to: s3://sagemaker-eu-west-1-934765130326/fashion-mnist/train/fashion_mnist_train.pt
Test data uploaded to: s3://sagemaker-eu-west-1-934765130326/fashion-mnist/test/fashion_mnist_test.pt


## Estimator, customer container

In [15]:
from sagemaker.estimator import Estimator

estimator = Estimator(
    base_job_name="custom-image-estimator",
    image_uri='934765130326.dkr.ecr.eu-west-1.amazonaws.com/mnist:latest',
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    keep_alive_period_in_seconds=1800,
    entry_point='train_bare.py',
#    volume_size=30,
#    max_run=3600,
#    output_path='s3://mlbucket13/masteringsagemaker/pytorch/output',
#    code_location='s3://mlbucket13/masteringsagemaker/pytorch',
#    entry_point='train_0.py',
    hyperparameters={'epochs': 4}
)

estimator.fit()


INFO:sagemaker:Creating training-job with name: custom-image-estimator-2024-12-02-09-59-19-562


2024-12-02 09:59:20 Starting - Starting the training job.

KeyboardInterrupt: 

## Estimator, framework container

In [6]:
from sagemaker.estimator import Estimator

estimator = Estimator(
    entry_point='train.py',
    source_dir='src',
    image_uri='763104351884.dkr.ecr.eu-west-1.amazonaws.com/pytorch-training:2.0.1-cpu-py310',
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    keep_alive_period_in_seconds=1800,
)

estimator.fit()


INFO:sagemaker:Creating training-job with name: pytorch-training-2024-12-02-09-26-17-649


2024-12-02 09:26:18 Starting - Starting the training job.

KeyboardInterrupt: 

In [None]:
!aws s3 cp train_0.py s3://mlbucket13/masteringsagemaker/pytorch/train_0.py

In [12]:
from sagemaker.estimator import Estimator

estimator = Estimator(
    base_job_name="base-estimator",
    image_uri='763104351884.dkr.ecr.eu-west-1.amazonaws.com/pytorch-training:2.0.1-cpu-py310',
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    entry_point='train_bare.py',
    hyperparameters={'epochs': 2}
)

estimator.fit()


INFO:sagemaker:Creating training-job with name: base-estimator-2024-12-02-09-39-46-720


2024-12-02 09:39:47 Starting - Starting the training job.

KeyboardInterrupt: 

upload: ./train_0.py to s3://mlbucket13/masteringsagemaker/pytorch/train_0.py


In [19]:
from sagemaker.pytorch import PyTorch

pytorch_estimator = PyTorch(
    base_job_name="pytorch-estimator",
    entry_point='train_bare.py',
    source_dir='0',
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    framework_version='2.0.1',
    py_version='py310',
    keep_alive_period_in_seconds=1800,
)

pytorch_estimator.fit()

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-estimator-2024-12-02-10-17-38-285


2024-12-02 10:17:38 Starting - Starting the training job...
2024-12-02 10:17:59 Downloading - Downloading the training image
2024-12-02 10:17:59 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-12-02 10:18:00,220 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-12-02 10:18:00,221 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-12-02 10:18:00,222 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-12-02 10:18:00,237 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-12-02 10:18:00,240 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2024-12-02 10:18:02,093 sagemaker-training-toolkit INFO    

In [None]:
from sagemaker.pytorch import PyTorch

pytorch_estimator = PyTorch(
    entry_point='train_0.py',
    role=role,
    instance_count=2,
    instance_type='ml.m5.xlarge',
    framework_version='2.0.1',
    py_version='py310',
    keep_alive_period_in_seconds=1800,
    #hyperparameters={'epochs': 20},
    #distribution={ "smdistributed": { "dataparallel": { "enabled": True } } }
)

pytorch_estimator.fit({
    'training': train_data_path,
    'testing': test_data_path
})

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2024-11-29-15-03-11-678


2024-11-29 15:03:12 Starting - Starting the training job...
2024-11-29 15:03:28 Starting - Preparing the instances for training...
2024-11-29 15:03:59 Downloading - Downloading input data...
2024-11-29 15:04:24 Downloading - Downloading the training image......
2024-11-29 15:05:30 Training - Training image download completed. Training in progress..[35mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[35mbash: no job control in this shell[0m
[35m2024-11-29 15:05:34,814 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[35m2024-11-29 15:05:34,814 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[35m2024-11-29 15:05:34,815 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[35m2024-11-29 15:05:34,824 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[35m2024-11-29 15:05:34,826 sagemak

In [None]:
from sagemaker.pytorch import PyTorch

pytorch_estimator = PyTorch(
    entry_point='train.py',
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    framework_version='2.0.1',
    py_version='py310',
    keep_alive_period_in_seconds=1800,
    hyperparameters={'epochs': 10},
    metric_definitions=[
        {'Name': 'train_loss', 'Regex': 'train_loss": ([0-9\.]+)'},
        {'Name': 'train_accuracy', 'Regex': 'train_accuracy": ([0-9\.]+)'},
        {'Name': 'test_loss', 'Regex': 'test_loss": ([0-9\.]+)'},
        {'Name': 'test_accuracy', 'Regex': 'test_accuracy": ([0-9\.]+)'},
        {'Name': 'precision', 'Regex': 'precision": ([0-9\.]+)'},
        {'Name': 'recall', 'Regex': 'recall": ([0-9\.]+)'},
        {'Name': 'f1_score', 'Regex': 'f1_score": ([0-9\.]+)'}
    ]
)

pytorch_estimator.fit({
    'training': train_data_path,
    'testing': test_data_path
})

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2024-11-28-19-32-11-067


2024-11-28 19:32:11 Starting - Starting the training job...
2024-11-28 19:32:26 Starting - Preparing the instances for training...
2024-11-28 19:32:57 Downloading - Downloading input data...
2024-11-28 19:33:27 Downloading - Downloading the training image......
2024-11-28 19:34:34 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-11-28 19:34:37,695 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-11-28 19:34:37,696 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-11-28 19:34:37,697 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-11-28 19:34:37,707 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-11-28 19:34:37,709 sagemak

INFO:sagemaker:Creating training-job with name: pytorch-training-2024-11-28-20-30-00-947


ClientError: An error occurred (ValidationException) when calling the CreateTrainingJob operation: Invalid DNS suffix 'amazonaws.com' for region 'us-west-2' in training image. Please provide the valid <region>.<dns-suffix>: 'eu-west-1.amazonaws.com'