In [3]:
import sagemaker
import torch
import torchvision
import torchvision.transforms as transforms

role = sagemaker.get_execution_role()

### Download and process data, then upload to S3 for training

In [4]:
# Download and prepare data
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Download and transform both training and test sets
trainset = torchvision.datasets.FashionMNIST(
    root='data', train=True, download=True, transform=transform
)

testset = torchvision.datasets.FashionMNIST(
    root='data', train=False, download=True, transform=transform
)

# Save both datasets to disk
torch.save(trainset, 'train_dataset.pt')
torch.save(testset, 'test_dataset.pt')

# Upload to S3
session = sagemaker.Session()
bucket = session.default_bucket()
prefix = 'fashion-mnist'

# Upload training data to S3
train_data_path = session.upload_data(
    path='train_dataset.pt',
    bucket=bucket,
    key_prefix=f'{prefix}/train'
)

# Upload test data to S3
test_data_path = session.upload_data(
    path='test_dataset.pt',
    bucket=bucket,
    key_prefix=f'{prefix}/test'
)

print(f"Training data uploaded to: {train_data_path}")
print(f"Test data uploaded to: {test_data_path}")

Training data uploaded to: s3://sagemaker-eu-west-1-934765130326/fashion-mnist/train/train_dataset.pt
Test data uploaded to: s3://sagemaker-eu-west-1-934765130326/fashion-mnist/test/test_dataset.pt


### Create training job with data as input

In [6]:
from sagemaker.pytorch import PyTorch

pytorch_estimator = PyTorch(
    entry_point='train.py',
    source_dir='src',
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    framework_version='2.0.1',
    py_version='py310',
    keep_alive_period_in_seconds=1800, # Keep the instance alive for 300 seconds after the jobs has finished
)

pytorch_estimator.fit({
    'training': train_data_path,
    'testing': test_data_path
})

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2024-12-02-12-48-17-911


2024-12-02 12:48:18 Starting - Starting the training job.

KeyboardInterrupt: 

### Create training job with hyperparameters and custom metric definitions

In [10]:
# This parses stdout in the training job according to the below regexes
metric_definitions = [
    {'Name': 'train:loss', 'Regex': '"metric": "train_loss", "value": (\d+\.\d+)'},
    {'Name': 'test:accuracy', 'Regex': '"metric": "test_accuracy", "value": (\d+\.\d+)'},
    {'Name': 'test:loss', 'Regex': '"metric": "test_loss", "value": (\d+\.\d+)'},
    {'Name': 'f1_score', 'Regex': '"metric": "f1_score", "value": (\d+\.\d+)'}
]

In [11]:
from sagemaker.pytorch import PyTorch

pytorch_estimator = PyTorch(
    entry_point='train_extended.py',
    source_dir='src',
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    framework_version='2.0.1',
    py_version='py310',
    hyperparameters={'epochs': 5, 'batch-size': 64}, # Pass hyper parameters to the job
    metric_definitions=metric_definitions, # Pass the metric_definitions to the job
    keep_alive_period_in_seconds=1800, # Keep the instance alive for 300 seconds after the jobs has finished
)

pytorch_estimator.fit({
    'training': train_data_path,
    'testing': test_data_path
})

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2024-12-02-13-17-33-432


2024-12-02 13:17:33 Starting - Starting the training job...
2024-12-02 13:18:04 Downloading - Downloading the training image.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-12-02 13:18:05,096 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-12-02 13:18:05,096 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-12-02 13:18:05,097 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-12-02 13:18:05,107 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-12-02 13:18:05,109 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2024-12-02 13:18:06,488 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/opt/conda/bin/python3.10 -m p

UnexpectedStatusException: Error for Training job pytorch-training-2024-12-02-13-17-33-432: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
ExitCode 1
ErrorMessage "NameError: name 'f1_score' is not defined"
Command "/opt/conda/bin/python3.10 train_extended.py --batch-size 64 --epochs 5", exit code: 1