In [32]:
!pip install --upgrade sagemaker



In [41]:
import time
from sagemaker_core.helper.session_helper import Session, get_execution_role

# Set up region, role and bucket parameters used throughout the notebook.
session = Session()
region = session.boto_region_name
role = get_execution_role()
bucket = session.default_bucket()

print(f"AWS region: {region}")
print(f"Execution role: {role}")
print(f"Default S3 bucket: {bucket}")

AWS region: eu-west-1
Execution role: arn:aws:iam::934765130326:role/service-role/SageMaker-ExecutionRole-20231205T101213
Default S3 bucket: sagemaker-eu-west-1-934765130326


### Download data

In [42]:
import torch
import torchvision
import torchvision.transforms as transforms

# Download and prepare data
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Download and transform both training and test sets
trainset = torchvision.datasets.FashionMNIST(
    root='data', train=True, download=True, transform=transform
)

testset = torchvision.datasets.FashionMNIST(
    root='data', train=False, download=True, transform=transform
)

# Save both datasets to disk
torch.save(trainset, 'train_dataset.pt')
torch.save(testset, 'test_dataset.pt')

prefix = 'fashion-mnist'

# Upload training data to S3
train_data_path = session.upload_data(
    path='train_dataset.pt',
    bucket=bucket,
    key_prefix=f'{prefix}/train'
)

# Upload test data to S3
test_data_path = session.upload_data(
    path='test_dataset.pt',
    bucket=bucket,
    key_prefix=f'{prefix}/test'
)

print(f"Training data uploaded to: {train_data_path}")
print(f"Test data uploaded to: {test_data_path}")

Training data uploaded to: s3://sagemaker-eu-west-1-934765130326/fashion-mnist/train/train_dataset.pt
Test data uploaded to: s3://sagemaker-eu-west-1-934765130326/fashion-mnist/test/test_dataset.pt


### Upload script to S3

In [43]:
training_script_path = session.upload_data(
    path='train_extended.py',
    bucket=bucket,
    key_prefix=f'{prefix}/code'
)
print(f"Traning script uploaded to: {training_script_path}")
training_script_prefix = f"s3://{bucket}/{prefix}/code/"

Traning script uploaded to: s3://sagemaker-eu-west-1-934765130326/fashion-mnist/code/train_extended.py


In [48]:
job_name = "pytorch-mnist" + time.strftime(
    "%Y-%m-%d-%H-%M-%S", time.gmtime()
)  # Name of training job

instance_type = "ml.m4.xlarge"  # SageMaker instance type to use for training
instance_count = 1  # Number of instances to use for training
volume_size_in_gb = 30  # Amount of storage to allocate to training job
max_runtime_in_seconds = 1200  # Maximum runtimt. Job exits if it doesn't finish before this
s3_output_path = f"s3://{bucket}"  # bucket and optional prefix where the training job stores output artifacts, like model artifact.

# Specify hyperparameters
hyper_parameters = {
    "epochs": "5",
    "batch-size": "64"
}

image_uri = "763104351884.dkr.ecr.eu-west-1.amazonaws.com/pytorch-training:2.0.1-cpu-py310" 

In [49]:
from sagemaker_core.resources import TrainingJob
from sagemaker_core.shapes import (
    AlgorithmSpecification,
    Channel,
    DataSource,
    S3DataSource,
    ResourceConfig,
    StoppingCondition,
    OutputDataConfig,
    MetricDefinition
)

# This parses stdout in the training job according to the below regexes
metric_definitions = [
    MetricDefinition(name = 'train:loss', regex = 'train_loss: (\d+\.\d+)'),
    MetricDefinition(name = 'test:accuracy', regex = 'test_accuracy: (\d+\.\d+)'),
    MetricDefinition(name = 'test:loss', regex = 'test_loss: (\d+\.\d+)'),
    MetricDefinition(name = 'f1_score', regex = 'f1_score: (\d+\.\d+)'),
]

# Create training job.
training_job = TrainingJob.create(
    training_job_name=job_name,
    hyper_parameters=hyper_parameters,
    algorithm_specification=AlgorithmSpecification(
        training_image=image_uri, 
        training_input_mode="File",
        metric_definitions=metric_definitions,
        container_entrypoint=["python", "train_extended.py"]
#        source_dir=training_script_prefix,
    ),
    role_arn=role,
    input_data_config=[
        Channel(
            channel_name="training",
            content_type="application/x-torch",
            data_source=DataSource(
                s3_data_source=S3DataSource(
                    s3_data_type="S3Prefix",
                    s3_uri=train_data_path,
                    s3_data_distribution_type="FullyReplicated",
                )
            ),
        ),
        Channel(
            channel_name="testing",
            content_type="application/x-torch",
            data_source=DataSource(
                s3_data_source=S3DataSource(
                    s3_data_type="S3Prefix",
                    s3_uri=test_data_path,
                    s3_data_distribution_type="FullyReplicated",
                )
            ),
        ),
    ],
    output_data_config=OutputDataConfig(s3_output_path=s3_output_path),
    resource_config=ResourceConfig(
        instance_type=instance_type,
        instance_count=instance_count,
        volume_size_in_gb=volume_size_in_gb,
    ),
    stopping_condition=StoppingCondition(max_runtime_in_seconds=max_runtime_in_seconds),
)

# Wait for the training job to complete
training_job.wait()

Output()