In [8]:
%%time
import boto3
import re
import sagemaker
from sagemaker import get_execution_role
from sagemaker import image_uris

role = get_execution_role()

bucket = sagemaker.Session().default_bucket()

training_image = image_uris.retrieve(
    region=boto3.Session().region_name, framework="image-classification"
)

CPU times: user 250 ms, sys: 11.5 ms, total: 261 ms
Wall time: 608 ms


In [9]:
# The algorithm supports multiple network depth (number of layers). They are 18, 34, 50, 101, 152 and 200
# For this training, we will use 18 layers
num_layers = "18"
# we need to specify the input image shape for the training data
image_shape = "1000,1000"
# we also need to specify the number of training samples in the training set
# for caltech it is 15420
num_training_samples = "100"
# specify the number of output classes
num_classes = "1"
# batch size for training
mini_batch_size = "64"
# number of epochs
epochs = "2"
# learning rate
learning_rate = "0.01"

In [28]:
job_name_prefix = "task-4-imageclassification"
job_name = job_name_prefix + "-" + time.strftime("-%Y-%m-%d-%H-%M-%S", time.gmtime())
training_params = {
    # specify the training image
    "AlgorithmSpecification": {"TrainingImage": training_image, "TrainingInputMode": "Pipe"},
    "RoleArn": role,
    "OutputDataConfig": {"S3OutputPath": "s3://{}/{}/output".format(bucket, job_name_prefix)},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.p3.2xlarge", "VolumeSizeInGB": 50},
    "TrainingJobName": job_name,
    "HyperParameters": {
        "num_layers": str(num_layers),
        "num_training_samples": str(num_training_samples),
        "num_classes": str(num_classes),
        "mini_batch_size": str(mini_batch_size),
        "epochs": str(epochs),
        "learning_rate": str(learning_rate),
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 360000},
    # Training data should be inside a subdirectory called "train"
    # Validation data should be inside a subdirectory called "validation"
    # The algorithm currently only supports fullyreplicated model (where data is copied onto each machine)
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "AttributeNames": [ "source-ref", "class" ],
                    "S3DataType": "AugmentedManifestFile",
                    "S3Uri": "s3://dug-cloudy/cloud_training.json",
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "ContentType": "application/jpeg",
            "CompressionType": "None",
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "AttributeNames": [ "source-ref", "class" ],
                    "S3DataType": "AugmentedManifestFile",
                    "S3Uri": "s3://dug-cloudy/cloud_validation.json",
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "ContentType": "application/jpeg",
            "CompressionType": "None",
        },
    ],
}
print("Training job name: {}".format(job_name))
print(
    "\nInput Data Location: {}".format(
        training_params["InputDataConfig"][0]["DataSource"]["S3DataSource"]
    )
)

Training job name: task-4-imageclassification--2022-08-12-20-45-41

Input Data Location: {'AttributeNames': ['source-ref', 'class'], 'S3DataType': 'AugmentedManifestFile', 'S3Uri': 's3://dug-cloudy/cloud_training.json', 'S3DataDistributionType': 'FullyReplicated'}


In [29]:
# create the Amazon SageMaker training job
sagemaker = boto3.client(service_name="sagemaker")
sagemaker.create_training_job(**training_params)

# confirm that the training job has started
status = sagemaker.describe_training_job(TrainingJobName=job_name)["TrainingJobStatus"]
print("Training job current status: {}".format(status))

try:
    # wait for the job to finish and report the ending status
    sagemaker.get_waiter("training_job_completed_or_stopped").wait(TrainingJobName=job_name)
    training_info = sagemaker.describe_training_job(TrainingJobName=job_name)
    status = training_info["TrainingJobStatus"]
    print("Training job ended with status: " + status)
except:
    print("Training failed to start")
    # if exception is raised, that means it has failed
    message = sagemaker.describe_training_job(TrainingJobName=job_name)["FailureReason"]
    print("Training failed with the following error: {}".format(message))

Training job current status: InProgress
Training failed to start
Training failed with the following error: ClientError: Artifact upload failed:PlatformError: DataAgent status is not healthy. Health: ERROR. Msg: s3://lp-prod-public/HLSL30.020/HLS.L30.T01FBE.2013226T214019.v2.0/HLS.L30.T01FBE.2013226T214019.v2.0.jpg: HTTP 403 : Forbidden


In [30]:
training_info = sagemaker.describe_training_job(TrainingJobName=job_name)
status = training_info["TrainingJobStatus"]
print("Training job ended with status: " + status)

Training job ended with status: Failed
