In [1]:
%%time
import sagemaker
from sagemaker import get_execution_role

role = get_execution_role()
print(role)
sess = sagemaker.Session()

arn:aws:iam::065122976270:role/service-role/AmazonSageMaker-ExecutionRole-20200403T093426
CPU times: user 851 ms, sys: 192 ms, total: 1.04 s
Wall time: 12.8 s


In [2]:
bucket = 'sagemaker-object-detection-test-200408' # custom bucket name.
# bucket = sess.default_bucket()
prefix = 'ObjectDetection-v0'

In [3]:
from sagemaker.amazon.amazon_estimator import get_image_uri

training_docker_image = get_image_uri(sess.boto_region_name, 'object-detection', repo_version="latest")
print (training_docker_image)

825641698319.dkr.ecr.us-east-2.amazonaws.com/object-detection:latest


In [4]:
s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)
s3_output_location

's3://sagemaker-object-detection-test-200408/ObjectDetection-v0/output'

# Sagemaker configuration

In [5]:
train_channel = prefix + '/train'
validation_channel = prefix + '/validation'
train_annotation_channel = prefix + '/train_annotation'
validation_annotation_channel = prefix + '/validation_annotation'

s3_train_data = 's3://{}/{}'.format(bucket, train_channel)
s3_validation_data = 's3://{}/{}'.format(bucket, validation_channel)
s3_train_annotation = 's3://{}/{}'.format(bucket, train_annotation_channel)
s3_validation_annotation = 's3://{}/{}'.format(bucket, validation_annotation_channel)

In [14]:

od_model = sagemaker.estimator.Estimator(training_docker_image,
                                         role, 
                                         train_instance_count=1, 
                                         train_instance_type='ml.p2.xlarge',
                                         train_volume_size = 50,
                                         train_max_run = 360000,
                                         input_mode = 'File',
                                         output_path=s3_output_location,
                                         sagemaker_session=sess)
# 'ml.p3.2xlarge',

In [15]:
od_model.set_hyperparameters(base_network='resnet-50',
                             use_pretrained_model=1,
                             num_classes=80,
                             mini_batch_size=16,
                             epochs=30,
                             learning_rate=0.001,
                             lr_scheduler_step='10',
                             lr_scheduler_factor=0.1,
                             optimizer='sgd',
                             momentum=0.9,
                             weight_decay=0.0005,
                             overlap_threshold=0.5,
                             nms_threshold=0.45,
                             image_shape=512,
                             label_width=600,
                             num_training_samples=174)

In [16]:
train_data = sagemaker.session.s3_input(s3_train_data, distribution='FullyReplicated', 
                        content_type='image/jpeg', s3_data_type='S3Prefix')
validation_data = sagemaker.session.s3_input(s3_validation_data, distribution='FullyReplicated', 
                             content_type='image/jpeg', s3_data_type='S3Prefix')
train_annotation = sagemaker.session.s3_input(s3_train_annotation, distribution='FullyReplicated', 
                             content_type='image/jpeg', s3_data_type='S3Prefix')
validation_annotation = sagemaker.session.s3_input(s3_validation_annotation, distribution='FullyReplicated', 
                             content_type='image/jpeg', s3_data_type='S3Prefix')

data_channels = {'train': train_data, 'validation': validation_data, 
                 'train_annotation': train_annotation, 'validation_annotation':validation_annotation}

In [18]:
od_model.fit(inputs=data_channels, logs=True)

2020-04-15 05:45:26 Starting - Starting the training job...
2020-04-15 05:45:27 Starting - Launching requested ML instances...
2020-04-15 05:46:22 Starting - Preparing the instances for training.........
2020-04-15 05:47:45 Downloading - Downloading input data
2020-04-15 05:47:45 Training - Downloading the training image........[34mDocker entrypoint called with argument(s): train[0m
[34m[04/15/2020 05:49:00 INFO 139779977344832] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/default-input.json: {u'label_width': u'350', u'early_stopping_min_epochs': u'10', u'epochs': u'30', u'overlap_threshold': u'0.5', u'lr_scheduler_factor': u'0.1', u'_num_kv_servers': u'auto', u'weight_decay': u'0.0005', u'mini_batch_size': u'32', u'use_pretrained_model': u'0', u'freeze_layer_pattern': u'', u'lr_scheduler_step': u'', u'early_stopping': u'False', u'early_stopping_patience': u'5', u'momentum': u'0.9', u'num_training_samples': u'', u'optimizer': u'sgd', u'_tuning