In [31]:
%%time
import sagemaker
from sagemaker import get_execution_role
 
role = get_execution_role()
print(role)
sess = sagemaker.Session()

arn:aws:iam::023375022819:role/service-role/AmazonSageMaker-ExecutionRole-20181029T121824
CPU times: user 97.5 ms, sys: 8.13 ms, total: 106 ms
Wall time: 127 ms


In [32]:
bucket = sess.default_bucket()  
prefix = 'ships'
print(bucket)
print(prefix)

INFO:sagemaker:Created S3 bucket: sagemaker-us-east-1-023375022819


sagemaker-us-east-1-023375022819
ships


In [33]:
from sagemaker.amazon.amazon_estimator import get_image_uri
training_image = get_image_uri(sess.boto_region_name, 'semantic-segmentation', repo_version="latest")
print (training_image)


811284229777.dkr.ecr.us-east-1.amazonaws.com/semantic-segmentation:latest


In [34]:
s3_output_location = 's3://{}/{}/output1'.format(bucket, prefix)
print(s3_output_location)


s3://sagemaker-us-east-1-023375022819/ships/output1


In [41]:
# Create the sagemaker estimator object.
ss_model = sagemaker.estimator.Estimator(training_image,
                                         role, 
                                         train_instance_count = 1, 
                                         train_instance_type = 'ml.p2.xlarge',
                                         train_volume_size = 50,
                                         train_max_run = 3600,
                                         output_path = s3_output_location,
                                         base_job_name = 'image-kings',
                                         sagemaker_session = sess)


In [42]:
import glob

# Create channel names for the s3 bucket.
train_channel = prefix + '/trainsm1'
validation_channel = prefix + '/validatesm1'
train_annotation_channel = prefix + '/trainsmpng1'
validation_annotation_channel = prefix + '/validatesmpng1'

# Create full bucket names
s3_train_data = 's3://{}/{}'.format(bucket, train_channel)
s3_train_annotation = 's3://{}/{}'.format(bucket, train_annotation_channel)

s3_validation_data = 's3://{}/{}'.format(bucket, validation_channel)
s3_validation_annotation = 's3://{}/{}'.format(bucket, validation_annotation_channel)

distribution = 'FullyReplicated'
# num_training_samples = str(len(glob.glob1('s3://sagemaker-us-east-1-023375022819/ships/train/',"*.jpg")))
# num_training_samples = 192556
num_training_samples = 6

print('s3_train_data: ', s3_train_data)
print('s3_train_annotation: ', s3_train_annotation)
print('s3_validation_data: ', s3_validation_data)
print('s3_validation_annotation: ', s3_validation_annotation)
print('distribution: ', distribution)
print('num_training_samples = ', num_training_samples)


# Create sagemaker s3_input objects
train_data = sagemaker.session.s3_input(s3_train_data, distribution=distribution, 
                                        content_type='image/jpeg', s3_data_type='S3Prefix')
validation_data = sagemaker.session.s3_input(s3_validation_data, distribution=distribution, 
                                        content_type='image/jpeg', s3_data_type='S3Prefix')
train_annotation = sagemaker.session.s3_input(s3_train_annotation, distribution=distribution, 
                                        content_type='image/png', s3_data_type='S3Prefix')
validation_annotation = sagemaker.session.s3_input(s3_validation_annotation, distribution=distribution, 
                                        content_type='image/png', s3_data_type='S3Prefix')

data_channels = {'train': train_data, 
                 'validation': validation_data,
                 'train_annotation': train_annotation, 
                 'validation_annotation':validation_annotation}


s3_train_data:  s3://sagemaker-us-east-1-023375022819/ships/trainsm1
s3_train_annotation:  s3://sagemaker-us-east-1-023375022819/ships/trainsmpng1
s3_validation_data:  s3://sagemaker-us-east-1-023375022819/ships/validatesm1
s3_validation_annotation:  s3://sagemaker-us-east-1-023375022819/ships/validatesmpng1
distribution:  FullyReplicated
num_training_samples =  6


In [43]:
# ---------------------
# Set Hyper Parameters
# ---------------------
ss_model.set_hyperparameters(backbone='resnet-50', # This is the encoder. Other option is resnet-50
                             # algorithm='fcn', # This is the decoder. Other option is 'psp' and 'deeplab'                             
                             use_pretrained_model='True', # Use the pre-trained model.
                             # crop_size=768, # Size of image random crop.                             
                             num_classes=226, # Pascal has 21 classes. This is a mandatory parameter.
                             epochs=10, # Number of epochs to run.
                             learning_rate=0.0001,                             
                             #optimizer='sgd', # Other options include 'adam', 'rmsprop', 'nag', 'adagrad'.
                             #lr_scheduler='poly', # Other options include 'cosine' and 'step'.                           
                             mini_batch_size=2, # Setup some mini batch size.
                             validation_mini_batch_size=1,
                             #early_stopping=True, # Turn on early stopping. If OFF, other early stopping parameters are ignored.
                             #early_stopping_patience=2, # Tolerate these many epochs if the mIoU doens't increase.
                             #early_stopping_min_epochs=10, # No matter what, run these many number of epochs.                             
                             num_training_samples=num_training_samples) 


In [44]:
ss_model.fit(inputs = data_channels, logs = True)


INFO:sagemaker:Creating training-job with name: image-kings-2019-01-10-20-29-56-442


2019-01-10 20:29:56 Starting - Starting the training job...
2019-01-10 20:29:59 Starting - Launching requested ML instances......
2019-01-10 20:31:12 Starting - Preparing the instances for training......
2019-01-10 20:32:11 Downloading - Downloading input data...
2019-01-10 20:32:38 Training - Downloading the training image...
2019-01-10 20:33:16 Training - Training image download completed. Training in progress.
[31mDocker entrypoint called with argument(s): train[0m
[31m[01/10/2019 20:33:19 INFO 140057577924416] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/default-input.json: {u'gamma2': u'0.9', u'gamma1': u'0.9', u'early_stopping_min_epochs': u'5', u'epochs': u'10', u'_workers': u'16', u'_num_kv_servers': u'auto', u'weight_decay': u'0.0001', u'crop_size': u'240', u'use_pretrained_model': u'True', u'_aux_weight': u'0.5', u'_hybrid': u'False', u'_augmentation_type': u'default', u'lr_scheduler': u'poly', u'early_stopping_patience': u'4', u'mom

ValueError: Error for Training job image-kings-2019-01-10-20-29-56-442: Failed Reason: InternalServerError: We encountered an internal error. Please try again.