In [42]:
import sagemaker
import boto3
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
import os
import shutil
import glob
import json

In [5]:
role = 'AmazonSageMaker-ExecutionRole-20190815T111389'
session = sagemaker.Session()

In [7]:
bucket = session.default_bucket()
prefix = 'semantic-segmentation'
print(bucket)

sagemaker-us-east-1-756448110530


In [20]:
training_image = get_image_uri(session.boto_region_name, 'semantic-segmentation')
print(training_image)

811284229777.dkr.ecr.us-east-1.amazonaws.com/semantic-segmentation:1


In [21]:
%%time

# comment this if data is already downloaded
# Download the dataset
# !wget -P /tmp http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar    
# # Extract the data.
# !tar -xf /tmp/VOCtrainval_11-May-2012.tar && rm /tmp/VOCtrainval_11-May-2012.tar

--2020-04-23 16:36:22--  http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
Resolving host.robots.ox.ac.uk (host.robots.ox.ac.uk)... 129.67.94.152
Connecting to host.robots.ox.ac.uk (host.robots.ox.ac.uk)|129.67.94.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1999639040 (1.9G) [application/x-tar]
Saving to: ‘/tmp/VOCtrainval_11-May-2012.tar’


2020-04-23 16:52:52 (1.93 MB/s) - ‘/tmp/VOCtrainval_11-May-2012.tar’ saved [1999639040/1999639040]

CPU times: user 34.1 s, sys: 10.7 s, total: 44.8 s
Wall time: 16min 45s


In [30]:
# create directories for copying to S3
VOC2012 = 'VOCdevkit/VOC2012'
os.makedirs('train', exist_ok=True)
os.makedirs('validation', exist_ok=True)
os.makedirs('train_annotation', exist_ok=True)
os.makedirs('validation_annotation', exist_ok=True)

In [29]:
# load train and validation file lists
filename = VOC2012 + '/ImageSets/Segmentation/train.txt'
with open(filename) as f:
    train_list = f.read().splitlines()
    
filename = VOC2012 + '/ImageSets/Segmentation/val.txt'
with open(filename) as f:
    val_list = f.read().splitlines()

In [31]:
# copy training images
for image in train_list:
    shutil.copy2(VOC2012 + '/JPEGImages/' + image + '.jpg', 'train/')
    shutil.copy2(VOC2012 + '/SegmentationClass/' + image + '.png', 'train_annotation/')

In [32]:
# copy validation images
for image in val_list:
    shutil.copy2(VOC2012 + '/JPEGImages/' + image + '.jpg', 'validation/')
    shutil.copy2(VOC2012 + '/SegmentationClass/' + image + '.png', 'validation_annotation/')

In [46]:
# verify train/validation and annotation image count
num_training_samples = len(glob.glob1('train', "*.jpg"))
print(num_training_samples)

assert num_training_samples == len(glob.glob1('train_annotation', "*.png"))

num_validation_samples = len(glob.glob1('validation', "*.jpg"))
print(num_validation_samples)

assert num_validation_samples == len(glob.glob1('validation_annotation', "*.png"))

1464
1449


In [47]:
# create label map
label_map = {'scale': 1}
with open('train_label_map.json', 'w') as f:
    json.dump(label_map, f)

In [49]:
train_channel = prefix + '/train'
validation_channel = prefix + '/validation'
train_annotation_channel = prefix + '/train_validation'
validation_annotation_channel = prefix + '/validation_annotation'

In [51]:
session.upload_data(path='train', bucket=bucket, key_prefix=train_channel)

's3://sagemaker-us-east-1-756448110530/semantic-segmentation/train'

In [52]:
session.upload_data(path='validation', bucket=bucket, key_prefix=validation_channel)

's3://sagemaker-us-east-1-756448110530/semantic-segmentation/validation'

In [54]:
session.upload_data(path='train_annotation', bucket=bucket, key_prefix=train_annotation_channel)

's3://sagemaker-us-east-1-756448110530/semantic-segmentation/train_validation'

In [55]:
session.upload_data(path='validation_annotation', bucket=bucket, key_prefix=validation_annotation_channel)

's3://sagemaker-us-east-1-756448110530/semantic-segmentation/validation_annotation'

In [58]:
s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)
print(s3_output_location)

s3://sagemaker-us-east-1-756448110530/semantic-segmentation/output


In [59]:
ss_model = sagemaker.estimator.Estimator(training_image,
                                        role,
                                        train_instance_count=1,
                                        train_instance_type='ml.p2.xlarge',
                                        train_use_spot_instances=True,
                                        train_volume_size=50,
                                        train_max_run=3600,
                                        train_max_wait=5400,
                                        output_path=s3_output_location,
                                        base_job_name='ss-job',
                                        sagemaker_session=session)

In [76]:
ss_model.set_hyperparameters(backbone='resnet-50',
                             algorithm='deeplab',
                             use_pretrained_model=True,
                             crop_size=240,
                             num_classes=21,
                             epochs=10,
                             learning_rate=0.0001,
                             optimizer='rmsprop',
                             lr_scheduler='poly',
                             mini_batch_size=16,
                             validation_mini_batch_size=16,
                             early_stopping=True,
                             early_stopping_patience=2,
                             early_stopping_min_epochs=1,
                             num_training_samples=num_training_samples)

In [77]:
s3_train_data = 's3://{}/{}'.format(bucket, train_channel)
s3_validation_data = 's3://{}/{}'.format(bucket, validation_channel)
s3_train_annotation = 's3://{}/{}'.format(bucket, train_annotation_channel)
s3_validation_annotation = 's3://{}/{}'.format(bucket, validation_annotation_channel)

distribution = 'FullyReplicated'

train_data = sagemaker.session.s3_input(s3_train_data,
                                        distribution=distribution,
                                        content_type='image/jpeg',
                                        s3_data_type='S3Prefix')

validation_data = sagemaker.session.s3_input(s3_validation_data,
                                        distribution=distribution,
                                        content_type='image/jpeg',
                                        s3_data_type='S3Prefix')

train_annotation = sagemaker.session.s3_input(s3_train_annotation,
                                        distribution=distribution,
                                        content_type='image/jpeg',
                                        s3_data_type='S3Prefix')

validation_annotation = sagemaker.session.s3_input(s3_validation_annotation,
                                        distribution=distribution,
                                        content_type='image/jpeg',
                                        s3_data_type='S3Prefix')

data_channels = {
    'train': train_data,
    'validation': validation_data,
    'train_annotation': train_annotation,
    'validation_annotation': validation_annotation
}

In [78]:
ss_model.fit(inputs=data_channels, logs=True)

2020-04-23 19:07:28 Starting - Starting the training job...
2020-04-23 19:07:30 Starting - Launching requested ML instances......
2020-04-23 19:08:38 Starting - Preparing the instances for training..
2020-04-23 19:09:49 Downloading - Downloading input data......
2020-04-23 19:11:03 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34mRunning custom environment configuration script[0m
[34m[04/23/2020 19:11:43 INFO 140525295597376] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/default-input.json: {u'syncbn': u'False', u'gamma2': u'0.9', u'gamma1': u'0.9', u'early_stopping_min_epochs': u'5', u'epochs': u'10', u'_workers': u'16', u'_num_kv_servers': u'auto', u'weight_decay': u'0.0001', u'crop_size': u'240', u'use_pretrained_model': u'True', u'_aux_weight': u'0.5', u'_hybrid': u'False', u'_augmentation_type': u'default', u'lr_scheduler': u'p

[34m[04/23/2020 19:19:12 INFO 140525295597376] #progress_notice. epoch: 1, iterations: 20 speed: 5.19462839549 samples/sec learning_rate: 0.000089[0m
[34m[04/23/2020 19:20:11 INFO 140525295597376] #progress_notice. epoch: 1, iterations: 40 speed: 5.18814988263 samples/sec learning_rate: 0.000087[0m
[34m[04/23/2020 19:21:11 INFO 140525295597376] #progress_notice. epoch: 1, iterations: 60 speed: 5.18597406351 samples/sec learning_rate: 0.000085[0m
[34m[04/23/2020 19:22:10 INFO 140525295597376] #progress_notice. epoch: 1, iterations: 80 speed: 5.17599841733 samples/sec learning_rate: 0.000083[0m
[34m[04/23/2020 19:22:46 INFO 140525295597376] #quality_metric. host: algo-1, epoch: 1, train loss: 1.0069251675968585 .[0m
[34m[04/23/2020 19:22:46 INFO 140525295597376] #throughput_metric. host: algo-1, epoch: 1, train throughput: 5.39209737048 samples/sec.[0m
[34m[04/23/2020 19:23:11 INFO 140525295597376] #progress_notice. epoch: 1, iterations: 20 speed: 14.1185842298 samples/sec[

[34m[04/23/2020 19:38:16 INFO 140525295597376] #progress_notice. epoch: 4, iterations: 20 speed: 5.63391981044 samples/sec learning_rate: 0.000061[0m
[34m[04/23/2020 19:39:16 INFO 140525295597376] #progress_notice. epoch: 4, iterations: 40 speed: 5.66967928216 samples/sec learning_rate: 0.000059[0m
[34m[04/23/2020 19:40:15 INFO 140525295597376] #progress_notice. epoch: 4, iterations: 60 speed: 5.66552172394 samples/sec learning_rate: 0.000057[0m
[34m[04/23/2020 19:41:14 INFO 140525295597376] #progress_notice. epoch: 4, iterations: 80 speed: 5.66277857022 samples/sec learning_rate: 0.000054[0m
[34m[04/23/2020 19:41:47 INFO 140525295597376] #quality_metric. host: algo-1, epoch: 4, train loss: 0.7150815394553509 .[0m
[34m[04/23/2020 19:41:47 INFO 140525295597376] #throughput_metric. host: algo-1, epoch: 4, train throughput: 5.39106292664 samples/sec.[0m
[34m[04/23/2020 19:42:12 INFO 140525295597376] #progress_notice. epoch: 4, iterations: 20 speed: 14.163013964 samples/sec[0

[34m[04/23/2020 19:57:18 INFO 140525295597376] #progress_notice. epoch: 7, iterations: 20 speed: 5.67212132646 samples/sec learning_rate: 0.000031[0m
[34m[04/23/2020 19:58:17 INFO 140525295597376] #progress_notice. epoch: 7, iterations: 40 speed: 5.66568626375 samples/sec learning_rate: 0.000029[0m
[34m[04/23/2020 19:59:16 INFO 140525295597376] #progress_notice. epoch: 7, iterations: 60 speed: 5.65626265327 samples/sec learning_rate: 0.000027[0m
[34m[04/23/2020 20:00:16 INFO 140525295597376] #progress_notice. epoch: 7, iterations: 80 speed: 5.65547328626 samples/sec learning_rate: 0.000024[0m
[34m[04/23/2020 20:00:51 INFO 140525295597376] #quality_metric. host: algo-1, epoch: 7, train loss: 0.6173880582918292 .[0m
[34m[04/23/2020 20:00:51 INFO 140525295597376] #throughput_metric. host: algo-1, epoch: 7, train throughput: 5.39594212082 samples/sec.[0m
[34m[04/23/2020 20:01:17 INFO 140525295597376] #progress_notice. epoch: 7, iterations: 20 speed: 14.0655384283 samples/sec[