# Prepare the COCO Dataset

## Download / Unpack

In [2]:
bucket = 'sagemaker-object-detection-test-200408' # custom bucket name.
# bucket = sess.default_bucket()
prefix = 'ObjectDetection-v0'

In [3]:
%%time
import sagemaker
from sagemaker import get_execution_role

role = get_execution_role()
print(role)
sess = sagemaker.Session()

arn:aws:iam::065122976270:role/service-role/AmazonSageMaker-ExecutionRole-20200403T093426
CPU times: user 773 ms, sys: 61.5 ms, total: 835 ms
Wall time: 896 ms


In [3]:
import os
import urllib.request
import json
import logging

In [4]:
def download(url):
    filename = url.split("/")[-1]
    if not os.path.exists(filename):
        urllib.request.urlretrieve(url, filename)


# MSCOCO validation image files
download('http://images.cocodataset.org/zips/val2017.zip')
download('http://images.cocodataset.org/annotations/annotations_trainval2017.zip')

In [5]:
%%bash
unzip -qo val2017.zip || true
unzip -qo annotations_trainval2017.zip || true
rm val2017.zip annotations_trainval2017.zip || true

In [6]:
%%bash
#Create folders to store the data and annotation files
rm -rf generated train train_annotation validation validation_annotation || true
mkdir generated train train_annotation validation validation_annotation || true

## Prepare Mappers

In [7]:
CAT_IDS = []
DOG_IDS = []

file_name = './annotations/instances_val2017.json'
with open(file_name) as f:
    js = json.load(f)
    images = js['images']
    categories = js['categories']
    for c in categories:
        n = c['name']
        i = c['id']
        if n == 'cat':
            CAT_IDS.append(i)
        if n == 'dog':
            DOG_IDS.append(i)
            
print("CAT_IDS %s" % CAT_IDS)
print("DOG_IDS %s" % DOG_IDS)

CAT_IDS [17]
DOG_IDS [18]


In [8]:
def is_cat_or_dog(category):
    if category in CAT_IDS or category in DOG_IDS:
        return True
    return False

def get_coco_mapper():
    original_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20,
                    21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
                    41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
                    61, 62, 63, 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80,
                    81, 82, 84, 85, 86, 87, 88, 89, 90]
    iter_counter = 0
    COCO = {}
    for orig in original_list:
        if is_cat_or_dog(orig):
            COCO[orig] = iter_counter
            iter_counter += 1
    return COCO

def get_mapper_fn(map):  
    def mapper(in_category):
        return map[in_category]
    return mapper

fix_index_mapping = get_mapper_fn(get_coco_mapper())
 

## Map annotations to sagemaker format

In [9]:
file_name = './annotations/instances_val2017.json'
with open(file_name) as f:
    js = json.load(f)
    images = js['images']
    categories = js['categories']
    annotations = js['annotations']
    for i in images:
        jsonFile = i['file_name']
        jsonFile = jsonFile.split('.')[0]+'.json'
        
        line = {}
        line['file'] = i['file_name']
        line['image_size'] = [{
            'width':int(i['width']),
            'height':int(i['height']),
            'depth':3
        }]
        line['annotations'] = []
        line['categories'] = []
        for j in annotations:
            if j['image_id'] == i['id'] and len(j['bbox']) > 0:
                if not is_cat_or_dog(j['category_id']):
                    continue
                
                line['annotations'].append({
                    'class_id':int(fix_index_mapping(j['category_id'])),
                    'top':int(j['bbox'][1]),
                    'left':int(j['bbox'][0]),
                    'width':int(j['bbox'][2]),
                    'height':int(j['bbox'][3])
                })
                class_name = ''
                for k in categories:
                    if int(j['category_id']) == k['id']:
                        class_name = str(k['name'])
                assert class_name is not ''
                line['categories'].append({
                    'class_id':int(j['category_id']),
                    'name':class_name
                })
        if line['annotations']:
            with open(os.path.join('generated', jsonFile),'w') as p:
                json.dump(line,p)

In [10]:
jsons = os.listdir('generated')
print ('There are {} images have annotation files'.format(len(jsons)))

There are 349 images have annotation files


## Split Data and upload to S3 for training

In [11]:
import shutil

jsons = os.listdir('generated')
split_idx = int(len(jsons)/2)
print("Split idx %i" % split_idx)

train_jsons = jsons[:split_idx]
val_jsons = jsons[split_idx:]


Split idx 174


In [12]:
#Moving training files to the training folders
for i in train_jsons:
    image_file = './val2017/'+i.split('.')[0]+'.jpg'
    if not os.path.exists(image_file):
        print("Train image file not available %s" % image_file)
        continue
    
    shutil.move(image_file, './train/')
    shutil.move('./generated/'+i, './train_annotation/')

#Moving validation files to the validation folders
for i in val_jsons:
    image_file = './val2017/'+i.split('.')[0]+'.jpg'
    if not os.path.exists(image_file):
        print("Validation image file not available %s" % image_file)
        continue
    shutil.move(image_file, './validation/')
    shutil.move('./generated/'+i, './validation_annotation/')

### Upload to S3

In [4]:
train_channel = prefix + '/train'
validation_channel = prefix + '/validation'
train_annotation_channel = prefix + '/train_annotation'
validation_annotation_channel = prefix + '/validation_annotation'

sess.upload_data(path='train', bucket=bucket, key_prefix=train_channel)
sess.upload_data(path='validation', bucket=bucket, key_prefix=validation_channel)
sess.upload_data(path='train_annotation', bucket=bucket, key_prefix=train_annotation_channel)
sess.upload_data(path='validation_annotation', bucket=bucket, key_prefix=validation_annotation_channel)

s3_train_data = 's3://{}/{}'.format(bucket, train_channel)
s3_validation_data = 's3://{}/{}'.format(bucket, validation_channel)
s3_train_annotation = 's3://{}/{}'.format(bucket, train_annotation_channel)
s3_validation_annotation = 's3://{}/{}'.format(bucket, validation_annotation_channel)


In [5]:
s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)
s3_output_location

's3://sagemaker-object-detection-test-200408/ObjectDetection-v0/output'