In [None]:
import sagemaker
import json
import pandas as pd
import boto3
from PIL import Image, ImageDraw
from io import BytesIO

In [None]:
sm_session = sagemaker.session.Session()
default_bucket = sm_session.default_bucket()
role = sagemaker.get_execution_role()

## Google's Open Images Dataset (v4)

Google publishes annotated images for ML at https://storage.googleapis.com/openimages/web/download_v4.html

Let's download the hierarchy as well as the annotations CSV file locally (not the full image dataset).

In [None]:
# Download and process the Open Images annotations.
!wget https://storage.googleapis.com/openimages/2018_04/test/test-annotations-bbox.csv
!wget https://storage.googleapis.com/openimages/2018_04/bbox_labels_600_hierarchy.json

## Bird images selection and copying

View the full hierarchy of images at: 

https://storage.googleapis.com/openimages/2018_04/bbox_labels_600_hierarchy_visualizer/circle.html

Entity >> Animal >> Bird >> [Specie of Bird]


In [None]:
with open("bbox_labels_600_hierarchy.json", "r") as f:
    hierarchy = json.load(f)
    
hierarchy

CLASS_NAME = "Bird"
BIRD_ID = "/m/015p6"
CLASS_NAME = "Animal"
ANIMAL_ID = "/m/0jbk"
N = 10 # How many images to take

In [None]:
entities = hierarchy['Subcategory']

animal_index = next((index for (index, d) in enumerate(entities) if d["LabelName"] == ANIMAL_ID), None)

animals = entities[animal_index]['Subcategory']

bird_index = next((index for (index, d) in enumerate(animals) if d["LabelName"] == BIRD_ID), None)

birds = animals[bird_index]['Subcategory']

bird_labels = [label['LabelName'] for label in birds]

In [None]:
images = pd.read_csv('./test-annotations-bbox.csv')

birds = images[images['LabelName'].isin(bird_labels)]

skip_these_images = ["251d4c429f6f9c39", "065ad49f98157c8d"]

birds_clean = birds[~birds['ImageID'].isin(skip_these_images)]

### Copying bird images to own S3 bucket

To be used with GrounTruth labelling jobs

In [None]:
s3 = boto3.client("s3")

unique_images = birds_clean.ImageID.unique()[0:N]

for it, img_id in enumerate(unique_images):

    copy_source = {"Bucket": "open-images-dataset", "Key": "test/{}.jpg".format(img_id)}
    if it % 100 == 0:
        print("Copying image {} / {}".format(it, N))
    s3.copy(copy_source, default_bucket, "groundtruth_demo/images/{}.jpg".format(img_id))


## Create Manifest

Let's create a file with the images we wish to annotate

In [None]:
# Create and upload the input manifest.
manifest_name = "input.manifest"

BUCKET = default_bucket
PREFIX = "groundtruth_demo/images"

### Generate Manifest

In [None]:
with open(manifest_name, "w") as f:
    # Let's take first N images
    for it, img_id in enumerate(unique_images):
        img_json = {}
        img_path = "s3://{}/{}/{}.jpg".format(BUCKET, PREFIX, img_id)
        img_json['source-ref'] = img_path
        f.write(json.dumps(img_json)+'\n')

### Upload Manifest

In [None]:
PREFIX = "groundtruth_demo"
s3.upload_file(manifest_name, BUCKET, "{}/{}".format(PREFIX, manifest_name))
print("Uploaded manifest to s3://{}/{}/{}".format(BUCKET, PREFIX, manifest_name))

## Create Manifest with Bounding Boxes

First let's see if we can draw and calculate bounding boxes accurately with PIL image library

### Verify size conversion

In [None]:
# Let's get a list of unique img IDs from our list of clean images with bounding boxes
unique_images = birds_clean.ImageID.unique()

#Take first image, load into memory, and calculate width x height
img_id = unique_images[10]
image_bytes = s3.get_object(Bucket="open-images-dataset", Key="test/{}.jpg".format(img_id))['Body'].read()
img = Image.open(BytesIO(image_bytes))
width, height = img.size

#Get all bounding boxes defined for this image
bboxes = birds_clean.loc[birds_clean['ImageID'] == img_id][['XMin', 'XMax', 'YMin', 'YMax']]

#Now for each bounding box draw a rectangle, multiplying the normalised X and Y Min and Max values by width and height
for index, bbox in bboxes.iterrows():
    draw = ImageDraw.Draw(img)
    draw.rectangle(((bbox.XMin*width, bbox.YMin*height), (bbox.XMax*width, bbox.YMax*height)), outline="red")

#And display image
img

### Generate Annotated Manifest

Now let's do this for all images and generate the Ground Truth manifest

In [None]:
# Create and upload the input manifest.
manifest_name = "annotated_input.manifest"

BUCKET = default_bucket
PREFIX = "groundtruth_demo/images"
LABELING_JOB_NAME = "labeling-job-name"

with open(manifest_name, "w") as f:
    
    # Let's take first N images
    N = 10
    
    for it, img_id in enumerate(unique_images[0:N]):
        
        img_json = {}
        
        bboxes = birds_clean.loc[birds_clean['ImageID'] == img_id][['XMin', 'XMax', 'YMin', 'YMax']]
        img_path = "s3://{}/{}/{}.jpg".format(BUCKET, PREFIX, img_id)
        image_bytes = s3.get_object(Bucket=BUCKET, Key="{}/{}.jpg".format(PREFIX, img_id))['Body'].read()
        img = Image.open(BytesIO(image_bytes))
        width, height = img.size
        img_json['source-ref'] = img_path
        img_json[LABELING_JOB_NAME] = {}
        img_json[LABELING_JOB_NAME+'-metadata'] = {}
        img_json[LABELING_JOB_NAME+'-metadata']['objects'] = []
        img_json[LABELING_JOB_NAME]['image_size'] = [{"width": width, "height": height, "depth": 3}]
        img_json[LABELING_JOB_NAME]['annotations'] = []
        for it, bbox in bboxes.iterrows():
            img_json[LABELING_JOB_NAME]['annotations'].append(
                {"class_id": 0,
                 "top": round(bbox.YMin * height),
                 "left": round(bbox.XMin * width),
                 "height": round((bbox.YMax-bbox.YMin) * height),
                 "width": round((bbox.XMax - bbox.XMin) * width),
                }
            )
            img_json[LABELING_JOB_NAME+'-metadata']['objects'].append(
                {"confidence": 0}
            )
        img_json[LABELING_JOB_NAME+'-metadata']["class-map"] = {}
        img_json[LABELING_JOB_NAME+'-metadata']["class-map"]["0"] = "bird"
        img_json[LABELING_JOB_NAME+'-metadata']["type"] = "groundtruth/object-detection"

        f.write(json.dumps(img_json)+'\n')

### Upload Manifest

In [None]:
#Upload manifest to S3
s3.upload_file(manifest_name, default_bucket, "groundtruth_demo/{}".format(manifest_name))

## Inspect Manifests

Install JQ and inspect manifest file

In [None]:
%%capture
# Inspect manifest file
!apt-get update
!apt-get -y install jq


In [None]:
#manifest_name = 'input.manifest'
manifest_name = 'annotated_input.manifest'
!head {manifest_name} -n 1 | jq