In [1]:
import os, sys
parent_dir = os.path.abspath('..')
if not parent_dir in sys.path:
    sys.path.append(parent_dir)

In [2]:
import json
import boto3
from dotenv import dotenv_values
config = dotenv_values(".env")
from pascal import annotation_from_xml
from rekog import pascal_voc_to_manifest_line
from lib.file_utils import list_images

In [3]:
session = boto3.Session(profile_name=config['AWS_PROFILE'])
s3_client = session.client('s3')

## Read dataset labels

In [4]:
DATASET_DIR = "../dataset"
with open(f'{DATASET_DIR}/catalog.json') as json_file:
    catalog = json.load(json_file)
print(catalog)

{'MM': 'medical-mask', 'PM': 'person-mask', 'PMM': 'person-masked-medical', 'PMN': 'person-masked-non-medical'}


In [5]:
class_map = {}
for index, class_name in zip(range(len(catalog)), catalog.values()):
    class_map[str(index)] = class_name
class_map

{'0': 'medical-mask',
 '1': 'person-mask',
 '2': 'person-masked-medical',
 '3': 'person-masked-non-medical'}

In [6]:
def read_pascal_voc(image_path):
    end_index = image_path.rindex('.')
    xml_path = image_path[0:end_index] + '.xml'
    annotations = annotation_from_xml(xml_path)
    return annotations

In [7]:
def add_images(s3_client, images, s3_bucket, s3_key_prefix):
    manifest_lines = []
    
    for image_path in images:
        annotation = read_pascal_voc(image_path)
        s3_key = f"{s3_key_prefix}/{image_path.split('/')[-1]}"
        image_src = f's3://{s3_bucket}/{s3_key}'
        manifest_line = pascal_voc_to_manifest_line(job_name='face-mask-job',
                                                    annotation=annotation, class_map=class_map,
                                                    image_src=image_src)
        manifest_lines.append(manifest_line)
        s3_client.upload_file(image_path, s3_bucket, s3_key)
        
    return manifest_lines

## Upload images with manifest (annotations)

In [8]:
IMAGES_DIR = "../images"
DATASET_BUCKET = config['REKOG_DATASET_BUCKET']

for dataset_type in ['test', 'train']:
    manifest_lines = []

    for class_dir, class_name in catalog.items():
        images = list_images(f"{IMAGES_DIR}/{dataset_type}/{class_dir}")
        lines = add_images(s3_client, images, DATASET_BUCKET, f'{dataset_type}/{class_dir}')
        manifest_lines.extend(lines)

    manifest = '\n'.join(manifest_lines).encode('utf-8')
    s3_client.put_object(Body=manifest,
                         Bucket=DATASET_BUCKET, Key=f'{dataset_type}/manifest.json')
    print(f'Uploaded {len(manifest_lines)} images for {dataset_type} dataset')

Uploaded 23 images for test dataset
Uploaded 88 images for train dataset
