In [1]:
import os, sys
parent_dir = os.path.abspath('..')
if not parent_dir in sys.path:
    sys.path.append(parent_dir)

In [2]:
import json
import boto3
from dotenv import dotenv_values
config = dotenv_values(".env")
from pascal import annotation_from_xml
from rekog import find_project, get_project_dataset, pascal_voc_to_manifest_line, add_image_to_dataset
from lib.file_utils import list_images

In [3]:
session = boto3.Session(profile_name=config['AWS_PROFILE'])
s3_client = session.client('s3')
rekognition_client = session.client("rekognition")

## Read dataset labels

In [4]:
DATASET_DIR = "../dataset"
with open(f'{DATASET_DIR}/catalog.json') as json_file:
    catalog = json.load(json_file)
print(catalog)

{'MM': 'medical-mask', 'PM': 'person-mask', 'PMM': 'person-masked-medical', 'PMN': 'person-masked-non-medical'}


In [5]:
class_map = {}
for index, class_name in zip(range(len(catalog)), catalog.values()):
    class_map[str(index)] = class_name
class_map

{'0': 'medical-mask',
 '1': 'person-mask',
 '2': 'person-masked-medical',
 '3': 'person-masked-non-medical'}

In [6]:
def read_pascal_voc(image_path):
    end_index = image_path.rindex('.')
    xml_path = image_path[0:end_index] + '.xml'
    annotations = annotation_from_xml(xml_path)
    return annotations

In [7]:
def create_manifest_line(image_path, s3_bucket, s3_key_prefix):
    annotation = read_pascal_voc(image_path)
    s3_key = f"{s3_key_prefix}/{image_path.split('/')[-1]}"
    image_src = f's3://{s3_bucket}/{s3_key}'
    manifest_line = pascal_voc_to_manifest_line(job_name='face-mask-job',
                                                annotation=annotation, class_map=class_map,
                                                image_src=image_src)
        
    return manifest_line

## Create training dataset

In [8]:
IMAGES_DIR = "../images"
DATASET_BUCKET = config['REKOG_DATASET_BUCKET']

project = find_project(rekognition_client, config['REKOG_PROJECT_ARN'])
project

{'ProjectArn': 'arn:aws:rekognition:us-east-1:391874106884:project/face-mask/1704048039527',
 'CreationTimestamp': datetime.datetime(2023, 12, 31, 12, 40, 39, 527000, tzinfo=tzlocal()),
 'Status': 'CREATED',
 'Datasets': [{'CreationTimestamp': datetime.datetime(2024, 1, 13, 0, 16, 37, 320000, tzinfo=tzlocal()),
   'DatasetType': 'TRAIN',
   'DatasetArn': 'arn:aws:rekognition:us-east-1:391874106884:project/face-mask/dataset/train/1705126597316',
   'Status': 'UPDATE_COMPLETE',
   'StatusMessage': 'The dataset was updated successfully.',
   'StatusMessageCode': 'SUCCESS'}]}

In [9]:
DATASET_TYPE = 'TRAIN'
dataset_arn = get_project_dataset(project, DATASET_TYPE)['DatasetArn']

for class_dir, class_name in catalog.items():
    images = list_images(f"{IMAGES_DIR}/{DATASET_TYPE.lower()}/{class_dir}")
    
    for image_path in images:
        print(image_path)
        manifest_line = create_manifest_line(image_path, DATASET_BUCKET, f"{DATASET_TYPE.lower()}/{class_dir}")
        resp = add_image_to_dataset(rekognition_client, dataset_arn, manifest_line)
        print(resp)
        print(manifest_line)
        break
    break

Dataset status: UPDATE_COMPLETE
../images/train/MM/2.jpg
{'ResponseMetadata': {'RequestId': '8f992749-bbd0-4edd-93b3-1f50a6f03e37', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '8f992749-bbd0-4edd-93b3-1f50a6f03e37', 'content-type': 'application/x-amz-json-1.1', 'content-length': '2', 'date': 'Sat, 13 Jan 2024 06:24:51 GMT'}, 'RetryAttempts': 0}}
{"source-ref": "s3://face-mask-dataset/train/MM/2.jpg", "bounding-box": {"image_size": [{"width": 720, "height": 720, "depth": 3}], "annotations": [{"class_id": 0, "top": 121, "left": 235, "width": 246, "height": 586}]}, "bounding-box-metadata": {"objects": [{"confidence": 1}], "class-map": {"0": "medical-mask"}, "type": "groundtruth/object-detection", "human-annotated": "yes", "creation-date": "2024-01-13T00:24:51.345789", "job-name": "face-mask-job"}}


## Upload images with manifest (annotations)

In [None]:
IMAGES_DIR = "../images"
DATASET_BUCKET = config['REKOG_DATASET_BUCKET']

for dataset_type in ['test', 'train']:
    manifest_lines = []

    for class_dir, class_name in catalog.items():
        images = list_images(f"{IMAGES_DIR}/{dataset_type}/{class_dir}")
        lines = add_images(s3_client, images, DATASET_BUCKET, f'{dataset_type}/{class_dir}')
        manifest_lines.extend(lines)

    manifest = '\n'.join(manifest_lines).encode('utf-8')
    s3_client.put_object(Body=manifest,
                         Bucket=DATASET_BUCKET, Key=f'{dataset_type}/manifest.json')
    print(f'Uploaded {len(manifest_lines)} images for {dataset_type} dataset')