In [None]:
from datetime import datetime
from glob import glob
from math import floor
from os import environ, listdir, makedirs, path, unlink
from shutil import copy, move, rmtree
import yaml

from boto3 import client
from numpy import random
from openimages.download import download_dataset
from yolov5 import export, train

## Step 1: ingest data

In [None]:
def ingest_data(
        data_folder='./data', limit=0,
        configuration_path='configuration-local.yaml'):
    _clean_folder(data_folder)
    class_labels = _read_class_labels(configuration_path)

    print('Commencing data ingestion.')

    limit = limit or int(environ.get('sample_count', 100))
    download_folder = f'{data_folder}/download'

    download_dataset(
        download_folder,
        class_labels=class_labels,
        annotation_format='darknet',
        limit=limit
    )

    print('data ingestion done')


def _clean_folder(folder):
    print(f'Cleaning folder {folder}')

    for filename in listdir(folder):
        file_path = path.join(folder, filename)
        try:
            if path.isfile(file_path) or path.islink(file_path):
                unlink(file_path)
            elif path.isdir(file_path):
                rmtree(file_path)
        except Exception as e:
            print(f'Failed to delete {file_path}. Reason: {e}')


def _read_class_labels(configuration_file_path):
    with open(configuration_file_path, 'r') as config_file:
        config = yaml.load(config_file.read(), Loader=yaml.SafeLoader)

    class_labels = config['names']
    return class_labels


ingest_data()

## Step 2: preprocess data

In [None]:
def preprocess_data(
        data_folder='./data', configuration_path='configuration-local.yaml'):
    print('preprocessing data')

    for folder in ['images', 'labels']:
        for split in ['train', 'val', 'test']:
            local_folder = f'{data_folder}/{folder}/{split}'
            if not path.exists(local_folder):
                makedirs(local_folder)

    download_folder = f'{data_folder}/download'
    class_labels = _read_class_labels(configuration_path)

    folder_names = [class_name.lower() for class_name in class_labels]
    images = [
        _get_filenames(f'{download_folder}/{folder_name}/images')
        for folder_name in folder_names
    ]

    duplicates_0_1 = images[0] & images[1]
    duplicates_1_2 = images[1] & images[2]
    duplicates_2_0 = images[2] & images[0]

    images[0] -= duplicates_0_1
    images[1] -= duplicates_1_2
    images[2] -= duplicates_2_0

    random.seed(42)
    train_ratio = 0.75
    val_ratio = 0.125
    for i, image_set in enumerate(images):
        image_list = list(image_set)
        random.shuffle(image_list)
        train_size = floor(train_ratio * len(image_list))
        val_size = floor(val_ratio * len(image_list))
        _split_dataset(
            download_folder,
            data_folder,
            folder_names[i],
            image_list,
            train_size=train_size,
            val_size=val_size,
        )

    print('data processing done')


def _get_filenames(folder):
    filenames = set()

    for local_path in glob(path.join(folder, '*.jpg')):
        filename = path.split(local_path)[-1]
        filenames.add(filename)

    return filenames


def _split_dataset(
        download_folder, data_folder, item, image_names, train_size, val_size):

    for i, image_name in enumerate(image_names):
        # Label filename
        label_name = image_name.replace('.jpg', '.txt')

        # Split into train, val, or test
        if i < train_size:
            split = 'train'
        elif i < train_size + val_size:
            split = 'val'
        else:
            split = 'test'

        # Source paths
        source_image_path = f'{download_folder}/{item}/images/{image_name}'
        source_label_path = f'{download_folder}/{item}/darknet/{label_name}'

        # Destination paths
        target_image_folder = f'{data_folder}/images/{split}'
        target_label_folder = f'{data_folder}/labels/{split}'

        # Copy files
        copy(source_image_path, target_image_folder)
        copy(source_label_path, target_label_folder)


preprocess_data()

## Step 3: train model

In [None]:
def train_model(
        data_folder='./data', batch_size=0, epochs=0, base_model='yolov5m',
        configuration_path='configuration-local.yaml'):
    print('training model')

    batch_size = batch_size or int(environ.get('batch_size', 4))
    epochs = epochs or int(environ.get('epochs', 2))
    base_model = base_model or environ.get('base_model', 'yolov5m')

    _clean_folder('yolov5/runs')
    train.run(
        data=configuration_path,
        weights=f'{base_model}.pt',
        epochs=epochs,
        batch_size=batch_size,
        freeze=[10],
        cache='disk',
        exists_ok=True
    )

    move('yolov5/runs/train/exp/weights/best.pt', 'model.pt')

    print('model training done')


train_model()

## Step 4: convert model

In [None]:
def convert_model(model_file_path='model.pt'):
    print('converting model')

    export.run(
        weights=model_file_path,
        include=['onnx'],
        imgsz=(640, 640),
        opset=13,
    )

    print('model converted')


convert_model()

## Step 5: upload model

In [None]:
model_object_prefix = environ.get('model_object_prefix', 'model')
s3_endpoint_url = environ.get('AWS_S3_ENDPOINT')
s3_access_key = environ.get('AWS_ACCESS_KEY_ID')
s3_secret_key = environ.get('AWS_SECRET_ACCESS_KEY')
s3_bucket_name = environ.get('AWS_S3_BUCKET')


def upload_model(model_object_prefix='model', version=''):
    s3_client = _initialize_s3_client(
        s3_endpoint_url=s3_endpoint_url,
        s3_access_key=s3_access_key,
        s3_secret_key=s3_secret_key
    )
    model_object_name = _generate_model_name(
        model_object_prefix, version=version
    )
    _do_upload(s3_client, model_object_name)

    model_object_name_latest = _generate_model_name(
        model_object_prefix, 'latest'
    )
    _do_upload(s3_client, model_object_name_latest)


def _initialize_s3_client(s3_endpoint_url, s3_access_key, s3_secret_key):
    print('initializing S3 client')
    s3_client = client(
        's3', aws_access_key_id=s3_access_key,
        aws_secret_access_key=s3_secret_key,
        endpoint_url=s3_endpoint_url,
    )
    return s3_client


def _generate_model_name(model_object_prefix, version=''):
    version = version if version else _timestamp()
    model_name = f'models/{model_object_prefix}-{version}.onnx'
    return model_name


def _timestamp():
    return datetime.now().strftime('%y%m%d%H%M')


def _do_upload(s3_client, object_name):
    print(f'uploading model to {object_name}')
    try:
        s3_client.upload_file('model.onnx', s3_bucket_name, object_name)
    except:
        print(f'S3 upload to bucket {s3_bucket_name} at {s3_endpoint_url} failed!')
        raise
    print(f'model uploaded and available as "{object_name}"')


upload_model()