# STAGE I - Download Dataset From S3

In [None]:
!pip install boto3 botocore
!pip list | grep -E "boto3"

In [None]:
import os, sys, threading, datetime
try:
    import boto3
    from boto3.s3.transfer import TransferConfig
    from botocore.exceptions import ClientError
except Exception as e:
    print(f"Caught exception: {e}")

## Settings

Here we set up all the options for training.  Most are environment variables which will allow us to override values from pipelines and run this notebook with different setting such as the base model or number of training steps and learning rate.

In [None]:
# setup working environment
PERSISTENCE_DIR: str = os.environ.get('PERSISTENCE_DIR')
OUTPUT_DIR = os.path.join(PERSISTENCE_DIR, "data")
try:
    os.makedirs(OUTPUT_DIR, True)
except Exception as e:
    print(f"TRACE: {e}")

# setup connection to S3 Storage (MinIO)
ACCESS_KEY_ID = os.environ.get('ACCESS_KEY_ID')
SECRET_KEY_ID = os.environ.get('SECRET_ACCESS_KEY')
ENDPOINT_URL = os.environ.get('S3_ENDPOINT')
REGION = os.environ.get('DEFAULT_REGION')
BUCKET_NAME = os.environ.get('S3_BUCKET', "trainingdata")

# dataset filename
TARBALL_NAME: str = os.environ.get("TARBALL_NAME", "mario-dataset.tar.gz")
FILE_NAME: str = "/".join((OUTPUT_DIR, TARBALL_NAME))

# Access the S3 bucket containing the tarball of the training dataset

Now we have our training dataset stored in an s3 bucket. We need to download and decompress the tarball to make everything available to the training pipeline
Note: This requires a data connection to an S3 compatible bucket. As part of the setup you should have deployed an instance of MinIO from this [helm chart](https://github.com/mcaimi/minio-helm)


In [None]:
# connect to MinIO and prepare buckets
print(f"Accessing S3 endpoint {ENDPOINT_URL} with ACCESS_KEY {ACCESS_KEY_ID}...")

# instantiate connection
minio_api = boto3.client("s3",
                         endpoint_url=ENDPOINT_URL,
                         aws_access_key_id=ACCESS_KEY_ID,
                         aws_secret_access_key=SECRET_KEY_ID)

Now download the tarball

In [None]:
# Set the desired multipart threshold value (5GB)
GB = 1024 ** 3
transfer_config = TransferConfig(multipart_threshold = 5*GB, use_threads=False)

# download
try:
    print(f"Downloading {TARBALL_NAME} FROM MinIO bucket {BUCKET_NAME}")
    minio_api.download_file(BUCKET_NAME,
                            TARBALL_NAME,
                            FILE_NAME)
except ClientError as e:
    print(f"S3 Exception: {e.response['Error']['Code']}, trace: {e}")
except Exception as e:
    print(f"Caught exception: {e}")

print("Download Complete.")

# Decompress dataset

Decompress the data set tarball in the current directory for further use.

The dataset is composed of
- images/{train,validate} folders: where actual training images are stored for training and validation purposes
- labels/{train,validate} folders: where label annotations are stored, one txt file per source image
- task.yaml: the training task descriptor

In [None]:
# decompress tarball
def decompressDataset(datasetPath:str, destination: str) -> None:
    try:
        import tarfile as tf
    except Exception as e:
        raise e

    # check destination path
    if not os.path.isdir(destination):
        print(f"Creating destination dir {destination}...")
        os.mkdir(destination)

    # decompress file
    with tf.open(datasetPath) as dset_file:
        for f in dset_file.getnames():
            if not os.path.exists("/".join((destination,f))):
                print(f"Extracting {f}...")
                dset_file.extract(f, destination)

In [None]:
# decompress dataset in the target directory
try:
    decompressDataset(FILE_NAME, OUTPUT_DIR)
except Exception as e:
    print(f"Caught exception: {e}")