In [None]:
! pip install --quiet git-remote-codecommit

In [None]:
import boto3
import sagemaker
import time
from time import strftime

boto_session = boto3.Session()
sagemaker_session = sagemaker.Session(boto_session=boto_session)
sm_client = boto3.client("sagemaker")
region = boto_session.region_name
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role()
account = sagemaker_session.boto_session.client("sts").get_caller_identity()["Account"]

prefix = 'sagemaker-intel-dvc'

print(f"account: {account}")
print(f"bucket: {bucket}")
print(f"region: {region}")
print(f"role: {role}")

In [None]:
from sagemaker.pytorch.processing import PyTorchProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker import get_execution_role

In [None]:
dvc_repo_url = "codecommit::us-west-2://sagemaker-intel"
dvc_branch = "processed-dataset"

In [None]:
pytorch_processor = PyTorchProcessor(
    image_uri='public.ecr.aws/f2t6q8t2/emlo:infer',
    role=get_execution_role(),
    instance_type='ml.t3.medium',
    instance_count=1,
    base_job_name='preprocess-intel-dataset',
    env={
        "DVC_REPO_URL": dvc_repo_url,
        "DVC_BRANCH": dvc_branch,
        "GIT_USER": "m",
        "GIT_EMAIL": "m@emlo.com"
    }
)

In [None]:
input_dataset = "s3://sagemaker-us-west-2-350104937619/intel.zip"

In [None]:
pytorch_processor.run(
    # code='preprocess.py',
    # source_dir='sagemaker-intel',
    # dependencies=["sagemaker-intel/requirements.txt"],
    inputs=[
        ProcessingInput(
            input_name='data',
            source=input_dataset,
            destination='/opt/ml/processing/input'
        )
    ]
)

In [None]:
%pwd

In [None]:
! aws s3 cp s3://sagemaker-us-west-2-350104937619/intel.zip .

In [None]:
from torchvision.datasets.utils import extract_archive

from pathlib import Path

In [None]:
%pwd

In [None]:
dataset_zip = Path("/root/intel-project/intel.zip")
dataset_extracted = Path("/root/intel-project/")

In [None]:
extract_archive(
    from_path=dataset_zip,
    to_path=dataset_extracted
)

In [None]:
dataset_full = list((dataset_extracted / "intel").glob("*/*.jpg"))
labels = [x.parent.stem for x in dataset_full]

In [None]:
import numpy as np

In [None]:
from collections import Counter

In [None]:
Counter(labels)

In [None]:
! pip install --quiet scikit-learn

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
dataset_full[0], labels[0]

In [None]:
dataset_full[0].name

In [None]:
d_train, d_test = train_test_split(dataset_full, stratify=labels)

In [None]:
Counter(x.parent.stem for x in d_train)

In [None]:
Counter(x.parent.stem for x in d_test)

In [None]:
git_path = Path("/root/intel-project/example-git")
for path in ['train', 'test']:
        output_dir = git_path / "dataset" / path
        output_dir.mkdir(parents=True, exist_ok=True)

In [None]:
import shutil

In [None]:
def write_dataset(image_paths, output_dir):
    for img_path in image_paths:
        Path(output_dir / img_path.parent.stem).mkdir(parents=True, exist_ok=True)
        shutil.copyfile(img_path, output_dir / img_path.parent.stem / img_path.name)

In [None]:
write_dataset(d_train, git_path / "dataset" / "train")
write_dataset(d_test, git_path / "dataset" / "test")

In [None]:
from torchvision.datasets import ImageFolder

In [None]:
img_dset = ImageFolder(git_path / "dataset" / "train")

In [None]:
img_dset[0][0]

In [None]:
img_dset = ImageFolder(git_path / "dataset" / "test")

In [None]:
img_dset.classes