In [1]:
!pip install boto3 sagemaker


project_name = "tensorflow-project-A1"

Collecting boto3
  Downloading boto3-1.24.85-py3-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.5/132.5 kB[0m [31m901.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting sagemaker
  Downloading sagemaker-2.110.0.tar.gz (576 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m576.0/576.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting jmespath<2.0.0,>=0.7.1
  Using cached jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.7.0,>=0.6.0
  Using cached s3transfer-0.6.0-py3-none-any.whl (79 kB)
Collecting botocore<1.28.0,>=1.27.85
  Downloading botocore-1.27.85-py3-none-any.whl (9.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.2/9.2 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting attrs<22,>=20.3.0
  Using cached attrs-21.4.0-py2.py3-none-any.whl (60 kB)
Collecting g

In [2]:
import os


base_dir = os.getcwd()
print(base_dir)

/Users/omerhaim/work/aws/quicklizard-sagemaker


In [3]:
import boto3
import sagemaker


sm_session = sagemaker.Session()
account_id = sm_session.account_id()
region = sm_session._region_name
bucket = sm_session.default_bucket() 
ecr_url = f"{account_id}.dkr.ecr.{region}.amazonaws.com"
base_dir = os.getcwd()

print(region, account_id, ecr_url, base_dir)

eu-west-1 910416587115 910416587115.dkr.ecr.eu-west-1.amazonaws.com /Users/omerhaim/work/aws/quicklizard-sagemaker


## Building to docker image for BYOC processing

In [6]:
os.chdir(f"{base_dir}/processingContainer")

repo_name='processing-byoc'
version='v1'

# create the repo in ECR
!aws ecr describe-repositories --repository-names {repo_name} > /dev/null || aws ecr create-repository --repository-name {repo_name} > /dev/null

# Build to custom image
!aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {ecr_url}

# If you use an x86 architecture workstation, than run native docker build for your BYC image
#!docker build -t {ecr_url}/{repo_name}:{version} .
#!docker push {ecr_url}/{repo_name}:{version}

# If you are running on Mac M1 you need to build the BYC using buildx
!docker buildx build --platform=linux/amd64 -t {ecr_url}/{repo_name}:{version} . --push



zsh:1: command not found: docker

[Errno 32] Broken pipe


In [7]:
os.chdir(base_dir)

## Processing with your own container

This uses a generic ScriptProcessor that runs your image

In [8]:
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker import get_execution_role

try:
    execution_role = get_execution_role() # will work when running in sagemaker
except ValueError:
    execution_role = "AmazonSageMaker-ExecutionRole-20210510T183593" # on the local machine place the ARN manually

# S3 input and output
s3_npy_input_train_files = f"s3://{bucket}/{project_name}/train/"
s3_npy_input_test_files = f"s3://{bucket}/{project_name}/test/"
s3_parquet_output_files = f"s3://{bucket}/{project_name}/parquet/"

# local processing path
container_local_input_path = "/opt/ml/processing/input/"
container_local_output_path = "/opt/ml/processing/output/"

# create a generic script process that will run your processing to parquet files.
processor = ScriptProcessor(
    command=['python3'],
    image_uri=f"{ecr_url}/{repo_name}:{version}",
    role=execution_role,
    instance_count=1,
    instance_type='ml.c5.xlarge',
    env={"sm_input": container_local_input_path,
         "sm_output": container_local_output_path}
    )

# Run the processing script and add input and output files, you can add as many as you want
processor.run(code='./scripts/process.py',
    inputs=[ProcessingInput(
        source=s3_npy_input_train_files,
        destination=container_local_input_path)],
    outputs=[ProcessingOutput(
        source=container_local_output_path,
        destination=s3_parquet_output_files)]
    )

Couldn't call 'get_role' to get Role ARN from role name omer to get Role path.


## Building the training container

In [9]:
os.chdir(f"{base_dir}/trainingContainer")

training_repo_name='training-byoc'
version='v1'

# create the repo in ECR
!aws ecr describe-repositories --repository-names {training_repo_name} > /dev/null || aws ecr create-repository --repository-name {training_repo_name} > /dev/null

# Build to custom image
!aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {ecr_url}

# If you use an x86 architecture workstation, than run native docker build for your BYC image
#!docker build -t {ecr_url}/{repo_name}:{version} .
#!docker push {ecr_url}/{repo_name}:{version}

# If you are running on Mac M1 you need to build the BYC using buildx
!docker buildx build --platform=linux/amd64 -t {ecr_url}/{training_repo_name}:{version} . --push

zsh:1: command not found: docker

[Errno 32] Broken pipe


## Training

This uses a generic Estimator as its a BYOC

In [10]:
os.chdir(base_dir)

In a training job, save the model according to `SM_MODEL_DIR` environment variable. SageMaker will take what is in the local directory, it will compress it using tar.gz and upload it to S3, then the `estimator` object will have the output location of the model in S3 for the next step of offline processing job for inference

In [11]:
from sagemaker.estimator import Estimator

# hyperparameters will be added as command line arguments to the script command, and we will use argparse to use them. SageMaker SDK will parse and add them to the train command.
hyperparameters = {'epochs': 15, 'batch_size': 128, 'learning_rate': 0.01 }

# This is a generic estimator for running training on your own containers.
estimator = Estimator(
    source_dir='scripts', # adding source_dir will upload the entire directory
    entry_point='train.py',
    instance_type='ml.c5.xlarge',
    instance_count=1,
    hyperparameters=hyperparameters,
    role='arn:aws:iam::910416587115:role/service-role/AmazonSageMaker-ExecutionRole-20210510T183593',
    base_job_name='ql-byoc',
    image_uri=f"{ecr_url}/{training_repo_name}:{version}"
    )

# define the inputs for the train and test
inputs = {'train': s3_npy_input_train_files, 'test': s3_npy_input_test_files}

# submit a training job
estimator.fit(inputs)


2022-10-04 14:10:18 Starting - Starting the training job...
2022-10-04 14:10:42 Starting - Preparing the instances for trainingProfilerReport-1664892617: InProgress
......
2022-10-04 14:11:42 Downloading - Downloading input data...
2022-10-04 14:12:22 Training - Downloading the training image...
2022-10-04 14:12:57 Uploading - Uploading generated training model[34m2022-10-04 14:12:44,276 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-10-04 14:12:44,290 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-10-04 14:12:44,304 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-10-04 14:12:44,318 sagemaker-training-toolkit INFO     Invoking user script[0m
[34mTraining Env:[0m
[34m{
    "additional_framework_parameters": {},
    "channel_input_dirs": {
        "test": "/opt/ml/input/data/test",
        "train": "/opt/ml/input/data/train"
    },
    

In [14]:
# The estimator holds the S3 path where it uploaded the model from the training container according to the SM_MODEL_DIR environment variable.
print(estimator.model_data)

s3://sagemaker-eu-west-1-910416587115/ql-byoc-2022-10-04-14-10-15-979/output/model.tar.gz


Now that we have a model, we can run a generic processing job on your built-in processing container, and run the inference.
In my sample I will run `model.evaluate` as I would do inference

In [18]:
container_local_model_path = '/opt/ml/processing/model' # The file path must start with /opt/ml/processing/ 
s3_inference_result_path = f"s3://{bucket}/{project_name}/results" # upload the evaluate result to this s3 path, the same as it will do for inference results in our case.

# again, a generic script processor that has all needed dependencies installed to run inference.
processor = ScriptProcessor(
    command=['python3'],
    image_uri=f"{ecr_url}/{repo_name}:{version}",
    role=execution_role,
    instance_count=1,
    instance_type='ml.c5.xlarge',
    env={"sm_input": container_local_input_path,
         "sm_output": container_local_output_path,
         "sm_model": container_local_model_path}
    )

# call the processing job to run the offline processing using the offline inference script
# note that there are 2 inputs, one for the mode, and the other for the data to use for evaluating (the same as you will do for inference)
processor.run(
    code='./scripts/offline-inference.py',
    inputs=[ProcessingInput(
                source=s3_npy_input_test_files, # I will download the test files as input files for the model
                destination=container_local_input_path),
            ProcessingInput(
                source=estimator.model_data,
                destination=container_local_model_path
                )
            ],
    outputs=[ProcessingOutput(
        source=container_local_output_path,
        destination=s3_inference_result_path)]
    )


Job Name:  processing-byoc-2022-10-04-14-45-57-596
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-eu-west-1-910416587115/tensorflow-project-A1/test/', 'LocalPath': '/opt/ml/processing/input/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-2', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-eu-west-1-910416587115/ql-byoc-2022-10-04-14-10-15-979/output/model.tar.gz', 'LocalPath': '/opt/ml/processing/model', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-eu-west-1-910416587115/processing-byoc-2022-10-04-14-45-57-596/input/code/offline-inference.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyRepli

# Building the pipeline from the above steps

The pipeline will be: Processing job to Generate Parquet files -> Training job to Train -> Processing job to Infer