In [45]:
!%pip install boto3 sagemaker

project_name = "tensorflow-project-A1"

In [2]:
import os

base_dir = os.getcwd()
print(base_dir)

/Users/omerhaim/work/aws/quicklizard-sagemaker


In [3]:
import boto3
import os 
import sagemaker

# session = boto3.session.Session()
# region = session.region_name
# account_id = boto3.client('sts').get_caller_identity().get('Account')

sm_session = sagemaker.Session()
account_id = sm_session.account_id()
region = sm_session._region_name
bucket = sm_session.default_bucket() 
ecr_url = f"{account_id}.dkr.ecr.{region}.amazonaws.com"
base_dir = os.getcwd()

print(region, account_id, ecr_url, base_dir)

eu-west-1 910416587115 910416587115.dkr.ecr.eu-west-1.amazonaws.com /Users/omerhaim/work/aws/quicklizard-sagemaker


## Building to docker image for BYOC processing

In [37]:
os.chdir(f"{base_dir}/processingContainer")

repo_name='processing-byoc'
version='v1'

# create the repo in ECR
!aws ecr describe-repositories --repository-names {repo_name} > /dev/null || aws ecr create-repository --repository-name {repo_name} > /dev/null

# Build to custom image
!aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {ecr_url}

#!docker build -t {ecr_url}/{repo_name}:{version} .
#!docker push {ecr_url}/{repo_name}:{version}

!docker buildx build --platform=linux/amd64 -t {ecr_url}/{repo_name}:{version} . --push



Login Succeeded
[1A[1B[0G[?25l[+] Building 0.0s (0/0)                                                         
[?25h[1A[0G[?25l[+] Building 0.1s (2/3)                                                         
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 157B                                       0.0s
[0m[34m => [internal] load .dockerignore                                          0.0s
[0m[34m => => transferring context: 2B                                            0.0s
[0m => [internal] load metadata for docker.io/library/python:3.10.7-bullseye  0.1s
[?25h[1A[1A[1A[1A[1A[1A[0G[?25l[+] Building 0.3s (2/3)                                                         
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 157B                                       0.0s
[0m[34m => [internal] load .dockerignore           

In [38]:
os.chdir(base_dir)

## Processing with your own container

This uses a generic ScriptProcessor that runs your image

In [47]:
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker import get_execution_role

try:
    execution_role = get_execution_role() # will work when running in sagemaker
except ValueError:
    execution_role = "AmazonSageMaker-ExecutionRole-20220125T110249" # on the local machine place the ARN manually

# S3 input and output
s3_npy_input_train_files = f"s3://{bucket}/{project_name}/train/"
s3_npy_input_test_files = f"s3://{bucket}/{project_name}/test/"
s3_parquet_output_files = f"s3://{bucket}/{project_name}/parquet/"

# local processing path
container_local_input_path = "/opt/ml/processing/input/"
container_local_output_path = "/opt/ml/processing/output/"


processor = ScriptProcessor(
    command=['python3'],
    image_uri=f"{ecr_url}/{repo_name}:{version}",
    role=execution_role,
    instance_count=1,
    instance_type='ml.c5.xlarge',
    env={"sm_input": container_local_input_path,
         "sm_output": container_local_output_path}
    )

processor.run(code='./scripts/process.py',
    inputs=[ProcessingInput(
        source=s3_npy_input_train_files,
        destination=container_local_input_path)],
    outputs=[ProcessingOutput(
        source=container_local_output_path,
        destination=s3_parquet_output_files)]
    )

Couldn't call 'get_role' to get Role ARN from role name omer to get Role path.



Job Name:  processing-byoc-2022-10-04-08-00-41-090
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-eu-west-1-910416587115/tensorflow-project-A1/train/', 'LocalPath': '/opt/ml/processing/input/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-eu-west-1-910416587115/processing-byoc-2022-10-04-08-00-41-090/input/code/process.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'output-1', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-eu-west-1-910416587115/tensorflow-project-A1/parquet/', 'LocalPath': '/opt/ml/processing/output/', 'S3UploadMode': 'EndOfJob'}}]
.........................[34mThis is a processing job[0m
[34menviro

## Building the training container

In [52]:
os.chdir(f"{base_dir}/trainingContainer")

training_repo_name='training-byoc'
version='v1'

# create the repo in ECR
!aws ecr describe-repositories --repository-names {training_repo_name} > /dev/null || aws ecr create-repository --repository-name {training_repo_name} > /dev/null

# Build to custom image
!aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {ecr_url}

#!docker build -t {ecr_url}/{repo_name}:{version} .
#!docker push {ecr_url}/{repo_name}:{version}

!docker buildx build --platform=linux/amd64 -t {ecr_url}/{training_repo_name}:{version} . --push

## Training

This uses a generic Estimator as its a BYOC

In [53]:
os.chdir(base_dir)

In a training job, save the model according to `SM_MODEL_DIR` environment variable. SageMaker will take what is in the local directory, it will compress it using tar.gz and upload it to S3, then the `estimator` object will have the output location of the model in S3 for the next step of offline processing job for inference

In [54]:
from sagemaker.estimator import Estimator

hyperparameters = {'epochs': 100, 'batch_size': 128, 'learning_rate': 0.01 }

estimator = Estimator(
    source_dir='scripts', # adding source_dir will upload the entire directory
    entry_point='train.py',
    instance_type='ml.c5.xlarge',
    instance_count=1,
    hyperparameters=hyperparameters,
    role=execution_role,
    base_job_name='ql-byoc',
    image_uri=f"{ecr_url}/{repo_name}:{version}"
    )

inputs = {'train': s3_npy_input_train_files, 'test': s3_npy_input_test_files}
estimator.fit(inputs, wait=True)


2022-10-04 08:20:43 Starting - Starting the training job...
2022-10-04 08:21:06 Starting - Preparing the instances for trainingProfilerReport-1664871640: InProgress
......
2022-10-04 08:22:06 Downloading - Downloading input data...
2022-10-04 08:22:46 Training - Downloading the training image...
2022-10-04 08:23:18 Uploading - Uploading generated training model
2022-10-04 08:23:18 Failed - Training job failed
..

UnexpectedStatusException: Error for Training job ql-byoc-2022-10-04-08-20-39-306: Failed. Reason: ClientError: Artifact upload failed:API error (400): OCI runtime create failed: container_linux.go:380: starting container process caused: exec: "train": executable file not found in $PATH: unknown