In [3]:
# 初期設定
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models

import os, boto3, json, sagemaker, numpy as np
from sagemaker.pytorch import PyTorchModel
from io import BytesIO

def make_dir(path):
    if os.path.isdir(path):
        pass
    else:
        os.mkdir(path)
        
sagemaker_session = sagemaker.Session()

# AWS設定
role = 'han_s3_full_access'
region = boto3.Session().region_name
bucket='sagemaker-han-batch'
prefix = 'batch-images'
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket)
model_prefix = os.path.join(prefix, 'model')
input_prefix = os.path.join(prefix, 'inputs')
output_prefix = os.path.join(prefix, 'outputs')
inference_prefix = os.path.join(prefix, 'inference')

# Local設定
base_dir = './'
data_dir = os.path.join(base_dir, 'data')
model_dir = os.path.join(base_dir, 'model')
input_dir = os.path.join(base_dir, 'inputs')
output_dir = os.path.join(base_dir, 'outputs')
inference_dir = os.path.join(base_dir, 'inference')
source_dir = os.path.join(base_dir, 'src')
for dir_name in [model_dir, input_dir, output_dir, source_dir, inference_dir]:
    make_dir(dir_name)

In [7]:
# Sample 100 datas from CIFAR10 dataset

from src.utils import create_init_sample_data, convert_np_to_png

sampled_imgs = create_init_sample_data(data_dir, 100)
convert_np_to_png(inference_dir, sampled_imgs)

# Upload png images to S3
inference_inputs = sagemaker_session.upload_data(
    path=inference_dir, bucket=bucket, key_prefix=inference_prefix
)

Files already downloaded and verified


In [10]:
# Prepare dataset for the model training

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
])

train_data = datasets.CIFAR10(root=data_dir, train=True, download=True, transform=transform)
test_data = datasets.CIFAR10(root=data_dir, train=False, download=True, transform=transform)

training_data_loader = DataLoader(train_data, batch_size=len(train_data))
training_data_loaded = next(iter(training_data_loader))
torch.save(training_data_loaded, os.path.join(input_dir, 'training.pt'))

test_data_loader = DataLoader(test_data, batch_size=len(test_data))
test_data_loaded = next(iter(test_data_loader))
torch.save(test_data_loaded, os.path.join(output_dir, 'test.pt'))

inputs = sagemaker_session.upload_data(path=input_dir, bucket=bucket, key_prefix=input_prefix)
outputs = sagemaker_session.upload_data(path=output_dir, bucket=bucket, key_prefix=output_prefix)

In [1]:
inference_inputs= 's3://sagemaker-han-batch/batch-images/inference'
inputs= 's3://sagemaker-han-batch/batch-images/inputs'
outputs= 's3://sagemaker-han-batch/batch-images/outputs'

In [4]:
# Create Training Container
from sagemaker.pytorch import PyTorch

estimator = PyTorch(entry_point="resnet.py",
                    role=role,
                    source_dir = "src",
                    framework_version='1.12.0',
                    py_version='py38',
                    instance_count=1,
                    instance_type='ml.m4.xlarge',
                    hyperparameters={
                        'batch-size':128,
                        'lr': 0.01,
                        'epochs': 1,
                    })

estimator.fit({'training': inputs})

2022-08-22 10:11:10 Starting - Starting the training job...ProfilerReport-1661163068: InProgress
...
2022-08-22 10:12:13 Starting - Preparing the instances for training....
2022-08-22 10:21:55 Downloading - Downloading input data
2022-08-22 10:21:55 Training - Training image download completed. Training in progress.
2022-08-22 10:21:55 Uploading - Uploading generated training model
2022-08-22 10:21:55 Completed - Training job completed
ProfilerReport-1661163068: NoIssuesFound
[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-08-22 10:15:29,984 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-08-22 10:15:29,987 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-08-22 10:15:29,997 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-08-22 10:15:30,01

In [8]:
cifar10_predictor = estimator.deploy(
    initial_instance_count=1, 
    instance_type='ml.m4.xlarge'
)


--------!

In [13]:
test_input = torch.rand(1,3,32,32)
test_output = cifar10_predictor.predict(test_input)
print(test_input.shape)
print(test_output.shape)

torch.Size([1, 3, 32, 32])
(1, 10)


In [14]:
cifar10_predictor.delete_endpoint()

In [16]:
model_path = 's3://sagemaker-us-west-2-582981179587/pytorch-training-2022-08-22-11-21-23-382/model.tar.gz'

In [135]:
from sagemaker.pytorch.model import PyTorchModel

max_concurrent_transforms = None
max_payload = None

inference_inputs = 's3://sagemaker-han/sagemaker/batch_transform'
output_s3_path = 'https://sagemaker-han.s3.us-west-2.amazonaws.com/sagemaker/batch_transform_output_{}_{}'.format(max_concurrent_transforms,max_payload)


model_data = 's3://sagemaker-us-west-2-608095525235/pytorch-training-2022-08-20-08-30-28-845/output/model.tar.gz'

pytorch_model = PyTorchModel(model_data = model_data,
                             entry_point='resnet_deploy.py',
                             source_dir = 'src',
                             framework_version='1.12.0',
                             py_version='py38',
                             role = role)

transformer.transform(
    data=inference_inputs,
    data_type="S3Prefix",
    content_type="application/x-image",
    wait=True,
)

# transformer = o2v.transformer(instance_count=4,
#                               instance_type="ml.p2.xlarge",
#                               max_concurrent_transforms=2,
#                               max_payload=1,  # 1MB
#                               strategy='MultiRecord',
#                               env={'INFERENCE_PREFERRED_MODE': 'classification'},  # only useful with GPU
#                               output_path=output_s3_path)

# {"in0": [6, 17, 606, 19, 53, 67, 52, 12, 5, 10, 15, 10178, 7, 33, 652, 80, 15, 69, 821, 4], "in1": [16, 21, 13, 45, 14, 9, 80, 59, 164, 4]}
# {"in0": [22, 1016, 32, 13, 25, 11, 5, 64, 573, 45, 5, 80, 15, 67, 21, 7, 9, 107, 4], "in1": [22, 32, 13, 25, 1016, 573, 3252, 4]}
# {"in0": [774, 14, 21, 206], "in1": [21, 366, 125]}

# {"scores":[0.195667684078216,0.395351558923721,0.408980727195739]}
# {"scores":[0.251988261938095,0.258233487606048,0.489778339862823]}
# {"scores":[0.280087798833847,0.368331134319305,0.351581096649169]}



[34m2022-08-21T09:06:53,604 [INFO ] main org.pytorch.serve.servingsdk.impl.PluginsManager - Initializing plugins manager...[0m
[34m2022-08-21T09:06:53,754 [INFO ] main org.pytorch.serve.ModelServer - [0m
[34mTorchserve version: 0.6.0[0m
[34mTS Home: /opt/conda/lib/python3.8/site-packages[0m
[34mCurrent directory: /[0m
[34mTemp directory: /home/model-server/tmp[0m
[34mNumber of GPUs: 0[0m
[34mNumber of CPUs: 2[0m
[34mMax heap size: 980 M[0m
[34mPython executable: /opt/conda/bin/python3.8[0m
[34mConfig file: /etc/sagemaker-ts.properties[0m
[34mInference address: http://0.0.0.0:8080[0m
[34mManagement address: http://0.0.0.0:8080[0m
[34mMetrics address: http://127.0.0.1:8082[0m
[34mModel Store: /.sagemaker/ts/models[0m
[34mInitial Models: model=/opt/ml/model[0m
[34mLog dir: /logs[0m
[34mMetrics dir: /logs[0m
[34mNetty threads: 0[0m
[34mNetty client threads: 0[0m
[34mDefault workers per model: 2[0m
[34mBlacklist Regex: N/A[0m
[34mMaximum Response

[32m2022-08-21T09:07:02.682:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m
[34m2022-08-21T09:07:06,804 [INFO ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerThread - Flushing req. to backend at: 1661072826804[0m
[34m2022-08-21T09:07:06,807 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - Backend received inference at: 1661072826[0m
[34m2022-08-21T09:07:06,821 [INFO ] W-9000-model_1.0-stdout MODEL_METRICS - PredictionTime.Milliseconds:14.58|#ModelName:model,Level:Model|#hostname:59c63a18ea25,requestID:b9a671c9-ca1c-441a-9577-0b8eb3f12313,timestamp:1661072826[0m
[34m2022-08-21T09:07:06,821 [INFO ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerThread - Backend response time: 13[0m
[34m2022-08-21T09:07:06,822 [INFO ] W-9000-model_1.0 ACCESS_LOG - /169.254.255.130:51170 "POST /invocations HTTP/1.1" 200 18[0m
[34m2022-08-21T09:07:06,824 [INFO ] W-9000-model_1.0 TS_METRICS - Requests2XX.Count:1|#Level:Host|#hostname:59c63a18ea25,timestamp:16

[35m2022-08-21T09:07:16,823 [INFO ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerThread - Flushing req. to backend at: 1661072836823[0m
[35m2022-08-21T09:07:16,824 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - Backend received inference at: 1661072836[0m
[35m2022-08-21T09:07:16,824 [INFO ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerThread - Backend response time: 1[0m
[35m2022-08-21T09:07:16,825 [INFO ] W-9000-model_1.0 ACCESS_LOG - /169.254.255.130:37156 "POST /invocations HTTP/1.1" 500 2[0m
[35m2022-08-21T09:07:16,825 [INFO ] W-9000-model_1.0 TS_METRICS - Requests5XX.Count:1|#Level:Host|#hostname:59c63a18ea25,timestamp:1661072822[0m
[35m2022-08-21T09:07:16,825 [INFO ] W-9000-model_1.0 TS_METRICS - QueueTime.ms:0|#Level:Host|#hostname:59c63a18ea25,timestamp:1661072836[0m
[35m2022-08-21T09:07:16,825 [INFO ] W-9000-model_1.0 TS_METRICS - WorkerThreadTime.ms:1|#Level:Host|#hostname:59c63a18ea25,timestamp:1661072836[0m
[35m2022-08-21T09:07:16,825 [INFO ] W-9000-model_1.0-st

[34m2022-08-21T09:07:17,928 [INFO ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerThread - Flushing req. to backend at: 1661072837928[0m
[34m2022-08-21T09:07:17,929 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - Backend received inference at: 1661072837[0m
[35m2022-08-21T09:07:17,928 [INFO ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerThread - Flushing req. to backend at: 1661072837928[0m
[35m2022-08-21T09:07:17,929 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - Backend received inference at: 1661072837[0m
[34m2022-08-21T09:07:17,929 [INFO ] W-9000-model_1.0-stdout MODEL_METRICS - PredictionTime.Milliseconds:0.46|#ModelName:model,Level:Model|#hostname:59c63a18ea25,requestID:53092055-2cff-4f53-a0c5-b3e7df8ef5bc,timestamp:1661072837[0m
[34m2022-08-21T09:07:17,930 [INFO ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerThread - Backend response time: 0[0m
[34m2022-08-21T09:07:17,930 [INFO ] W-9000-model_1.0 ACCESS_LOG - /169.254.255.130:54694 "POST /invocations HTTP/1.1" 500 3[0m
[3

[35m2022-08-21T09:07:17,979 [INFO ] W-9001-model_1.0 ACCESS_LOG - /169.254.255.130:54754 "POST /invocations HTTP/1.1" 500 2[0m
[35m2022-08-21T09:07:17,980 [INFO ] W-9001-model_1.0 TS_METRICS - Requests5XX.Count:1|#Level:Host|#hostname:59c63a18ea25,timestamp:1661072822[0m
[35m2022-08-21T09:07:17,980 [INFO ] W-9001-model_1.0 TS_METRICS - QueueTime.ms:0|#Level:Host|#hostname:59c63a18ea25,timestamp:1661072837[0m
[35m2022-08-21T09:07:17,980 [INFO ] W-9001-model_1.0 TS_METRICS - WorkerThreadTime.ms:2|#Level:Host|#hostname:59c63a18ea25,timestamp:1661072837[0m
[35m2022-08-21T09:07:17,984 [INFO ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerThread - Flushing req. to backend at: 1661072837984[0m
[35m2022-08-21T09:07:17,985 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - Backend received inference at: 1661072837[0m
[35m2022-08-21T09:07:17,986 [INFO ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerThread - Backend response time: 1[0m
[35m2022-08-21T09:07:17,986 [INFO ] W-9000-model_1.0 AC

UnexpectedStatusException: Error for Transform job pytorch-inference-2022-08-21-09-02-13-653: Failed. Reason: AlgorithmError: See job logs for more information

In [128]:
print("Latest transform job:", transformer.latest_transform_job.name)

Latest transform job: pytorch-inference-2022-08-21-08-18-59-697


In [126]:
transformer.stop_transform_job()

[34m2022-08-21T06:36:10,025 [INFO ] main org.pytorch.serve.servingsdk.impl.PluginsManager - Initializing plugins manager...[0m
[34m2022-08-21T06:36:10,173 [INFO ] main org.pytorch.serve.ModelServer - [0m
[34mTorchserve version: 0.6.0[0m
[34mTS Home: /opt/conda/lib/python3.8/site-packages[0m
[34mCurrent directory: /[0m
[34mTemp directory: /home/model-server/tmp[0m
[34mNumber of GPUs: 0[0m
[34mNumber of CPUs: 2[0m
[34mMax heap size: 980 M[0m
[34mPython executable: /opt/conda/bin/python3.8[0m
[34mConfig file: /etc/sagemaker-ts.properties[0m
[34mInference address: http://0.0.0.0:8080[0m
[34mManagement address: http://0.0.0.0:8080[0m
[34mMetrics address: http://127.0.0.1:8082[0m
[34mModel Store: /.sagemaker/ts/models[0m
[34mInitial Models: model=/opt/ml/model[0m
[34mLog dir: /logs[0m
[34mMetrics dir: /logs[0m
[35m2022-08-21T06:36:10,025 [INFO ] main org.pytorch.serve.servingsdk.impl.PluginsManager - Initializing plugins manager...[0m
[35m2022-08-21T06:

[34m2022-08-21T06:36:28,660 [INFO ] W-9001-model_1.0 org.pytorch.serve.wlm.WorkerThread - Flushing req. to backend at: 1661063788660[0m
[34m2022-08-21T06:36:28,661 [INFO ] W-9001-model_1.0-stdout MODEL_LOG - Backend received inference at: 1661063788[0m
[34m2022-08-21T06:36:28,661 [INFO ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerThread - Flushing req. to backend at: 1661063788661[0m
[34m2022-08-21T06:36:28,662 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - Backend received inference at: 1661063788[0m
[34m2022-08-21T06:36:28,662 [INFO ] W-9001-model_1.0 org.pytorch.serve.wlm.WorkerThread - Backend response time: 2[0m
[34m2022-08-21T06:36:28,662 [INFO ] W-9001-model_1.0 ACCESS_LOG - /169.254.255.130:58684 "POST /invocations HTTP/1.1" 500 3[0m
[34m2022-08-21T06:36:28,663 [INFO ] W-9001-model_1.0 TS_METRICS - Requests5XX.Count:1|#Level:Host|#hostname:3717f2878b4a,timestamp:1661063778[0m
[34m2022-08-21T06:36:28,663 [INFO ] W-9001-model_1.0 TS_METRICS - QueueTime.ms:0|#Level:H

[35m2022-08-21T06:37:11,334 [INFO ] pool-3-thread-1 TS_METRICS - MemoryUtilization.Percent:36.2|#Level:Host|#hostname:3717f2878b4a,timestamp:1661063831[0m


UnexpectedStatusException: Error for Transform job pytorch-inference-2022-08-21-06-31-31-788: Failed. Reason: AlgorithmError: See job logs for more information

In [113]:
import pprint as pp

job_name = 'pytorch-inference-2022-08-21-03-13-37-242'
sm_cli = sagemaker_session.sagemaker_client
job_info = sm_cli.describe_transform_job(TransformJobName=job_name)
pp.pprint(job_info)

{'CreationTime': datetime.datetime(2022, 8, 21, 12, 13, 37, 860000, tzinfo=tzlocal()),
 'DataProcessing': {'InputFilter': '$',
                    'JoinSource': 'None',
                    'OutputFilter': '$'},
 'FailureReason': 'AlgorithmError: See job logs for more information',
 'ModelName': 'pytorch-inference-2022-08-21-03-13-10-667',
 'ResponseMetadata': {'HTTPHeaders': {'content-length': '949',
                                      'content-type': 'application/x-amz-json-1.1',
                                      'date': 'Sun, 21 Aug 2022 05:10:13 GMT',
                                      'x-amzn-requestid': 'cd218913-c7fe-4e1f-9b31-2b2fa07372d4'},
                      'HTTPStatusCode': 200,
                      'RequestId': 'cd218913-c7fe-4e1f-9b31-2b2fa07372d4',
                      'RetryAttempts': 0},
 'TransformEndTime': datetime.datetime(2022, 8, 21, 12, 18, 21, 796000, tzinfo=tzlocal()),
 'TransformInput': {'CompressionType': 'None',
                    'ContentType'

In [117]:
local_path = 'outputs'

sagemaker_session.download_data(path=local_path, bucket=bucket, key_prefix='sagemaker/batch_transform/outputs')

In [118]:
import json

for f in os.listdir(local_path):
    path = os.path.join(local_path, f)
    with open(path, "r") as f:
        pred = json.load(f)
        print(pred)

{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'predictions': 8}
{'prediction