In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models

import os

def make_dir(path):
    if os.path.isdir(data_path):
        pass
    else:
        os.path.make_dir(path)

In [37]:
# 初期設定
import sagemaker
import os
import boto3
import re
import numpy as np

sagemaker_session = sagemaker.Session()

# AWS設定
role = 'FullAccessHan'
region = boto3.Session().region_name
bucket='sagemaker-han'
prefix = 'sagemaker/cnn-cifar10'
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket)

# Local設定
data_dir = 'cifar10/testing'

In [24]:
# データセットの準備 → S3に保存　
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
])

train_data = datasets.CIFAR10(root=training_dir, train=True, download=True, transform=transform)
test_data = datasets.CIFAR10(root=test_dir, train=False, download=True, transform=transform)

training_data_loader = DataLoader(train_data, batch_size=len(train_data))
training_data_loaded = next(iter(training_data_loader))
torch.save(training_data_loaded, os.path.join(data_dir, 'training.pt'))

test_data_loader = DataLoader(test_data, batch_size=len(test_data))
test_data_loaded = next(iter(test_data_loader))
torch.save(test_data_loaded, os.path.join(data_dir, 'test.pt'))

inputs = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)
print('input spec (in this case, just an S3 path): {}'.format(inputs))

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to mnist_png/training/cifar-10-python.tar.gz


100.0%


Extracting mnist_png/training/cifar-10-python.tar.gz to mnist_png/training
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to mnist_png/testing/cifar-10-python.tar.gz


100.0%


Extracting mnist_png/testing/cifar-10-python.tar.gz to mnist_png/testing
input spec (in this case, just an S3 path): s3://sagemaker-han/sagemaker/cnn-cifar10


In [26]:
# Train Script
!pygmentize ./src/resnet_train.py

[38;2;0;128;0;01mimport[39;00m [38;2;0;0;255;01margparse[39;00m
[38;2;0;128;0;01mimport[39;00m [38;2;0;0;255;01mjson[39;00m
[38;2;0;128;0;01mimport[39;00m [38;2;0;0;255;01mlogging[39;00m
[38;2;0;128;0;01mimport[39;00m [38;2;0;0;255;01mos[39;00m
[38;2;0;128;0;01mimport[39;00m [38;2;0;0;255;01msys[39;00m
[38;2;0;128;0;01mimport[39;00m [38;2;0;0;255;01mtorch[39;00m
[38;2;0;128;0;01mimport[39;00m [38;2;0;0;255;01mtorch[39;00m[38;2;0;0;255;01m.[39;00m[38;2;0;0;255;01mdistributed[39;00m [38;2;0;128;0;01mas[39;00m [38;2;0;0;255;01mdist[39;00m
[38;2;0;128;0;01mimport[39;00m [38;2;0;0;255;01mtorch[39;00m[38;2;0;0;255;01m.[39;00m[38;2;0;0;255;01mnn[39;00m [38;2;0;128;0;01mas[39;00m [38;2;0;0;255;01mnn[39;00m
[38;2;0;128;0;01mimport[39;00m [38;2;0;0;255;01mtorch[39;00m[38;2;0;0;255;01m.[39;00m[38;2;0;0;255;01mnn[39;00m[38;2;0;0;255;01m.[39;00m[38;2;0;0;255;01mfunctional[39;00m [38;2;0;128;0;01mas[39;00m [38;2;0;0;255;01mF[39;

In [30]:
# Create Training Container
from sagemaker.pytorch import PyTorch

estimator = PyTorch(entry_point="resnet_train.py",
                    role=role,
                    source_dir = "src",
                    framework_version='1.12.0',
                    py_version='py38',
                    instance_count=1,
                    instance_type='ml.m4.xlarge',
                    hyperparameters={
                        'batch-size':128,
                        'lr': 0.01,
                        'epochs': 1,
                    })

In [31]:
estimator.fit({'training': inputs})

2022-08-20 08:30:34 Starting - Starting the training job...
2022-08-20 08:31:00 Starting - Preparing the instances for trainingProfilerReport-1660984232: InProgress
......
2022-08-20 08:32:12 Downloading - Downloading input data......
2022-08-20 08:33:12 Training - Downloading the training image......
2022-08-20 08:34:18 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-08-20 08:34:15,963 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-08-20 08:34:15,965 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-08-20 08:34:15,974 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-08-20 08:34:15,983 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m20

[34mTest set: Average loss: -1459.9684, Accuracy: 1851/10000 (19%)[0m
[34mINFO:__main__:Test set: Average loss: -1459.9684, Accuracy: 1851/10000 (19%)[0m
[34mSaving the model.[0m
[34mINFO:__main__:Saving the model.[0m
[34m2022-08-20 08:40:13,274 sagemaker-training-toolkit INFO     Waiting for the process to finish and give a return code.[0m
[34m2022-08-20 08:40:13,274 sagemaker-training-toolkit INFO     Done waiting for a return code. Received 0 from exiting process.[0m
[34m2022-08-20 08:40:13,275 sagemaker-training-toolkit INFO     Reporting training SUCCESS[0m

2022-08-20 08:40:39 Uploading - Uploading generated training model
2022-08-20 08:40:39 Completed - Training job completed
ProfilerReport-1660984232: NoIssuesFound
Training seconds: 504
Billable seconds: 504


In [34]:
!pygmentize ./src/resnet_deploy.py

[38;2;0;128;0;01mimport[39;00m [38;2;0;0;255;01mtorch[39;00m
[38;2;0;128;0;01mimport[39;00m [38;2;0;0;255;01mos[39;00m

[38;2;0;128;0;01mimport[39;00m [38;2;0;0;255;01msys[39;00m
[38;2;0;128;0;01mimport[39;00m [38;2;0;0;255;01mtorch[39;00m
[38;2;0;128;0;01mimport[39;00m [38;2;0;0;255;01mtorch[39;00m[38;2;0;0;255;01m.[39;00m[38;2;0;0;255;01mnn[39;00m [38;2;0;128;0;01mas[39;00m [38;2;0;0;255;01mnn[39;00m
[38;2;0;128;0;01mimport[39;00m [38;2;0;0;255;01mtorch[39;00m[38;2;0;0;255;01m.[39;00m[38;2;0;0;255;01mnn[39;00m[38;2;0;0;255;01m.[39;00m[38;2;0;0;255;01mfunctional[39;00m [38;2;0;128;0;01mas[39;00m [38;2;0;0;255;01mF[39;00m


[38;2;61;123;123;03m# Based on https://github.com/pytorch/examples/blob/master/mnist/main.py[39;00m
[38;2;0;128;0;01mclass[39;00m [38;2;0;0;255;01mBasicBlock[39;00m(nn[38;2;102;102;102m.[39mModule):
    expansion [38;2;102;102;102m=[39m [38;2;102;102;102m1[39m

    [38;2;0;128;0;01mdef[39;00m [3

In [54]:
from sagemaker.pytorch.model import PyTorchModel

model_data = 's3://sagemaker-us-west-2-608095525235/pytorch-training-2022-08-20-08-30-28-845/output/model.tar.gz'
inference_inputs = 's3://sagemaker-han/sagemaker/batch_transform'
output_s3_path = 'https://sagemaker-han.s3.us-west-2.amazonaws.com/sagemaker/batch_transform_output_one_predict'

pytorch_model = PyTorchModel(model_data = model_data,
                             entry_point='resnet_train.py',
                             source_dir = 'src',
                             framework_version='1.12.0',
                             py_version='py38',
                             role = role)
predictor = pytorch_model.deploy(initial_instance_count=1, instance_type='ml.m5.xlarge')

-----!

In [87]:
# Select 5 images and labels from test set
import numpy as np
n_sample = 1100
n_test = len(test_data)
sampled_index = np.random.choice(n_test, size=n_sample, replace=False)
sampled_testimg = np.array([test_data[i][0].numpy() for i in sampled_index])
sampled_testlabel = np.array([test_data[i][1] for i in sampled_index])

# Run inference and pick up the most likely label based on the score
labels = []
for split_sample in np.array_split(sampled_testimg,3):
    prediction_scores = predictor.predict(split_sample)
    labels += np.argmax(prediction_scores,axis = 1).tolist()
    
print("Predicted labels: {}".format(labels))
print("Ground Truth:     {}".format(sampled_testlabel))
predictor.delete_endpoint()

Predicted labels: [8, 6, 6, 8, 8, 8, 6, 6, 8, 8, 6, 8, 6, 8, 6, 6, 6, 6, 8, 6, 8, 6, 8, 6, 6, 8, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 8, 8, 6, 6, 6, 8, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 8, 8, 8, 6, 6, 8, 8, 8, 6, 8, 6, 6, 6, 6, 6, 6, 8, 6, 6, 6, 8, 6, 6, 6, 6, 8, 6, 8, 8, 8, 6, 8, 8, 6, 6, 6, 6, 6, 6, 6, 8, 6, 6, 6, 6, 6, 8, 6, 6, 6, 8, 6, 8, 6, 6, 6, 6, 8, 8, 8, 6, 6, 6, 6, 8, 6, 6, 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 8, 6, 6, 8, 8, 6, 6, 6, 8, 6, 8, 6, 8, 6, 6, 6, 8, 8, 8, 6, 8, 8, 6, 6, 8, 6, 6, 8, 6, 8, 6, 6, 6, 8, 8, 6, 8, 8, 6, 6, 6, 8, 8, 6, 8, 6, 8, 6, 6, 6, 8, 6, 8, 8, 6, 6, 8, 6, 8, 8, 8, 8, 8, 6, 6, 6, 8, 8, 8, 8, 6, 8, 8, 8, 8, 6, 6, 6, 8, 6, 6, 8, 6, 6, 8, 6, 6, 8, 6, 6, 6, 6, 8, 6, 6, 6, 8, 6, 6, 4, 6, 6, 8, 6, 6, 8, 8, 6, 6, 6, 8, 8, 6, 6, 8, 6, 8, 6, 8, 6, 6, 8, 8, 8, 6, 8, 6, 6, 8, 6, 6, 8, 8, 8, 6, 6, 6, 6, 8, 6, 8, 6, 6, 6, 6, 6, 6, 8, 6, 8, 8, 6, 8, 8, 8, 8, 6, 6, 8, 8, 6, 8, 8, 6, 8, 6, 8, 6, 8, 8, 8, 6, 8, 8, 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 

In [104]:
# sample_dataloaded = test_data_loaded[0][:1100].numpy()
# sampled_1100_path = './data/sampled_1100.npy'
# np.save(sampled_1100_path,sample_dataloaded)

# prefix = 'sagemaker-han/sagemaker/cifar10'

# new_input = sagemaker_session.upload_data(path=sampled_1100_path, bucket=bucket, key_prefix=prefix)

In [160]:
# datset files
dataset_jsonl_file="./data/cifar10.jsonl"

with open(dataset_jsonl_file, "w+") as outfile:
    for row in sample_dataloaded:
        json.dump(row.tolist(), outfile)
        outfile.write('\n')
        
prefix = 'sagemaker-han/sagemaker/cifar10'

new_input = sagemaker_session.upload_data(path=dataset_jsonl_file, bucket=bucket, key_prefix=prefix)

In [165]:
!pip install jsonlines
import jsonlines
import numpy as np
tmp = './data/test_img_path.json'
inputs = []
with jsonlines.open(dataset_jsonl_file) as reader:
    for obj in reader:
        inputs.append(np.array(obj))
inputs.pop(0)
print(len(inputs))
print




TypeError: pop expected at most 1 argument, got 2

In [138]:
model_data = 's3://sagemaker-us-west-2-608095525235/pytorch-training-2022-08-20-08-30-28-845/output/model.tar.gz'
inference_inputs = 's3://sagemaker-han/sagemaker/batch_transform'
output_s3_path = 'https://sagemaker-han.s3.us-west-2.amazonaws.com/sagemaker/batch_transform_output_one_predict'

pytorch_model = PyTorchModel(model_data = model_data,
                             entry_point='resnet_deploy.py',
                             source_dir = 'src',
                             framework_version='1.12.0',
                             py_version='py38',
                             role = role)

In [139]:
# Batch Tranform

max_concurrent_transforms = None
max_payload = None

inference_inputs = 's3://sagemaker-han/sagemaker/batch_transform'
output_s3_path = 'https://sagemaker-han.s3.us-west-2.amazonaws.com/sagemaker/batch_transform_output_cifar10_{}_{}'.format(max_concurrent_transforms,max_payload)

transformer = pytorch_model.transformer(instance_count=1, 
                                        instance_type="ml.m4.xlarge",
                                        output_path=output_s3_path,
                                        max_concurrent_transforms = max_concurrent_transforms,
                                        max_payload = max_payload
                                       )


In [140]:

transformer.transform(
    data=new_input,
    wait=True,
    content_type='application/jsonlines',    
    split_type='Line'
)

[34m2022-08-21T17:20:13,468 [INFO ] main org.pytorch.serve.servingsdk.impl.PluginsManager - Initializing plugins manager...[0m
[34m2022-08-21T17:20:13,601 [INFO ] main org.pytorch.serve.ModelServer - [0m
[34mTorchserve version: 0.6.0[0m
[34mTS Home: /opt/conda/lib/python3.8/site-packages[0m
[34mCurrent directory: /[0m
[34mTemp directory: /home/model-server/tmp[0m
[34mNumber of GPUs: 0[0m
[34mNumber of CPUs: 4[0m
[34mMax heap size: 3088 M[0m
[34mPython executable: /opt/conda/bin/python3.8[0m
[34mConfig file: /etc/sagemaker-ts.properties[0m
[34mInference address: http://0.0.0.0:8080[0m
[34mManagement address: http://0.0.0.0:8080[0m
[34mMetrics address: http://127.0.0.1:8082[0m
[34mModel Store: /.sagemaker/ts/models[0m
[34mInitial Models: model=/opt/ml/model[0m
[34mLog dir: /logs[0m
[34mMetrics dir: /logs[0m
[34mNetty threads: 0[0m
[34mNetty client threads: 0[0m
[34mDefault workers per model: 4[0m
[34mBlacklist Regex: N/A[0m
[34mMaximum Respons


[34m2022-08-21T17:20:20,876 [INFO ] pool-2-thread-5 ACCESS_LOG - /169.254.255.130:33978 "GET /ping HTTP/1.1" 200 35[0m
[34m2022-08-21T17:20:20,878 [INFO ] pool-2-thread-5 TS_METRICS - Requests2XX.Count:1|#Level:Host|#hostname:56a3d289cfe0,timestamp:1661102420[0m
[34m2022-08-21T17:20:20,907 [INFO ] epollEventLoopGroup-3-2 ACCESS_LOG - /169.254.255.130:33982 "GET /execution-parameters HTTP/1.1" 404 2[0m
[34m2022-08-21T17:20:20,910 [INFO ] epollEventLoopGroup-3-2 TS_METRICS - Requests4XX.Count:1|#Level:Host|#hostname:56a3d289cfe0,timestamp:1661102420[0m
[35m2022-08-21T17:20:20,876 [INFO ] pool-2-thread-5 ACCESS_LOG - /169.254.255.130:33978 "GET /ping HTTP/1.1" 200 35[0m
[35m2022-08-21T17:20:20,878 [INFO ] pool-2-thread-5 TS_METRICS - Requests2XX.Count:1|#Level:Host|#hostname:56a3d289cfe0,timestamp:1661102420[0m
[35m2022-08-21T17:20:20,907 [INFO ] epollEventLoopGroup-3-2 ACCESS_LOG - /169.254.255.130:33982 "GET /execution-parameters HTTP/1.1" 404 2[0m
[35m2022-08-21T17:20:2

[32m2022-08-21T17:20:23.065:[sagemaker logs]: sagemaker-han/sagemaker-han/sagemaker/cifar10/cifar10.jsonl:     raise JSONDecodeError("Extra data", s, end)[0m
[32m2022-08-21T17:20:23.065:[sagemaker logs]: sagemaker-han/sagemaker-han/sagemaker/cifar10/cifar10.jsonl: json.decoder.JSONDecodeError: Extra data: line 2 column 1 (char 65293)[0m


UnexpectedStatusException: Error for Transform job pytorch-inference-2022-08-21-17-14-31-810: Failed. Reason: AlgorithmError: See job logs for more information

In [131]:
transformer.transform(
    data=new_input,
    wait=True,
    content_type='application/jsonlines',    
    split_type='Line'
)

[34m2022-08-21T16:49:47,004 [INFO ] main org.pytorch.serve.servingsdk.impl.PluginsManager - Initializing plugins manager...[0m
[34m2022-08-21T16:49:47,157 [INFO ] main org.pytorch.serve.ModelServer - [0m
[34mTorchserve version: 0.6.0[0m
[34mTS Home: /opt/conda/lib/python3.8/site-packages[0m
[34mCurrent directory: /[0m
[34mTemp directory: /home/model-server/tmp[0m
[34mNumber of GPUs: 0[0m
[34mNumber of CPUs: 4[0m
[34mMax heap size: 3070 M[0m
[34mPython executable: /opt/conda/bin/python3.8[0m
[34mConfig file: /etc/sagemaker-ts.properties[0m
[34mInference address: http://0.0.0.0:8080[0m
[34mManagement address: http://0.0.0.0:8080[0m
[34mMetrics address: http://127.0.0.1:8082[0m
[34mModel Store: /.sagemaker/ts/models[0m
[34mInitial Models: model=/opt/ml/model[0m
[34mLog dir: /logs[0m
[34mMetrics dir: /logs[0m
[34mNetty threads: 0[0m
[34mNetty client threads: 0[0m
[34mDefault workers per model: 4[0m
[34mBlacklist Regex: N/A[0m
[34mMaximum Respons

[34m2022-08-21T16:49:53,855 [INFO ] pool-2-thread-5 ACCESS_LOG - /169.254.255.130:35634 "GET /ping HTTP/1.1" 200 34[0m
[34m2022-08-21T16:49:53,856 [INFO ] pool-2-thread-5 TS_METRICS - Requests2XX.Count:1|#Level:Host|#hostname:92efc52e3711,timestamp:1661100593[0m
[34m2022-08-21T16:49:53,900 [INFO ] epollEventLoopGroup-3-2 ACCESS_LOG - /169.254.255.130:35648 "GET /execution-parameters HTTP/1.1" 404 2[0m
[34m2022-08-21T16:49:53,903 [INFO ] epollEventLoopGroup-3-2 TS_METRICS - Requests4XX.Count:1|#Level:Host|#hostname:92efc52e3711,timestamp:1661100593[0m
[35m2022-08-21T16:49:53,855 [INFO ] pool-2-thread-5 ACCESS_LOG - /169.254.255.130:35634 "GET /ping HTTP/1.1" 200 34[0m
[35m2022-08-21T16:49:53,856 [INFO ] pool-2-thread-5 TS_METRICS - Requests2XX.Count:1|#Level:Host|#hostname:92efc52e3711,timestamp:1661100593[0m
[35m2022-08-21T16:49:53,900 [INFO ] epollEventLoopGroup-3-2 ACCESS_LOG - /169.254.255.130:35648 "GET /execution-parameters HTTP/1.1" 404 2[0m
[35m2022-08-21T16:49:53

[34m2022-08-21T16:49:57,475 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - Listening on port: /home/model-server/tmp/.ts.sock.9000[0m
[34m2022-08-21T16:49:57,477 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - [PID]158[0m
[34m2022-08-21T16:49:57,477 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - Torch worker started.[0m
[34m2022-08-21T16:49:57,478 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - Python runtime: 3.8.13[0m
[34m2022-08-21T16:49:57,478 [INFO ] W-9000-model_1.0 org.pytorch.serve.wlm.WorkerThread - Connecting to: /home/model-server/tmp/.ts.sock.9000[0m
[34m2022-08-21T16:49:57,480 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - Connection accepted: /home/model-server/tmp/.ts.sock.9000.[0m
[35m2022-08-21T16:49:57,475 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - Listening on port: /home/model-server/tmp/.ts.sock.9000[0m
[35m2022-08-21T16:49:57,477 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - [PID]158[0m
[35m2022-08-21T16:49:57,477 [INFO ] W-9000-model_1.0-stdout MODEL_LOG - Torch w




UnexpectedStatusException: Error for Transform job pytorch-inference-2022-08-21-16-43-47-224: Failed. Reason: AlgorithmError: See job logs for more information