In [10]:
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri

import sagemaker

sagemaker_session = sagemaker.Session()

region = sagemaker_session.boto_session.region_name
role = sagemaker.get_execution_role()

In [11]:
hyperparameters = {'num_train_epochs': 6, 'save_steps':400, 'train_batch_size':32, 'eval_batch_size':8}


In [12]:
#git_config = {'repo': 'https://github.com/awslabs/amazon-sagemaker-examples.git', 'branch': 'training-scripts'}

In [15]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(entry_point='train.py',
                    role=role,
                    framework_version='1.1.0',
                    train_instance_count=1,
                    train_instance_type='ml.p3.8xlarge',
                    source_dir='email-compliance-bert',
                    #git_config=git_config,
                    hyperparameters=hyperparameters)

In [16]:
estimator.fit('s3://sagemaker-us-east-1-665028136136/compliance-data/batch2/')

2019-09-17 01:28:30 Starting - Starting the training job...
2019-09-17 01:28:32 Starting - Launching requested ML instances......
2019-09-17 01:29:32 Starting - Preparing the instances for training......
2019-09-17 01:30:43 Downloading - Downloading input data...
2019-09-17 01:31:30 Training - Training image download completed. Training in progress..[31mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[31mbash: no job control in this shell[0m
[31m2019-09-17 01:31:31,775 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[31m2019-09-17 01:31:31,818 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[31m2019-09-17 01:31:32,452 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[31m2019-09-17 01:31:32,787 sagemaker-containers INFO     Module train does not provide a setup.py. [0m
[31mGenerating setup.py[0m
[31m2019-09-17 01:31:32,787 sage

[31mTraining: use 4 GPUs![0m
[31mlen(train_dataloader) 280[0m
[31margs  {'workers': 2, 'num_train_epochs': 6, 'train_batch_size': 32, 'eval_batch_size': 8, 'weight_decay': 0, 'learning_rate': 4e-05, 'adam_epsilon': 1e-08, 'warmup_steps': 0, 'max_grad_norm': 1.0, 'model_type': 'bert', 'model_name': 'bert-base-uncased', 'task_name': 'binary', 'output_mode': 'classification', 'max_seq_length': 512, 'fp16': False, 'fp16_opt_level': 'O1', 'gradient_accumulation_steps': 1, 'logging_steps': 500, 'save_steps': 400, 'reprocess_input_data': False, 'hosts': ['algo-1'], 'current_host': 'algo-1', 'model_dir': '/opt/ml/model', 'data_dir': '/opt/ml/input/data/training', 'output_dir': './outputs', 'num_gpus': 4}[0m
[31m#0150.649105[0m
[31malgo-1:103:200 [0] misc/ibvwrap.cu:63 NCCL WARN Failed to open libibverbs.so[.1][0m
[31mNCCL version 2.4.2+cuda9.0[0m
[31m#0150.384912#0150.318247#0150.326684#0150.141499#0150.278126#0150.237001#0150.425965#0150.140065#0150.095901#0150.279403#0150.192553

[31m2019-09-17 01:45:37,196 sagemaker-containers INFO     Reporting training SUCCESS[0m

2019-09-17 01:46:36 Uploading - Uploading generated training model
2019-09-17 01:47:37 Completed - Training job completed
Training seconds: 1014
Billable seconds: 1014


In [19]:
!aws s3 ls s3://sagemaker-us-east-1-665028136136/${estimator.latest_training_job.name}/

# Import model into SageMaker
The PyTorch model uses a npy serializer and deserializer by default. since we have a custom implementation of all the hosting functions and plan on using JSON instead, we need a predictor that can serialize and deserialize JSON.

In [20]:
from sagemaker.predictor import RealTimePredictor, json_serializer, json_deserializer

class JSONPredictor(RealTimePredictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super(JSONPredictor, self).__init__(endpoint_name, sagemaker_session, json_serializer, json_deserializer)

Since hosting functions implemented outside of train script we can't just use estimator object to deploy the model. Instead we need to create a PyTorchModel object using the latest training job to get the S3 location of the trained model data. Besides model data location in S3, we also need to configure PyTorchModel with the script and source directory (because our generate script requires model and data classes from source directory), an IAM role.

In [21]:
from sagemaker.pytorch import PyTorchModel

training_job_name = estimator.latest_training_job.name
desc = sagemaker_session.sagemaker_client.describe_training_job(TrainingJobName=training_job_name)
trained_model_location = desc['ModelArtifacts']['S3ModelArtifacts']
model = PyTorchModel(model_data=trained_model_location,
                     role=role,
                     framework_version='1.1.0',
                     entry_point='inference.py',
                     source_dir='email-compliance-bert',
                     #git_config=git_config,
                     predictor_cls=JSONPredictor)

Create endpoint
Now the model is ready to be deployed at a SageMaker endpoint and we are going to use the sagemaker.pytorch.model.PyTorchModel.deploy method to do this. We can use a CPU-based instance for inference (in this case an ml.m4.xlarge), even though we trained on GPU instances, because at the end of training we moved model to cpu before returning it. This way we can load trained model on any device and then move to GPU if CUDA is available.

In [None]:
# Create Endpoint Configuration
from time import gmtime, strftime

endpoint_config_name = 'EmailComplianceEndpointConfig-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(endpoint_config_name)
create_endpoint_config_response = sagemaker.create_endpoint_config(
    EndpointConfigName = endpoint_config_name,
    ProductionVariants=[{
        'InstanceType':'ml.m5.xlarge',
        'InitialInstanceCount':2,
        'ModelName':model_name,
        'VariantName':'AllTraffic',
        'AcceleratorType':'ml.eia1.medium'}])

print("Endpoint Config Arn: " + create_endpoint_config_response['EndpointConfigArn'])


In [None]:
endpoint_name = 'EmailComplianceEndpoint-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
endpoint_response = sagemaker.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name)

In [23]:
predictor = model.deploy(initial_instance_count=2, instance_type='ml.m5.xlarge')
#https://aws.amazon.com/machine-learning/elastic-inference/pricing/

---------------------------------------------------------------------------------------------------!

In [24]:
predictor.endpoint

'sagemaker-pytorch-2019-09-17-01-54-42-510'

In [25]:
hello_song="""
Hello, it's me.
I was wondering if after all these years you'd like to meet.
To go over everything.
They say that time's supposed to heal you.
But I ain't done much healing.
"""

In [27]:
import time
import json
start_t=time.time()
input_json = {
    'txt': hello_song
}
d=json.dumps(input_json)
response = predictor.predict(input_json)
print("response=", response, time.time()-start_t)

response= 1 0.9275727272033691


In [29]:
import boto3
import json
runtime = boto3.Session().client(service_name='runtime.sagemaker',region_name='us-east-1')
#endpoint_name = 'sagemaker-pytorch-2019-09-15-13-33-39-536'
endpoint_name = predictor.endpoint
start_t=time.time()
response = runtime.invoke_endpoint(EndpointName=endpoint_name,
 ContentType='application/json',
 Body=d)
result = json.loads(response['Body'].read().decode())
print("response=", response, time.time()-start_t)

response= {'ResponseMetadata': {'RequestId': '65b1e034-25aa-445c-9100-41c35b8a2b62', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '65b1e034-25aa-445c-9100-41c35b8a2b62', 'x-amzn-invoked-production-variant': 'AllTraffic', 'date': 'Tue, 17 Sep 2019 02:21:13 GMT', 'content-type': 'application/json', 'content-length': '3'}, 'RetryAttempts': 0}, 'ContentType': 'application/json', 'InvokedProductionVariant': 'AllTraffic', 'Body': <botocore.response.StreamingBody object at 0x7f0f86096710>} 0.9273343086242676


In [None]:
print(result)

Cleanup
To delete the prediction endpoint to release the instance(s) associated with it.

In [None]:
#sagemaker_session.delete_endpoint(predictor.endpoint)

In [33]:
!more requirements.txt


sklearn
uuid
pytorch-transformers


In [35]:
!cat Dockerfile

# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
#     http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.

# For more information on creating a Dockerfile
# https://docs.docker.com/compose/gettingstarted/#step-2-create-a-dockerfile
# https://github.com/awslabs/amazon-sagemaker-examples/master/advanced_functionality/pytorch_extending_our_containers/pytorch_extending_our_containers.ipynb
# SageMaker PyTorch image
FROM 520713654638.dkr.ecr.us-east-1.amazonaws.com/sagemaker-pytorch:1.1.0-cpu-py3

ENV PATH="/opt/ml/c

In [None]:
 !./build_and_push.sh sagemaker-pytorch-email-compliance