In [6]:
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri

import sagemaker

sagemaker_session = sagemaker.Session()


role = sagemaker.get_execution_role()

In [2]:
hyperparameters = {'num_train_epochs': 1, 'save_steps':1000, 'train_batch_size':32, 'eval_batch_size':8}


In [3]:
#git_config = {'repo': 'https://github.com/awslabs/amazon-sagemaker-examples.git', 'branch': 'training-scripts'}

In [4]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(entry_point='train.py',
                    role=role,
                    framework_version='1.1.0',
                    train_instance_count=1,
                    train_instance_type='ml.p2.8xlarge',
                    source_dir='email-compliance-bert',
                    #git_config=git_config,
                    hyperparameters=hyperparameters)

In [None]:
estimator.fit('s3://sagemaker-us-east-1-517600888691/compliance-data/batch')

2019-09-15 12:42:23 Starting - Starting the training job...
2019-09-15 12:42:24 Starting - Launching requested ML instances......
2019-09-15 12:43:53 Starting - Preparing the instances for training.........
2019-09-15 12:45:20 Downloading - Downloading input data
2019-09-15 12:45:20 Training - Downloading the training image...
2019-09-15 12:45:49 Training - Training image download completed. Training in progress..[31mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[31mbash: no job control in this shell[0m
[31m2019-09-15 12:45:50,949 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[31m2019-09-15 12:45:51,029 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[31m2019-09-15 12:45:54,043 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[31m2019-09-15 12:45:54,296 sagemaker-containers INFO     Module train does not provide a setup.py. [

[31mTraining: use 8 GPUs![0m
[31mlen(train_dataloader) 1329[0m
[31m#0150.354053[0m
[31malgo-1:97:169 [0] misc/ibvwrap.cu:63 NCCL WARN Failed to open libibverbs.so[.1][0m
[31mNCCL version 2.4.2+cuda9.0[0m


In [None]:
!aws s3 ls s3://sagemaker-us-east-1-517600888691/

# Import model into SageMaker
The PyTorch model uses a npy serializer and deserializer by default. since we have a custom implementation of all the hosting functions and plan on using JSON instead, we need a predictor that can serialize and deserialize JSON.

In [7]:
from sagemaker.predictor import RealTimePredictor, json_serializer, json_deserializer

class JSONPredictor(RealTimePredictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super(JSONPredictor, self).__init__(endpoint_name, sagemaker_session, json_serializer, json_deserializer)

Since hosting functions implemented outside of train script we can't just use estimator object to deploy the model. Instead we need to create a PyTorchModel object using the latest training job to get the S3 location of the trained model data. Besides model data location in S3, we also need to configure PyTorchModel with the script and source directory (because our generate script requires model and data classes from source directory), an IAM role.

In [28]:
from sagemaker.pytorch import PyTorchModel

training_job_name = estimator.latest_training_job.name
desc = sagemaker_session.sagemaker_client.describe_training_job(TrainingJobName=training_job_name)
trained_model_location = desc['ModelArtifacts']['S3ModelArtifacts']
model = PyTorchModel(model_data=trained_model_location,
                     role=role,
                     framework_version='1.1.0',
                     entry_point='inference.py',
                     source_dir='email-compliance-bert',
                     #git_config=git_config,
                     predictor_cls=JSONPredictor)

Create endpoint
Now the model is ready to be deployed at a SageMaker endpoint and we are going to use the sagemaker.pytorch.model.PyTorchModel.deploy method to do this. We can use a CPU-based instance for inference (in this case an ml.m4.xlarge), even though we trained on GPU instances, because at the end of training we moved model to cpu before returning it. This way we can load trained model on any device and then move to GPU if CUDA is available.

In [29]:
predictor = model.deploy(initial_instance_count=1, instance_type='ml.p2.8xlarge')


--------------------------------------------------------------------------------------------------------------!

In [30]:
predictor.endpoint

'sagemaker-pytorch-2019-09-15-14-22-24-975'

In [39]:
hello_song="""
Hello, it's me.
I was wondering if after all these years you'd like to meet.
To go over everything.
They say that time's supposed to heal you.
But I ain't done much healing.
"""

In [40]:
import time
start_t=time.time()
input_json = {
    'txt': hello_song
}
d=json.dumps(input_json)
response = predictor.predict(input_json)
print("response=", response, time.time()-start_t)

('response=', u'0', 0.11596512794494629)


In [41]:
import boto3
import json
runtime = boto3.Session().client(service_name='runtime.sagemaker',region_name='us-east-1')
#endpoint_name = 'sagemaker-pytorch-2019-09-15-13-33-39-536'
endpoint_name = 'sagemaker-pytorch-2019-09-15-14-22-24-975'
start_t=time.time()
response = runtime.invoke_endpoint(EndpointName=endpoint_name,
 ContentType='application/json',
 Body=d)
result = json.loads(response['Body'].read().decode())
print("response=", response, time.time()-start_t)

('response=', {u'InvokedProductionVariant': 'AllTraffic', u'Body': <botocore.response.StreamingBody object at 0x7f72c7872d10>, u'ContentType': 'application/json', 'ResponseMetadata': {'RetryAttempts': 0, 'HTTPStatusCode': 200, 'RequestId': '5e826411-0d9a-4d1d-bef5-104797628d05', 'HTTPHeaders': {'x-amzn-requestid': '5e826411-0d9a-4d1d-bef5-104797628d05', 'x-amzn-invoked-production-variant': 'AllTraffic', 'content-length': '3', 'date': 'Sun, 15 Sep 2019 14:38:16 GMT', 'content-type': 'application/json'}}}, 0.19006991386413574)


In [42]:
print(result)

0


Cleanup
To delete the prediction endpoint to release the instance(s) associated with it.

In [None]:
sagemaker_session.delete_endpoint(predictor.endpoint)