# Day 2: SageMaker Inference

In [38]:
%cd /root/sagemaker-workshop-420/day_2/

BUCKET = 'sagemaker-workshop-420'
PREFIX = 'data'

LOCAL_DATA_DIRECTORY = './data/'

/root/sagemaker-workshop-420/day_2


## 1. Loading a pretrained model

In [19]:
import boto3
import pandas as pd
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn import SKLearnModel

In [4]:
boto_session = boto3.Session()
sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()
print(role)

arn:aws:iam::209970524256:role/service-role/AmazonSageMaker-ExecutionRole-20200402T065938


### TODO Mention how misleading the documentation can be

In [17]:
model_data = 's3://sagemaker-us-east-2-209970524256/sagemaker-scikit-learn-2020-04-03-11-03-52-730/output/model.tar.gz'
entry_point = '../day_1/sklearn_iris.py'

sklearn_model = SKLearnModel(model_data=model_data,
                             role=role, 
                             # I don't know why we need to provide this path here
                             entry_point=entry_point)

In [18]:
sklearn_predictor = sklearn_model.deploy(initial_instance_count=1,
                                         instance_type="ml.m4.xlarge")

-------------!

In [21]:
iris_df = pd.read_csv("../data/iris.csv", header=None)
iris_df.head()

Unnamed: 0,0,1,2,3,4
0,0.0,5.1,3.5,1.4,0.2
1,0.0,4.9,3.0,1.4,0.2
2,0.0,4.7,3.2,1.3,0.2
3,0.0,4.6,3.1,1.5,0.2
4,0.0,5.0,3.6,1.4,0.2


In [22]:
iris_X = iris_df.iloc[:,1:]
iris_y = iris_df.iloc[:,0]

In [23]:
iris_preds = sklearn_predictor.predict(iris_X.values)

In [26]:
print(iris_preds)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2.]


### Endpoint cleanup <a class="anchor" id="endpoint_cleanup"></a>

When you're done with the endpoint, you'll want to clean it up.

In [27]:
sklearn_predictor.delete_endpoint()

## Batch Transform <a class="anchor" id="batch_transform"></a>
We can also use the trained model for asynchronous batch inference on S3 data using SageMaker Batch Transform.

In [28]:
sklearn_transformer = sklearn_model.transformer(instance_count=1,
                                                instance_type='ml.m4.xlarge')

Using already existing model: sagemaker-scikit-learn-2020-04-07-11-58-34-595


### Prepare Input Data <a class="anchor" id="prepare_input_data"></a>
We will use the training data file we uploaded yesterday for inference.

In [39]:
iris_X.to_csv('../data/iris_inference.csv')
train_input = sagemaker_session.upload_data(
    '../data/iris_inference.csv',
    bucket=BUCKET,
    key_prefix="{}/sklearn".format(PREFIX))

### Run Transform Job <a class="anchor" id="run_transform_job"></a>
Using the Transformer, run a transform job on the S3 input data.

In [33]:
# Start a transform job and wait for it to finish
sklearn_transformer.transform(s3_iris, content_type='text/csv')
print('Waiting for transform job: ' + sklearn_transformer.latest_transform_job.job_name)
sklearn_transformer.wait()

Waiting for transform job: sagemaker-scikit-learn-2020-04-07-11-58-2020-04-07-12-15-05-071
.....................
[34mProcessing /opt/ml/code[0m
[34mBuilding wheels for collected packages: sklearn-iris
  Building wheel for sklearn-iris (setup.py): started
  Building wheel for sklearn-iris (setup.py): finished with status 'done'
  Created wheel for sklearn-iris: filename=sklearn_iris-1.0.0-py2.py3-none-any.whl size=7003 sha256=ce09da208bf633dce7b3f75cb27e8141e27077e206db96e5d5635786521a634a
  Stored in directory: /tmp/pip-ephem-wheel-cache-9q3kmuq3/wheels/35/24/16/37574d11bf9bde50616c67372a334f94fa8356bc7164af8ca3[0m
[34mSuccessfully built sklearn-iris[0m
[34mInstalling collected packages: sklearn-iris[0m
[34mSuccessfully installed sklearn-iris-1.0.0[0m
  import imp[0m
[34m[2020-04-07 12:18:12 +0000] [40] [INFO] Starting gunicorn 19.9.0[0m
[34m[2020-04-07 12:18:12 +0000] [40] [INFO] Listening at: unix:/tmp/gunicorn.sock (40)[0m
[34m[2020-04-07 12:18:12 +0000] [40] [INFO] 

UnexpectedStatusException: Error for Transform job sagemaker-scikit-learn-2020-04-07-11-58-2020-04-07-12-15-05-071: Failed. Reason: AlgorithmError: See job logs for more information

### Check Output Data  <a class="anchor" id="check_output_data"></a>
After the transform job has completed, download the output data from S3. For each file "f" in the input data, we have a corresponding file "f.out" containing the predicted labels from each input row. We can compare the predicted labels to the true labels saved earlier.

In [None]:
# Download the output data from S3 to local filesystem
batch_output = sklearn_transformer.output_path
!mkdir -p batch_data/output
!aws s3 cp --recursive $batch_output/ batch_data/output/
# Head to see what the batch output looks like
!head batch_data/output/*

In [None]:
%%bash
# For each sample file, compare the predicted labels from batch output to the true labels
for i in {1..9}; do
    diff -s batch_data/Y/iris_sample_Y_${i}.csv \
        <(cat batch_data/output/iris_sample_X_${i}.csv.out | sed 's/[["]//g' | sed 's/, \|]/\n/g') \
        | sed "s/\/dev\/fd\/63/batch_data\/output\/iris_sample_X_${i}.csv.out/"
done