In [155]:
import datetime
import tarfile

import mlflow

import boto3
import pandas as pd
import numpy as np
from sagemaker import get_execution_role
from mlflow.sagemaker import deploy 
import sagemaker
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error, r2_score

from sqlalchemy import create_engine

sm_boto3 = boto3.client('sagemaker')

sess = sagemaker.Session()

region = sess.boto_session.region_name

bucket = sess.default_bucket()  # this could also be a hard-coded bucket name

print('Using bucket ' + bucket)

Using bucket sagemaker-eu-west-1-505529183986


## Prepare data

load data from redshift and then prepare test and train datasets

In [156]:
engine = create_engine('postgresql://mlflow_admin:EdBx6FUjDEgvG5@mlflow-redshift0.cdwamzbulp7n.eu-west-1.redshift.amazonaws.com:5439/dev')
data_frame = pd.read_sql_query('SELECT * FROM boston_data;', engine)

In [157]:
trainX, testX = train_test_split(data_frame, test_size=0.25, random_state=42)

In [158]:
trainX.to_csv('boston_train.csv')
testX.to_csv('boston_test.csv')

In [159]:
# send data to S3. SageMaker will take training data from s3
trainpath = sess.upload_data(
    path='boston_train.csv', bucket=bucket,
    key_prefix='sagemaker/sklearncontainer')

testpath = sess.upload_data(
    path='boston_test.csv', bucket=bucket,
    key_prefix='sagemaker/sklearncontainer')

## Writing a Script Mode script

In [153]:
%%writefile script.py

import argparse
import os

import mlflow
from mlflow.sklearn import log_model

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib
from sklearn.metrics import mean_squared_error, r2_score

if __name__ =='__main__':

    print('extracting arguments')
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    # to simplify the demo we don't use all sklearn RandomForest hyperparameters
    parser.add_argument('--n-estimators', type=int, default=10)
    parser.add_argument('--min-samples-leaf', type=int, default=3)

    # Data, model, and output directories
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--train-file', type=str, default='boston_train.csv')
    parser.add_argument('--test-file', type=str, default='boston_test.csv')
#     parser.add_argument('--data_table', type=str, default='boston_data')
    parser.add_argument('--features', type=str)  # in this script we ask user to explicitly name features
    parser.add_argument('--target', type=str) # in this script we ask user to explicitly name the target

    args, _ = parser.parse_known_args()

    print('reading data')
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
#     engine = create_engine('postgresql://mlflow_admin:EdBx6FUjDEgvG5@mlflow-redshift0.cdwamzbulp7n.eu-west-1.redshift.amazonaws.com:5439/dev')
#     data_frame = pd.read_sql_query('SELECT * FROM boston_data;', engine)
#     train_df, test_df = train_test_split(data_frame, test_size=0.25, random_state=42)

    print('building training and testing datasets')
    X_train = train_df[args.features.split()]
    X_test = test_df[args.features.split()]
    y_train = train_df[args.target]
    y_test = test_df[args.target]

    # setting MLFlow tracker 
    uri = "http://internal-a19641f33008a11eaa1590a387f0e3c9-331214759.eu-west-1.elb.amazonaws.com:5000"
    mlflow.set_tracking_uri(uri)
    mlflow.set_experiment("ml-demo")
    
    # train
    print('training model')
    with mlflow.start_run():
        model = RandomForestRegressor(
            n_estimators=args.n_estimators,
            min_samples_leaf=args.min_samples_leaf,
            n_jobs=-1)

        model.fit(X_train, y_train)

        # print abs error, rmse and r2_score
        print('validating model')
        y_test_predict = model.predict(X_test)
        abs_err = np.abs(y_test_predict - y_test)
#         rmse = (np.sqrt(mean_squared_error(y_test_predict, y_test)))
#         r2 = r2_score(y_test, y_test_predict)
#         print('rmse: {}, r2_score: {}'.format(rmse, r2))
#         mlflow.log_metric('rmse', rmse)
#         mlflow.log_metric('r2', r2)


        # print couple perf metrics
        for q in [10, 50, 90]:
            print('AE-at-' + str(q) + 'th-percentile: '
                  + str(np.percentile(a=abs_err, q=q)))
            mlflow.log_metric('AE-at-' + str(q) + 'th-percentile', np.percentile(a=abs_err, q=q))

        print(args.min_samples_leaf)
        
        mlflow.log_param("n_estimators", args.n_estimators)
        mlflow.log_param("min_samples_leaf", args.min_samples_leaf)
        log_model(model, "model")
        mlflow.log_artifcat()

Overwriting script.py


In [160]:
! python script.py --n-estimators 600 \
                   --min-samples-leaf 5 \
                   --train /home/ec2-user/SageMaker/mlflow_demo \
                   --test /home/ec2-user/SageMaker/mlflow_demo \
                   --features 'crim zn indus chas nox rm age dis rad tax ptratio b lstat' \
                   --target target

  import imp
extracting arguments
reading data
building training and testing datasets
training model
  from IPython.utils.signatures import signature
validating model
AE-at-10th-percentile: 0.3721773623439724
AE-at-50th-percentile: 1.7404672064140279
AE-at-90th-percentile: 5.067331733496558
5


## SageMaker Training

### Launching a training job with the Python SDK

In [166]:
from sagemaker.sklearn.estimator import SKLearn

sklearn_estimator = SKLearn(
    entry_point='script.py',
    role = get_execution_role(),
    train_instance_count=1,
    train_instance_type='ml.c5.xlarge',
    framework_version='0.20.0',
    metric_definitions=[
        {'Name': 'median-AE',
         'Regex': "AE-at-50th-percentile: ([0-9.]+).*$"}],
    hyperparameters = {'n-estimators': 100,
                       'min-samples-leaf': 3,
                       'features': 'crim zn indus chas nox rm age dis rad tax ptratio b lstat',
                       'target': 'target'})

In [167]:
# launch training job, with asynchronous call
sklearn_estimator.fit({'train':trainpath, 'test': testpath}, wait=False)

## Deploy the model 

In [148]:
ml_endpoint_name = 'ml-demo'
deploy(app_name=ml_endpoint_name, model_uri='s3://3stripes-mlflow-artifacts/1/dc33eb2ba2a44e6b8301f5e97b3c5af5/artifacts/model/',
                       execution_role_arn=get_execution_role(), bucket='3stripes-mlflow-artifacts', 
                       image_url='505529183986.dkr.ecr.eu-west-1.amazonaws.com/mlflow-pyfunc:1.4.0', 
                       region_name='eu-west-1', mode=mlflow.sagemaker.DEPLOYMENT_MODE_REPLACE)

2019/11/08 11:41:48 INFO mlflow.sagemaker: Using the python_function flavor for deployment!
2019/11/08 11:41:49 INFO mlflow.sagemaker: tag response: {'ResponseMetadata': {'RequestId': '65331DBD49E8D987', 'HostId': 'wnqnDmR8oVzR3Ce4Nf9RpEOQQB08P212bQUc09gSd98L619kfK5V8f6v2x3p3WTAfmfpBGjSlzY=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'wnqnDmR8oVzR3Ce4Nf9RpEOQQB08P212bQUc09gSd98L619kfK5V8f6v2x3p3WTAfmfpBGjSlzY=', 'x-amz-request-id': '65331DBD49E8D987', 'date': 'Fri, 08 Nov 2019 11:41:50 GMT', 'content-length': '0', 'server': 'AmazonS3'}, 'RetryAttempts': 0}}
2019/11/08 11:41:49 INFO mlflow.sagemaker: Found active endpoint with arn: arn:aws:sagemaker:eu-west-1:505529183986:endpoint/ml-demo. Updating...
2019/11/08 11:41:49 INFO mlflow.sagemaker: Created new model with arn: arn:aws:sagemaker:eu-west-1:505529183986:model/ml-demo-model-7xegvplrt-cuyrosaftobg
2019/11/08 11:41:49 INFO mlflow.sagemaker: Created new endpoint configuration with arn: arn:aws:sagemaker:eu-west-1:50552918

## Model Inference 

In [85]:
runtime = boto3.client('sagemaker-runtime')

In [128]:
# csv serialization
response = runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=testX[testX.columns[:-1]].iloc[:2].to_csv(header=False, index=False).encode('utf-8'),
    ContentType='text/csv')

print(response['Body'].read().decode("utf-8") )

[33.85619167709908]


In [133]:
testX[testX.columns[:-1]].iloc[:1]

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
173,0.0315,95.0,1.47,0.0,0.403,6.975,15.3,7.6534,3.0,402.0,17.0,396.9,4.56


In [140]:
uri = "http://internal-a19641f33008a11eaa1590a387f0e3c9-331214759.eu-west-1.elb.amazonaws.com:5000"
mlflow_client = mlflow.tracking.MlflowClient(uri)

In [147]:
mlflow_client.get_run('f19af831e29748e69afe1ecd02404bb7').to_dictionary()

{'info': {'artifact_uri': 's3://3stripes-mlflow-artifacts/1/f19af831e29748e69afe1ecd02404bb7/artifacts',
  'end_time': 1573212611607,
  'experiment_id': '1',
  'lifecycle_stage': 'active',
  'run_id': 'f19af831e29748e69afe1ecd02404bb7',
  'run_uuid': 'f19af831e29748e69afe1ecd02404bb7',
  'start_time': 1573212609959,
  'status': 'FINISHED',
  'user_id': 'ec2-user'},
 'data': {'metrics': {'AE-at-10th-percentile': 0.363594086901652,
   'AE-at-50th-percentile': 1.50249762300608,
   'AE-at-90th-percentile': 4.93729709268675},
  'params': {'n_estimators': '400', 'min_samples_leaf': '4'},
  'tags': {'mlflow.source.name': 'script.py',
   'mlflow.source.git.commit': '4ea8cd6b1d7acdeab6e86ae727be6dde70d1c4f5',
   'mlflow.source.type': 'LOCAL',
   'mlflow.user': 'ec2-user'}}}