In [None]:
import boto3
import sagemaker
from sagemaker import image_uris, get_execution_role
from sagemaker.parameter import ContinuousParameter, IntegerParameter, CategoricalParameter
from sagemaker.tuner import HyperparameterTuner

import os, io, time

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

sess = sagemaker.Session()

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'oscarm524-mastering-ml-aws'
bucket = s3.Bucket(bucket_name)
prefix = 'regression'

## Defining bucket locations
train_data = 's3://{}/{}/train'.format(bucket.name, prefix)
validation_data = 's3://{}/{}/validation'.format(bucket.name, prefix)
test_data = 's3://{}/{}/test'.format(bucket.name, prefix)
output_path = 's3://{}/{}/output'.format(bucket.name, prefix)

In [None]:
## Getting linear-leaner image
container = image_uris.retrieve(region = boto3.Session().region_name, framework = 'xgboost', version = 'latest')

role = get_execution_role()

training = sagemaker.inputs.TrainingInput(s3_data = train_data,
                                          distribution = 'FullyReplicated',
                                          compression = None,
                                          content_type = 'text/csv',
                                          record_wrapping = None,
                                          s3_data_type = 'S3Prefix')

validation = sagemaker.inputs.TrainingInput(s3_data = validation_data,
                                            distribution = 'FullyReplicated',
                                            compression = None,
                                            content_type = 'text/csv',
                                            record_wrapping = None,
                                            s3_data_type = 'S3Prefix')

xgboost = sagemaker.estimator.Estimator(container,
                                        role,
                                        instance_count = 1,
                                        instance_type = 'ml.c4.xlarge',
                                        output_path = output_path,
                                        sagemaker_session = sess,
                                        )

hyperparameter_ranges = {'eta': ContinuousParameter(0, 0.5),
                         'min_child_weight': ContinuousParameter(1, 5),
                         'alpha': ContinuousParameter(0, 2),
                         'max_depth': IntegerParameter(5, 7),
                         'num_round': IntegerParameter(1, 100)}

tuner = HyperparameterTuner(estimator = xgboost,
                            objective_metric_name = 'validation:rmse',
                            hyperparameter_ranges = hyperparameter_ranges, 
                            objective_type = 'Minimize', 
                            max_jobs = 10,
                            max_parallel_jobs = 3)

tuner.fit(inputs = {'train': training, 'validation': validation})

In [None]:
## Deploying best model 
xgboost_predictor = tuner.deploy(initial_instance_count = 1, 
                                 instance_type = 'ml.c4.xlarge',
                                 endpoint_name = 'xgboost-regressor-aws',
                                 model_name = 'xgboost-regressor')

In [None]:
test_file_key = 'regression/test/test.csv'
test_bucket_object = bucket.Object(test_file_key)
test_file_object = test_bucket_object.get()
test_file_content_stream = test_file_object.get('Body')

## Reading test csv file
test = pd.read_csv(test_file_content_stream, header = None)
X_test = test.drop(test.columns[0], axis = 1)
Y_test = test.loc[:, 0]

def np2csv(arr):
    csv = io.BytesIO()
    np.savetxt(csv, arr, delimiter = ',', fmt = '%g')
    return csv.getvalue().decode().rstrip()

runtime = boto3.client('runtime.sagemaker')

test_payload = np2csv(X_test)
test_response = runtime.invoke_endpoint(EndpointName = xgboost_predictor.endpoint_name,
                                        ContentType = 'text/csv',
                                        Body = test_payload)

test_results = json.loads(test_response['Body'].read().decode())
test_results = np.array([r['score'] for r in test_results['predictions']])

## Creating a data-frame to store predictions 
test_preds = pd.DataFrame({'Y_actual': Y_test, 'Y_pred': test_results})

## Computing mse
mse = mean_squared_error(test_preds['Y_actual'], test_preds['Y_pred'])
mse

In [None]:
sagemaker.Session().delete_endpoint(xgboost_predictor.endpoint)