In [None]:
import boto3
import sagemaker
from sagemaker import image_uris, get_execution_role
from sagemaker.parameter import ContinuousParameter, IntegerParameter, CategoricalParameter
from sagemaker.tuner import HyperparameterTuner

import os, io, time

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

sess = sagemaker.Session()

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'oscarm524-mastering-ml-aws'
bucket = s3.Bucket(bucket_name)
prefix = 'regression'

## Defining the file to be read from s3 bucket
file_key = 'regression/raw_data/housing.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
housing = pd.read_csv(file_content_stream)

## Defining inputs and target variable
X = housing.drop(['MEDV'], axis = 1)
Y = housing['MEDV']

## Splitting data into training, validation and testing
X_train, X_val, Y_train, Y_val =  train_test_split(X, Y, test_size = 0.3, random_state = 618, shuffle = True)
X_val, X_test, Y_val, Y_test = train_test_split(X_val, Y_val, test_size = 0.5, random_state = 620, shuffle = True)

## Putting datasets in the right format to be uploaded to s3
train = pd.concat([Y_train, X_train], axis = 1)
train.to_csv('train.csv', index = False, header = False)

validation = pd.concat([Y_val, X_val], axis = 1)
validation.to_csv('validation.csv', index = False, header = False)

test = pd.concat([Y_test, X_test], axis = 1)
test.to_csv('test.csv', index = False, header = False)

## Uploading datasets to s3 buckects
sess.upload_data(path = 'train.csv', 
                 bucket = bucket_name,
                 key_prefix = 'regression/train')

sess.upload_data(path = 'validation.csv', 
                 bucket = bucket_name,
                 key_prefix = 'regression/validation')

sess.upload_data(path = 'test.csv', 
                 bucket = bucket_name,
                 key_prefix = 'regression/test')

## Defining bucket locations
train_data = 's3://{}/{}/train'.format(bucket.name, prefix)
validation_data = 's3://{}/{}/validation'.format(bucket.name, prefix)
test_data = 's3://{}/{}/test'.format(bucket.name, prefix)
output_path = 's3://{}/{}/output'.format(bucket.name, prefix)

In [None]:
## Getting linear-leaner image
container = image_uris.retrieve(region = boto3.Session().region_name, framework = "linear-learner")

role = get_execution_role()

training = sagemaker.inputs.TrainingInput(s3_data = train_data,
                                          distribution = 'FullyReplicated',
                                          compression = None,
                                          content_type = 'text/csv',
                                          record_wrapping = None,
                                          s3_data_type = 'S3Prefix')

validation = sagemaker.inputs.TrainingInput(s3_data = validation_data,
                                            distribution = 'FullyReplicated',
                                            compression = None,
                                            content_type = 'text/csv',
                                            record_wrapping = None,
                                            s3_data_type = 'S3Prefix')

linear = sagemaker.estimator.Estimator(container,
                                       role,
                                       instance_count = 1,
                                       instance_type = 'ml.c4.xlarge',
                                       output_path = output_path,
                                       sagemaker_session = sess,
                                       )

linear.set_hyperparameters(feature_dim = 3,
                           predictor_type = 'regressor',
                           mini_batch_size = 100, 
                           epochs = 10)

hyperparameter_ranges = {'wd': ContinuousParameter(0, 1),
                         'l1': ContinuousParameter(0, 1),
                         'learning_rate': ContinuousParameter(0, 1),
                         'mini_batch_size': IntegerParameter(100, 200),
                         'use_bias': CategoricalParameter([True, False])}

tuner = HyperparameterTuner(estimator = linear,
                            objective_metric_name = 'validation:mse',
                            hyperparameter_ranges = hyperparameter_ranges, 
                            objective_type = 'Minimize', 
                            max_jobs = 10,
                            max_parallel_jobs = 3)

tuner.fit(inputs = {'train': training, 'validation': validation})

In [None]:
## Deploying best model 
linear_predictor = tuner.deploy(initial_instance_count = 1, 
                                instance_type = 'ml.c4.xlarge',
                                endpoint_name = 'linear-learner-regressor-aws',
                                model_name = 'linear-learner-regressor')

In [None]:
test_file_key = 'regression/test/test.csv'
test_bucket_object = bucket.Object(test_file_key)
test_file_object = test_bucket_object.get()
test_file_content_stream = test_file_object.get('Body')

## Reading test csv file
test = pd.read_csv(test_file_content_stream, header = None)
X_test = test.drop(test.columns[0], axis = 1)
Y_test = test.loc[:, 0]

def np2csv(arr):
    csv = io.BytesIO()
    np.savetxt(csv, arr, delimiter = ',', fmt = '%g')
    return csv.getvalue().decode().rstrip()

runtime = boto3.client('runtime.sagemaker')

test_payload = np2csv(X_test)
test_response = runtime.invoke_endpoint(EndpointName = linear_predictor.endpoint_name,
                                        ContentType = 'text/csv',
                                        Body = test_payload)

test_results = json.loads(test_response['Body'].read().decode())
test_results = np.array([r['score'] for r in test_results['predictions']])

## Creating a data-frame to store predictions 
test_preds = pd.DataFrame({'Y_actual': Y_test, 'Y_pred': test_results})

## Computing mse
mse = mean_squared_error(test_preds['Y_actual'], test_preds['Y_pred'])
mse

In [None]:
sagemaker.Session().delete_endpoint(linear_predictor.endpoint)