In [1]:
import boto3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import sagemaker
from sagemaker import image_uris, get_execution_role
from sagemaker.session import s3_input
from sagemaker.parameter import ContinuousParameter, IntegerParameter, CategoricalParameter
from sagemaker.tuner import HyperparameterTuner

sess = sagemaker.Session()

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'oscarm524-mastering-ml-aws'
bucket = s3.Bucket(bucket_name)
prefix = 'regression'

## Defining the file to be read from s3 bucket
file_key = 'regression/raw_data/housing.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
housing = pd.read_csv(file_content_stream)

## Defining inputs and target variable
X = housing.drop(['MEDV'], axis = 1)
Y = housing['MEDV']

## Splitting data into training, validation and testing
X_train, X_val, Y_train, Y_val =  train_test_split(X, Y, test_size = 0.3, random_state = 618, shuffle = True)
X_val, X_test, Y_val, Y_test = train_test_split(X_val, Y_val, test_size = 0.5, random_state = 620, shuffle = True)

## Putting datasets in the right format to be uploaded to s3
train = pd.concat([Y_train, X_train], axis = 1)
train.to_csv('train.csv', index = False, header = False)

validation = pd.concat([Y_val, X_val], axis = 1)
validation.to_csv('validation.csv', index = False, header = False)

test = pd.concat([Y_test, X_test], axis = 1)
test.to_csv('test.csv', index = False, header = False)

## Uploading datasets to s3 buckects
sess.upload_data(path = 'train.csv', 
                 bucket = bucket_name,
                 key_prefix = 'regression/train')

sess.upload_data(path = 'validation.csv', 
                 bucket = bucket_name,
                 key_prefix = 'regression/validation')

sess.upload_data(path = 'test.csv', 
                 bucket = bucket_name,
                 key_prefix = 'regression/test')

's3://oscarm524-mastering-ml-aws/regression/test/test.csv'

In [3]:
train_data = 's3://{}/{}/train'.format(bucket.name, prefix)
validation_data = 's3://{}/{}/validation'.format(bucket.name, prefix)
test_data = 's3://{}/{}/test'.format(bucket.name, prefix)
output_path = 's3://{}/{}/output'.format(bucket.name, prefix)

's3://oscarm524-mastering-ml-aws/regression/train'

In [None]:
## Getting linear-leaner image
container = image_uris.retrieve(region = boto3.Session().region_name, framework = "linear-learner")

role = get_execution_role()

linear = sagemaker.estimator.Estimator(container,
                                       role,
                                       instance_count = 1,
                                       instance_type = 'ml.c4.xlarge',
                                       output_path = output_location,
                                       sagemaker_session = sess,
                                       )

linear.set_hyperparameters(feature_dim = 3,
                           predictor_type = 'regressor',
                           mini_batch_size = 100)

In [None]:
linear.fit({'train': s3_input(train_data, content_type = 'text/csv'),
            'validation': s3_input(validation_data, content_type = 'text/csv')})

In [None]:
## Applying fitted model
linear_learner = linear.transformer(instance_count = 1,
                                   instance_type = 'ml.c4.xlarge',
                                   output_path = output_location 
                                   )

linear_learner.transform(test_data, content_type = 'text/csv')
linear_learner.wait()