Using Simple Linear Regression on SageMaker & SageMaker SDK

In [None]:
#importing 

In [6]:
#importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [3]:
#import dataset

df = pd.read_csv('FuelEconomy.csv')
df.head()

Unnamed: 0,Horse Power,Fuel Economy (MPG)
0,118.770799,29.344195
1,176.326567,24.695934
2,219.262465,23.95201
3,187.310009,23.384546
4,218.59434,23.426739


In [45]:
# Separate the data into input X and Output y  
#data needs to be in float32 format

X = np.array(df[['Horse Power']]).astype('float32') #np.array because that is the format needed for sagemaker X must be a matrix
y = np.array(df['Fuel Economy (MPG)']).astype('float32') # target must be a vector
       

In [46]:
#split train/test - For the Sagemaker Target must be a vector:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
print(y_train.shape)
print(X_train.shape)
type(X_train)

(80,)
(80, 1)


numpy.ndarray

In [47]:
# Boto3 is the Amazon Web Services (AWS) Software Development Kit (SDK) for Python
# Boto3 allows Python developer to write software that makes use of services like Amazon S3 and Amazon EC2

import sagemaker
import boto3

#step 1 : create a session
sagemaker_session = sagemaker.Session()

#step 2 saving your work - create the bucket in S3 and folders

bucket = 'udemy-linear-learner' #this bucket needs to be created before you run this code
prefix = 'linear-learner-capstone' # sub folder that will be created by teh code

#step3 get the role that is executing this session

role = sagemaker.get_execution_role()





In [48]:
#import libraries to prepare data to work with sagemaker

import io
import sagemaker.amazon.common as smac

#from AWS Udemy course:
# Code below converts the data in numpy array format to RecordIO format
# This is the format required by Sagemaker Linear Learner
buff = io.BytesIO()
smac.write_numpy_to_dense_tensor(buff, X_train, y_train)
buff.seek(0)
# When you write to in-memory byte arrays, it increments 1 every time you write to it
# Let's reset that back to zero 

0

In [49]:
#Now using 'os' to create the files and folders

import os

file_name = 'linear-train-capstone'

boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix,'train',file_name)).upload_fileobj(buff)

#print the path so we can find our files later
s3_data_train = 's3://{}/{}/train/{}'.format(bucket, prefix, file_name)
print(f'training data path:{s3_data_train}')                                           
                                           


training data path:s3://udemy-linear-learner/linear-learner-capstone/train/linear-train-capstone


In [50]:
#for test data

buff = io.BytesIO() 
smac.write_numpy_to_dense_tensor(buff, X_test, y_test)
buff.seek(0) 

0

In [51]:
file_name = 'linear-test-capstone'
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix,'test',file_name)).upload_fileobj(buff)


#print the path so we can find our files later
s3_data_test = 's3://{}/{}/test/{}'.format(bucket, prefix, file_name)
print(f'testing data path:{s3_data_test}')                                           
 

testing data path:s3://udemy-linear-learner/linear-learner-capstone/test/linear-test-capstone


In [52]:
# create an output placeholder in S3 bucket to store the linear learner output

output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('Training artifacts will be uploaded to: {}'.format(output_location))

Training artifacts will be uploaded to: s3://udemy-linear-learner/linear-learner-capstone/output


In [53]:
#obtaining 'container for deployment' from sage maker sdk

container = sagemaker.image_uris.retrieve('linear-learner', boto3.Session().region_name)



In [54]:
#instantiating module

#from aws:
# classsagemaker.estimator.EstimatorBase(role, instance_count=None, instance_type=None, keep_alive_period_in_seconds=None, 
#volume_size=30, volume_kms_key=None, max_run=86400, input_mode='File', output_path=None, output_kms_key=None, 
#base_job_name=None, sagemaker_session=None, tags=None, subnets=None, security_group_ids=None, model_uri=None, 
#model_channel_name='model', metric_definitions=None, encrypt_inter_container_traffic=False, use_spot_instances=False, 
#max_wait=None, checkpoint_s3_uri=None, checkpoint_local_path=None, rules=None, debugger_hook_config=None, 
#tensorboard_output_config=None, enable_sagemaker_metrics=None, enable_network_isolation=False, profiler_config=None, 
#disable_profiler=False, environment=None, max_retry_attempts=None, source_dir=None, git_config=None, 
#hyperparameters=None, container_log_level=20, code_location=None, entry_point=None, dependencies=None, 
#instance_groups=None, **kwargs)

linear_model = sagemaker.estimator.Estimator(container, role, 
                                             instance_count = 1, 
                                             instance_type = 'ml.m4.xlarge',
                                             output_path = output_location,
                                             sagemaker_session = sagemaker_session)
#setting hyperparameters:

linear_model.set_hyperparameters(feature_dim = 1,
                                 predictor_type = 'regressor',
                                 mini_batch_size = 5,
                                 epochs = 4,
                                 num_models = 20,
                                 loss = 'absolute_loss')

#fti

linear_model.fit({'train': s3_data_train})


2022-11-25 17:23:30 Starting - Starting the training job...ProfilerReport-1669397010: InProgress
...
2022-11-25 17:24:14 Starting - Preparing the instances for training......
2022-11-25 17:25:21 Downloading - Downloading input data...
2022-11-25 17:25:55 Training - Downloading the training image............
2022-11-25 17:27:57 Uploading - Uploading generated training model[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[11/25/2022 17:27:45 INFO 139838445905728] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'i

Deploying Model


In [55]:
from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import CSVSerializer

In [56]:
linear_regressor = linear_model.deploy(initial_instance_count = 1,
                                       instance_type = 'ml.m4.xlarge',
                                       serializer = CSVSerializer(),
                                       deserializer = JSONDeserializer())

--------!

In [58]:
result = linear_regressor.predict(X_test)

In [59]:
pred_df = pd.json_normalize(result, record_path='predictions' )

In [60]:
pred_df.head()

Unnamed: 0,score
0,22.294613
1,23.806149
2,25.494226
3,31.493713
4,21.038033


In [None]:
# VISUALIZE TEST SET RESULTS
plt.figure(figsize = (10, 6))
plt.scatter(X_test, y_test, color = 'blue')
plt.plot(X_test, pred_df['score'], color = 'red')
plt.xlabel('Horse Power')
plt.ylabel('G')

plt.grid()
plt.show();