In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("petrol_consumption.csv")
df.head()

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption
0,9.0,3571,1976,0.525,541
1,9.0,4092,1250,0.572,524
2,9.0,3865,1586,0.58,561
3,7.5,4870,2351,0.529,414
4,8.0,4399,431,0.544,410


In [2]:
#Splitting data in 80-20 split to use testing data for model inference later
train = df.iloc[:35,:]
test = df.iloc[36:,:]

#Train and test csv
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

In [3]:
#Create a sagemaker session to be able to upload data to s3
import boto3
import sagemaker
sagemaker_session = sagemaker.Session()

#Uploading data to S3 bucket titled "tf-iris-data"
prefix = "sklearn-petrol-data"

#Create train and test paths, with the test dataset we will use batch inference
training_input_path = sagemaker_session.upload_data('train.csv', key_prefix=prefix + '/training')
test_data_path = sagemaker_session.upload_data('test.csv', key_prefix=prefix + '/test')

In [8]:
import sagemaker

region = sagemaker.Session().boto_region_name
print("AWS Region: {}".format(region))

role = sagemaker.get_execution_role()
print("RoleArn: {}".format(role))

AWS Region: us-east-2
RoleArn: arn:aws:iam::126500756700:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole


In [None]:
#Docs: https://sagemaker.readthedocs.io/en/stable/frameworks/sklearn/sagemaker.sklearn.html
from sagemaker.sklearn import SKLearn
sk_estimator = SKLearn(entry_point='train.py', 
                          role=role,
                          instance_count=1, 
                          instance_type='ml.m4.xlarge',
                          py_version='py3',
                          framework_version='0.23-1',
                          script_mode=True,
                          hyperparameters={
                              'estimators': 20
                          }
                         )

#Training
sk_estimator.fit({'train': training_input_path})

2022-07-28 04:51:19 Starting - Starting the training job...
2022-07-28 04:51:46 Starting - Preparing the instances for trainingProfilerReport-1658983878: InProgress
.........
2022-07-28 04:53:02 Downloading - Downloading input data.

In [None]:
"""
input_fn
    request_body: The body of the request sent to the model.
    request_content_type: (string) specifies the format/variable type of the request
"""
def input_fn(input_data, content_type):
    if content_type == "text/csv":
        # Read the raw input data as CSV.
        df = pd.read_csv(StringIO(input_data))
        df = df[['Petrol_tax', 'Average_income', 'Paved_Highways', 'Population_Driver_licence(%)']]
        return df
    else:
        raise ValueError("{} not supported by script!".format(content_type))

In [None]:
# Define a SKLearn Transformer from the trained SKLearn Estimator
transformer = sk_estimator.transformer(
    instance_count=1, instance_type="ml.m5.xlarge", assemble_with="Line", accept="text/csv"
)

In [None]:
# Feed the test data
transformer.transform(test_data_path, content_type="text/csv")
print("Waiting for transform job: " + transformer.latest_transform_job.job_name)
transformer.wait()
output = transformer.output_path

In [None]:
import boto3
client = boto3.client('sagemaker')

output_path = client.describe_transform_job(TransformJobName = "Enter your transform job name from console")['TransformOutput']['S3OutputPath']
output_path

#emits a file called output.csv to your local directory
!aws s3 cp 'Replace with your S3 output path' output.csv
results = pd.read_csv('output.csv')
results