In [2]:
import boto3
import sagemaker

In [3]:
from sagemaker import get_execution_role
role = get_execution_role()
role

'arn:aws:iam::257548842387:role/service-role/AmazonSageMaker-ExecutionRole-20230618T190858'

In [4]:
region = boto3.session.Session().region_name
region

'ap-south-1'

In [5]:
from sagemaker.sklearn.processing import SKLearnProcessor
sklearn_processor = SKLearnProcessor(framework_version='0.20.0', role=role, instance_type='ml.t3.medium', instance_count=1)

In [6]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

In [7]:
import pandas as pd
d = pd.read_csv("s3://preprocessoroutputs/abalone.csv")

In [8]:
d.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [9]:
d.to_csv("abalone.csv")

In [10]:
input_data = "s3://preprocessoroutputs/abalone.csv"

sklearn_processor.run(
    code='preprocessing.py',
    inputs = [
        ProcessingInput(source=input_data, 
                        destination = '/opt/ml/processing/input')],
    outputs = [
        ProcessingOutput(output_name = 'train_data',
                         source = '/opt/ml/processing/train',
                         destination = "s3://preprocessoroutputs/train"),
        
        ProcessingOutput(output_name = 'test_data',
                         source = '/opt/ml/processing/test',
                         destination = "s3://preprocessoroutputs/test")],
    
    arguments = ['--train-test-split-ratio','0.3']
)

INFO:sagemaker:Creating processing-job with name sagemaker-scikit-learn-2023-06-24-07-04-07-775


  import imp[0m
  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'[0m
[34mrecieved args: Namespace(train_test_split_ratio=0.3)[0m
[34msplit: 0.3[0m
[34mSaving features:[0m



In [11]:
preprocessing_job_description = sklearn_processor.jobs[-1].describe()

In [12]:
preprocessing_job_description

{'ProcessingInputs': [{'InputName': 'input-1',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://preprocessoroutputs/abalone.csv',
    'LocalPath': '/opt/ml/processing/input',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}},
  {'InputName': 'code',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://sagemaker-ap-south-1-257548842387/sagemaker-scikit-learn-2023-06-24-07-04-07-775/input/code/preprocessing.py',
    'LocalPath': '/opt/ml/processing/input/code',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}}],
 'ProcessingOutputConfig': {'Outputs': [{'OutputName': 'train_data',
    'S3Output': {'S3Uri': 's3://preprocessoroutputs/train',
     'LocalPath': '/opt/ml/processing/train',
     'S3UploadMode': 'EndOfJob'},
    'AppManaged': False},
   {'OutputName': 'test_data',
    'S3Output': {'S3Uri': 's3://

In [13]:
output_config = preprocessing_job_description["ProcessingOutputConfig"]

In [14]:
output_config

{'Outputs': [{'OutputName': 'train_data',
   'S3Output': {'S3Uri': 's3://preprocessoroutputs/train',
    'LocalPath': '/opt/ml/processing/train',
    'S3UploadMode': 'EndOfJob'},
   'AppManaged': False},
  {'OutputName': 'test_data',
   'S3Output': {'S3Uri': 's3://preprocessoroutputs/test',
    'LocalPath': '/opt/ml/processing/test',
    'S3UploadMode': 'EndOfJob'},
   'AppManaged': False}]}

In [15]:
for output in output_config['Outputs']:
    if output['OutputName'] == 'train_data':
        preprocessed_training_data = output['S3Output']['S3Uri']
    if output['OutputName'] == 'test_data':
         preprocessed_test_data = output['S3Output']['S3Uri']

In [16]:
preprocessed_training_data

's3://preprocessoroutputs/train'

In [17]:
preprocessed_test_data

's3://preprocessoroutputs/test'

In [18]:
x = pd.read_csv('s3://preprocessoroutputs/train/Xtrain.csv', header=None)

y = pd.read_csv('s3://preprocessoroutputs/test/ytrain.csv',header=None)


In [19]:
x.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0,0.008394,0.222908,-0.107991,0.030098,0.32952,-0.005416,-0.411901
1,1,-0.782755,-0.835261,-0.94487,-0.946795,-0.909574,-0.895005,-0.961525
2,2,-0.574558,-0.583316,-0.825316,-0.841763,-0.887045,-0.762708,-0.67414
3,2,-2.656529,-2.548487,-2.259966,-1.603495,-1.542638,-1.597553,-1.60455
4,0,0.549706,0.575631,0.48978,0.554234,0.660696,0.656073,0.403555


In [20]:
y

Unnamed: 0,0
0,9
1,7
2,11
3,7
4,13
...,...
2918,9
2919,12
2920,11
2921,9


In [21]:
import io
import numpy as np
import sagemaker.amazon.common as smac

In [22]:
vectors = np.array(x.values, dtype="float32")
labels = np.array(y.values, dtype="float32")

In [23]:
labels = labels.reshape(-1)

In [24]:
buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, vectors, labels)

In [25]:
buf.seek(0)

0

In [26]:
from sagemaker import get_execution_role
bucket = 'preprocessoroutputs'
prefix = 'knn-trial'
role = get_execution_role()

In [27]:
import os
key = 'recordio-pb-data'
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

uploaded training data location: s3://preprocessoroutputs/knn-trial/train/recordio-pb-data


In [28]:
from sagemaker.amazon.amazon_estimator import get_image_uri

In [29]:
sess = sagemaker.Session()
sess

<sagemaker.session.Session at 0x7f33b2e27a50>

In [30]:
container = get_image_uri(boto3.Session().region_name, "knn")
knn = sagemaker.estimator.Estimator(container, role, train_instance_count=1, train_instance_type="ml.m4.xlarge", output_path="s3://preprocessoroutputs/vehicle/model/", sagemaker_session=sess)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [31]:
knn.set_hyperparameters(
  k=5,
  predictor_type= "regressor",
  sample_size=700
)

In [32]:
knn.fit({'train': s3_train_data})

INFO:sagemaker:Creating training-job with name: knn-2023-06-24-07-22-11-519


2023-06-24 07:22:11 Starting - Starting the training job...
2023-06-24 07:22:25 Starting - Preparing the instances for training......
2023-06-24 07:23:34 Downloading - Downloading input data...
2023-06-24 07:23:54 Training - Downloading the training image..................
2023-06-24 07:27:05 Training - Training image download completed. Training in progress....
2023-06-24 07:27:40 Uploading - Uploading generated training model[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[06/24/2023 07:27:28 INFO 140523072866112] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-conf.json: {'_kvstore': 'dist_async', '_log_level': 'info', '_num_gpus': 'auto', '_num_kv_servers': '1', '_tuning_objective_metric': '', '_faiss_index_nprobe': '5', 'epochs': '1', 'feature_dim': 'auto', 'faiss_index_ivf_nlists': 'auto', 'index_metric': 'L2', 'index_type': 'faiss.Flat', 'mini_batch_siz

In [33]:
endpoint_name = 'abaloneage-endpoint-1a'
abalone_predictor = knn.deploy(initial_instance_count=1,
                                       instance_type='ml.m4.xlarge',
                                       endpoint_name=endpoint_name)

INFO:sagemaker:Creating model with name: knn-2023-06-24-07-29-08-435
INFO:sagemaker:Creating endpoint-config with name abaloneage-endpoint-1a
INFO:sagemaker:Creating endpoint with name abaloneage-endpoint-1a


-----------!

In [34]:
from sagemaker.predictor import RealTimePredictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

abalone_predictor = RealTimePredictor(endpoint_name, sess, content_type='text/csv')


abalone_predictor.serializer = CSVSerializer()
abalone_predictor.deserializer = JSONDeserializer()

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [35]:
test_vectors = pd.read_csv("s3://preprocessoroutputs/train/Xtest.csv", header=None)

In [36]:
test_vectors.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,2,0.674624,0.474853,0.48978,0.560353,0.277704,1.103149,0.619093
1,2,0.549706,0.323686,0.250672,0.089242,0.124506,0.313924,0.044323
2,0,0.29987,0.374075,1.326659,0.310521,-0.244969,0.400602,0.69094
3,0,0.924461,0.827576,0.728888,0.882584,0.807135,0.783809,1.014248
4,2,-0.408,-0.230593,0.131117,-0.430816,-0.560375,-0.662344,-0.171216


In [37]:
test_vectors = test_vectors.values

In [38]:
test_vectors[2]

array([ 0.        ,  0.29986958,  0.37407537,  1.32665906,  0.31052096,
       -0.2449688 ,  0.40060164,  0.69093973])

In [39]:
print(abalone_predictor.predict(test_vectors[2]))

{'predictions': [{'predicted_label': 15.2}]}


In [40]:
test_vectors.shape

(1254, 8)

In [41]:
import numpy as np

predictions = []
for array in range(1254):
    result = abalone_predictor.predict(test_vectors[array])
    
    predictions += [r['predicted_label'] for r in result['predictions']]

predictions = np.array(predictions)


In [42]:
predictions

array([11.4,  9.4, 15.2, ..., 10. ,  9.8,  8.4])

In [43]:
predictions = pd.DataFrame(predictions, columns=["pred"])

In [44]:
predictions.head()

Unnamed: 0,pred
0,11.4
1,9.4
2,15.2
3,10.0
4,13.2


In [45]:
original = pd.read_csv("s3://preprocessoroutputs/test/ytest.csv", header=None)

In [46]:
from sklearn.metrics import mean_squared_error, r2_score
score = r2_score(original, predictions)
score

0.44170384095794357

In [47]:
mse = mean_squared_error(original, predictions, squared=False)
mse

2.3810187732825803

In [48]:
sagemaker.Session().delete_endpoint(abalone_predictor.endpoint)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Deleting endpoint with name: abaloneage-endpoint-1a
