In [2]:
import boto3
import sagemaker

In [3]:
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")


In [4]:
from sagemaker import get_execution_role
role = get_execution_role()
role

'arn:aws:iam::257548842387:role/service-role/AmazonSageMaker-ExecutionRole-20230618T190858'

In [5]:
region = boto3.session.Session().region_name
region

'ap-south-1'

In [6]:
from sagemaker.sklearn.processing import SKLearnProcessor
sklearn_processor = SKLearnProcessor(framework_version='0.20.0', role=role, instance_type='ml.m5.xlarge', instance_count=1)

In [7]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

In [8]:
import pandas as pd
d = pd.read_csv("s3://preprocessoroutputs/heart.csv")

In [None]:
d.head()

In [None]:
d.to_csv("heart.csv")

In [None]:
input_data = "s3://preprocessoroutputs/heart.csv"

sklearn_processor.run(
    code='preprocessing.py',
    inputs = [
        ProcessingInput(source=input_data, 
                        destination = '/opt/ml/processing/input')],
    outputs = [
        ProcessingOutput(output_name = 'train_data',
                         source = '/opt/ml/processing/train',
                         destination = "s3://preprocessoroutputs/"),
        
        ProcessingOutput(output_name = 'test_data',
                         source = '/opt/ml/processing/test',
                         destination = "s3://preprocessoroutputs/")],
    
    arguments = ['--train-test-split-ratio','0.2']
)

In [None]:
preprocessing_job_description = sklearn_processor.jobs[-1].describe()

In [None]:
preprocessing_job_description

In [None]:
output_config = preprocessing_job_description["ProcessingOutputConfig"]

In [None]:
output_config

In [None]:
for output in output_config['Outputs']:
    if output['OutputName'] == 'train_data':
        preprocessed_training_data = output['S3Output']['S3Uri']
    if output['OutputName'] == 'test_data':
         preprocessed_test_data = output['S3Output']['S3Uri']

In [None]:
preprocessed_training_data

In [None]:
preprocessed_test_data

In [None]:
pd.read_csv(preprocessed_test_data+"Xtest.csv", header=None)

In [9]:
x_train = pd.read_csv("s3://preprocessoroutputs/Xtrain.csv", header=None)

In [10]:
y_train = pd.read_csv("s3://preprocessoroutputs/ytrain.csv", header=None)

In [12]:
x_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,-0.599275,1,2,-0.777449,0.052511,0,0,0.419847,0,0.129032,2,3,2
1,1.054918,0,0,2.763755,0.454338,0,1,0.633588,1,0.000000,2,0,2
2,-0.047877,0,2,-1.348611,0.321918,0,0,0.732824,0,0.000000,2,0,2
3,0.503520,1,0,0.193526,0.246575,0,1,0.687023,0,0.080645,1,0,3
4,-0.378716,1,2,-0.377636,0.271689,1,0,0.725191,0,0.387097,1,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
815,-1.481512,1,2,-0.092055,0.200913,0,0,0.740458,0,0.322581,1,0,2
816,0.724079,1,0,0.479107,0.184932,0,0,0.511450,1,0.306452,2,1,3
817,-0.378716,1,0,0.479107,0.394977,0,1,0.778626,1,0.258065,2,0,3
818,-1.260953,1,0,-1.234378,0.194064,0,1,0.687023,0,0.000000,2,0,3


In [13]:
x_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,-0.599275,1,2,-0.777449,0.052511,0,0,0.419847,0,0.129032,2,3,2
1,1.054918,0,0,2.763755,0.454338,0,1,0.633588,1,0.0,2,0,2
2,-0.047877,0,2,-1.348611,0.321918,0,0,0.732824,0,0.0,2,0,2
3,0.50352,1,0,0.193526,0.246575,0,1,0.687023,0,0.080645,1,0,3
4,-0.378716,1,2,-0.377636,0.271689,1,0,0.725191,0,0.387097,1,0,2


In [14]:
x_train.shape

(820, 13)

In [15]:
import io
import numpy as np
import sagemaker.amazon.common as smac

In [16]:
vectors = np.array(x_train.values, dtype="float32")
labels = np.array(y_train.values, dtype="float32")

In [19]:
labels = labels.reshape(-1)

In [20]:
buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, vectors, labels)

In [21]:
buf.seek(0)

0

In [22]:
from sagemaker import get_execution_role
bucket = 'preprocessoroutputs'
prefix = 'linear-learner'
role = get_execution_role()

In [23]:
import os
key = 'recordio-pb-data'
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

uploaded training data location: s3://preprocessoroutputs/linear-learner/train/recordio-pb-data


In [24]:
from sagemaker.amazon.amazon_estimator import get_image_uri

In [25]:
container = get_image_uri(boto3.Session().region_name, 'linear-learner')

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [26]:
sess = sagemaker.Session()

In [27]:
linear = sagemaker.estimator.Estimator(container, role, train_instance_count=1, train_instance_type="ml.m4.xlarge", output_path="s3://preprocessoroutputs/model/", sagemaker_session=sess)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [28]:
linear.set_hyperparameters(feature_dim=13,
 predictor_type='binary_classifier',
 mini_batch_size=100)

In [29]:
linear.fit({'train': s3_train_data})

INFO:sagemaker:Creating training-job with name: linear-learner-2023-06-22-06-50-50-257


2023-06-22 06:50:50 Starting - Starting the training job...
2023-06-22 06:51:15 Starting - Preparing the instances for training.........
2023-06-22 06:52:44 Downloading - Downloading input data
2023-06-22 06:52:44 Training - Downloading the training image...............
2023-06-22 06:55:20 Training - Training image download completed. Training in progress....
2023-06-22 06:55:51 Uploading - Uploading generated training model[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[06/22/2023 06:55:41 INFO 139895614142272] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uni

In [30]:
endpoint_name = 'linear-endpoint1'
linear_predictor = linear.deploy(initial_instance_count=1,
                                       instance_type='ml.m4.xlarge',
                                       endpoint_name=endpoint_name)

INFO:sagemaker:Creating model with name: linear-learner-2023-06-22-06-59-28-839
INFO:sagemaker:Creating endpoint-config with name linear-endpoint1
INFO:sagemaker:Creating endpoint with name linear-endpoint1


-------!

In [44]:
from sagemaker.predictor import RealTimePredictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

linear_predictor = RealTimePredictor(endpoint_name, sess, content_type='text/csv')

linear_predictor.serializer = CSVSerializer()
linear_predictor.deserializer = JSONDeserializer()


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [45]:
test_vectors = pd.read_csv("s3://preprocessoroutputs/Xtest.csv", header=None)

In [46]:
test_vectors.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.834359,0,0,-0.434752,0.189498,0,1,0.70229,0,0.0,2,0,2
1,-0.158157,0,2,-0.206287,0.205479,0,0,0.335878,0,0.0,2,0,0
2,0.062402,1,0,1.621431,0.372146,0,0,0.564885,1,0.129032,1,1,3
3,-0.488996,0,1,-0.663216,0.269406,0,1,0.694656,0,0.177419,2,0,2
4,-0.709555,1,0,-0.092055,0.296804,1,0,0.603053,1,0.0,2,2,3


In [47]:
test_vectors.loc[2]

0     0.062402
1     1.000000
2     0.000000
3     1.621431
4     0.372146
5     0.000000
6     0.000000
7     0.564885
8     1.000000
9     0.129032
10    1.000000
11    1.000000
12    3.000000
Name: 2, dtype: float64

In [48]:
result = linear_predictor.predict(test_vectors.loc[2])
print(result)

{'predictions': [{'score': 0.027960801497101784, 'predicted_label': 0}]}


In [None]:
import logging

logging.getLogger('sagemaker.deprecations').setLevel(logging.WARNING)


In [49]:
test_vectors.shape

(205, 13)

In [50]:
import numpy as np
predictions = []
import warnings

# Filter the deprecation warning

for array in range(205):
    result = linear_predictor.predict(test_vectors.loc[array])
    predictions += [r['predicted_label'] for r in result['predictions']]

predictions = np.array(predictions)


In [51]:
predictions

array([1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 0])

In [52]:
original = pd.read_csv("s3://preprocessoroutputs/ytest.csv", header=None)

In [53]:
original = np.array(original).reshape(-1)

In [54]:
original

array([1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0])

In [55]:
len(predictions)

205

In [56]:
from sklearn.metrics import accuracy_score, classification_report
score = accuracy_score(original, predictions)
score*100

80.0

In [57]:
sagemaker.Session().delete_endpoint(linear_predictor.endpoint)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Deleting endpoint with name: linear-endpoint1
