In [2]:
import pandas as pd
import numpy as np
import time
import boto3

import sagemaker
import io

from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
from sagemaker.predictor import csv_serializer, json_deserializer

In [3]:
# setup variables to point to s3
filename = 'iris_recordio_train.data'
bucket = 'lkurgan'
data_dir = 'dataset'
dataset_name = 'iris.data'
raw_prefix = 'raw'
train_prefix = 'train'
output_prefix = 'output'
train_path = f"{train_prefix}/{filename}"
s3_train_data = f's3://{bucket}/{train_prefix}'
output_location = f's3://{bucket}/{output_prefix}'

In [4]:
%env DATA_DIR=$data_dir
%env S3_DATA_BUCKET_NAME = $bucket/$raw_prefix
%env DATASET_NAME = $dataset_name
%env TRAINING_PATH = $bucket/$train_prefix

env: DATA_DIR=dataset
env: S3_DATA_BUCKET_NAME=lkurgan/raw
env: DATASET_NAME=iris.data
env: TRAINING_PATH=lkurgan/train


In [5]:
!aws s3 cp s3://$S3_DATA_BUCKET_NAME/$DATASET_NAME ./$DATA_DIR/

download: s3://lkurgan/raw/iris.data to dataset/iris.data      


In [6]:
# loading the data
columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']

df = pd.read_csv(f"{data_dir}/iris.data", header=None, names = columns)
df['class'].replace(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], [1, 2, 3], inplace=True)
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,1
1,4.9,3.0,1.4,0.2,1
2,4.7,3.2,1.3,0.2,1
3,4.6,3.1,1.5,0.2,1
4,5.0,3.6,1.4,0.2,1


In [7]:
# Spliting to train and test
x_train, x_test, y_train, y_test = train_test_split(
    df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']],
    df['class'], random_state = 5)

In [8]:
buf = io.BytesIO()
sagemaker.amazon.common.write_numpy_to_dense_tensor(buf, 
                                                    np.array(x_train).astype('float32'),
                                                    np.array(y_train).astype('float32'))
buf.seek(0)
boto3.resource('s3').Bucket(bucket).Object(f'{train_path}').upload_fileobj(buf)



In [9]:
container = sagemaker.amazon.amazon_estimator.get_image_uri(boto3.Session().region_name, 'knn')
role = sagemaker.get_execution_role()
sess = sagemaker.Session()

# create knn estimator
knn = sagemaker.estimator.Estimator(container,
                                    role,
                                    train_instance_count=1,
                                    train_instance_type='ml.m5.4xlarge',
                                    output_path=output_location,
                                    sagemaker_session=sess)
# set hyperparameters
knn.set_hyperparameters(predictor_type='classifier',
                           feature_dim=4,
                           k=3,
                           sample_size=x_train.shape[0])

knn.fit({'train': s3_train_data},  job_name=f"iris-job-{int(time.time())}")

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


2022-05-12 17:07:46 Starting - Starting the training job...
2022-05-12 17:08:10 Starting - Preparing the instances for trainingProfilerReport-1652375266: InProgress
......
2022-05-12 17:09:12 Downloading - Downloading input data...
2022-05-12 17:09:30 Training - Downloading the training image............
2022-05-12 17:11:33 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[05/12/2022 17:11:35 INFO 140561448105792] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-conf.json: {'_kvstore': 'dist_async', '_log_level': 'info', '_num_gpus': 'auto', '_num_kv_servers': '1', '_tuning_objective_metric': '', '_faiss_index_nprobe': '5', 'epochs': '1', 'feature_dim': 'auto', 'faiss_index_ivf_nlists': 'auto', 'index_metric': 'L2', 'index_type': 'faiss.Flat', 'mini_batch_size': '5000', '_enable_profiler': 'false

In [12]:
# deploy the model
knn_predictor = knn.deploy(initial_instance_count=1,
                       instance_type='ml.t2.medium',
                       endpoint_name="iris2-endpoint")
knn_predictor.serializer = csv_serializer
knn_predictor.deserializer = json_deserializer

----------------!

In [13]:
result = knn_predictor.predict(x_test.values)

The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The json_deserializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [21]:
result = [x['predicted_label'] for x in result["predictions"]]

TypeError: list indices must be integers or slices, not str

In [20]:
print("Accuracy:", metrics.accuracy_score(y_test, result))

Accuracy: 0.9473684210526315


In [23]:
sagemaker.Session().delete_endpoint(knn_predictor.endpoint)

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
