In [1]:
bucket = 'testawslearn'
prefix = 'git'

# Define IAM role
import boto3
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sagemaker
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer

role = get_execution_role()

In [2]:

dataset=pd.read_csv('s3://testawslearn/iris.csv')

In [3]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
dataset['variety']=le.fit_transform(dataset['variety'])

In [4]:
dataset = pd.concat([dataset['variety'], dataset.drop(['variety'], axis=1)], axis=1)

In [5]:
dataset.head(3)

Unnamed: 0,variety,sepal.length,sepal.width,petal.length,petal.width
0,0,5.1,3.5,1.4,0.2
1,0,4.9,3.0,1.4,0.2
2,0,4.7,3.2,1.3,0.2


In [6]:
train_data, validation_data, test_data = np.split(dataset.sample(frac=1, random_state=1729), [int(0.7 * len(dataset)), int(0.9 * len(dataset))])
train_data.to_csv('train.csv', header=False, index=False)
validation_data.to_csv('validation.csv', header=False, index=False)


In [7]:
s3_input_train = boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_validation = boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')

In [8]:
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')

In [9]:
base_job_name='iris-lamba-api'

In [16]:
from sagemaker.amazon.amazon_estimator import get_image_uri
image_name = get_image_uri(boto3.Session().region_name, 'xgboost')

	get_image_uri(region, 'xgboost', '0.90-1').


In [17]:
estimator = sagemaker.estimator.Estimator(
    sagemaker_session=sess,
    image_name=image_name,
    role=role,
    train_instance_count=1, 
train_instance_type='ml.m4.xlarge',
output_path='s3://{}/{}/output'.format(bucket, prefix),
    base_job_name=base_job_name
)

In [20]:
estimator.set_hyperparameters(alpha=1.448983,colsample_bytree=0.6897649,eta=0.246274,gamma=0.546408,lamda=0.0003157054,
max_depth=18,min_child_weight=0.00282088,num_class=3,num_round=8, objective='multi:softmax',subsample=0.538571908)

In [21]:

estimator.fit({'train':s3_input_train,'validation':s3_input_validation})

2020-06-07 07:38:57 Starting - Starting the training job...
2020-06-07 07:38:59 Starting - Launching requested ML instances......
2020-06-07 07:40:07 Starting - Preparing the instances for training...
2020-06-07 07:40:58 Downloading - Downloading input data...
2020-06-07 07:41:14 Training - Downloading the training image..[34mArguments: train[0m
[34m[2020-06-07:07:41:33:INFO] Running standalone xgboost training.[0m
[34m[2020-06-07:07:41:33:INFO] File size need to be processed in the node: 0.0mb. Available memory size in the node: 8469.67mb[0m
[34m[2020-06-07:07:41:33:INFO] Determined delimiter of CSV input is ','[0m
[34m[07:41:33] S3DistributionType set as FullyReplicated[0m
[34m[07:41:33] 105x4 matrix with 420 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-06-07:07:41:34:INFO] Determined delimiter of CSV input is ','[0m
[34m[07:41:33] S3DistributionType set as FullyReplicated[0m
[34m[07:41:34] 30x4 matrix with 120 entri

In [22]:
job_name = estimator.latest_training_job.name
print(job_name)

iris-lamba-api-2020-06-07-07-38-57-724


In [23]:
endpoint_name = sess.endpoint_from_job(
    job_name=job_name,
    initial_instance_count=1,
    instance_type='ml.m4.xlarge',
    deployment_image=image_name,
    role=role
)

-------------!

In [25]:
print ('endpoint name: {0}'.format(endpoint_name))

endpoint name: iris-lamba-api-2020-06-07-07-38-57-724


In [29]:
# test_data_array = test_data.drop(['variety'], axis=1).values #load the data into an array
# xgb_predictor_iris.content_type = 'text/csv' # set the data type for an inference
# xgb_predictor_iris.serializer = csv_serializer # set the serializer type
# predictions = xgb_predictor_iris.predict(test_data_array).decode('utf-8') # predict!
# predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
# print(predictions_array.shape)
# cm = pd.crosstab(index=test_data['variety'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
# tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
# print("\n{0:<20}{1:<4.1f}%\n".format("Overall Accuracy Rate: ", p))
# result=xgb_predictor_iris.predict([5.1,3.5,1.4,0.2])


Overall Accuracy Rate: 100.0%

