In [73]:
%%time
from sagemaker.amazon.common import write_numpy_to_dense_tensor
import io
import boto3
import pandas as pd
import numpy as np
from sagemaker import get_execution_role
import copy 

role = get_execution_role()
bucket = 'sagemaker-yelp-data' # Use the name of your s3 bucket here
prefix = 'knn-blog-2019-04-30'

CPU times: user 154 ms, sys: 3.58 ms, total: 158 ms
Wall time: 465 ms


In [2]:
data_key = 'FILE_2.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)
print('training data will be uploaded to: {}'.format(data_location))

test_data_key = 'FILE_1.csv'
res_location = 's3://{}/{}'.format(bucket, test_data_key)
print('testing data will be uploaded to: {}'.format(res_location))

training data will be uploaded to: s3://sagemaker-yelp-data/FILE_2.csv
testing data will be uploaded to: s3://sagemaker-yelp-data/FILE_1.csv


In [3]:
train_data = pd.read_csv(data_location, header=None, engine="python")

In [4]:
test_data = pd.read_csv(res_location, header=None, engine="python")

In [5]:
def stateToNumber(s):
    l = 0
    for x in s:
        l = l + int(hex(ord(x)),16)
    return l

In [6]:
train_data[0]=train_data[0].apply(lambda x: stateToNumber(x))
train_data[1]=train_data[1].apply(lambda x: stateToNumber(x))
test_data[0]=test_data[0].apply(lambda x: stateToNumber(x))
test_data[1]=test_data[1].apply(lambda x: stateToNumber(x))

In [7]:
train_data

Unnamed: 0,0,1,2,3,4
0,1899,703,4,2444,1
1,1997,807,4,695,1
2,1943,598,4,903,1
3,2075,703,4,629,1
4,1843,1776,4,582,1
5,1955,807,4,946,1
6,2124,1211,4,979,1
7,1908,948,4,529,1
8,1758,929,4,754,1
9,1880,703,4,3736,1


In [8]:
trainX_Array = train_data.as_matrix().astype(np.float32)
testX_Array = test_data.as_matrix().astype(np.float32)

In [31]:
# labels are in the first column
train_labels = trainX_Array[:,-1]
train_features = trainX_Array[:,1:4]
test_features = testX_Array[:,1:4]

In [32]:
import io
import sagemaker.amazon.common as smac

print('train_features shape = ', train_features.shape)
print('train_labels shape = ', train_labels.shape)

buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, train_features, train_labels)
buf.seek(0)

train_features shape =  (200, 3)
train_labels shape =  (200,)


0

In [33]:
import boto3
import os
import sagemaker

bucket = sagemaker.Session().default_bucket() # modify to your bucket name
prefix = 'knn-blog-2019-04-17'
key = 'recordio-pb-data'

boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

uploaded training data location: s3://sagemaker-us-west-2-844243140541/knn-blog-2019-04-17/train/recordio-pb-data


In [34]:
print('test_features shape = ', test_features.shape)
# print('test_labels shape = ', test_labels.shape)

buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, test_features)
buf.seek(0)

boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test', key)).upload_fileobj(buf)
s3_test_data = 's3://{}/{}/test/{}'.format(bucket, prefix, key)
print('uploaded test data location: {}'.format(s3_test_data))

test_features shape =  (1955, 3)
uploaded test data location: s3://sagemaker-us-west-2-844243140541/knn-blog-2019-04-17/test/recordio-pb-data


In [35]:
import matplotlib.pyplot as plt

import sagemaker
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer, json_deserializer
from sagemaker.amazon.amazon_estimator import get_image_uri


def trained_estimator_from_hyperparams(s3_train_data, hyperparams, output_path, s3_test_data=None):
    """
    Create an Estimator from the given hyperparams, fit to training data, 
    and return a deployed predictor
    
    """
    # set up the estimator
    knn = sagemaker.estimator.Estimator(get_image_uri(boto3.Session().region_name, "knn"),
        get_execution_role(),
        train_instance_count=1,
        train_instance_type='ml.m5.large',
        output_path=output_path,
        sagemaker_session=sagemaker.Session())
    knn.set_hyperparameters(**hyperparams)
    
    # train a model. fit_input contains the locations of the train and test data
    fit_input = {'train': s3_train_data}
    if s3_test_data is not None:
        fit_input['test'] = s3_test_data
    knn.fit(fit_input)
    return knn

In [36]:
hyperparams = {
    'feature_dim': 3,
    'k': 10,
    'sample_size': 20000,
    'predictor_type': 'classifier' 
}
output_path = 's3://' + bucket + '/' + prefix + '/default_example/output'
knn_estimator = trained_estimator_from_hyperparams(s3_train_data, hyperparams, output_path, s3_test_data=None)

INFO:sagemaker:Creating training-job with name: knn-2019-04-27-21-27-40-481


2019-04-27 21:27:40 Starting - Starting the training job...
2019-04-27 21:27:42 Starting - Launching requested ML instances......
2019-04-27 21:28:46 Starting - Preparing the instances for training......
2019-04-27 21:30:09 Downloading - Downloading input data...
2019-04-27 21:30:36 Training - Training image download completed. Training in progress.
2019-04-27 21:30:36 Uploading - Uploading generated training model
[31mDocker entrypoint called with argument(s): train[0m
[31m[04/27/2019 21:30:32 INFO 140061320103744] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'index_metric': u'L2', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'_log_level': u'info', u'faiss_index_ivf_nlists': u'auto', u'epochs': u'1', u'index_type': u'faiss.Flat', u'_faiss_index_nprobe': u'5', u'_kvstore': u'dist_async', u'_num_kv_servers': u'1', u'mini_batch_size': u'5000'}[0m
[31m[04/27/2019 21:30:32 INFO 140061320103744] Reading


2019-04-27 21:30:43 Completed - Training job completed
Billable seconds: 34


## endpoint

In [37]:
def predictor_from_estimator(knn_estimator, estimator_name, instance_type, endpoint_name=None): 
    knn_predictor = knn_estimator.deploy(initial_instance_count=1, instance_type=instance_type,
                                        endpoint_name=endpoint_name)
    knn_predictor.content_type = 'text/csv'
    knn_predictor.serializer = csv_serializer
    knn_predictor.deserializer = json_deserializer
    return knn_predictor

In [38]:
import time

instance_type = 'ml.t2.medium'
model_name = 'knn_%s'% instance_type
endpoint_name = 'knn-ml-m4-xlarge-%s'% (str(time.time()).replace('.','-'))
print('setting up the endpoint..')
predictor = predictor_from_estimator(knn_estimator, model_name, instance_type, endpoint_name=endpoint_name)

INFO:sagemaker:Creating model with name: knn-2019-04-27-21-32-16-568


setting up the endpoint..


INFO:sagemaker:Creating endpoint with name knn-ml-m4-xlarge-1556400736-4812799


--------------------------------------------------------------------------------------------------!

In [39]:
batches = np.array_split(test_features, 100)
print('data split into 100 batches, of size %d.' % batches[0].shape[0])

# obtain an np array with the predictions for the entire test set
start_time = time.time()
predictions = []
for batch in batches:
    result = predictor.predict(batch)
    cur_predictions = np.array([result['predictions'][i]['predicted_label'] for i in range(len(result['predictions']))])
    predictions.append(cur_predictions)
predictions = np.concatenate(predictions)
run_time = time.time() - start_time


data split into 100 batches, of size 20.


In [40]:
predictions

array([0., 0., 0., ..., 0., 1., 1.])

### Sklearn to get prediction probability

In [41]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=10)

neigh.fit(train_features, train_labels) 

pred = neigh.predict(test_features)
score = neigh.predict_proba(test_features)

In [63]:
original_test_data = pd.read_csv(res_location, header=None, engine="python")

### Create final data

In [64]:
score_new = []
for x in list(score):
    score_new.append(np.max(x))

In [65]:
original_test_data['score'] = score_new
original_test_data['bestAnswer'] = predictions

In [75]:
final_data_key = 'FILE_3.csv'
bucket = 'sagemaker-yelp-data'
dest_location = 's3://{}/{}'.format(bucket, final_data_key)

In [78]:
# original_test_data.to_csv(dest_location)

# boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
# s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
# print('uploaded training data location: {}'.format(s3_train_data)
      
from io import StringIO

csv_buffer = StringIO()
original_test_data.to_csv(csv_buffer)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, final_data_key).put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '88AB93C92FFC8B10',
  'HostId': 'fKzZFSdIRIOI5Hg7bPzqxvR59RgnrH07aePQJDue9fq9PET9WtnX4IPi7zB82Uu5QXja7CGDrhs=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'fKzZFSdIRIOI5Hg7bPzqxvR59RgnrH07aePQJDue9fq9PET9WtnX4IPi7zB82Uu5QXja7CGDrhs=',
   'x-amz-request-id': '88AB93C92FFC8B10',
   'date': 'Sat, 27 Apr 2019 22:08:05 GMT',
   'etag': '"fa78579f414aac98e53de5fed3e32fe5"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"fa78579f414aac98e53de5fed3e32fe5"'}