In [65]:
import boto3
import io
import os
import sagemaker
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
import sagemaker.amazon.common as smac

# Exploratory Data Analysis

In [68]:
# load movie dataset
df = pd.read_csv('ml-100k/u.data', header=None, delim_whitespace=True)
df.columns = ['user_id', 'movie_id', 'rating', 'time_stamp']
print(df.head())

# one hot code user id and movie id, needs to be in float32 for sagemaker
X = df[['user_id', 'movie_id']]

enc = OneHotEncoder(categories='auto', dtype='float32')
enc.fit(X)
one_hot_labels = enc.transform(X).toarray()

# create target labels, needs to be in float32 for sagemaker
df['rating'] = df['rating'].apply(lambda x : 1 if x >=4 else 0)
target = df['rating'].as_matrix().astype('float32')

# create new data frame
df_temp = pd.DataFrame(one_hot_labels)
df_temp.insert(one_hot_labels.shape[1], 'rating', df['rating'])

X_train, X_test, y_train, y_test = train_test_split(one_hot_labels, target, test_size=0.33, random_state=42)

# convert train data to recordio protobuf
train_data_buffer = io.BytesIO()
smac.write_numpy_to_dense_tensor(train_data_buffer, X_train, y_train)
train_data_buffer.seek(0)

# convert test data to recordio protobuf
test_data_buffer = io.BytesIO()
smac.write_numpy_to_dense_tensor(test_data_buffer, X_test, y_test)
test_data_buffer.seek(0)

# df_temp.to_csv('train.csv', header=False, index=False)

   user_id  movie_id  rating  time_stamp
0      196       242       3   881250949
1      186       302       3   891717742
2       22       377       1   878887116
3      244        51       2   880606923
4      166       346       1   886397596


  from ipykernel import kernelapp as app


0

In [73]:
# df = pd.read_csv('ml-100k/u.user', header=None, delim_whitespace=True)
# df.head()

In [74]:
bucket = 'sagemaker-us-east-1-756448110530'
prefix = 'movie-dateset'
key = 'recordio-pb-data'

In [75]:
session = sagemaker.Session()

In [76]:
# upload train and test data to S3
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(train_data_buffer)
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test', key)).upload_fileobj(test_data_buffer)

# create train and test channel for training
train_path = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
test_path = 's3://{}/{}/test/{}'.format(bucket, prefix, key)
s3_input_train = sagemaker.s3_input(s3_data=train_path, content_type='application/x-recordio-protobuf')
s3_input_test = sagemaker.s3_input(s3_data=test_path, content_type='application/x-recordio-protobuf')

In [77]:
container = get_image_uri(boto3.Session().region_name, "factorization-machines")

In [78]:

estimator = sagemaker.estimator.Estimator(container,
                                         'AmazonSageMaker-ExecutionRole-20190815T111389',
                                         train_instance_count=1,
                                         train_instance_type='ml.m5.large',
                                         output_path='s3://{}/{}/output'.format(bucket, prefix),
                                         train_use_spot_instances=True,
                                         train_max_run=3600,
                                         train_max_wait=3600,
                                         sagemaker_session=session,
                                         input_mode='Pipe')

In [81]:
estimator.set_hyperparameters(feature_dim=2625,
                             predictor_type='binary_classifier',
                             mini_batch_size=100,
                             num_factors=64,
                             epochs=10,
                             linear_lr=0.001)

In [82]:
estimator.fit({
  'train': s3_input_train,
    'test': s3_input_test
})

2020-04-10 19:07:29 Starting - Starting the training job...
2020-04-10 19:07:31 Starting - Launching requested ML instances......
2020-04-10 19:08:40 Starting - Preparing the instances for training...
2020-04-10 19:09:26 Downloading - Downloading input data...
2020-04-10 19:10:17 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  from numpy.testing import nosetester[0m
[34m[04/10/2020 19:10:19 INFO 140314233935680] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'factors_lr': u'0.0001', u'linear_init_sigma': u'0.01', u'epochs': 1, u'_wd': u'1.0', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'factors_init_sigma': u'0.001', u'_log_level': u'info', u'bias_init_method': u'normal', u'linear_init_method': u'normal', u'linear_lr': u'0.001', u'factors_init_method': u'normal', u'_tuning_

[34m[04/10/2020 19:10:55 INFO 140314233935680] Iter[1] Batch [500]#011Speed: 3356.76 samples/sec[0m
[34m[04/10/2020 19:10:55 INFO 140314233935680] #quality_metric: host=algo-1, epoch=1, batch=500 train binary_classification_accuracy <score>=0.663473053892[0m
[34m[04/10/2020 19:10:55 INFO 140314233935680] #quality_metric: host=algo-1, epoch=1, batch=500 train binary_classification_cross_entropy <loss>=6.17977654189[0m
[34m[04/10/2020 19:10:55 INFO 140314233935680] #quality_metric: host=algo-1, epoch=1, batch=500 train binary_f_1.000 <score>=0.695414965495[0m
[34m[2020-04-10 19:11:00.464] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 5, "duration": 19551, "num_examples": 670, "num_bytes": 706716000}[0m
[34m[04/10/2020 19:11:00 INFO 140314233935680] Epoch[1] Train-binary_classification_accuracy=0.680015[0m
[34m[04/10/2020 19:11:00 INFO 140314233935680] Epoch[1] Train-binary_classification_cross_entropy=5.875615[0m
[34m[04/10/2020 19:11

[34m[04/10/2020 19:11:59 INFO 140314233935680] Iter[4] Batch [500]#011Speed: 2915.65 samples/sec[0m
[34m[04/10/2020 19:11:59 INFO 140314233935680] #quality_metric: host=algo-1, epoch=4, batch=500 train binary_classification_accuracy <score>=0.849880239521[0m
[34m[04/10/2020 19:11:59 INFO 140314233935680] #quality_metric: host=algo-1, epoch=4, batch=500 train binary_classification_cross_entropy <loss>=2.76073460349[0m
[34m[04/10/2020 19:11:59 INFO 140314233935680] #quality_metric: host=algo-1, epoch=4, batch=500 train binary_f_1.000 <score>=0.864469392536[0m
[34m[2020-04-10 19:12:05.046] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 14, "duration": 22491, "num_examples": 670, "num_bytes": 706716000}[0m
[34m[04/10/2020 19:12:05 INFO 140314233935680] Epoch[4] Train-binary_classification_accuracy=0.854343[0m
[34m[04/10/2020 19:12:05 INFO 140314233935680] Epoch[4] Train-binary_classification_cross_entropy=2.678921[0m
[34m[04/10/2020 19:1

[34m[04/10/2020 19:13:09 INFO 140314233935680] Iter[7] Batch [500]#011Speed: 2743.61 samples/sec[0m
[34m[04/10/2020 19:13:09 INFO 140314233935680] #quality_metric: host=algo-1, epoch=7, batch=500 train binary_classification_accuracy <score>=0.913313373253[0m
[34m[04/10/2020 19:13:09 INFO 140314233935680] #quality_metric: host=algo-1, epoch=7, batch=500 train binary_classification_cross_entropy <loss>=1.59284621431[0m
[34m[04/10/2020 19:13:09 INFO 140314233935680] #quality_metric: host=algo-1, epoch=7, batch=500 train binary_f_1.000 <score>=0.9218870843[0m
[34m[2020-04-10 19:13:14.738] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 23, "duration": 23927, "num_examples": 670, "num_bytes": 706716000}[0m
[34m[04/10/2020 19:13:14 INFO 140314233935680] Epoch[7] Train-binary_classification_accuracy=0.915910[0m
[34m[04/10/2020 19:13:14 INFO 140314233935680] Epoch[7] Train-binary_classification_cross_entropy=1.545405[0m
[34m[04/10/2020 19:13:

[34m[04/10/2020 19:14:22 INFO 140314233935680] Iter[10] Batch [500]#011Speed: 2654.16 samples/sec[0m
[34m[04/10/2020 19:14:22 INFO 140314233935680] #quality_metric: host=algo-1, epoch=10, batch=500 train binary_classification_accuracy <score>=0.944231536926[0m
[34m[04/10/2020 19:14:22 INFO 140314233935680] #quality_metric: host=algo-1, epoch=10, batch=500 train binary_classification_cross_entropy <loss>=1.0251873378[0m
[34m[04/10/2020 19:14:22 INFO 140314233935680] #quality_metric: host=algo-1, epoch=10, batch=500 train binary_f_1.000 <score>=0.949699348288[0m
[34m[2020-04-10 19:14:28.338] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 32, "duration": 24685, "num_examples": 670, "num_bytes": 706716000}[0m
[34m[04/10/2020 19:14:28 INFO 140314233935680] Epoch[10] Train-binary_classification_accuracy=0.944955[0m
[34m[04/10/2020 19:14:28 INFO 140314233935680] Epoch[10] Train-binary_classification_cross_entropy=1.012048[0m
[34m[04/10/2020

[34m[04/10/2020 19:15:38 INFO 140314233935680] Iter[13] Batch [500]#011Speed: 2500.69 samples/sec[0m
[34m[04/10/2020 19:15:38 INFO 140314233935680] #quality_metric: host=algo-1, epoch=13, batch=500 train binary_classification_accuracy <score>=0.958842315369[0m
[34m[04/10/2020 19:15:38 INFO 140314233935680] #quality_metric: host=algo-1, epoch=13, batch=500 train binary_classification_cross_entropy <loss>=0.757727640118[0m
[34m[04/10/2020 19:15:38 INFO 140314233935680] #quality_metric: host=algo-1, epoch=13, batch=500 train binary_f_1.000 <score>=0.962888304957[0m
[34m[2020-04-10 19:15:45.068] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 41, "duration": 26121, "num_examples": 670, "num_bytes": 706716000}[0m
[34m[04/10/2020 19:15:45 INFO 140314233935680] Epoch[13] Train-binary_classification_accuracy=0.960015[0m
[34m[04/10/2020 19:15:45 INFO 140314233935680] Epoch[13] Train-binary_classification_cross_entropy=0.736045[0m
[34m[04/10/20

[34m[04/10/2020 19:16:58 INFO 140314233935680] Iter[16] Batch [500]#011Speed: 2457.67 samples/sec[0m
[34m[04/10/2020 19:16:58 INFO 140314233935680] #quality_metric: host=algo-1, epoch=16, batch=500 train binary_classification_accuracy <score>=0.971796407186[0m
[34m[04/10/2020 19:16:58 INFO 140314233935680] #quality_metric: host=algo-1, epoch=16, batch=500 train binary_classification_cross_entropy <loss>=0.519076260968[0m
[34m[04/10/2020 19:16:58 INFO 140314233935680] #quality_metric: host=algo-1, epoch=16, batch=500 train binary_f_1.000 <score>=0.974599579356[0m
[34m[2020-04-10 19:17:04.813] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 50, "duration": 26784, "num_examples": 670, "num_bytes": 706716000}[0m
[34m[04/10/2020 19:17:04 INFO 140314233935680] Epoch[16] Train-binary_classification_accuracy=0.971851[0m
[34m[04/10/2020 19:17:04 INFO 140314233935680] Epoch[16] Train-binary_classification_cross_entropy=0.518033[0m
[34m[04/10/20

[34m[04/10/2020 19:18:20 INFO 140314233935680] Iter[19] Batch [500]#011Speed: 2386.56 samples/sec[0m
[34m[04/10/2020 19:18:20 INFO 140314233935680] #quality_metric: host=algo-1, epoch=19, batch=500 train binary_classification_accuracy <score>=0.976067864271[0m
[34m[04/10/2020 19:18:20 INFO 140314233935680] #quality_metric: host=algo-1, epoch=19, batch=500 train binary_classification_cross_entropy <loss>=0.440450917927[0m
[34m[04/10/2020 19:18:20 INFO 140314233935680] #quality_metric: host=algo-1, epoch=19, batch=500 train binary_f_1.000 <score>=0.978430984547[0m
[34m[2020-04-10 19:18:26.411] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 59, "duration": 27386, "num_examples": 670, "num_bytes": 706716000}[0m
[34m[04/10/2020 19:18:26 INFO 140314233935680] Epoch[19] Train-binary_classification_accuracy=0.976806[0m
[34m[04/10/2020 19:18:26 INFO 140314233935680] Epoch[19] Train-binary_classification_cross_entropy=0.426954[0m
[34m[04/10/20

[34m[04/10/2020 19:19:42 INFO 140314233935680] Iter[22] Batch [500]#011Speed: 2400.25 samples/sec[0m
[34m[04/10/2020 19:19:42 INFO 140314233935680] #quality_metric: host=algo-1, epoch=22, batch=500 train binary_classification_accuracy <score>=0.98119760479[0m
[34m[04/10/2020 19:19:42 INFO 140314233935680] #quality_metric: host=algo-1, epoch=22, batch=500 train binary_classification_cross_entropy <loss>=0.346352925253[0m
[34m[04/10/2020 19:19:42 INFO 140314233935680] #quality_metric: host=algo-1, epoch=22, batch=500 train binary_f_1.000 <score>=0.983075209314[0m
[34m[2020-04-10 19:19:49.455] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 68, "duration": 27375, "num_examples": 670, "num_bytes": 706716000}[0m
[34m[04/10/2020 19:19:49 INFO 140314233935680] Epoch[22] Train-binary_classification_accuracy=0.981657[0m
[34m[04/10/2020 19:19:49 INFO 140314233935680] Epoch[22] Train-binary_classification_cross_entropy=0.337896[0m
[34m[04/10/202

[34m[04/10/2020 19:21:06 INFO 140314233935680] Iter[25] Batch [500]#011Speed: 2334.69 samples/sec[0m
[34m[04/10/2020 19:21:06 INFO 140314233935680] #quality_metric: host=algo-1, epoch=25, batch=500 train binary_classification_accuracy <score>=0.984291417166[0m
[34m[04/10/2020 19:21:06 INFO 140314233935680] #quality_metric: host=algo-1, epoch=25, batch=500 train binary_classification_cross_entropy <loss>=0.288811348936[0m
[34m[04/10/2020 19:21:06 INFO 140314233935680] #quality_metric: host=algo-1, epoch=25, batch=500 train binary_f_1.000 <score>=0.985836407811[0m
[34m[2020-04-10 19:21:13.652] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 77, "duration": 28167, "num_examples": 670, "num_bytes": 706716000}[0m
[34m[04/10/2020 19:21:13 INFO 140314233935680] Epoch[25] Train-binary_classification_accuracy=0.984149[0m
[34m[04/10/2020 19:21:13 INFO 140314233935680] Epoch[25] Train-binary_classification_cross_entropy=0.291344[0m
[34m[04/10/20

[34m[04/10/2020 19:22:32 INFO 140314233935680] Iter[28] Batch [500]#011Speed: 2302.78 samples/sec[0m
[34m[04/10/2020 19:22:32 INFO 140314233935680] #quality_metric: host=algo-1, epoch=28, batch=500 train binary_classification_accuracy <score>=0.986067864271[0m
[34m[04/10/2020 19:22:32 INFO 140314233935680] #quality_metric: host=algo-1, epoch=28, batch=500 train binary_classification_cross_entropy <loss>=0.256022356107[0m
[34m[04/10/2020 19:22:32 INFO 140314233935680] #quality_metric: host=algo-1, epoch=28, batch=500 train binary_f_1.000 <score>=0.987443333093[0m
[34m[2020-04-10 19:22:39.284] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 86, "duration": 28563, "num_examples": 670, "num_bytes": 706716000}[0m
[34m[04/10/2020 19:22:39 INFO 140314233935680] Epoch[28] Train-binary_classification_accuracy=0.986104[0m
[34m[04/10/2020 19:22:39 INFO 140314233935680] Epoch[28] Train-binary_classification_cross_entropy=0.255504[0m
[34m[04/10/20


2020-04-10 19:23:21 Completed - Training job completed
Training seconds: 835
Billable seconds: 365
Managed Spot Training savings: 56.3%


In [34]:
fm_predictor = estimator.deploy(initial_instance_count=1,
                         instance_type='ml.c4.large')

----------------!

In [48]:
import json
from sagemaker.predictor import json_deserializer

def fm_serializer(data):
    js = {'instances': []}
    js['instances'].append({'features': data.tolist()})
    return json.dumps(js)

fm_predictor.content_type = 'application/json'
fm_predictor.serializer = fm_serializer
fm_predictor.deserializer = json_deserializer

In [50]:
print(X_test[1000])
prediction = X_test[1000]
result = fm_predictor.predict(prediction)

print(y_test[1000])
print(result)

[0. 0. 0. ... 0. 0. 0.]
4.0
{'predictions': [{'score': -1285.267578125}]}


In [52]:
session.delete_endpoint(fm_predictor.endpoint)