# [Product Recommendations for Online Retail Store](https://medium.com/@peggy1502/product-recommendations-for-online-retail-store-1d565e1607b7)
### Build and Train a Personalized Recommender Engine with Amazon SageMaker Factorization Machines

**This is `Notebook Part 2`**

[Click here for `Notebook Part 1`](https://github.com/peggy1502/FM-Recommender-Engine/blob/main/rating/fm_v5_part1.ipynb)

In [2]:
import numpy as np 
import pandas as pd 
import time

import boto3
import sagemaker
import sagemaker.amazon.common as smac

from scipy.sparse import csr_matrix, hstack, save_npz, load_npz
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [3]:
print("numpy version:", np.__version__)
print("pandas version:", pd.__version__)
print("boto3 version:", boto3.__version__)
print("sagemaker version:", sagemaker.__version__)

numpy version: 1.19.5
pandas version: 1.1.5
boto3 version: 1.19.3
sagemaker version: 2.63.2


# Reading npz files

In [4]:
# load array and sparse matrices.

X_train = load_npz("X_train.npz")
X_test = load_npz("X_test.npz")

y_train = np.load("y_train.npz")
y_test = np.load("y_test.npz")
y_train = y_train.f.arr_0
y_test = y_test.f.arr_0

# Example of sparse matrix for X_test
# pd.DataFrame(X_test.todense())

In [5]:
feature_dim = 0

# Read the saved feature dimension.
with open("feature_dim.txt", "r") as f:
    feature_dim = int(f.read())
    
feature_dim

156638

# Creating Sparse RecordIO File

https://docs.aws.amazon.com/sagemaker/latest/dg/fact-machines.html

For **training**, the Factorization Machines algorithm currently supports only the `recordIO-protobuf` format with Float32 tensors.

For **inference**, the Factorization Machines algorithm supports the `application/json` and `x-recordio-protobuf` formats.

In [6]:
# Function to create sparse RecordIO file.

def write_sparse_recordio_file (filename, X, y=None):
    with open(filename, 'wb') as f:
        smac.write_spmatrix_to_sparse_tensor (f, X, y)

In [7]:
# Function to upload file to S3.
# https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.upload_fileobj

def upload_to_s3(filename, bucket, prefix, key):
    with open(filename,'rb') as f: # Read in binary mode
        boto3.Session().resource('s3').Bucket(bucket).Object(f"{prefix}/{key}").upload_fileobj(f)
        return f"s3://{bucket}/{prefix}/{key}"

In [8]:
# Creating the train and test RecordIO files.

write_sparse_recordio_file("fm_train.recordio", X_train, y_train)
write_sparse_recordio_file("fm_test.recordio", X_test, y_test)

In [9]:
# Uploading the train and test RecordIO files to S3.

sess = sagemaker.Session()
region = sess.boto_region_name
bucket = sess.default_bucket()

prefix = "fm"
train_key = "fm_train.recordio"
test_key = "fm_test.recordio"
output_location = f"s3://{bucket}/{prefix}/output"

train_file_location = upload_to_s3("fm_train.recordio", bucket, prefix, train_key)
test_file_location = upload_to_s3("fm_test.recordio", bucket, prefix, test_key)

print("SageMaker version:", sagemaker.__version__)
print("Region:", region)
print("Bucket:", bucket)
print("train file location:", train_file_location)
print("test file location:", test_file_location)
print("model output location:", output_location)

SageMaker version: 2.63.2
Region: us-east-2
Bucket: sagemaker-us-east-2-802795124455
train file location: s3://sagemaker-us-east-2-802795124455/fm/fm_train.recordio
test file location: s3://sagemaker-us-east-2-802795124455/fm/fm_test.recordio
model output location: s3://sagemaker-us-east-2-802795124455/fm/output


# Training Job & Hyperparameters

In [17]:
job_name = 'fm-job-v5'
job_name

'fm-job-v5'

In [18]:
# https://github.com/aws-samples/amazon-sagemaker-managed-spot-training/blob/main/xgboost_built_in_managed_spot_training_checkpointing/xgboost_built_in_managed_spot_training_checkpointing.ipynb
    
use_spot_instances = False
max_run = 3600                                   # set to 60 mins
max_wait = 3600 if use_spot_instances else None  # set to 60 mins (must be equal or greater than max_run)
   
checkpoint_s3_uri = (f"s3://{bucket}/{prefix}/checkpoints/{job_name}" if use_spot_instances
                     else None)
    
print(f"Checkpoint uri: {checkpoint_s3_uri}")

Checkpoint uri: None


In [19]:
role = sagemaker.get_execution_role()
role

'arn:aws:iam::802795124455:role/service-role/AmazonSageMaker-ExecutionRole-20211026T153321'

In [20]:
container = sagemaker.image_uris.retrieve("factorization-machines", region=region)
container

'404615174143.dkr.ecr.us-east-2.amazonaws.com/factorization-machines:1'

In [21]:
estimator = sagemaker.estimator.Estimator(    
    container,
    role,
    instance_count = 1,
    instance_type = "ml.m4.xlarge",   # Or "ml.c5.xlarge",
    output_path = output_location,
    sagemaker_session = sess,
    base_job_name = job_name,
    use_spot_instances = use_spot_instances,
    max_run = max_run,
    max_wait = max_wait,
    checkpoint_s3_uri = checkpoint_s3_uri
)

In [22]:
# https://docs.aws.amazon.com/sagemaker/latest/dg/fact-machines-hyperparameters.html

estimator.set_hyperparameters(
    feature_dim = feature_dim,
    num_factors = 64,  
    predictor_type = "regressor",
    epochs = 83,      
    mini_batch_size = 1000,  
)

estimator.hyperparameters()

{'feature_dim': 156638,
 'num_factors': 64,
 'predictor_type': 'regressor',
 'epochs': 83,
 'mini_batch_size': 1000}

# Train Model

In [23]:
estimator.fit({'train':train_file_location, 
               'test':test_file_location})

2021-11-16 06:52:20 Starting - Starting the training job...
2021-11-16 06:52:44 Starting - Launching requested ML instancesProfilerReport-1637045540: InProgress
......
2021-11-16 06:53:44 Starting - Preparing the instances for training......
2021-11-16 06:54:48 Downloading - Downloading input data...
2021-11-16 06:55:04 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  from collections import Mapping, MutableMapping, Sequence[0m
  """[0m
  """[0m
[34m[11/16/2021 06:55:34 INFO 140589165307712] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-conf.json: {'epochs': 1, 'mini_batch_size': '1000', 'use_bias': 'true', 'use_linear': 'true', 'bias_lr': '0.1', 'linear_lr': '0.001', 'factors_lr': '0.0001', 'bias_wd': '0.01', 'linear_wd': '0.001', 'factors_wd': '0.00001', 'bias_init_method': 'normal', 'bias_init_sigma': '0.01', 'li


2021-11-16 06:55:45 Training - Training image download completed. Training in progress.[34m[2021-11-16 06:55:39.370] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 4, "duration": 2013, "num_examples": 106, "num_bytes": 13773364}[0m
[34m[11/16/2021 06:55:39 INFO 140589165307712] #quality_metric: host=algo-1, epoch=1, train rmse <loss>=1.3420804018254218[0m
[34m[11/16/2021 06:55:39 INFO 140589165307712] #quality_metric: host=algo-1, epoch=1, train mse <loss>=1.8011798049638856[0m
[34m[11/16/2021 06:55:39 INFO 140589165307712] #quality_metric: host=algo-1, epoch=1, train absolute_loss <loss>=1.0714545524165315[0m
[34m#metrics {"StartTime": 1637045737.3533494, "EndTime": 1637045739.371646, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 2017.0040130615234, "count": 1, "min": 2017.0040130615234, "max": 2017.0040130615234}}}[0m
[34m[11/16/2021 06:55:39 INFO 1

[34m[2021-11-16 06:55:51.507] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 16, "duration": 2081, "num_examples": 106, "num_bytes": 13773364}[0m
[34m[11/16/2021 06:55:51 INFO 140589165307712] #quality_metric: host=algo-1, epoch=7, train rmse <loss>=1.2387667379037761[0m
[34m[11/16/2021 06:55:51 INFO 140589165307712] #quality_metric: host=algo-1, epoch=7, train mse <loss>=1.534543030936763[0m
[34m[11/16/2021 06:55:51 INFO 140589165307712] #quality_metric: host=algo-1, epoch=7, train absolute_loss <loss>=0.974174281282245[0m
[34m#metrics {"StartTime": 1637045749.4230914, "EndTime": 1637045751.5082593, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 2083.857297897339, "count": 1, "min": 2083.857297897339, "max": 2083.857297897339}}}[0m
[34m[11/16/2021 06:55:51 INFO 140589165307712] #progress_metric: host=algo-1, completed 9.63855421686747 % of epochs[0m


[34m[2021-11-16 06:56:03.277] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 28, "duration": 2010, "num_examples": 106, "num_bytes": 13773364}[0m
[34m[11/16/2021 06:56:03 INFO 140589165307712] #quality_metric: host=algo-1, epoch=13, train rmse <loss>=1.140399518231467[0m
[34m[11/16/2021 06:56:03 INFO 140589165307712] #quality_metric: host=algo-1, epoch=13, train mse <loss>=1.3005110611825619[0m
[34m[11/16/2021 06:56:03 INFO 140589165307712] #quality_metric: host=algo-1, epoch=13, train absolute_loss <loss>=0.8675872917895047[0m
[34m#metrics {"StartTime": 1637045761.2587187, "EndTime": 1637045763.2783256, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 2018.157720565796, "count": 1, "min": 2018.157720565796, "max": 2018.157720565796}}}[0m
[34m[11/16/2021 06:56:03 INFO 140589165307712] #progress_metric: host=algo-1, completed 16.867469879518072 % of epochs

[34m[2021-11-16 06:56:19.741] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 44, "duration": 1879, "num_examples": 106, "num_bytes": 13773364}[0m
[34m[11/16/2021 06:56:19 INFO 140589165307712] #quality_metric: host=algo-1, epoch=21, train rmse <loss>=0.9838468436318113[0m
[34m[11/16/2021 06:56:19 INFO 140589165307712] #quality_metric: host=algo-1, epoch=21, train mse <loss>=0.9679546117242777[0m
[34m[11/16/2021 06:56:19 INFO 140589165307712] #quality_metric: host=algo-1, epoch=21, train absolute_loss <loss>=0.7033089582335275[0m
[34m#metrics {"StartTime": 1637045777.8573508, "EndTime": 1637045779.741952, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 1883.1729888916016, "count": 1, "min": 1883.1729888916016, "max": 1883.1729888916016}}}[0m
[34m[11/16/2021 06:56:19 INFO 140589165307712] #progress_metric: host=algo-1, completed 26.50602409638554 % of epoc

[34m[2021-11-16 06:56:31.349] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 56, "duration": 1946, "num_examples": 106, "num_bytes": 13773364}[0m
[34m[11/16/2021 06:56:31 INFO 140589165307712] #quality_metric: host=algo-1, epoch=27, train rmse <loss>=0.8570726509098127[0m
[34m[11/16/2021 06:56:31 INFO 140589165307712] #quality_metric: host=algo-1, epoch=27, train mse <loss>=0.7345735289375737[0m
[34m[11/16/2021 06:56:31 INFO 140589165307712] #quality_metric: host=algo-1, epoch=27, train absolute_loss <loss>=0.5807626486724278[0m
[34m#metrics {"StartTime": 1637045789.3998144, "EndTime": 1637045791.350267, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 1948.9123821258545, "count": 1, "min": 1948.9123821258545, "max": 1948.9123821258545}}}[0m
[34m[11/16/2021 06:56:31 INFO 140589165307712] #progress_metric: host=algo-1, completed 33.734939759036145 % of epo

[34m[2021-11-16 06:56:42.781] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 68, "duration": 1951, "num_examples": 106, "num_bytes": 13773364}[0m
[34m[11/16/2021 06:56:42 INFO 140589165307712] #quality_metric: host=algo-1, epoch=33, train rmse <loss>=0.7327361523055643[0m
[34m[11/16/2021 06:56:42 INFO 140589165307712] #quality_metric: host=algo-1, epoch=33, train mse <loss>=0.5369022688955631[0m
[34m[11/16/2021 06:56:42 INFO 140589165307712] #quality_metric: host=algo-1, epoch=33, train absolute_loss <loss>=0.47280185541116965[0m
[34m#metrics {"StartTime": 1637045800.8266811, "EndTime": 1637045802.7822502, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 1954.1685581207275, "count": 1, "min": 1954.1685581207275, "max": 1954.1685581207275}}}[0m
[34m[11/16/2021 06:56:42 INFO 140589165307712] #progress_metric: host=algo-1, completed 40.963855421686745 % of e

[34m[2021-11-16 06:57:00.208] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 86, "duration": 1922, "num_examples": 106, "num_bytes": 13773364}[0m
[34m[11/16/2021 06:57:00 INFO 140589165307712] #quality_metric: host=algo-1, epoch=42, train rmse <loss>=0.5635630174623717[0m
[34m[11/16/2021 06:57:00 INFO 140589165307712] #quality_metric: host=algo-1, epoch=42, train mse <loss>=0.3176032746512935[0m
[34m[11/16/2021 06:57:00 INFO 140589165307712] #quality_metric: host=algo-1, epoch=42, train absolute_loss <loss>=0.34071515468381486[0m
[34m#metrics {"StartTime": 1637045818.2826781, "EndTime": 1637045820.208908, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 1924.8850345611572, "count": 1, "min": 1924.8850345611572, "max": 1924.8850345611572}}}[0m
[34m[11/16/2021 06:57:00 INFO 140589165307712] #progress_metric: host=algo-1, completed 51.80722891566265 % of epo

[34m[2021-11-16 06:57:12.811] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 98, "duration": 1912, "num_examples": 106, "num_bytes": 13773364}[0m
[34m[11/16/2021 06:57:12 INFO 140589165307712] #quality_metric: host=algo-1, epoch=48, train rmse <loss>=0.4665247310004646[0m
[34m[11/16/2021 06:57:12 INFO 140589165307712] #quality_metric: host=algo-1, epoch=48, train mse <loss>=0.21764532463505584[0m
[34m[11/16/2021 06:57:12 INFO 140589165307712] #quality_metric: host=algo-1, epoch=48, train absolute_loss <loss>=0.27166045278873086[0m
[34m#metrics {"StartTime": 1637045830.8956738, "EndTime": 1637045832.812328, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 1915.2545928955078, "count": 1, "min": 1915.2545928955078, "max": 1915.2545928955078}}}[0m
[34m[11/16/2021 06:57:12 INFO 140589165307712] #progress_metric: host=algo-1, completed 59.036144578313255 % of e

[34m[2021-11-16 06:57:30.129] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 116, "duration": 1843, "num_examples": 106, "num_bytes": 13773364}[0m
[34m[11/16/2021 06:57:30 INFO 140589165307712] #quality_metric: host=algo-1, epoch=57, train rmse <loss>=0.3475708386310215[0m
[34m[11/16/2021 06:57:30 INFO 140589165307712] #quality_metric: host=algo-1, epoch=57, train mse <loss>=0.12080548786667158[0m
[34m[11/16/2021 06:57:30 INFO 140589165307712] #quality_metric: host=algo-1, epoch=57, train absolute_loss <loss>=0.1922812268239147[0m
[34m#metrics {"StartTime": 1637045848.2827122, "EndTime": 1637045850.1306102, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 1846.468210220337, "count": 1, "min": 1846.468210220337, "max": 1846.468210220337}}}[0m
[34m[11/16/2021 06:57:30 INFO 140589165307712] #progress_metric: host=algo-1, completed 69.87951807228916 % of epoc

[34m[2021-11-16 06:57:41.598] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 128, "duration": 1855, "num_examples": 106, "num_bytes": 13773364}[0m
[34m[11/16/2021 06:57:41 INFO 140589165307712] #quality_metric: host=algo-1, epoch=63, train rmse <loss>=0.28592368806313156[0m
[34m[11/16/2021 06:57:41 INFO 140589165307712] #quality_metric: host=algo-1, epoch=63, train mse <loss>=0.08175235539562298[0m
[34m[11/16/2021 06:57:41 INFO 140589165307712] #quality_metric: host=algo-1, epoch=63, train absolute_loss <loss>=0.15055321214783868[0m
[34m#metrics {"StartTime": 1637045859.7393174, "EndTime": 1637045861.5994768, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 1858.7276935577393, "count": 1, "min": 1858.7276935577393, "max": 1858.7276935577393}}}[0m
[34m[11/16/2021 06:57:41 INFO 140589165307712] #progress_metric: host=algo-1, completed 77.10843373493977 % of

[34m[2021-11-16 06:57:53.472] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 140, "duration": 1951, "num_examples": 106, "num_bytes": 13773364}[0m
[34m[11/16/2021 06:57:53 INFO 140589165307712] #quality_metric: host=algo-1, epoch=69, train rmse <loss>=0.23750686021914735[0m
[34m[11/16/2021 06:57:53 INFO 140589165307712] #quality_metric: host=algo-1, epoch=69, train mse <loss>=0.0564095086511576[0m
[34m[11/16/2021 06:57:53 INFO 140589165307712] #quality_metric: host=algo-1, epoch=69, train absolute_loss <loss>=0.11886215094800265[0m
[34m#metrics {"StartTime": 1637045871.5174327, "EndTime": 1637045873.472996, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 1954.0772438049316, "count": 1, "min": 1954.0772438049316, "max": 1954.0772438049316}}}[0m
[34m[11/16/2021 06:57:53 INFO 140589165307712] #progress_metric: host=algo-1, completed 84.33734939759036 % of e


2021-11-16 06:58:25 Uploading - Uploading generated training model[34m[2021-11-16 06:58:09.773] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 156, "duration": 1965, "num_examples": 106, "num_bytes": 13773364}[0m
[34m[11/16/2021 06:58:09 INFO 140589165307712] #quality_metric: host=algo-1, epoch=77, train rmse <loss>=0.1918738970520568[0m
[34m[11/16/2021 06:58:09 INFO 140589165307712] #quality_metric: host=algo-1, epoch=77, train mse <loss>=0.03681559236994329[0m
[34m[11/16/2021 06:58:09 INFO 140589165307712] #quality_metric: host=algo-1, epoch=77, train absolute_loss <loss>=0.08953972409806162[0m
[34m#metrics {"StartTime": 1637045887.8045583, "EndTime": 1637045889.7741752, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 1968.2698249816895, "count": 1, "min": 1968.2698249816895, "max": 1968.2698249816895}}}[0m
[34m[11/16/2021 06:58:09 INFO 14058916530771

[34m[2021-11-16 06:58:19.834] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/test", "epoch": 0, "duration": 165555, "num_examples": 1, "num_bytes": 129532}[0m
[34m[2021-11-16 06:58:20.740] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/test", "epoch": 1, "duration": 906, "num_examples": 27, "num_bytes": 3449396}[0m
[34m#metrics {"StartTime": 1637045899.8340564, "EndTime": 1637045900.7409708, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training", "Meta": "test_data_iter"}, "Metrics": {"Total Records Seen": {"sum": 26409.0, "count": 1, "min": 26409, "max": 26409}, "Total Batches Seen": {"sum": 27.0, "count": 1, "min": 27, "max": 27}, "Max Records Seen Between Resets": {"sum": 26409.0, "count": 1, "min": 26409, "max": 26409}, "Max Batches Seen Between Resets": {"sum": 27.0, "count": 1, "min": 27, "max": 27}, "Reset Count": {"sum": 1.0, "count": 1, "min": 1, "max": 1}, "Number of Records Since Last Res

In [24]:
job_name = estimator.latest_training_job.job_name

sagemaker_boto_client = boto3.Session(region_name=region).client("sagemaker")
training_job_info = sagemaker_boto_client.describe_training_job(TrainingJobName = job_name)
training_job_info

{'TrainingJobName': 'fm-job-v5-2021-11-16-06-52-20-631',
 'TrainingJobArn': 'arn:aws:sagemaker:us-east-2:802795124455:training-job/fm-job-v5-2021-11-16-06-52-20-631',
 'ModelArtifacts': {'S3ModelArtifacts': 's3://sagemaker-us-east-2-802795124455/fm/output/fm-job-v5-2021-11-16-06-52-20-631/output/model.tar.gz'},
 'TrainingJobStatus': 'Completed',
 'SecondaryStatus': 'Completed',
 'HyperParameters': {'epochs': '83',
  'feature_dim': '156638',
  'mini_batch_size': '1000',
  'num_factors': '64',
  'predictor_type': 'regressor'},
 'AlgorithmSpecification': {'TrainingImage': '404615174143.dkr.ecr.us-east-2.amazonaws.com/factorization-machines:1',
  'TrainingInputMode': 'File',
  'MetricDefinitions': [{'Name': 'train:rmse:epoch',
    'Regex': '#quality_metric: host=\\S+, epoch=\\S+, train rmse <loss>=(\\S+)'},
   {'Name': 'train:progress',
    'Regex': '#progress_metric: host=\\S+, completed (\\S+) %'},
   {'Name': 'test:binary_f_beta',
    'Regex': '#quality_metric: host=\\S+, test binary_f_

# Deploy Model

In [25]:
from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import JSONSerializer
import json

class fm_json_serializer(JSONSerializer):
    def serialize(self, data):
        js = {"instances": []}
        for row in data:
            js["instances"].append({"features": row.tolist()})
        return json.dumps(js)

In [26]:
predictor = estimator.deploy(initial_instance_count = 1,
                             instance_type = "ml.m5.xlarge",
                             endpoint_name = job_name,
                             serializer = fm_json_serializer(),
                             deserializer = JSONDeserializer(),
                            )

----!

# Model Inference

### Top Customers

In [27]:
pd.set_option('max_colwidth', 100)  # default is 50
df = pd.read_csv("fm_preprocessed.csv")
df["product_title"].fillna("", inplace=True)

# The list of top customers (customers with the most product reviews).
df.groupby("customer_id").count()["product_id"].sort_values(ascending=False).head(30)

customer_id
3949232     8
6660431     7
42568080    6
16430672    6
19967343    6
3610901     6
36000245    6
18167714    5
44619977    5
43577817    5
52082728    5
42799904    5
22606168    5
16528195    5
5626633     5
26016557    5
13299352    5
16856119    5
15532654    5
13096644    5
1522693     5
50623001    5
1597288     5
35178127    5
14789580    5
39958911    5
46994542    5
30882451    5
9303980     5
18222132    4
Name: product_id, dtype: int64

In [28]:
# Select one of the top customers from above.
# This customer have records for both product categories.
sample_customer = 42799904 # 42799904, 50623001, 16528195, 35178127, 18167714

# The existing product ratings given by the selected customer.
df[df["customer_id"] == sample_customer]  

Unnamed: 0,customer_id,product_id,product_title,product_category,star_rating
13023,42799904,B0030DEWSK,Intimore Plus B3 Feminine Wash - 7.44 oz,Personal_Care_Appliances,5
20311,42799904,B008320WGO,INFINITE ALOE SKIN CARE - ORIGINAL - (1 - 8oz jar) + (2 - 0.5oz jars),Personal_Care_Appliances,3
51497,42799904,B0047PF5MM,Miele : Dishwasher Conditioner 8.5 oz (06848160 / 09042920),Major Appliances,5
54611,42799904,B00DSICT1Y,uComfy Shiatsu Foot Massager,Personal_Care_Appliances,1
94490,42799904,B000FAR33M,Danby DAR195BL 1.8 cu.ft. All Refrigerator - Black,Major Appliances,5


### Trending Products

Trending products are products with most number of unique customers.

In [29]:
trending = df.copy()
trending = (trending.groupby(["product_id", "product_title", "product_category"])
            .nunique()["customer_id"]
            .sort_values(ascending=False)
            .reset_index()            
           )            
trending = trending.rename(columns={'customer_id': 'unique_customers'})
trending

Unnamed: 0,product_id,product_title,product_category,unique_customers
0,B00H9L7VIW,"boostULTIMATE - 60 Capsules - Increase Workout Stamina, Muscle Size, Energy & More 1 Month Supply",Personal_Care_Appliances,1937
1,B0006VJ6TO,Body Back Company’s Body Back Buddy Trigger Point Therapy Self Massage Tool - PARENT,Personal_Care_Appliances,1694
2,B00KJ07SEM,GE MWF SmartWater Compatible Water Filter Cartridge - Refrigerator,Major Appliances,1164
3,B00HES9CMS,"Viva Naturals #1 Best Selling Certified Organic Cacao Powder from Superior Criollo Beans, 1 LB Bag",Personal_Care_Appliances,946
4,B000JLNBW4,Koolatron Coke Personal Mini Fridge,Major Appliances,789
...,...,...,...,...
20899,B003N4YD8U,"Whirlpool Part Number 3608F091-70: TRIM, KICKPLATE",Major Appliances,1
20900,B003N3S922,Whirlpool Part Number 3385931: PANEL-CNTL,Major Appliances,1
20901,B003N3N1XO,Whirlpool Part Number 4449751: THERMOSTAT (TOD),Major Appliances,1
20902,B003N2NYK0,Whirlpool Part Number 9708175: Planetary (Metallic Chrome),Major Appliances,1


### Pool of Trending Products - Consists of top trending products for each category

In [37]:
# Select 4 trending products for each category.
tr1 = trending[trending["product_category"]=="Personal_Care_Appliances"].head(4)
tr2 = trending[trending["product_category"]=="Major Appliances"].head(4)
trending_pool = pd.concat([tr1, tr2], axis=0)
trending_pool

Unnamed: 0,product_id,product_title,product_category,unique_customers
0,B00H9L7VIW,"boostULTIMATE - 60 Capsules - Increase Workout Stamina, Muscle Size, Energy & More 1 Month Supply",Personal_Care_Appliances,1937
1,B0006VJ6TO,Body Back Company’s Body Back Buddy Trigger Point Therapy Self Massage Tool - PARENT,Personal_Care_Appliances,1694
3,B00HES9CMS,"Viva Naturals #1 Best Selling Certified Organic Cacao Powder from Superior Criollo Beans, 1 LB Bag",Personal_Care_Appliances,946
8,B000SOQ30E,"MedMobile® BATHTUB TRANSFER BENCH / BATH CHAIR WITH BACK, WIDE SEAT, ADJUSTABLE SEAT HEIGHT, SUR...",Personal_Care_Appliances,578
2,B00KJ07SEM,GE MWF SmartWater Compatible Water Filter Cartridge - Refrigerator,Major Appliances,1164
4,B000JLNBW4,Koolatron Coke Personal Mini Fridge,Major Appliances,789
5,B004MX8XO6,SPT Countertop Dishwasher,Major Appliances,604
6,B002MPLYEW,"Danby 120 Can Beverage Center, Stainless Steel DBC120BLS",Major Appliances,603


### Prepare data for inference

Pick the sample customer to be used for inference.

In [38]:
trending_pool["customer_id"] = sample_customer
trending_pool

Unnamed: 0,product_id,product_title,product_category,unique_customers,customer_id
0,B00H9L7VIW,"boostULTIMATE - 60 Capsules - Increase Workout Stamina, Muscle Size, Energy & More 1 Month Supply",Personal_Care_Appliances,1937,42799904
1,B0006VJ6TO,Body Back Company’s Body Back Buddy Trigger Point Therapy Self Massage Tool - PARENT,Personal_Care_Appliances,1694,42799904
3,B00HES9CMS,"Viva Naturals #1 Best Selling Certified Organic Cacao Powder from Superior Criollo Beans, 1 LB Bag",Personal_Care_Appliances,946,42799904
8,B000SOQ30E,"MedMobile® BATHTUB TRANSFER BENCH / BATH CHAIR WITH BACK, WIDE SEAT, ADJUSTABLE SEAT HEIGHT, SUR...",Personal_Care_Appliances,578,42799904
2,B00KJ07SEM,GE MWF SmartWater Compatible Water Filter Cartridge - Refrigerator,Major Appliances,1164,42799904
4,B000JLNBW4,Koolatron Coke Personal Mini Fridge,Major Appliances,789,42799904
5,B004MX8XO6,SPT Countertop Dishwasher,Major Appliances,604,42799904
6,B002MPLYEW,"Danby 120 Can Beverage Center, Stainless Steel DBC120BLS",Major Appliances,603,42799904


In [39]:
ohe = OneHotEncoder(handle_unknown = "ignore")
ohe_cols = ["customer_id", "product_id", "product_category"]
ohe.fit(df[ohe_cols])
ohe_features = ohe.transform(trending_pool[ohe_cols])
ohe_features

<8x147967 sparse matrix of type '<class 'numpy.float64'>'
	with 24 stored elements in Compressed Sparse Row format>

In [40]:
vectorizer = TfidfVectorizer(min_df=2)  # Ignore terms that appear in less than 2 documents.
vectorizer.fit(df["product_title"].unique())
tfidf_features = vectorizer.transform(trending_pool["product_title"])
tfidf_features

<8x8671 sparse matrix of type '<class 'numpy.float64'>'
	with 76 stored elements in Compressed Sparse Row format>

In [41]:
X_trending = hstack([ohe_features, tfidf_features], format="csr", dtype="float32")
X_trending

<8x156638 sparse matrix of type '<class 'numpy.float32'>'
	with 100 stored elements in Compressed Sparse Row format>

In [42]:
X_trending.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

## Get recommendation for the selected customer from the pool of trending products

In [43]:
result = predictor.predict(X_trending.toarray())
result

{'predictions': [{'score': 1.9962997436523438},
  {'score': 2.53277587890625},
  {'score': 3.1168642044067383},
  {'score': 2.2905712127685547},
  {'score': 4.798834800720215},
  {'score': 4.610613822937012},
  {'score': 4.791792869567871},
  {'score': 4.648616790771484}]}

In [44]:
predictions = [i["score"] for i in result["predictions"]]
predictions

[1.9962997436523438,
 2.53277587890625,
 3.1168642044067383,
 2.2905712127685547,
 4.798834800720215,
 4.610613822937012,
 4.791792869567871,
 4.648616790771484]

In [45]:
# argsort: smaller values are in front, bigger values are behind.

index_array = np.array(predictions).argsort()
index_array

array([0, 3, 1, 2, 5, 7, 6, 4])

In [47]:
products = ohe.inverse_transform(ohe_features)[:, 1]
products

array(['B00H9L7VIW', 'B0006VJ6TO', 'B00HES9CMS', 'B000SOQ30E',
       'B00KJ07SEM', 'B000JLNBW4', 'B004MX8XO6', 'B002MPLYEW'],
      dtype=object)

## Top 3 recommendations for the customer

In [48]:
# Top 3 recommendations means take the biggest values from behind. 
# (i.e. index 4 followed by index 6, and 7).

top_3_recommended = np.take_along_axis(products, index_array, axis=0)[: -3 - 1 : -1]
top_3_recommended

array(['B00KJ07SEM', 'B004MX8XO6', 'B002MPLYEW'], dtype=object)

In [84]:
# Convert the array to dataframe.
df_3 = pd.DataFrame(top_3_recommended, columns=["product_id"])
df_3

Unnamed: 0,product_id
0,B00KJ07SEM
1,B004MX8XO6
2,B002MPLYEW


In [87]:
# Map the dataframe's product_id to get product_title and product_category.
# These are the top-3 recommended products for the selected customer.

df_recommend = pd.merge(df_3, trending_pool, on="product_id")
columns = ["product_id", "product_title", "product_category"]
df_recommend = df_recommend[columns]
df_recommend

Unnamed: 0,product_id,product_title,product_category
0,B00KJ07SEM,GE MWF SmartWater Compatible Water Filter Cartridge - Refrigerator,Major Appliances
1,B004MX8XO6,SPT Countertop Dishwasher,Major Appliances
2,B002MPLYEW,"Danby 120 Can Beverage Center, Stainless Steel DBC120BLS",Major Appliances


In [50]:
predictor.delete_endpoint()