# Movie recommendation on Amazon SageMaker with Factorization Machines

### Download ml-100k dataset

In [1]:
!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -o ml-100k.zip

--2018-08-29 04:07:00--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.34.235
Connecting to files.grouplens.org (files.grouplens.org)|128.101.34.235|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2018-08-29 04:07:01 (21.1 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base    

In [2]:
%cd ml-100k
!shuf ua.base -o ua.base.shuffled
!head -10 ua.base.shuffled

/home/ec2-user/SageMaker/pilho-lab/ml-100k
343	25	2	876402814
659	212	4	891387227
858	515	4	880932911
387	196	2	886484012
854	762	2	882812905
421	466	4	892241459
436	895	4	887768717
605	462	5	881016176
346	276	1	874950029
40	294	4	889041671


In [3]:
!head -10 ua.test

1	20	4	887431883
1	33	4	878542699
1	61	4	878542420
1	117	3	874965739
1	155	2	878542201
1	160	4	875072547
1	171	5	889751711
1	189	3	888732928
1	202	5	875072442
1	265	4	878542441


In [4]:
import sagemaker
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role
from sagemaker.predictor import json_deserializer

import boto3, csv, io, json
import numpy as np
from scipy.sparse import lil_matrix

### Build training set and test set

In [5]:
nbUsers=943
nbMovies=1682
nbFeatures=nbUsers+nbMovies

nbRatingsTrain=90570
nbRatingsTest=9430

In [6]:
# For each user, build a list of rated movies.
# We'd need this to add random negative samples.
moviesByUser = {}
for userId in range(nbUsers):
    moviesByUser[str(userId)]=[]
 
with open('ua.base.shuffled','r') as f:
    samples=csv.reader(f,delimiter='\t')
    for userId,movieId,rating,timestamp in samples:
        moviesByUser[str(int(userId)-1)].append(int(movieId)-1) 

In [7]:
def loadDataset(filename, lines, columns):
    # Features are one-hot encoded in a sparse matrix
    X = lil_matrix((lines, columns)).astype('float32')
    # Labels are stored in a vector
    Y = []
    line=0
    with open(filename,'r') as f:
        samples=csv.reader(f,delimiter='\t')
        for userId,movieId,rating,timestamp in samples:
            X[line,int(userId)-1] = 1
            X[line,int(nbUsers)+int(movieId)-1] = 1
            if int(rating) >= 4:
                Y.append(1)
            else:
                Y.append(0)
            line=line+1
            
    Y=np.array(Y).astype('float32')
    return X,Y

In [8]:
X_train, Y_train = loadDataset('ua.base.shuffled', nbRatingsTrain, nbFeatures)
X_test, Y_test = loadDataset('ua.test',nbRatingsTest,nbFeatures)

In [30]:
print(X_train[1000])

  (0, 591)	1.0
  (0, 1403)	1.0


In [9]:
print(X_train.shape)
print(Y_train.shape)
assert X_train.shape == (nbRatingsTrain, nbFeatures)
assert Y_train.shape == (nbRatingsTrain, )
zero_labels = np.count_nonzero(Y_train)
print("Training labels: %d zeros, %d ones" % (zero_labels, nbRatingsTrain-zero_labels))

print(X_test.shape)
print(Y_test.shape)
assert X_test.shape  == (nbRatingsTest, nbFeatures)
assert Y_test.shape  == (nbRatingsTest, )
zero_labels = np.count_nonzero(Y_test)
print("Test labels: %d zeros, %d ones" % (zero_labels, nbRatingsTest-zero_labels))

(90570, 2625)
(90570,)
Training labels: 49906 zeros, 40664 ones
(9430, 2625)
(9430,)
Test labels: 5469 zeros, 3961 ones


### Convert to protobuf and save to S3

In [10]:
bucket = 'pilho-sagemaker-ai-workshop'
prefix = 'sagemaker/fm-movielens'

train_key      = 'train.protobuf'
train_prefix   = '{}/{}'.format(prefix, 'train3')

test_key       = 'test.protobuf'
test_prefix    = '{}/{}'.format(prefix, 'test3')

output_prefix  = 's3://{}/{}/output'.format(bucket, prefix)

In [11]:
def writeDatasetToProtobuf(X, Y, bucket, prefix, key):
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, X, Y)
    buf.seek(0)
    obj = '{}/{}'.format(prefix, key)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket,obj)
    
train_data = writeDatasetToProtobuf(X_train, Y_train, bucket, train_prefix, train_key)    
test_data  = writeDatasetToProtobuf(X_test, Y_test, bucket, test_prefix, test_key)    
  
print(train_data)
print(test_data)
print('Output: {}'.format(output_prefix))

s3://pilho-sagemaker-ai-workshop/sagemaker/fm-movielens/train3/train.protobuf
s3://pilho-sagemaker-ai-workshop/sagemaker/fm-movielens/test3/test.protobuf
Output: s3://pilho-sagemaker-ai-workshop/sagemaker/fm-movielens/output


### Run training job

In [12]:
containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/factorization-machines:latest',
              'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/factorization-machines:latest',
              'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/factorization-machines:latest',
              'eu-west-1': '438346466558.dkr.ecr.eu-west-1.amazonaws.com/factorization-machines:latest'}

In [13]:
fm = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                   get_execution_role(), 
                                   train_instance_count=1, 
                                   train_instance_type='ml.c4.xlarge',
                                   output_path=output_prefix,
                                   sagemaker_session=sagemaker.Session())

fm.set_hyperparameters(feature_dim=nbFeatures,
                      predictor_type='binary_classifier',
                      mini_batch_size=1000,
                      num_factors=64,
                      epochs=100)

fm.fit({'train': train_data, 'test': test_data})

INFO:sagemaker:Creating training-job with name: factorization-machines-2018-08-29-04-07-20-066


.....................
[31mDocker entrypoint called with argument(s): train[0m
[31m[08/29/2018 04:10:43 INFO 139937213605696] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'factors_lr': u'0.0001', u'linear_init_sigma': u'0.01', u'epochs': 1, u'_wd': u'1.0', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'factors_init_sigma': u'0.001', u'_log_level': u'info', u'bias_init_method': u'normal', u'linear_init_method': u'normal', u'linear_lr': u'0.001', u'factors_init_method': u'normal', u'_tuning_objective_metric': u'', u'bias_wd': u'0.01', u'use_linear': u'true', u'bias_lr': u'0.1', u'mini_batch_size': u'1000', u'_use_full_symbolic': u'true', u'batch_metrics_publish_interval': u'500', u'bias_init_sigma': u'0.01', u'_num_gpus': u'auto', u'_data_format': u'record', u'factors_wd': u'0.00001', u'linear_wd': u'0.001', u'_kvstore': u'auto', u'_learning_rate': u'1.0', u'_optimizer': u'adam'}[0m
[31m[08/29/2018 04:10:43 

[31m[08/29/2018 04:10:50 INFO 139937213605696] #quality_metric: host=algo-1, epoch=8, train binary_classification_accuracy <score>=0.70167032967[0m
[31m[08/29/2018 04:10:50 INFO 139937213605696] #quality_metric: host=algo-1, epoch=8, train binary_classification_cross_entropy <loss>=0.620262380076[0m
[31m[08/29/2018 04:10:50 INFO 139937213605696] #quality_metric: host=algo-1, epoch=8, train binary_f_1.000 <score>=0.762426491179[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 730.7350635528564, "sum": 730.7350635528564, "min": 730.7350635528564}}, "EndTime": 1535515850.122713, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1535515849.391502}
[0m
[31m[08/29/2018 04:10:50 INFO 139937213605696] #progress_metric: host=algo-1, completed 9 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 91, "sum": 91.0, "min": 91}, "Number of Batches Since Last Reset": {"co

[31m[08/29/2018 04:10:59 INFO 139937213605696] #quality_metric: host=algo-1, epoch=21, train binary_classification_accuracy <score>=0.733659340659[0m
[31m[08/29/2018 04:10:59 INFO 139937213605696] #quality_metric: host=algo-1, epoch=21, train binary_classification_cross_entropy <loss>=0.572988758129[0m
[31m[08/29/2018 04:10:59 INFO 139937213605696] #quality_metric: host=algo-1, epoch=21, train binary_f_1.000 <score>=0.771906379695[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 757.8160762786865, "sum": 757.8160762786865, "min": 757.8160762786865}}, "EndTime": 1535515859.90967, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1535515859.151411}
[0m
[31m[08/29/2018 04:10:59 INFO 139937213605696] #progress_metric: host=algo-1, completed 22 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 91, "sum": 91.0, "min": 91}, "Number of Batches Since Last Reset": 

[31m[08/29/2018 04:11:10 INFO 139937213605696] #quality_metric: host=algo-1, epoch=35, train binary_classification_accuracy <score>=0.740021978022[0m
[31m[08/29/2018 04:11:10 INFO 139937213605696] #quality_metric: host=algo-1, epoch=35, train binary_classification_cross_entropy <loss>=0.550261639606[0m
[31m[08/29/2018 04:11:10 INFO 139937213605696] #quality_metric: host=algo-1, epoch=35, train binary_f_1.000 <score>=0.773447225787[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 737.7021312713623, "sum": 737.7021312713623, "min": 737.7021312713623}}, "EndTime": 1535515870.157512, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1535515869.419337}
[0m
[31m[08/29/2018 04:11:10 INFO 139937213605696] #progress_metric: host=algo-1, completed 36 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 91, "sum": 91.0, "min": 91}, "Number of Batches Since Last Reset":

[31m[08/29/2018 04:11:19 INFO 139937213605696] #quality_metric: host=algo-1, epoch=48, train binary_classification_accuracy <score>=0.742824175824[0m
[31m[08/29/2018 04:11:19 INFO 139937213605696] #quality_metric: host=algo-1, epoch=48, train binary_classification_cross_entropy <loss>=0.537705274645[0m
[31m[08/29/2018 04:11:19 INFO 139937213605696] #quality_metric: host=algo-1, epoch=48, train binary_f_1.000 <score>=0.774465870653[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 746.4599609375, "sum": 746.4599609375, "min": 746.4599609375}}, "EndTime": 1535515879.79548, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1535515879.048524}
[0m
[31m[08/29/2018 04:11:19 INFO 139937213605696] #progress_metric: host=algo-1, completed 49 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 91, "sum": 91.0, "min": 91}, "Number of Batches Since Last Reset": {"count":

[31m[08/29/2018 04:11:29 INFO 139937213605696] #quality_metric: host=algo-1, epoch=61, train binary_classification_accuracy <score>=0.744538461538[0m
[31m[08/29/2018 04:11:29 INFO 139937213605696] #quality_metric: host=algo-1, epoch=61, train binary_classification_cross_entropy <loss>=0.529024258456[0m
[31m[08/29/2018 04:11:29 INFO 139937213605696] #quality_metric: host=algo-1, epoch=61, train binary_f_1.000 <score>=0.775358747645[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 731.8141460418701, "sum": 731.8141460418701, "min": 731.8141460418701}}, "EndTime": 1535515889.646829, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1535515888.913151}
[0m
[31m[08/29/2018 04:11:29 INFO 139937213605696] #progress_metric: host=algo-1, completed 62 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 91, "sum": 91.0, "min": 91}, "Number of Batches Since Last Reset":

[31m[08/29/2018 04:11:39 INFO 139937213605696] #quality_metric: host=algo-1, epoch=75, train binary_classification_accuracy <score>=0.746835164835[0m
[31m[08/29/2018 04:11:39 INFO 139937213605696] #quality_metric: host=algo-1, epoch=75, train binary_classification_cross_entropy <loss>=0.522135612404[0m
[31m[08/29/2018 04:11:39 INFO 139937213605696] #quality_metric: host=algo-1, epoch=75, train binary_f_1.000 <score>=0.777035789637[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 709.3179225921631, "sum": 709.3179225921631, "min": 709.3179225921631}}, "EndTime": 1535515899.953237, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1535515899.24344}
[0m
[31m[08/29/2018 04:11:39 INFO 139937213605696] #progress_metric: host=algo-1, completed 76 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 91, "sum": 91.0, "min": 91}, "Number of Batches Since Last Reset": 

[31m[08/29/2018 04:11:50 INFO 139937213605696] #quality_metric: host=algo-1, epoch=89, train binary_classification_accuracy <score>=0.748967032967[0m
[31m[08/29/2018 04:11:50 INFO 139937213605696] #quality_metric: host=algo-1, epoch=89, train binary_classification_cross_entropy <loss>=0.516686616332[0m
[31m[08/29/2018 04:11:50 INFO 139937213605696] #quality_metric: host=algo-1, epoch=89, train binary_f_1.000 <score>=0.778832003718[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 734.490156173706, "sum": 734.490156173706, "min": 734.490156173706}}, "EndTime": 1535515910.163265, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1535515909.428314}
[0m
[31m[08/29/2018 04:11:50 INFO 139937213605696] #progress_metric: host=algo-1, completed 90 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 91, "sum": 91.0, "min": 91}, "Number of Batches Since Last Reset": {"


Billable seconds: 172


### Deploy model

In [14]:
fm_predictor = fm.deploy(instance_type='ml.c4.xlarge', initial_instance_count=1)

INFO:sagemaker:Creating model with name: factorization-machines-2018-08-29-04-12-33-551
INFO:sagemaker:Creating endpoint with name factorization-machines-2018-08-29-04-07-20-066


--------------------------------------------------------------------------!

In [15]:
def fm_serializer(data):
    js = {'instances': []}
    for row in data:
        js['instances'].append({'features': row.tolist()})
    #print js
    return json.dumps(js)

fm_predictor.content_type = 'application/json'
fm_predictor.serializer = fm_serializer
fm_predictor.deserializer = json_deserializer

### Run predictions

In [21]:
print(X_test[1000:1010])

  (0, 100)	1.0
  (0, 1164)	1.0
  (1, 100)	1.0
  (1, 1194)	1.0
  (2, 100)	1.0
  (2, 1223)	1.0
  (3, 100)	1.0
  (3, 1224)	1.0
  (4, 100)	1.0
  (4, 1246)	1.0
  (5, 100)	1.0
  (5, 1311)	1.0
  (6, 100)	1.0
  (6, 1347)	1.0
  (7, 100)	1.0
  (7, 1413)	1.0
  (8, 100)	1.0
  (8, 1538)	1.0
  (9, 100)	1.0
  (9, 1771)	1.0


In [24]:
result = fm_predictor.predict(X_test[2000:2010].toarray())
print(result)

{u'predictions': [{u'score': 0.2745290696620941, u'predicted_label': 0.0}, {u'score': 0.4928366541862488, u'predicted_label': 0.0}, {u'score': 0.18388329446315765, u'predicted_label': 0.0}, {u'score': 0.523980438709259, u'predicted_label': 1.0}, {u'score': 0.2758810222148895, u'predicted_label': 0.0}, {u'score': 0.32127004861831665, u'predicted_label': 0.0}, {u'score': 0.24220161139965057, u'predicted_label': 0.0}, {u'score': 0.4599955677986145, u'predicted_label': 0.0}, {u'score': 0.1501358151435852, u'predicted_label': 0.0}, {u'score': 0.2602457106113434, u'predicted_label': 0.0}]}


In [25]:
print (Y_test[2000:2010])

[0. 1. 1. 0. 0. 0. 1. 0. 0. 1.]


In [31]:
print(X_test[2000])

  (0, 200)	1.0
  (0, 1088)	1.0


In [23]:
result = fm_predictor.predict(X_test[2000:2001].toarray())
print(result)
print(Y_test[2000:2001])

{u'predictions': [{u'score': 0.2745290696620941, u'predicted_label': 0.0}]}
[0.]
