# Factorization Machines on MovieLens

### Download ml-100k dataset

In [18]:
%%sh
wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
unzip -o ml-100k.zip

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base         
  inflating: ml-100k/u3.test         
  inflating: ml-100k/u4.base         
  inflating: ml-100k/u4.test         
  inflating: ml-100k/u5.base         
  inflating: ml-100k/u5.test         
  inflating: ml-100k/ua.base         
  inflating: ml-100k/ua.test         
  inflating: ml-100k/ub.base         
  inflating: ml-100k/ub.test         


--2021-06-19 05:39:24--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’

     0K .......... .......... .......... .......... ..........  1%  540K 9s
    50K .......... .......... .......... .......... ..........  2% 1014K 7s
   100K .......... .......... .......... .......... ..........  3% 98.3M 4s
   150K .......... .......... .......... .......... ..........  4%  150M 3s
   200K .......... .......... .......... .......... ..........  5% 1023K 3s
   250K .......... .......... .......... .......... ..........  6% 90.5M 3s
   300K .......... .......... .......... .......... ..........  7%  125M 2s
   350K .......... .......... .......... .......... ..........  8%  186M 2s
   400K .......... .......... ......

In [19]:
%cd ml-100k
!shuf ua.base -o ua.base.shuffled
!head -5 ua.base.shuffled

/root/Learn-Amazon-SageMaker/sdkv2/ch4/ml-100k
894	332	3	879896233
413	471	4	879969642
276	288	4	874786392
450	336	3	882370464
151	1006	1	879524974


### Build training set and test set

In [20]:
# We define sizing constants:
num_users=943
num_movies=1682
num_features=num_users+num_movies

num_ratings_train=90570
num_ratings_test=9430

In [21]:
# loading dataset into sparse matrix using custom fn
import csv
import numpy as np
from scipy.sparse import lil_matrix

def loadDataset(filename, lines, columns):
    # Features are one-hot encoded in a sparse matrix
    X = lil_matrix((lines, columns)).astype('float32')
    # Labels are stored in a vector
    Y = []
    line=0
    with open(filename,'r') as f:
        samples=csv.reader(f,delimiter='\t')
        for userId,movieId,rating,timestamp in samples:
            X[line,int(userId)-1] = 1
            X[line,int(num_users)+int(movieId)-1] = 1
            Y.append(int(rating))
            line=line+1       
    Y=np.array(Y).astype('float32')
    return X,Y

In [22]:
X_train, Y_train = loadDataset('ua.base.shuffled', num_ratings_train, num_features)
X_test, Y_test = loadDataset('ua.test', num_ratings_test, num_features)

In [23]:
print(X_train.shape)
print(Y_train.shape)
assert X_train.shape == (num_ratings_train, num_features)
assert Y_train.shape == (num_ratings_train, )

print(X_test.shape)
print(Y_test.shape)
assert X_test.shape  == (num_ratings_test, num_features)
assert Y_test.shape  == (num_ratings_test, )

(90570, 2625)
(90570,)
(9430, 2625)
(9430,)


### Convert to protobuf and save to S3

In [24]:
import sagemaker

bucket = sagemaker.Session().default_bucket()
prefix = 'adeelml-fm-movielens'

train_key      = 'train.protobuf'
train_prefix   = '{}/{}'.format(prefix, 'train')

test_key       = 'test.protobuf'
test_prefix    = '{}/{}'.format(prefix, 'test')

output_prefix  = 's3://{}/{}/output'.format(bucket, prefix)

In [25]:
import io, boto3
import sagemaker.amazon.common as smac

def writeDatasetToProtobuf(X, Y, bucket, prefix, key):
#     create an in-memory binary stream
    buf = io.BytesIO()
#     to write the sample matrix and the label vector to that buffer in protobuf format
    smac.write_spmatrix_to_sparse_tensor(buf, X, Y)
    # use smac.write_numpy_to_dense_tensor(buf, feature, label) for numpy arrays
    buf.seek(0)
    print(buf)
    obj = '{}/{}'.format(prefix, key)
#     use boto3 to upload buffer to s3
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket,obj)
    
train_data = writeDatasetToProtobuf(X_train, Y_train, bucket, train_prefix, train_key)    
test_data  = writeDatasetToProtobuf(X_test, Y_test, bucket, test_prefix, test_key)    
  
print(train_data)
print(test_data)
print('Output: {}'.format(output_prefix))

<_io.BytesIO object at 0x7fbc31dd5170>
<_io.BytesIO object at 0x7fbc3828e350>
s3://sagemaker-us-west-1-886035371869/adeelml-fm-movielens/train/train.protobuf
s3://sagemaker-us-west-1-886035371869/adeelml-fm-movielens/test/test.protobuf
Output: s3://sagemaker-us-west-1-886035371869/adeelml-fm-movielens/output


### Config and Run training job

In [26]:
import boto3
from sagemaker import image_uris

region = boto3.Session().region_name    
container = image_uris.retrieve('factorization-machines', region)

In [27]:
fm = sagemaker.estimator.Estimator(container,
                                   role=sagemaker.get_execution_role(),
                                   instance_count=1, 
                                   instance_type='ml.c5.xlarge',
                                   output_path=output_prefix
                                   )

fm.set_hyperparameters(feature_dim=num_features,
                      predictor_type='regressor',
                      num_factors=64,
                      epochs=10)
# As protobuf is the default format for Factorization Machines,no need for training input.fm-estimator take protbuf paths.while other traiinginput channel
fm.fit({'train': train_data, 'test': test_data})

2021-06-19 11:48:18 Starting - Starting the training job...
2021-06-19 11:48:23 Starting - Launching requested ML instancesProfilerReport-1624103297: InProgress
......
2021-06-19 11:49:47 Starting - Preparing the instances for training......
2021-06-19 11:50:46 Downloading - Downloading input data
2021-06-19 11:50:46 Training - Downloading the training image...
2021-06-19 11:51:16 Uploading - Uploading generated training model[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  from collections import Mapping, MutableMapping, Sequence[0m
  """[0m
  """[0m
[34m[06/19/2021 11:51:08 INFO 139807005935424] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-conf.json: {'epochs': 1, 'mini_batch_size': '1000', 'use_bias': 'true', 'use_linear': 'true', 'bias_lr': '0.1', 'linear_lr': '0.001', 'factors_lr': '0.0001', 'bias_wd': '0.01', 'linear_wd': '0.001', 'factors_wd': '0.000

### Deploy model

In [28]:
endpoint_name = 'fm-movielens-100k'
fm_predictor = fm.deploy(endpoint_name=endpoint_name,
                         instance_type='ml.t2.medium', initial_instance_count=1)

---------------!

In [29]:
# send samples to the endpoint in JSON format
import json
# Serialization is converting an object into a stream of byte for action
from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import JSONSerializer

class FMSerializer(JSONSerializer):
    def serialize(self, data):
       js = {'instances': []}
       for row in data:
              js['instances'].append({'features': row.tolist()})
       return json.dumps(js)

fm_predictor.serializer = FMSerializer()
# The default JSON deserializer will be used automatically since we set the content type to application/json
fm_predictor.deserializer = JSONDeserializer()

### Run predictions

In [30]:
result = fm_predictor.predict(X_test[:3].toarray())
print(result)

{'predictions': [{'score': 3.3874545097351074}, {'score': 3.429487943649292}, {'score': 3.6385748386383057}]}


In [31]:
fm_predictor.delete_endpoint()