## Download dataset

In [2]:
# download csv
!wget https://s3-us-west-2.amazonaws.com/smart-offer-us-west-2-20181108/ctr-kansadr/topkUkP_train_ctr_rating.csv

--2018-11-09 08:09:16--  https://s3-us-west-2.amazonaws.com/smart-offer-us-west-2-20181108/ctr-kansadr/topkUkP_train_ctr_rating.csv
Resolving s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)... 52.218.248.200
Connecting to s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)|52.218.248.200|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6914604 (6.6M) [text/csv]
Saving to: ‘topkUkP_train_ctr_rating.csv’


2018-11-09 08:09:17 (6.28 MB/s) - ‘topkUkP_train_ctr_rating.csv’ saved [6914604/6914604]



In [5]:
!wc -l topkUkP_train_ctr_rating.csv

47423 topkUkP_train_ctr_rating.csv


In [8]:
!head -2 topkUkP_train_ctr_rating.csv

3ec8aa377965b54b265720d2c99557a94b4bdf7c08b10617ec1127f6248a6489	c00fc80efc385b791c8bf81c6cdbf158977480f7841172391db4f2ed788e037f	0.0	1464747000
3ec8aa377965b54b265720d2c99557a94b4bdf7c08b10617ec1127f6248a6489	02110e9c29f3828f5d2624a3ab1c77a35a9f24575eac606d8fd3a54980efad3b	0.0	1464747723


In [3]:
!wget https://s3-us-west-2.amazonaws.com/smart-offer-us-west-2-20181108/ctr-kansadr/topkUkP_test_ctr_rating.csv

--2018-11-09 08:10:07--  https://s3-us-west-2.amazonaws.com/smart-offer-us-west-2-20181108/ctr-kansadr/topkUkP_test_ctr_rating.csv
Resolving s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)... 52.218.245.80
Connecting to s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)|52.218.245.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 629459 (615K) [text/csv]
Saving to: ‘topkUkP_test_ctr_rating.csv’


2018-11-09 08:10:08 (1.75 MB/s) - ‘topkUkP_test_ctr_rating.csv’ saved [629459/629459]



In [6]:
!wc -l topkUkP_test_ctr_rating.csv

4302 topkUkP_test_ctr_rating.csv


In [9]:
!head -2 topkUkP_test_ctr_rating.csv

001fecc308b147cbd9837051c62f035fd75ab42b3ef19c5e83eea404042af885	10698b6475abd54c5c6d1724d6f51cb795234c23a23daf1bdef1886a8ae522b5	0.0	1465993358
001fecc308b147cbd9837051c62f035fd75ab42b3ef19c5e83eea404042af885	154f65f908a7406826ed6408a156db1bdb82f8f514dffc9c344dfba31ace8520	0.0	1465969073


## Build training set and test set

In [4]:
import sagemaker
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role
from sagemaker.predictor import json_deserializer

import boto3, csv, io, json
import numpy as np
from scipy.sparse import lil_matrix

In [11]:
import pandas as pd

In [16]:
trainDF = pd.read_csv('topkUkP_train_ctr_rating.csv', sep='\t')
trainDF.columns = ['userid', 'productid', 'Click rate', 'timestamp']
userIDs = list(set(trainDF.userid))
productIDs = list(set(trainDF.productid))

In [17]:
print(len(userIDs))
print(len(productIDs))

1000
430


In [7]:
nbUsers=1000
nbProducts=430
nbFeatures=nbUsers+nbProducts

nbRatingsTrain=47423
nbRatingsTest=4302

In [18]:
# For each user, build a list of rated movies.
# We'd need this to add random negative samples.
productsByUser = {}
for userId in userIDs:
    productsByUser[userId]=[]
 
with open('topkUkP_train_ctr_rating.csv','r') as f:
    samples=csv.reader(f,delimiter='\t')
    for userId,productId,rating,timestamp in samples:
        productsByUser[userId].append(productId)

In [46]:
def loadDataset(filename, lines, columns):
    # Features are one-hot encoded in a sparse matrix
    X = lil_matrix((lines, columns)).astype('float32')
    # Labels are stored in a vector
    Y = []
    line=0
    with open(filename,'r') as f:
        samples=csv.reader(f,delimiter='\t')
        for userId,productId,rating,timestamp in samples:
            X[line,userIDs.index(userId)] = 1
            X[line,int(nbUsers)+productIDs.index(productId)] = 1
            
            if float(rating) >= 0.5:
                Y.append(1)
            else:
                Y.append(0)
#             Y.append(rating)
            line=line+1
            
    Y=np.array(Y).astype('float32')
    return X,Y

In [47]:
X_train, Y_train = loadDataset('topkUkP_train_ctr_rating.csv', nbRatingsTrain, nbFeatures)
X_test, Y_test = loadDataset('topkUkP_test_ctr_rating.csv',nbRatingsTest,nbFeatures)

In [48]:
Y_train

array([0., 0., 0., ..., 0., 0., 1.], dtype=float32)

In [49]:
print(X_train.shape)
print(Y_train.shape)
assert X_train.shape == (nbRatingsTrain, nbFeatures)
assert Y_train.shape == (nbRatingsTrain, )
zero_labels = np.count_nonzero(Y_train)
print("Training labels: %d zeros, %d ones" % (zero_labels, nbRatingsTrain-zero_labels))

print(X_test.shape)
print(Y_test.shape)
assert X_test.shape  == (nbRatingsTest, nbFeatures)
assert Y_test.shape  == (nbRatingsTest, )
zero_labels = np.count_nonzero(Y_test)
print("Test labels: %d zeros, %d ones" % (zero_labels, nbRatingsTest-zero_labels))

(47423, 1430)
(47423,)
Training labels: 2119 zeros, 45304 ones
(4302, 1430)
(4302,)
Test labels: 1458 zeros, 2844 ones


## Convert to protobuf and save to S3

In [65]:
bucket = 'xgboost-tut'
prefix = 'ctr-kansadr'

train_key      = 'train.protobuf'
train_prefix   = '{}/{}'.format(prefix, 'train3')

test_key       = 'test.protobuf'
test_prefix    = '{}/{}'.format(prefix, 'test3')

output_prefix  = 's3://{}/{}/output'.format(bucket, prefix)

In [66]:
def writeDatasetToProtobuf(X, Y, bucket, prefix, key):
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, X, Y)
    buf.seek(0)
    obj = '{}/{}'.format(prefix, key)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket,obj)
    
train_data = writeDatasetToProtobuf(X_train, Y_train, bucket, train_prefix, train_key)    
test_data  = writeDatasetToProtobuf(X_test, Y_test, bucket, test_prefix, test_key)    
  
print(train_data)
print(test_data)
print('Output: {}'.format(output_prefix))

s3://xgboost-tut/ctr-kansadr/train3/train.protobuf
s3://xgboost-tut/ctr-kansadr/test3/test.protobuf
Output: s3://xgboost-tut/ctr-kansadr/output


In [67]:
containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/factorization-machines:latest',
              'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/factorization-machines:latest',
              'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/factorization-machines:latest',
              'eu-west-1': '438346466558.dkr.ecr.eu-west-1.amazonaws.com/factorization-machines:latest'}

In [91]:
boto3.Session().region_name

'us-east-2'

In [70]:
fm = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                   get_execution_role(), 
                                   train_instance_count=1, 
                                   train_instance_type='ml.m4.xlarge',
                                   output_path=output_prefix,
                                   sagemaker_session=sagemaker.Session())

In [72]:
fm.set_hyperparameters(feature_dim=nbFeatures,
                      predictor_type='binary_classifier',
                      mini_batch_size=1000,
                      num_factors=64,
                      epochs=100)

In [73]:
fm.fit({'train': train_data, 'test': test_data})

INFO:sagemaker:Creating training-job with name: factorization-machines-2018-11-09-09-11-17-529


2018-11-09 09:11:17 Starting - Starting the training job...
2018-11-09 09:11:18 Starting - Launching requested ML instances...
2018-11-09 09:12:16 Starting - Preparing the instances for training......
2018-11-09 09:13:02 Downloading - Downloading input data
2018-11-09 09:13:02 Failed - Training job failed
..

ValueError: Error training factorization-machines-2018-11-09-09-11-17-529: Failed Reason: ClientError: Data download failed:403 Forbidden (403): Forbidden

In [74]:
fm.fit({'train': train_data, 'test': test_data})

INFO:sagemaker:Creating training-job with name: factorization-machines-2018-11-09-09-15-36-137


2018-11-09 09:15:36 Starting - Starting the training job...
2018-11-09 09:15:38 Starting - Launching requested ML instances...
2018-11-09 09:16:36 Starting - Preparing the instances for training.........
2018-11-09 09:18:00 Downloading - Downloading input data...
2018-11-09 09:18:23 Training - Training image download completed. Training in progress.
[31mDocker entrypoint called with argument(s): train[0m
[31m[11/09/2018 09:18:25 INFO 139784636045120] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'factors_lr': u'0.0001', u'linear_init_sigma': u'0.01', u'epochs': 1, u'_wd': u'1.0', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'factors_init_sigma': u'0.001', u'_log_level': u'info', u'bias_init_method': u'normal', u'linear_init_method': u'normal', u'linear_lr': u'0.001', u'factors_init_method': u'normal', u'_tuning_objective_metric': u'', u'bias_wd': u'0.01', u'use_linear': u'true', u'bias_lr': u'0.1', u'mini_b

## Deploy model

In [75]:
fm_predictor = fm.deploy(instance_type='ml.t2.medium', initial_instance_count=1)

INFO:sagemaker:Creating model with name: factorization-machines-2018-11-09-09-20-34-716


ClientError: An error occurred (ValidationException) when calling the CreateModel operation: Could not access model data at s3://xgboost-tut/ctr-kansadr/output/factorization-machines-2018-11-09-09-15-36-137/output/model.tar.gz. Please ensure that the role "arn:aws:iam::383961943887:role/service-role/AmazonSageMaker-ExecutionRole-20181109T113978" exists and that its trust relationship policy allows the action "sts:AssumeRole" for the service principal "sagemaker.amazonaws.com". Also ensure that the role has "s3:GetObject" permissions and that the object is located in us-east-2.

In [86]:
fm_predictor = fm.deploy(instance_type='ml.t2.medium', initial_instance_count=1)

INFO:sagemaker:Creating model with name: factorization-machines-2018-11-09-09-40-26-172
INFO:sagemaker:Creating endpoint with name factorization-machines-2018-11-09-09-15-36-137


ResourceLimitExceeded: An error occurred (ResourceLimitExceeded) when calling the CreateEndpoint operation: The account-level service limit 'ml.m4.xlarge for endpoint usage' is 0 Instances, with current utilization of 0 Instances and a request delta of 1 Instances. Please contact AWS support to request an increase for this limit.

In [87]:
def fm_serializer(data):
    js = {'instances': []}
    for row in data:
        js['instances'].append({'features': row.tolist()})
    #print js
    return json.dumps(js)

fm_predictor.content_type = 'application/json'
fm_predictor.serializer = fm_serializer
fm_predictor.deserializer = json_deserializer

NameError: name 'fm_predictor' is not defined

## Run predictions

In [None]:
result = fm_predictor.predict(X_test[1000:1010].toarray())
print(result)
print (Y_test[1000:1010])

## Try to use another resource

In [92]:
fm1 = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                   get_execution_role(), 
                                   train_instance_count=1, 
                                   train_instance_type='ml.m5.large',
                                   output_path=output_prefix,
                                   sagemaker_session=sagemaker.Session())

In [93]:
fm1.set_hyperparameters(feature_dim=nbFeatures,
                      predictor_type='binary_classifier',
                      mini_batch_size=1000,
                      num_factors=64,
                      epochs=100)

In [94]:
fm1.fit({'train': train_data, 'test': test_data})

INFO:sagemaker:Creating training-job with name: factorization-machines-2018-11-09-09-54-21-132


2018-11-09 09:54:21 Starting - Starting the training job...
2018-11-09 09:54:22 Starting - Launching requested ML instances...
2018-11-09 09:55:17 Starting - Preparing the instances for training.........
2018-11-09 09:56:50 Downloading - Downloading input data
2018-11-09 09:56:50 Training - Training image download completed. Training in progress..
[31mDocker entrypoint called with argument(s): train[0m
[31m[11/09/2018 09:56:51 INFO 140034636285760] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'factors_lr': u'0.0001', u'linear_init_sigma': u'0.01', u'epochs': 1, u'_wd': u'1.0', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'factors_init_sigma': u'0.001', u'_log_level': u'info', u'bias_init_method': u'normal', u'linear_init_method': u'normal', u'linear_lr': u'0.001', u'factors_init_method': u'normal', u'_tuning_objective_metric': u'', u'bias_wd': u'0.01', u'use_linear': u'true', u'bias_lr': u'0.1', u'mini_bat

In [97]:
fm_predictor = fm1.deploy(instance_type='ml.t2.medium', initial_instance_count=1)

INFO:sagemaker:Creating model with name: factorization-machines-2018-11-09-10-07-01-574
INFO:sagemaker:Creating endpoint with name factorization-machines-2018-11-09-09-54-21-132


--------------------------------------------------------------!

In [98]:
def fm_serializer(data):
    js = {'instances': []}
    for row in data:
        js['instances'].append({'features': row.tolist()})
    #print js
    return json.dumps(js)

fm_predictor.content_type = 'application/json'
fm_predictor.serializer = fm_serializer
fm_predictor.deserializer = json_deserializer

In [99]:
result = fm_predictor.predict(X_test[1000:1010].toarray())
print(result)
print (Y_test[1000:1010])

{'predictions': [{'score': 0.06603201478719711, 'predicted_label': 0.0}, {'score': 0.08403120189905167, 'predicted_label': 0.0}, {'score': 0.03857986256480217, 'predicted_label': 0.0}, {'score': 0.5291479825973511, 'predicted_label': 1.0}, {'score': 0.03790748491883278, 'predicted_label': 0.0}, {'score': 0.04214948043227196, 'predicted_label': 0.0}, {'score': 0.04747305065393448, 'predicted_label': 0.0}, {'score': 0.5040233731269836, 'predicted_label': 1.0}, {'score': 0.4355510473251343, 'predicted_label': 0.0}, {'score': 0.03687872737646103, 'predicted_label': 0.0}]}
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]


## Accuracy

In [100]:
full_res = fm_predictor.predict(X_test.toarray())

ConnectionClosedError: Connection was closed before we received a valid response from endpoint URL: "https://runtime.sagemaker.us-east-2.amazonaws.com/endpoints/factorization-machines-2018-11-09-09-54-21-132/invocations".

In [104]:
predict_arr = {}
for i in range(44):
    res = fm_predictor.predict(X_test[i*100:i*100+100].toarray())
    predict_arr['pred'+str(i)] = res

In [106]:
import json

In [108]:
# with open('predicted.json', 'w') as fout:
#     json.(fout, predict_arr)

TypeError: Object of type 'TextIOWrapper' is not JSON serializable

In [109]:
predict_arr

{'pred0': {'predictions': [{'score': 0.13062921166419983,
    'predicted_label': 0.0},
   {'score': 0.04038452357053757, 'predicted_label': 0.0},
   {'score': 0.042863793671131134, 'predicted_label': 0.0},
   {'score': 0.0701277107000351, 'predicted_label': 0.0},
   {'score': 0.09378378093242645, 'predicted_label': 0.0},
   {'score': 0.039486952126026154, 'predicted_label': 0.0},
   {'score': 0.5786311626434326, 'predicted_label': 1.0},
   {'score': 0.0388571061193943, 'predicted_label': 0.0},
   {'score': 0.04356440529227257, 'predicted_label': 0.0},
   {'score': 0.048476334661245346, 'predicted_label': 0.0},
   {'score': 0.5586037039756775, 'predicted_label': 1.0},
   {'score': 0.49002909660339355, 'predicted_label': 0.0},
   {'score': 0.03802670165896416, 'predicted_label': 0.0},
   {'score': 0.05768006667494774, 'predicted_label': 0.0},
   {'score': 0.24910573661327362, 'predicted_label': 0.0},
   {'score': 0.09023089706897736, 'predicted_label': 0.0},
   {'score': 0.08006390929222

In [115]:
pre_label = []
for key in predict_arr.keys():
    for item in predict_arr[key]['predictions']:
        pre_label.append(item['predicted_label'])        

In [117]:
np.sum(np.array(pre_label) == Y_test)/len(Y_test)

0.7735936773593677

In [119]:
for i in range(44):
    print(Y_test[i*100:i*100+100])

[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0.]
[1. 1. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0.
 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 1. 0. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1.
 1. 1. 0. 0.]
[1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0.
 0. 0. 0. 0.]
[1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1.
 1. 0. 0.

In [120]:
for i in range(44):
    print(np.array(pre_label)[i*100:i*100+100])

[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 0.]
[1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0.
 0. 0. 0. 0.]
[1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 1. 0. 0.