In [None]:
!pip install smart_open

In [None]:
import numpy as np
import math

import boto3
from sagemaker import get_execution_role
from smart_open import smart_open
from datetime import datetime

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Input, Dense, Reshape, Flatten, Dropout
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.optimizers import Adam

In [7]:
#role = get_execution_role()

## Training Data Import

In [8]:
bucket='reco-mind-data/mind_large_train'
train = 'large_train_npratio4.tsv'
#test_negatives = 'large_test_negatives.tsv'

data_location_train = 's3://{}/{}'.format(bucket, train)
#data_location_test = 's3://{}/{}'.format(bucket, test)
#data_location_test_negatives = 's3://{}/{}'.format(bucket, test_negatives)

In [None]:
user_train, article_train, labels_train = [],[],[]
with smart_open(data_location_train, "r") as f:
    line = f.readline()
    while line != None and line != "":
        line_list = line.split("\t")
        user, article, label = line_list[0], line_list[1], line_list[2]
        user_train.append(int(user))
        article_train.append(int(article))
        labels_train.append(int(label))      

## Initialize Model Parameter

In [198]:
layers = [64, 32, 16, 8]
reg_layers = [0, 0, 0, 0]
num_layer = len(layers)
epochs = 10
learning_rate = 0.001
batch_size = 256
loss = 'binary_crossentropy'
SEED = 420
dropout = True
dropout_rates = [0, 0.2, 0.2, 0]

## Train Validation Split

In [None]:
ua_train, ua_val, label_train, label_val = train_test_split(np.column_stack((user_train, article_train)),
                                                            labels_train, random_state=SEED, test_size=0.2)

In [None]:
user_train, article_train = ua_train[:, 0], ua_train[:, 1]
user_val, article_val = ua_val[:, 0], ua_val[:, 1]

## Build Model

In [199]:
user_input = Input(shape=(1,), dtype='int32', name='user_input')
article_input = Input(shape=(1,), dtype='int32', name='article_input')

In [200]:
MLP_Embedding_User = Embedding(input_dim=num_users, output_dim=layers[0]//2, 
                               embeddings_regularizer=l2(reg_layers[0]),
                               name='user_embedding', input_length=1)

In [201]:
MLP_Embedding_Article = Embedding(input_dim=num_articles, output_dim=layers[0]//2, 
                                  embeddings_regularizer=l2(reg_layers[0]),
                                  name='article_embedding', input_length=1)

In [202]:
user_latent = Flatten()(MLP_Embedding_User(user_input))
article_latent = Flatten()(MLP_Embedding_Article(article_input))

In [203]:
vector = Concatenate(axis=-1)([user_latent, article_latent])

In [209]:
for idx in range(1, num_layer):
    layer = Dense(layers[idx], activation='relu', 
                  kernel_regularizer=l2(reg_layers[idx]), name=f'layer{idx}')
    if dropout:
        drop = Dropout(dropout_rates[idx-1], seed=SEED)
    vector = drop(layer(vector))

In [205]:
prediction = Dense(1, activation='sigmoid', kernel_initializer='lecun_uniform', name='prediction')(vector)

In [206]:
model = Model(inputs=[user_input, article_input], outputs=prediction)

In [207]:
model.compile(optimizer=Adam(lr=learning_rate), loss=loss,
              metrics=[Precision(), AUC()])

In [None]:
model.summary()

## Fitting of the model

In [68]:
ua_train, ua_val, label_train, label_val = train_test_split(np.column_stack((user_train, article_train)),
                                                            labels_train, random_state=SEED, test_size=0.2)

In [78]:
user_train, article_train = ua_train[:, 0], ua_train[:, 1]
user_val, article_val = ua_val[:, 0], ua_val[:, 1]

In [None]:
hist = model.fit([np.array(user_train), np.array(article_train)],
                 np.array(label_train),
                 validation_data=([np.array(user_val), np.array(article_val)], np.array(label_val)),
                 batch_size=batch_size, 
                 epochs=epochs, 
                 verbose=1, 
                 shuffle=False)

## Save the Model

In [None]:
history_out_file = f's3://{bucket}/ncf-large/history-'
model_out_file = f's3://{bucket}/ncf-large/ncf-model-large-'

date_time = datetime.now.strftime("%m/%d/%Y_%H:%M:%S")

with smart_open(model_out_file+date_time, "wb"):
    model.save(f)

In [None]:
history_out_file = f's3://{bucket}/ncf-large/history-'
with smart_open(history_out_file+date_time, 'wb') as file:
        pickle.dump(history.history, filepi)