# Neural Collaborative Filtering Model

In [1]:
import numpy as np
import math
from time import time
import tensorflow as tf
from datetime import datetime
import scipy.sparse as sp

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Sequential, Model

In [3]:
from tensorflow.keras.layers import Embedding, Input, Dense, Reshape, Flatten, Dropout
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.optimizers import Adam

In [4]:
from tensorflow.keras import initializers

In [5]:
layers = "[64,32,16,8]"

now = datetime.now()
today=datetime.today()

current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

Current Time = 15:46:47


In [6]:
model_out_file = f"pretrain/small_MLP_{layers}"

In [7]:
train_filename = "small_train.csv"

In [8]:
num_users, num_articles = 0, 0
with open(train_filename, "r") as f:
    header = f.readline()
    line = f.readline()
    while line != None and line != "":
        line_list = line.split(",")
        u, i = int(line_list[2]), int(line_list[3])
        num_users = max(num_users, u)
        num_articles = max(num_articles, i)
        line = f.readline()

num_users += 1
num_articles += 1

In [9]:
num_users, num_articles

(40331, 31415)

In [10]:
train = sp.dok_matrix((num_users, num_articles), dtype=np.float32)

with open(train_filename, "r") as f:
    header = f.readline()
    print(header)
    line = f.readline()
    while line != None and line != "":
        line_list = line.split(",")
        user, article = int(line_list[2]), int(line_list[3])
        train[user, article] = 1.0
        line = f.readline()

user_id,article,user_id_code,article_id_code



In [11]:
test_filename = "small_test.csv"

In [12]:
test_positives = []
with open(test_filename, "r") as f:
    header = f.readline()
    print(header)
    line = f.readline()
    print(line)
    while line != None and line != "":
        line_list = line.split(",")
        #print(line_list)
        user, article = int(line_list[2]), int(line_list[3])
        #print(user, article)                                            
        test_positives.append([user, article])
        line = f.readline()

user_id,article,user_id_code,article_id_code

U13740,N31801,1810,11677



In [13]:
len(test_positives)

39846

In [14]:
test_neg_filename = "small_test_negatives.tsv"

In [15]:
test_negatives = []
with open(test_neg_filename, "r") as f:
    line = f.readline()
    while line != None and line != "":
        line_list = line.split("\t")
        #print(line_list)
        negatives = []
        for neg in line_list[1: ]:
            negatives.append(int(neg))
        test_negatives.append(negatives)
        line = f.readline()

In [16]:
len(test_negatives)

39846

In [17]:
layers = [16, 8]
reg_layers = [0, 0]
num_layer = len(layers)
learning_rate = 0.001
batch_size=256

In [18]:
user_input = Input(shape=(1,), dtype='int32', name='user_input')
article_input = Input(shape=(1,), dtype='int32', name='article_input')

In [19]:
MLP_Embedding_User = Embedding(input_dim=num_users, output_dim=layers[0]//2, 
                               name='user_embedding', input_length=1)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [20]:
MLP_Embedding_Article = Embedding(input_dim=num_articles, output_dim=layers[0]//2, 
                               name='article_embedding', input_length=1)

In [21]:
user_latent = Flatten()(MLP_Embedding_User(user_input))
article_latent = Flatten()(MLP_Embedding_Article(article_input))

In [22]:
vector = Concatenate(axis=-1)([user_latent, article_latent])

In [23]:
for idx in range(1, num_layer):
    layer = Dense(layers[idx], activation='relu', name='layer%d' %idx)
    vector = layer(vector)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [24]:
prediction = Dense(1, activation='sigmoid', kernel_initializer='lecun_uniform', name = 'prediction')(vector)

In [25]:
model = Model(inputs=[user_input, article_input], outputs=prediction)

In [26]:
model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy')

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [27]:
num_negatives = 4

In [28]:
user_input, article_input, labels = [],[],[]
for (u, i) in train.keys():
    # positive instance
    user_input.append(u)
    article_input.append(i)
    labels.append(1)
    # negative instances
    for t in range(num_negatives):
        j = np.random.randint(num_articles)
        while (u, j) in train.keys():
            j = np.random.randint(num_articles)
        user_input.append(u)
        article_input.append(j)
        labels.append(0)

In [29]:
t1 = time()
hist = model.fit([np.array(user_input), np.array(article_input)], #input
                         np.array(labels), # labels 
                         batch_size=batch_size, epochs=1, verbose=1, shuffle=True)
t2 = time()



In [30]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
article_input (InputLayer)      [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_embedding (Embedding)      (None, 1, 8)         322648      user_input[0][0]                 
__________________________________________________________________________________________________
article_embedding (Embedding)   (None, 1, 8)         251320      article_input[0][0]              
______________________________________________________________________________________________

In [31]:
test_positives[0]

[1810, 11677]

In [32]:
import heapq

In [33]:
K = 10

In [34]:
def eval_one_rating(idx):
    rating = test_positives[idx]
    items = test_negatives[idx]
    u = rating[0]
    get_item = rating[1]
    items.append(get_item)
    # Get prediction scores
    map_item_score = {}
    users = np.full(len(items), u, dtype = 'int32')
    predictions = model.predict([users, np.array(items)], 
                                batch_size=100, verbose=0)
    for i in range(len(items)):
        item = items[i]
        map_item_score[item] = predictions[i]
    items.pop()
    
    # Evaluate top rank list
    ranklist = heapq.nlargest(K, map_item_score, key=map_item_score.get)
    
    if get_item in ranklist:
        hr = 1
        i = ranklist.index(get_item)
        ndcg = math.log(2) / math.log(i+2)
    else:
        hr = 0
        ndcg = 0
   
    return (hr, ndcg)

In [35]:
hits, ndcgs = [], []
for idx in range(len(test_positives)):
    hr, ndcg = eval_one_rating(idx)
    hits.append(hr)
    ndcgs.append(ndcgs)

In [36]:
hr = np.array(hits).mean()

In [37]:
hr

0.7562114139436832