# Generalized Matrix Factorization (GMF)

In [1]:
import numpy as np
import math
import tensorflow as tf
import scipy.sparse as sp
import heapq

In [2]:
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Sequential, Model

In [1]:
from tensorflow.keras.layers import Embedding, Input, Dense, Reshape, Flatten, Dropout
from tensorflow.keras.layers import Concatenate, Multiply
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import initializers

from tensorflow.keras.metrics import MeanSquaredError, Precision, AUC

## Import Train and Test Data

In [7]:
train_filename = "small_train.csv"

num_users, num_articles = 0, 0
with open(train_filename, "r") as f:
    header = f.readline()
    line = f.readline()
    while line != None and line != "":
        line_list = line.split(",")
        u, i = int(line_list[2]), int(line_list[3])
        num_users = max(num_users, u)
        num_articles = max(num_articles, i)
        line = f.readline()

num_users += 1
num_articles += 1

In [8]:
num_users, num_articles

(40331, 31415)

In [9]:
train = sp.dok_matrix((num_users, num_articles), dtype=np.float32)

with open(train_filename, "r") as f:
    header = f.readline()
    print(header)
    line = f.readline()
    while line != None and line != "":
        line_list = line.split(",")
        user, article = int(line_list[2]), int(line_list[3])
        train[user, article] = 1.0
        line = f.readline()

user_id,article,user_id_code,article_id_code



In [10]:
test_filename = "small_test.csv"

test_positives = []
with open(test_filename, "r") as f:
    header = f.readline()
    print(header)
    line = f.readline()
    print(line)
    while line != None and line != "":
        line_list = line.split(",")
        #print(line_list)
        user, article = int(line_list[2]), int(line_list[3])
        #print(user, article)                                            
        test_positives.append([user, article])
        line = f.readline()

user_id,article,user_id_code,article_id_code

U13740,N31801,1810,11677



In [11]:
len(test_positives)

39846

In [12]:
test_neg_filename = "small_test_negatives.tsv"

test_negatives = []
with open(test_neg_filename, "r") as f:
    line = f.readline()
    while line != None and line != "":
        line_list = line.split("\t")
        #print(line_list)
        negatives = []
        for neg in line_list[1: ]:
            negatives.append(int(neg))
        test_negatives.append(negatives)
        line = f.readline()

In [13]:
len(test_negatives)

39846

## Initialize Model Parameters

In [5]:
EPOCHS = 20
BATCH_SIZE = 256
NUM_FACTORS = 8
REGS = [0, 0]
NUM_NEG = 4
LR = 0.001
LEARNER = "adam"

In [6]:
topK = 10

## Build Model

In [14]:
user_input = Input(shape=(1,), dtype='int32', name='user_input')
article_input = Input(shape=(1,), dtype='int32', name='article_input')

In [15]:
MF_Embedding_User = Embedding(input_dim=num_users, 
                              output_dim=NUM_FACTORS, 
                              name='user_embedding',
                              input_length=1)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [16]:
MF_Embedding_Article = Embedding(input_dim=num_articles, 
                                 output_dim=NUM_FACTORS, 
                                 name = 'article_embedding',
                                 input_length=1)

In [17]:
user_latent = Flatten()(MF_Embedding_User(user_input))
article_latent = Flatten()(MF_Embedding_Article(article_input))

In [36]:
predict_vector = Multiply()([user_latent, article_latent])

In [37]:
prediction = Dense(1, activation='sigmoid', 
                   kernel_initializer='lecun_uniform', 
                   name = 'prediction')(predict_vector)

In [41]:
model = Model([user_input, article_input], prediction)

In [48]:
from tensorflow.keras.metrics import MeanSquaredError, Precision, AUC

In [50]:
model.compile(optimizer=Adam(lr=LR), loss='binary_crossentropy',
              metrics=[MeanSquaredError(), Precision(), AUC()])

In [51]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
article_input (InputLayer)      [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_embedding (Embedding)      (None, 1, 8)         322648      user_input[0][0]                 
__________________________________________________________________________________________________
article_embedding (Embedding)   (None, 1, 8)         251320      article_input[0][0]              
____________________________________________________________________________________________

In [52]:
user_input, article_input, labels = [],[],[]
for (u, i) in train.keys():
    # positive instance
    user_input.append(u)
    article_input.append(i)
    labels.append(1)
    # negative instances
    for t in range(NUM_NEG):
        j = np.random.randint(num_articles)
        while (u, j) in train.keys():
            j = np.random.randint(num_articles)
        user_input.append(u)
        article_input.append(j)
        labels.append(0)

In [63]:
hist = model.fit([np.array(user_input), np.array(article_input)], #input
                 np.array(labels), # labels 
                 batch_size=BATCH_SIZE, 
                 epochs=19, 
                 verbose=1, 
                 shuffle=True)

Epoch 1/19
Epoch 2/19
Epoch 3/19
Epoch 4/19
Epoch 5/19
Epoch 6/19
Epoch 7/19
Epoch 8/19
Epoch 9/19
Epoch 10/19
Epoch 11/19
Epoch 12/19
Epoch 13/19
Epoch 14/19
Epoch 15/19
Epoch 16/19
Epoch 17/19
Epoch 18/19
Epoch 19/19


In [59]:
def eval_one_rating(idx, topK):
    rating = test_positives[idx]
    items = test_negatives[idx]
    u = rating[0]
    get_item = rating[1]
    items.append(get_item)
    # Get prediction scores
    map_item_score = {}
    users = np.full(len(items), u, dtype = 'int32')
    predictions = model.predict([users, np.array(items)], 
                                batch_size=100, verbose=0)
    for i in range(len(items)):
        item = items[i]
        map_item_score[item] = predictions[i]
    items.pop()
    
    # Evaluate top rank list
    ranklist = heapq.nlargest(topK, map_item_score, key=map_item_score.get)
    
    if get_item in ranklist:
        hr = 1
        i = ranklist.index(get_item)
        ndcg = math.log(2) / math.log(i+2)
        rr = 1/(i+1)
    else:
        hr = 0
        ndcg = 0
        rr = 0
   
    return (hr, ndcg, rr)

In [64]:
hits, ndcgs, rrs = [], [], []
for idx in range(len(test_positives)):
    hr, ndcg, rr = eval_one_rating(idx, topK)
    hits.append(hr)
    ndcgs.append(ndcg)
    rrs.append(rr)

In [65]:
hr = np.array(hits).mean()
mrr = np.array(rrs).mean()
ndcg = np.array(ndcgs).mean()

print("Hit ratio:            ", hr)
print("Mean reciprocal rank: ", mrr)
print("NDCG@10:              ", ndcg)