In [2]:
import os
import sys
sys.path.append(os.path.join('..', 'src'))

import pickle
import time
import numpy as np
import matplotlib.pyplot as plt
import tensorflow.keras.backend as K

import warnings
warnings.filterwarnings("ignore")

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Add, Flatten, Dot, Input
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler

### Load and prepare data

In [3]:
DATA_DIR = os.path.join('..', 'data', 'processed', 'filtering')

In [4]:
with open(os.path.join(DATA_DIR, 'train_ratings.pickle'), 'rb') as file:
    train_ratings = pickle.load(file)

with open(os.path.join(DATA_DIR, 'test_ratings.pickle'), 'rb') as file:
    test_ratings = pickle.load(file)

In [5]:
# get number of users and items
M = 1 + max(
    max([i[0] for i in train_ratings.keys()]),
    max([i[0] for i in test_ratings.keys()])
)
N = 1 + max(
    max([i[1] for i in train_ratings.keys()]),
    max([i[1] for i in test_ratings.keys()])
)
M, N

(9166, 1998)

In [6]:
X_train = np.array(list(zip(*train_ratings.keys())))
y_train = np.array(list(train_ratings.values()))
               
X_test = np.array(list(zip(*test_ratings.keys())))
y_test = np.array(list(test_ratings.values()))

In [7]:
# compute global mean
mean = np.mean(y_train)
mean

4.239058234363317

### Build the model

In [7]:
def build_model(M, N, K, lmbda=0.):
    """
    Build a model for matrix factorization
    Args:
        M (int): number of users
        N (int): number of items
        K (int): size of inner (latent) dimension
        lmbda (float): L2-regularization parameter
    Returns:
        keras.models.Model: model for MF
    """
    w = Input((1, )) 
    u = Input((1, ))
    w_vec = Embedding(M, K, embeddings_regularizer=l2(lmbda))(w)  # user embeddings
    u_vec = Embedding(N, K, embeddings_regularizer=l2(lmbda))(u)  # item embeddings
    
    w_bias = Embedding(M, 1, embeddings_regularizer=l2(lmbda))(w)  # user bias
    u_bias = Embedding(N, 1, embeddings_regularizer=l2(lmbda))(u)  # item bias
    
    r_pred = Dot(axes=2)([w_vec, u_vec]) # predicted rating w/0 biases
    r_pred = Add()([r_pred, w_bias, u_bias])
    r_pred = Flatten()(r_pred)
    
    model = Model(inputs=[w, u], outputs=[r_pred])
    return model

### Example of keras-model architecture

In [None]:
model = build_model(M, N, K=10)

In [9]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 10)        91660       input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 10)        19980       input_2[0][0]                    
______________________________________________________________________________________________

### Train the model

In [10]:
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [28]:
LOSS = 'mse'
LR = 0.1
RATE_DECAY = 1e-8
EPOCHS = 100
BATCH_SIZE = 128

In [29]:
def search_k(min_k, max_k, step_k=1, l2_reg=0):
    test_scores = []
    
    for k in range(min_k, max_k + 1, step_k):
        print(f"Number of latent factors: {k}", end='  ')
        model = build_model(M, N, k, lmbda=l2_reg)
        model.compile(
            loss=LOSS,
            optimizer=SGD(lr=LR, decay=RATE_DECAY),
            metrics=[rmse]
        )
        
        history = model.fit(
            x=[*X_train],
            y=y_train-mean,
            epochs=EPOCHS,
            batch_size=BATCH_SIZE,
            validation_data=([*X_test], y_test - mean),
            verbose=0
        )
        
        loss, rmse_score = model.evaluate([*X_test], y_test - mean, verbose=0)
        print(f"RMSE: {rmse_score}")
        test_scores.append((k, rmse_score)) 
    return test_scores

In [19]:
search_k(1, 20, step_k=2)

Number of latent factors: 1  RMSE: 0.8520964980125427
Number of latent factors: 3  RMSE: 0.8520238399505615
Number of latent factors: 5  RMSE: 0.8518393635749817
Number of latent factors: 7  RMSE: 0.8522348999977112
Number of latent factors: 9  RMSE: 0.852024257183075
Number of latent factors: 11  RMSE: 0.8518834114074707
Number of latent factors: 13  RMSE: 0.8517428040504456
Number of latent factors: 15  RMSE: 0.8519430160522461
Number of latent factors: 17  RMSE: 0.85152667760849
Number of latent factors: 19  RMSE: 0.8518016338348389
