In [20]:
import os

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Multiply, Concatenate
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model

# from tensorflow.keras.optimizers import Adagrad, Adam, SGD, RMSprop
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow.keras.backend as K
from tensorflow.keras.models import load_model

# Set the output path for saving the model
os.environ['CURRENT_PATH'] = os.getcwd()
output_path = os.path.join(os.environ['CURRENT_PATH'], 'output')

# visualization imports
import matplotlib.pyplot as plt
%matplotlib inline

# Load the datasets
posts_df = pd.read_csv('datasets/post_data.csv')
views_df = pd.read_csv('datasets/view_data.csv')
users_df = pd.read_csv('datasets/user_data.csv')

# 2. DATA PRE-PROCESSING
## 2.1. Generating Score Data

In [21]:
views_df_with_scores = pd.DataFrame(views_df)

# Generate random probabilities for 1, 2, and 3
probs = np.random.dirichlet(np.ones(3))

# Assign probabilities for the scores
scores = np.random.choice(
    [1, 2, 3],
    size=len(views_df),
    p=probs
)

# Add the score column to the dataframe
views_df_with_scores['score'] = scores

views_df_with_scores.head()

Unnamed: 0,user_id,post_id,time_stamp,score
0,5eece14ffc13ae660900008b,136781766,01/01/2019 01:30 PM,3
1,5eece14efc13ae660900003c,43094523,01/01/2019 01:33 PM,2
2,5eece14efc13ae6609000025,42428071,01/01/2019 01:43 PM,3
3,5eece14ffc13ae66090001d4,76472880,01/01/2019 01:54 PM,3
4,5eece14ffc13ae66090000ac,202721843,01/01/2019 02:00 PM,3


## 2.2. Data Cleaning and Transformation for Model Training

In [22]:
# Drop the columns that are not needed and remove the rows with missing title values
cleaned_data = views_df_with_scores.drop(['time_stamp'], axis=1)

# Drop the duplicates
cleaned_data = cleaned_data.drop_duplicates()

# Map user_id and post_id to numeric indices
cleaned_data["user_id"] = cleaned_data["user_id"].astype("category").cat.codes + 1
cleaned_data["post_id"] = cleaned_data["post_id"].astype("category").cat.codes + 1

cleaned_data.head()

Unnamed: 0,user_id,post_id,score
0,140,813,3
1,61,202,2
2,38,195,3
3,469,418,3
4,173,1217,3


In [23]:
unique_users_num = len(cleaned_data.user_id.unique())
unique_posts_num = len(cleaned_data.post_id.unique())

print('There are {} unique users and {} unique posts in this data set'.format(unique_users_num, unique_posts_num))

There are 501 unique users and 6001 unique posts in this data set


In [24]:
users_max_id = cleaned_data.user_id.max()
posts_max_id = cleaned_data.post_id.max()

print('There are {} distinct users, and the max of user_id is also {}'.format(unique_users_num, users_max_id))
print('There are {} distinct posts, and the max of post_id is also {}'.format(unique_posts_num, posts_max_id))

There are 501 distinct users, and the max of user_id is also 501
There are 6001 distinct posts, and the max of post_id is also 6001


## 2.3. Splitting Data into Train and Test Sets

In [25]:
df_train, df_test = train_test_split(cleaned_data, test_size=0.2, shuffle=True, random_state=99)
print('shape of training data set:')
print(df_train.shape)
print('shape of test data set:')
print(df_test.shape)

shape of training data set:
(56946, 3)
shape of test data set:
(14237, 3)


# 3. Model Training, Evaluation, and Weights Loading Functions

In [26]:
# Define the model training function
def train_model(model, optimizer, batch_size, num_epochs, validation_split, input_data, target_data, output_model_name):
    # Define a custom metric for Root Mean Squared Error (RMSE)
    def compute_rmse(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_true - y_pred)))

    # Compile the model with the specified optimizer and loss function
    model.compile(
        optimizer=optimizer.lower(),
        loss='mean_squared_error',
        metrics=['mean_squared_error', compute_rmse]
    )

    # Define callbacks for early stopping and model checkpointing
    early_stopping = EarlyStopping(monitor='val_compute_rmse', patience=10, verbose=1)
    model_checkpoint = ModelCheckpoint(
        filepath=os.path.join(output_path, output_model_name),
        monitor='val_compute_rmse',
        save_best_only=True,
        save_weights_only=True
    )

    # Train the model and return the history
    history = model.fit(
        x=input_data,
        y=target_data,
        batch_size=batch_size,
        epochs=num_epochs,
        validation_split=validation_split,
        callbacks=[early_stopping, model_checkpoint]
    )

    return history

# Define the model's weights loading function
def load_model_weights(model, weights_file_path):
    model.load_weights(weights_file_path)
    return model

# Define the model evaluation function
calculate_rmse = lambda true_values, predicted_values: np.sqrt(
    np.mean(np.square(np.squeeze(predicted_values) - np.squeeze(true_values)))
)

# 4. Training and Testing the Generalized Matrix Factorization Model
## 4.1. Defining the GMF Model Architecture

In [27]:
def build_gmf_model(num_users, num_posts, latent_dim, user_reg, post_reg):
    user_input = Input(shape=(1,), dtype='int32', name='user_input')
    post_input = Input(shape=(1,), dtype='int32', name='post_input')

    # Embedding layers
    user_embedding = Embedding(
        input_dim=num_users + 1,
        output_dim=latent_dim,
        embeddings_initializer='uniform',
        name='user_embedding',
        embeddings_regularizer=l2(user_reg),
        input_length=1
    )
    post_embedding = Embedding(
        input_dim=num_posts + 1,
        output_dim=latent_dim,
        embeddings_initializer='uniform',
        name='post_embedding',
        embeddings_regularizer=l2(post_reg),
        input_length=1
    )

    # Flatten embedding vectors
    user_latent = Flatten()(user_embedding(user_input))
    post_latent = Flatten()(post_embedding(post_input))

    # Combine user and post embeddings
    interaction_vector = Multiply()([user_latent, post_latent])

    # Output layer
    prediction = Dense(1, kernel_initializer='glorot_uniform', name='prediction')(interaction_vector)

    # Create and return the model
    model = Model([user_input, post_input], prediction)
    return model

In [28]:
latent_dim = 8
user_reg = 0.01
post_reg = 0.01

GMF_model = build_gmf_model(users_max_id, posts_max_id, latent_dim, user_reg, post_reg)
GMF_model.summary()

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 user_input (InputLayer)     [(None, 1)]                  0         []                            
                                                                                                  
 post_input (InputLayer)     [(None, 1)]                  0         []                            
                                                                                                  
 user_embedding (Embedding)  (None, 1, 8)                 4016      ['user_input[0][0]']          
                                                                                                  
 post_embedding (Embedding)  (None, 1, 8)                 48016     ['post_input[0][0]']          
                                                                                            

## 4.2. Training the GMF Model

In [29]:
# model config
batch_size = 64
num_epochs = 100
validation_split = 0.25

# train model
history = train_model(GMF_model, 'adam', batch_size, num_epochs, validation_split, 
                      input_data=[df_train.user_id.values, df_train.post_id.values],
                      target_data=df_train.score.values,
                        output_model_name='best_gmf_model.hdf5')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 27: early stopping


## 4.3. Loading the Trained GMF Model and Evaluating Performance

In [30]:
# Load the pre-trained GMF model with the best weights
gmf_model = build_gmf_model(users_max_id, posts_max_id, latent_dim, user_reg, post_reg)
gmf_model = load_model_weights(gmf_model, os.path.join(output_path, 'best_gmf_model.hdf5'))

# Generate predictions using the test data
predicted_scores = gmf_model.predict([df_test.user_id.values, df_test.post_id.values])

# Calculate the RMSE for the predictions
rmse_error = calculate_rmse(df_test.score.values, predicted_scores)

# Print the RMSE result
print('The out-of-sample RMSE of rating predictions is', round(rmse_error, 4))

The out-of-sample RMSE of rating predictions is 0.7025


# 5. Training and Testing the Multi-Layer Perceptron Model
## 5.1. Defining the MLP Model Architecture

In [31]:
def build_mlp_model(num_users, num_posts, layers, reg_layers):
    # Ensure the number of layers matches the number of regularization parameters
    assert len(layers) == len(reg_layers)
    num_layer = len(layers)  # Number of layers in the MLP

    # Define input layers for user and post IDs
    user_input = Input(shape=(1,), dtype='int32', name='user_input')
    post_input = Input(shape=(1,), dtype='int32', name='post_input')

    # Define embedding layers for users and posts
    user_embedding = Embedding(
        input_dim=num_users + 1,           
        output_dim=layers[0] // 2,        
        embeddings_initializer='uniform', 
        name='user_embedding',           
        embeddings_regularizer=l2(reg_layers[0]),  
        input_length=1)                   
    
    post_embedding = Embedding(
        input_dim=num_posts + 1,         
        output_dim=layers[0] // 2,       
        embeddings_initializer='uniform', 
        name='post_embedding',           
        embeddings_regularizer=l2(reg_layers[0]), 
        input_length=1)                   

    # Flatten the embeddings to prepare for concatenation
    user_latent = Flatten()(user_embedding(user_input))  
    item_latent = Flatten()(post_embedding(post_input))  

    # Concatenate the user and item embedding vectors
    vector = Concatenate(axis=-1)([user_latent, item_latent])

    # Add fully connected (dense) layers
    for idx in range(1, num_layer): 
        layer = Dense(
            units=layers[idx],                   
            activation='relu',                   
            kernel_initializer='glorot_uniform',
            kernel_regularizer=l2(reg_layers[idx]), 
            name=f'layer{idx}')                 
        vector = layer(vector) 
    
    # Add the final prediction layer with a single output
    prediction = Dense(1, kernel_initializer='glorot_uniform', name='prediction')(vector)

    # Create the model with user and post inputs and prediction as the output
    model = Model([user_input, post_input], prediction)

    return model

In [32]:
layers = [64, 32, 16, 8]
reg_layers = [0.01, 0.01, 0.01, 0.01]

MLP_model = build_mlp_model(users_max_id, posts_max_id, layers, reg_layers)
MLP_model.summary()

Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 user_input (InputLayer)     [(None, 1)]                  0         []                            
                                                                                                  
 post_input (InputLayer)     [(None, 1)]                  0         []                            
                                                                                                  
 user_embedding (Embedding)  (None, 1, 32)                16064     ['user_input[0][0]']          
                                                                                                  
 post_embedding (Embedding)  (None, 1, 32)                192064    ['post_input[0][0]']          
                                                                                            

## 5.2. Training the MLP Model

In [33]:
# model config
batch_size = 64
num_epochs = 100
validation_split = 0.25

# train model
history = train_model(MLP_model, 'adam', batch_size, num_epochs, validation_split, 
                      input_data=[df_train.user_id.values, df_train.post_id.values],
                      target_data=df_train.score.values,
                        output_model_name='best_mlp_model.hdf5')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 15: early stopping


## 5.3. Loading the Trained MLP Model and Evaluating Performance

In [34]:
# Load the pre-trained MLP model with the best weights
mlp_model = build_mlp_model(users_max_id, posts_max_id, layers, reg_layers)
mlp_model = load_model_weights(mlp_model, os.path.join(output_path, 'best_mlp_model.hdf5'))

# Generate predictions using the test data
predicted_scores = mlp_model.predict([df_test.user_id.values, df_test.post_id.values])

# Calculate the RMSE for the predictions
rmse_error = calculate_rmse(df_test.score.values, predicted_scores)

# Print the RMSE result
print('The out-of-sample RMSE of rating predictions is', round(rmse_error, 4))

The out-of-sample RMSE of rating predictions is 0.7027


# 6. Training and Testing the Neural Matrix Factorization Model
## 6.1. Defining the NeuMF Model Architecture

In [35]:
def build_neumf_model(num_users, num_posts, MF_dim, MF_reg, MLP_layers, MLP_regs):
    # Ensure the number of layers matches the number of regularization parameters
    assert len(MLP_layers) == len(MLP_regs)
    num_MLP_layer = len(MLP_layers) # Number of layers in the MLP

    # Define input layers for user and post IDs
    user_input = Input(shape=(1,), dtype='int32', name='user_input')
    post_input = Input(shape=(1,), dtype='int32', name='post_input')

    # Embedding layers for MF
    mf_user_embedding = Embedding(
        input_dim=num_users + 1,
        output_dim=MF_dim,
        embeddings_initializer='uniform',
        name='mf_user_embedding',
        embeddings_regularizer=l2(MF_reg[0]),
        input_length=1)
    mf_post_embedding = Embedding(
        input_dim=num_posts + 1,
        output_dim=MF_dim,
        embeddings_initializer='uniform',
        name='mf_post_embedding',
        embeddings_regularizer=l2(MF_reg[1]),
        input_length=1)
    
    # Embedding layers for MLP
    mlp_user_embedding = Embedding(
        input_dim=num_users + 1,
        output_dim=MLP_layers[0] // 2,
        embeddings_initializer='uniform',
        name='mlp_user_embedding',
        embeddings_regularizer=l2(MLP_regs[0]),
        input_length=1)
    mlp_post_embedding = Embedding(
        input_dim=num_posts + 1,
        output_dim=MLP_layers[0] // 2,
        embeddings_initializer='uniform',
        name='mlp_post_embedding',
        embeddings_regularizer=l2(MLP_regs[0]),
        input_length=1) 
    
    # Flatten the embeddings to prepare for concatenation
    mf_user_latent = Flatten()(mf_user_embedding(user_input))
    mf_post_latent = Flatten()(mf_post_embedding(post_input))
    mf_vector = Multiply()([mf_user_latent, mf_post_latent])

    # Flatten the embeddings to prepare for concatenation
    mlp_user_latent = Flatten()(mlp_user_embedding(user_input))
    mlp_post_latent = Flatten()(mlp_post_embedding(post_input))
    mlp_vector = Concatenate(axis=-1)([mlp_user_latent, mlp_post_latent])
    
    # Concatenate the two latent vectors
    predict_vector = Concatenate(axis=-1)([mf_vector, mlp_vector])
    
    # Add fully connected (dense) layers
    for idx in range(1, num_MLP_layer):
        layer = Dense(
            units=MLP_layers[idx],
            activation='relu',
            kernel_initializer='glorot_uniform',
            kernel_regularizer=l2(MLP_regs[idx]),
            name = 'layer%d' %idx)
        mlp_vector = layer(mlp_vector)
    

    # Concatenate the two latent vectors
    prediction = Dense(1, kernel_initializer='glorot_uniform', name='prediction')(predict_vector)
    
    # Create the model with user and post inputs and prediction as the output
    model = Model([user_input, post_input], prediction)
    
    return model

In [36]:
MF_dim = 8
MF_reg = (0.01, 0.01)
MLP_layers = [64, 32, 16, 8]
MLP_regs = [0.01, 0.01, 0.01, 0.01]

NeuMF_model = build_neumf_model(
    num_users=users_max_id,
    num_posts=posts_max_id,
    MF_dim=MF_dim,
    MF_reg=MF_reg,
    MLP_layers=MLP_layers,
    MLP_regs=MLP_regs
)
NeuMF_model.summary()

Model: "model_10"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 user_input (InputLayer)     [(None, 1)]                  0         []                            
                                                                                                  
 post_input (InputLayer)     [(None, 1)]                  0         []                            
                                                                                                  
 mf_user_embedding (Embeddi  (None, 1, 8)                 4016      ['user_input[0][0]']          
 ng)                                                                                              
                                                                                                  
 mf_post_embedding (Embeddi  (None, 1, 8)                 48016     ['post_input[0][0]']   

## 6.2. Training the NeuMF Model

In [37]:
# model config
batch_size = 64
num_epochs = 100
validation_split = 0.25

# train model
history = train_model(NeuMF_model, 'adam', batch_size, num_epochs, validation_split, 
                        input_data=[df_train.user_id.values, df_train.post_id.values],
                        target_data=df_train.score.values,
                        output_model_name='best_neumf_model.hdf5')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 33: early stopping


## 6.3. Loading the Trained NeuMF Model and Evaluating Performance

In [38]:
# Load the pre-trained NeuMF model with the best weights
neumf_model = build_neumf_model(
    num_users=users_max_id,
    num_posts=posts_max_id,
    MF_dim=MF_dim,
    MF_reg=MF_reg,
    MLP_layers=MLP_layers,
    MLP_regs=MLP_regs
)
neumf_model = load_model_weights(neumf_model, os.path.join(output_path, 'best_neumf_model.hdf5'))

# Generate predictions using the test data
predicted_scores = neumf_model.predict([df_test.user_id.values, df_test.post_id.values])

# Calculate the RMSE for the predictions
rmse_error = calculate_rmse(df_test.score.values, predicted_scores)

# Print the RMSE result
print('The out-of-sample RMSE of rating predictions is', round(rmse_error, 4))

The out-of-sample RMSE of rating predictions is 0.7025
