# Solution: MF of an explicit feedback (ratings) matrix

The goal of this exercise is to compare a simple Matric Factorization (MF) and Neural Network Matrix Factorization. 


**Given:**
- data loader and matrix initialization code 
- default params settings, training and evaluation 


**Original sources:** 


http://hameddaily.blogspot.fr/2016/12/simple-matrix-factorization-with.html

https://nipunbatra.github.io/blog/2017/recommend-keras.html

In [None]:
!pip install -q tensorflow==2.0.0-beta0
!pip install -q matplotlib
!pip install -q pandas
!pip install -q numpy

In [None]:
import tensorflow as tf
import numpy
import pandas as pd
import matplotlib
%matplotlib inline

print("TensorFlow version: {}".format(tf.__version__))
print("Eager execution: {}".format(tf.executing_eagerly()))


### Load the data

In [None]:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

resp = urlopen("http://files.grouplens.org/datasets/movielens/ml-100k.zip")
zipfile = ZipFile(BytesIO(resp.read()))
file = 'ml-100k/u.data'
df = pd.read_csv(zipfile.open(file), low_memory=False, skiprows=[0], sep='\t', names=['user', 'item', 'rate', 'time'])
df.head()

In [None]:
numpy.random.seed(42)
# split data into train and test set
msk = numpy.random.rand(len(df)) < 0.7
df_train = df[msk]
df_test = df[~msk]

user_index = [x-1 for x in df_train.user.values]
item_index = [x-1 for x in df_train.item.values]
user_index_test = [x-1 for x in df_test.user.values]
item_index_test = [x-1 for x in df_test.item.values]

rates = df_train.rate.values 
rates_test = df_test.rate.values

num_ratings = len(rates)
num_ratings_test = len(rates_test)
mean_rating = numpy.mean(rates)
mean_rating_test = numpy.mean(rates_test)

rates = rates - mean_rating
rates_test = rates_test - mean_rating_test


print ("Mean (train) rating = " + str(mean_rating))
print ("Number of ratings (train/val/total) = " + str(num_ratings) + "/" + str(num_ratings_test) + "/" + str(num_ratings + num_ratings_test))

# Matrix Factorization

### MF model: define the user and item embeddings
Define/initialize the User and Item matrices and use their product to compute ratings R


In [None]:
# variables
feature_len = 10

num_users = len(numpy.unique(df.user.values)) 
num_items = len(numpy.unique(df.item.values)) 

print("Number of users is {}".format(num_users))
print("Number of movies is {}".format(num_items))


### Model

In [None]:
# product embedding
item_input = tf.keras.layers.Input(shape=[1],name='Item')
item_embedding = tf.keras.layers.Embedding(num_items, feature_len, name='Item-Embedding')(item_input)
item_vec = tf.keras.layers.Flatten(name='FlattenItems')(item_embedding)

# user embedding
user_input = tf.keras.layers.Input(shape=[1],name='User')
user_embedding = tf.keras.layers.Embedding(num_users, feature_len, name='User-Embedding')(user_input)
user_vec = tf.keras.layers.Flatten(name='FlattenUsers')(user_embedding)

# rating
#user_vec_transp = tf.transpose(user_vec)
result = tf.keras.layers.dot([item_vec, user_vec], axes=1, name='DotProduct')

# initialize Keras model 
model = tf.keras.Model([user_input, item_input], result)


### Loss and optimizer

In [None]:
# choose the loss 
loss = tf.keras.losses.MeanSquaredError()

# learning rate
lr = 0.001
learning_rate = tf.optimizers.schedules.ExponentialDecay(lr, decay_steps=100000,
    decay_rate=0.96, staircase=True)


optimizer = tf.optimizers.Adam(learning_rate=learning_rate)
#optimizer = tf.optimizers.SGD(learning_rate=lr)


### Metrics

In [None]:
# Define accuracy
threshold = 1.0

def accuracy(desired_rates, predicted_rates):
    diff_op = tf.subtract(predicted_rates, desired_rates, name='trainig_diff')
    # Just measure the absolute difference against the threshold
    good = tf.less(tf.abs(diff_op), threshold)

    return tf.reduce_mean(tf.cast(good, tf.float32))


### Compile the model

In [None]:
# compile the model with the optimizerm, loss and the tracking metrics
model.compile(optimizer=optimizer, loss=loss, metrics=[accuracy])

model.summary()

### Train the model

In [None]:
numberEpochs = 10
history = model.fit([user_index, item_index], rates, epochs=numberEpochs, verbose=1, validation_split=0.1)

### Visualize the training and validation loss 

In [None]:
import matplotlib.pyplot as plt

def plot_history(history):
    pd.Series(history.history['loss']).plot(logy=True, label='Training loss')
    pd.Series(history.history['val_loss']).plot(logy=True, label='Validation loss')
    plt.legend()

    plt.xlabel("Epoch")
    plt.ylabel("Train Error")
    plt.show()
    
plot_history(history)

### Evaluate on test dataset and compute RMSE

In [None]:
# example
nr_sampled_users = 10

for index in numpy.random.choice(range(num_users), size=nr_sampled_users):
    u, p, r = df[['user', 'item', 'rate']].values[index]
    learnt_product_embedding = model.get_layer(name='Item-Embedding').get_weights()[0]
    learnt_user_embedding = model.get_layer(name='User-Embedding').get_weights()[0]

    predicted_rates = numpy.dot(learnt_user_embedding, learnt_product_embedding.T)

    rhat = tf.gather(tf.gather(tf.add(predicted_rates, mean_rating), u-1), p-1)
    print ("rating for user " + str(u) + " for item " + str(p) + " is " + str(r) + " and our prediction is: " + str(rhat.numpy()))


In [None]:
from sklearn.metrics import mean_absolute_error

def compute_RMSE(model, user_index_test, item_index_test):
    predicted_rates_test = model.predict([user_index_test, item_index_test])
    
    return mean_absolute_error(rates_test, predicted_rates_test)

err_test = compute_RMSE(model, user_index_test, item_index_test)
print("Mean absolute error on the test set: {}".format(err_test))

### Generate recommendations from the trained model for a list of users

In [None]:
def create_recommendations(model, df, listOfUsers, nrRecommendations=20):
    
    item_index = numpy.array(numpy.unique(df.item.values)) - 1
    
    recommendations_set = {}
    
    for user in listOfUsers:
        user_index = numpy.ones(len(item_index)) * user
        predicted_rates = model.predict([user_index, item_index]) 
    
        ranked_items_idx = numpy.argsort(predicted_rates, axis=0)[::-1].squeeze()
        ranked_items = item_index[ranked_items_idx]
        
        recommendations_set[user] = ranked_items[:nrRecommendations]
        
    return recommendations_set

### Define and compute Precision@K score

We first create a validation set for every user which consists of all the products that the user rated higher than 3.5 (the value of the mean rate).

We then compute precistion@K for our recommendations.

### Create validation set for every user

In [None]:
def create_validation_set(df, minRate=3.5):
    validation_set = {}
    
    for user in numpy.unique(df['user'].values) - 1:
        rated_items = df[df['user'] == user]['item'].values
        rates = df[df['user'] == user]['rate'].values

        best_ranked_items = rated_items[numpy.where(rates > minRate)[0]]
        if len(best_ranked_items) > 0:
            validation_set[user] = best_ranked_items
            
    return validation_set


### Compute precision@k using the recommendations and the validation set

In [None]:
def precisionAtK(validations_set, recommendations_set, k=3):

    precision = []
    for user in validations_set.keys():

        precision.append(tf.keras.metrics.top_k_categorical_accuracy(
        validations_set[user][numpy.newaxis],
        recommendations_set[user][numpy.newaxis],
        k=k
    ).numpy() / k)

    return numpy.mean(precision)

In [None]:
def compute_precisionAtK_from_recommendations(model, df_test, validation_set=None, nrRecommendations=20, k=10):
    if validation_set is None:
        validation_set = create_validation_set(df_test, minRate=3.5)
        
    recommendations_set = create_recommendations(model, df_test, \
                                              validation_set.keys(), nrRecommendations=nrRecommendations)
    precision = precisionAtK(validation_set, recommendations_set, k=k)
    
    return precision
    

In [None]:
validation_set = create_validation_set(df_test, minRate=3.5)
# recommendations_set = create_recommendations(model, user_index_test, item_index_test, \
#                                               validation_set.keys(), nrRecommendations=20)

precision = compute_precisionAtK_from_recommendations(model, df_test, validation_set=validation_set, nrRecommendations=20, k=5)
print("MF: Precision@{} is {}".format(5, precision))



In [None]:
# ### how does the precision@k vary for k

# recommendations_set = create_recommendations(model, df_test, \
#                                               validation_set.keys(), nrRecommendations=100)
# prec = []
# kvect = range(1, 100, 10)
# for k in kvect:
#     print(k)
#     precision = precisionAtK(validation_set, recommendations_set, k=k)
#     print("MF: Precision@{} is {}".format(k, precision))
#     prec.append(precision)

# plt.plot(kvect, prec)
# plt.xlabel("k")
# plt.ylabel("Precision@k")
# plt.show()

## Exercise
Experiment with the parameter choice of the MF model and evaluate the setting in RMSE and Precision@5. Compare various settings in the ResultsTable.

In [None]:
class ResultsTable():
    def __init__(self):
        self.columns = ["name", "RMSE", "Precision@5"]
        self.df = pd.DataFrame(columns=self.columns)
        
    def add(self, name="experimentName", rmse=None, precision=None, overwrite=False):
        
        data ={"name":name, "RMSE":rmse, "Precision@5": precision}
        res = pd.Series(data, self.columns, name=name)
        
        if len(self.df[self.df.name == name]) > 0:
            if not overwrite:
                print("Error: name already exists. Change name or set overwrite to True.")
            else:
                self.df = self.df.drop(self.df[self.df.name == name].index[0])
                self.df = self.df.append(res,  ignore_index=True)

        else:
            self.df = self.df.append(res,  ignore_index=True)

        
    def show(self):
        display(self.df)

table = ResultsTable()        

Compute the RMSE and Precision@5 and add it to the results table.

In [None]:
err_test = compute_RMSE(model, user_index_test, item_index_test)

precision = compute_precisionAtK_from_recommendations(model, df_test, validation_set=validation_set, nrRecommendations=20, k=5)
print("MF: Precision@{} is {}".format(5, precision))
print("MF: RMSE is {}".format(err_test))

table.add(name="MF", rmse=err_test, precision=precision, overwrite=False)
table.show()



### Regularization 


### Q1: choose the regularization
Regularizers: see https://keras.io/regularizers/


In [None]:
## example:
experiment_name = "MF_" + "regUserItemL2"

In [None]:
# regularizers
no_regularizer = None
regularizer_l2 = tf.keras.regularizers.l2(0.0001)
regularizer_l1 = tf.keras.regularizers.l1(0.0001)
regularizer_l1l2 = tf.keras.regularizers.l1_l2(0.0001)

regularizer_user = regularizer_l2
regularizer_product = regularizer_l2

In [None]:
# product embedding
item_input = tf.keras.layers.Input(shape=[1],name='Item')

################ EMBEDDING AND REGULARIZER ##########################################################################
item_embedding = tf.keras.layers.Embedding(num_items + 1, feature_len, name='Item-Embedding', \
                                          embeddings_regularizer=regularizer_product)(item_input)
#################################################################################################

item_vec = tf.keras.layers.Flatten(name='FlattenItems')(item_embedding)

# user embedding
user_input = tf.keras.layers.Input(shape=[1],name='User')
################ EMBEDDING AND REGULARIZER ##########################################################################
user_embedding = tf.keras.layers.Embedding(num_users + 1, feature_len,name='User-Embedding', \
                                          embeddings_regularizer=regularizer_user)(user_input)
#################################################################################################

user_vec = tf.keras.layers.Flatten(name='FlattenUsers')(user_embedding)

# rating
result = tf.keras.layers.dot([item_vec, user_vec], axes=1, name='DotProduct')

# initialize Keras model 
model = tf.keras.Model([user_input, item_input], result)


In [None]:
# choose the loss 
## MeanAbsoluteError, MeanSquaredError, MeanSquaredLogarithmicError; see https://keras.io/losses/
loss = tf.keras.losses.MeanSquaredError()

# learning rate
lr = 0.001
learning_rate = tf.optimizers.schedules.ExponentialDecay(lr, decay_steps=100000,
    decay_rate=0.96, staircase=True)

# choose the optimizer
optimizer = tf.optimizers.Adam(learning_rate=learning_rate)
#optimizer = tf.optimizers.SGD(learning_rate=lr)


# compile the model with the optimizerm, loss and the tracking metrics
model.compile(optimizer=optimizer, loss=loss, metrics=[accuracy])

#model.summary()

numberEpochs = 10
history = model.fit([user_index, item_index], rates, epochs=numberEpochs, verbose=1, validation_split=0.1)
plot_history(history)


#### Evaluate the trained model

In [None]:
err_test = compute_RMSE(model, user_index_test, item_index_test)

precision = compute_precisionAtK_from_recommendations(model, df_test, validation_set=validation_set, nrRecommendations=20, k=5)
print("MF: Precision@{} is {}".format(5, precision))
print("MF: RMSE is {}".format(err_test))



#### Add results into the result table

In [None]:
table.add(name=experiment_name, rmse=err_test, precision=precision, overwrite=True)
table.show()

### Q4: Benchmark several experiments with different hyperparameters 

In [None]:
#table.add(name="experiment_l1regularization", rmse=err_test, precision=precision)
table.show()

# Neural networks for recommendation
Create a simple neural network for recommendation, or for estimating rating! This model is very similar to the earlier matrix factorisation models, but differs in the following ways:

- Instead of taking a dot product of the user and the item embedding, we concatenate them and use them as features for our neural network. Thus, we are not constrained to the dot product way of combining the embeddings, and can learn complex non-linear relationships.
- We can now have a different dimension of user and item embeddings. This can be useful if one dimension is larger than the other.

In [None]:
experiment_name = "NNMF"

In [None]:
n_latent_factors_user = 5
n_latent_factors_item = 8

item_input = tf.keras.layers.Input(shape=[1],name='Item')
item_embedding = tf.keras.layers.Embedding(num_items, n_latent_factors_item, name='Item-Embedding')(item_input)
item_vec = tf.keras.layers.Flatten(name='FlattenItems')(item_embedding)
item_vec = tf.keras.layers.Dropout(0.2)(item_vec)

user_input = tf.keras.layers.Input(shape=[1],name='User')
user_embedding = tf.keras.layers.Embedding(num_users, n_latent_factors_user,name='User-Embedding')(user_input)
user_vec = tf.keras.layers.Flatten(name='FlattenUsers')(user_embedding)
user_vec = tf.keras.layers.Dropout(0.2)(user_vec)

concat = tf.keras.layers.concatenate([item_vec, user_vec], name='Concat')
concat_dropout = tf.keras.layers.Dropout(0.2)(concat)
dense = tf.keras.layers.Dense(200,name='FullyConnected', activation='relu')(concat)
dropout_1 = tf.keras.layers.Dropout(0.2,name='Dropout')(dense)
dense_2 = tf.keras.layers.Dense(100,name='FullyConnected-1', activation='relu')(concat)
dropout_2 = tf.keras.layers.Dropout(0.2,name='Dropout')(dense_2)
dense_3 = tf.keras.layers.Dense(50,name='FullyConnected-2', activation='relu')(dense_2)
dropout_3 =tf.keras.layers.Dropout(0.2,name='Dropout')(dense_3)
dense_4 = tf.keras.layers.Dense(20,name='FullyConnected-3', activation='relu')(dense_3)

result = tf.keras.layers.Dense(1, activation='relu',name='Activation')(dense_4)

adam = optimizer = tf.optimizers.Adam( lr=0.0001)
model_NN = tf.keras.Model([user_input, item_input], result)
model_NN.compile(optimizer=adam,loss= 'mean_absolute_error')
model_NN.summary()

In [None]:
numberEpochs = 20
print_log = 0
history_NN = model_NN.fit([user_index, item_index], rates, epochs=numberEpochs, verbose=print_log, validation_split=0.1)

In [None]:
plot_history(history_NN)

In [None]:
err_test = compute_RMSE(model_NN, user_index_test, item_index_test)

precision = compute_precisionAtK_from_recommendations(model_NN, df_test, validation_set=validation_set, nrRecommendations=20, k=5)
print("MF: Precision@{} is {}".format(5, precision))
print("MF: RMSE is {}".format(err_test))

table.add(name=experiment_name, rmse=err_test, precision=precision, overwrite=True)
table.show()


### Exercise NN:
Change the model parameters, for example:
- Change the number of layers of the NN. 
- Remove/add dropout




### Design a simple baseline:  predict average rate per item and recommend items with the highest rating

In [None]:
gbo = df_test[["item", "rate"]].groupby("item").mean().reset_index()
df_gbo = pd.merge(df_test, gbo, on="item", suffixes=('', '_gbo'))
display(df_gbo[df_gbo.item == 1].head())
predicted_rates_gbo_test = df_gbo.rate_gbo.values

err_gbo_test = mean_absolute_error(rates_test + mean_rating_test, predicted_rates_gbo_test)
print("Mean absolute error on the test set: {}".format(err_gbo_test))

### Generate recommendations

In [None]:
def recommend_highest_rates(df, userList, nrRecommendations=10):
    gbo_set = {}
    
    # compute mean rating per item
    all_items = numpy.unique(df['item'].values)
    gbos = df.groupby("item").mean().rate.reset_index().sort_values(by="rate", ascending=False).item.values[:nrRecommendations]
    
    for user in userList:
        gbo_set[user] = gbos
            
    return gbo_set

In [None]:
highest_rates_baseline = recommend_highest_rates(df, validation_set.keys(), nrRecommendations=20)

precision_baseline = precisionAtK(validation_set, highest_rates_baseline, k=5)
print("Baseline: Precision@{} is {}".format(5, precision_baseline))
table.add(name="gbo", rmse=err_gbo_test, precision=precision_baseline)

### Final scores

In [None]:
#display(table.df.sort_values(by="RMSE"))


In [None]:
#display(table.df.sort_values(by="Precision@5", ascending=False))