<h1>Collaborative Filtering in Tensorflow using a Latent Factor Model</h1>

<img src="img/user_item.svg">
<img src="img/latent_factors.svg">
<img src="img/predict.svg">

Prediction:
$$
r' _ { u i } = \mu + b _ { u } + b _ { i } + p _ { u } q _ { i }
$$

Loss Function:
$$
\min _ { p _ { * } , q _ { * } , b _ { * } } \sum _ { ( u , i ) \in \mathcal { K } } \left( r _ { u i } - \mu - b _ { u } - b _ { i } - p _ { u } ^ { T } q _ { i } \right) ^ { 2 } + \lambda \left( \| p _ { u } \| ^ { 2 } + \| q _ { i } \| ^ { 2 } + b _ { u } ^ { 2 } + b _ { i } ^ { 2 } \right)
$$

<br><br><br><br>
Further Reading:<br>
<a href="http://sifter.org/simon/journal/20061211.html">Basic Latent Factor Model, original source</a><br>
<a href="http://www.cs.rochester.edu/twiki/pub/Main/HarpSeminar/Factorization_Meets_the_Neighborhood-_a_Multifaceted_Collaborative_Filtering_Model.pdf">Advanced Latent Factor Models</a><br>
<a href="http://www.cs.ubbcluj.ro/~gabis/DocDiplome/SistemeDeRecomandare/Recommender_systems_handbook.pdf">The Recommender Systems Handbook</a><br>
<a href="https://arxiv.org/pdf/1708.05031.pdf">Neural Collaborative Filtering</a><br>





In [129]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.initializers import glorot_normal
from tensorflow.python.keras.regularizers import l2
from tensorflow.python.keras.layers import Input, Embedding, concatenate, Dense, Flatten, Dropout


from collections import Counter
from tensorflow.contrib.tensorboard.plugins import projector
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [114]:
def load_movie_lens_1m():
    movie_lens_1m = pd.read_csv("datasets/ml-1m/ratings.dat", sep='::', header=None, engine='python')
        
    x, y = movie_lens_1m.iloc[:, :2].values, movie_lens_1m.iloc[:, 2].values 
    user_dict = dict(enumerate(np.unique(x[:, 0])))
    item_dict = dict(enumerate(np.unique(x[:, 1])))
    x[:, 0] = [{value:key for key,value in user_dict.items()}[u] for u in x[:, 0]] # index users from 0 to num_users - 1
    x[:, 1] = [{value:key for key,value in item_dict.items()}[i] for i in x[:, 1]] # index items from 0 to num_items - 1

    return x, y, user_dict, item_dict

In [118]:
def load_movie_lens_100k_norm():
    movie_lens_100k = pd.read_csv("datasets/ml-100k/u.data", sep='\t', header=None, engine='python')
        
    x, y = movie_lens_100k.iloc[:, :2].values, movie_lens_100k.iloc[:, 2].values 
    min_max_scaler = preprocessing.MinMaxScaler()
    y = min_max_scaler.fit_transform(y.reshape(-1, 1))
    
    user_dict = dict(enumerate(np.unique(x[:, 0])))
    item_dict = dict(enumerate(np.unique(x[:, 1])))
    x[:, 0] = [{value:key for key,value in user_dict.items()}[u] for u in x[:, 0]] # index users from 0 to num_users - 1
    x[:, 1] = [{value:key for key,value in item_dict.items()}[i] for i in x[:, 1]] # index items from 0 to num_items - 1

    return x, y, user_dict, item_dict

In [116]:
def load_movie_lens_100k():
    movie_lens_100k = pd.read_csv("datasets/ml-100k/u.data", sep='\t', header=None, engine='python')
        
    x, y = movie_lens_100k.iloc[:, :2].values, movie_lens_100k.iloc[:, 2].values 
    user_dict = dict(enumerate(np.unique(x[:, 0])))
    item_dict = dict(enumerate(np.unique(x[:, 1])))
    x[:, 0] = [{value:key for key,value in user_dict.items()}[u] for u in x[:, 0]] # index users from 0 to num_users - 1
    x[:, 1] = [{value:key for key,value in item_dict.items()}[i] for i in x[:, 1]] # index items from 0 to num_items - 1

    return x, y, user_dict, item_dict

In [119]:
X, Y, user_dict, item_dict = load_movie_lens_100k_norm()



In [104]:
# parameters
parameters = dict()
parameters["num_users"] = np.unique(X[:, 0]).size
parameters["num_items"] = np.unique(X[:, 1]).size
parameters["num_factors"] = 100

# regularization hyperparameters
hyperparameters = dict()
hyperparameters["reg_b_u"] = 0.0001
hyperparameters["reg_b_i"] = 0.0001
hyperparameters["reg_p_u"] = 0.005
hyperparameters["reg_q_i"] = 0.005

In [6]:
def create_constants(mu):
    """
    Creates and returns mu constant,
    which is defined as mean over all ratings.
    """
    with tf.variable_scope('constants'):
        _mu = tf.constant(mu, shape=[], dtype=tf.float32)
    
    return _mu

In [7]:
def create_user_variables(users, parameters, hyperparameters):
    """
    Creates latent user features_fac and user bias.
    Returns look-up OPs.
    """
    with tf.variable_scope('users'):
        user_embeddings = tf.get_variable(
            name='embedding',
            shape=[parameters["num_users"], parameters["num_factors"]],
            initializer=tf.contrib.layers.xavier_initializer(),
            regularizer=tf.contrib.layers.l2_regularizer(hyperparameters["reg_p_u"]))

        user_bias = tf.get_variable(
            name='bias',
            shape=[parameters["num_users"], ],
            initializer=tf.contrib.layers.xavier_initializer(),
            regularizer=tf.contrib.layers.l2_regularizer(hyperparameters["reg_b_u"]))

        p_u = tf.nn.embedding_lookup(
            user_embeddings,
            users,
            name='p_u')

        b_u = tf.nn.embedding_lookup(
            user_bias,
            users,
            name='b_u')
    
    return p_u, b_u

In [86]:
def create_item_variables(items, parameters, hyperparameters):
    """
    Creates latent item features and item bias.
    Returns look-up OPs.
    """
    with tf.variable_scope('items'):
        item_embeddings = tf.get_variable(
            name='embedding',
            shape=[parameters["num_items"], parameters["num_factors"]],
            initializer=tf.contrib.layers.xavier_initializer(),
            regularizer=tf.contrib.layers.l2_regularizer(hyperparameters["reg_q_i"]))

        item_bias = tf.get_variable(
            name='bias',
            shape=[parameters["num_items"], ],
            initializer=tf.contrib.layers.xavier_initializer(),
            regularizer=tf.contrib.layers.l2_regularizer(hyperparameters["reg_b_i"]))

        q_i = tf.nn.embedding_lookup(
            item_embeddings,
            items,
            name='q_i')

        b_i = tf.nn.embedding_lookup(
            item_bias,
            items,
            name='b_i')

    return q_i, b_i

In [87]:
def create_prediction(mu, b_u, b_i, p_u, q_i):
    """ 
    Returns the prediction which is definded as:
    r_hat = \mu + b_u + b_i + p_u * q_i
    """
    with tf.variable_scope('prediction'):
        pred = tf.reduce_sum(
            tf.multiply(p_u, q_i),
            axis=1)

        pred = tf.add_n([b_u, b_i, pred])

        pred = tf.add(pred, mu, name='pred')

    return pred

In [88]:
def create_loss(pred, ratings):
    """
    Returns the L2 loss.
    """
    with tf.variable_scope('loss'):
        loss = tf.nn.l2_loss(tf.subtract(ratings, pred), name='loss')

    return loss

In [89]:
def create_metrics(pred, ratings):
    """ 
    Returns evaluation Metrics and update OPs.
    """
    with tf.variable_scope('metrics'):
        mae, mae_update_op = tf.metrics.mean_absolute_error(ratings, pred, name ="mae")
    
        rmse, rmse_update_op = tf.metrics.root_mean_squared_error(tf.cast(ratings, tf.float32), tf.cast(pred, tf.float32), name ="rmse")
    
    return mae, mae_update_op, rmse, rmse_update_op

In [90]:
def create_optimizer(loss):
    """
    Returns the optimizer.
    The objective function is defined as the sum of
    loss and regularizers' losses.
    """
    with tf.variable_scope('optimizer'):
        objective = tf.add(
            loss,
            tf.add_n(tf.get_collection(
                tf.GraphKeys.REGULARIZATION_LOSSES)),
            name='objective')
        
        optimizer = tf.train.AdamOptimizer().minimize(objective, name='optimizer')
        #optimizer = tf.train.GradientDescentOptimizer(learning_rate = 0.01).minimize(objective, name='optimizer',)

    return optimizer

In [91]:
def build_graph(users, items, ratings, mu, parameters, hyperparameters):
    _mu = create_constants(mu)

    p_u, b_u = create_user_variables(users, parameters, hyperparameters)
    q_i, b_i = create_item_variables(items, parameters, hyperparameters)

    pred = create_prediction(_mu, b_u, b_i, p_u, q_i)

    loss = create_loss(pred, ratings)

    optimizer = create_optimizer(loss)
    
    return optimizer, loss, pred

In [92]:
def train(x, y, parameters, hyperparameters, epochs=30, batch_size=64, validation_data=None):

    if x.shape[0] != y.shape[0] or x.shape[1] != 2:
        raise ValueError('The shape of x should be (samples, 2) and '
                             'the shape of y should be (samples, 1).')

    # create datasets
    training_dataset = tf.data.Dataset.from_tensor_slices(
        (x[:, 0].astype(np.int32), 
         x[:, 1].astype(np.int32), 
         y.astype(np.float32))).batch(batch_size)
    
    if validation_data is not None:
        valid_x, valid_y = validation_data
        validation_dataset = tf.data.Dataset.from_tensor_slices(
            (valid_x[:, 0].astype(np.int32), 
             valid_x[:, 1].astype(np.int32), 
             valid_y.astype(np.float32))).batch(batch_size)

    # create dataset iterator
    iter = tf.data.Iterator.from_structure(training_dataset.output_types,
                                           training_dataset.output_shapes)

    users, items, ratings = iter.get_next()
    
    training_init_op = iter.make_initializer(training_dataset)  
    if validation_data is not None:
        validation_init_op = iter.make_initializer(validation_dataset)
    
    # build model
    optimizer, loss, pred = build_graph(users, items, ratings, np.mean(y), parameters, hyperparameters)
    
    if validation_data is not None:
        mae, mae_update_op, rmse, rmse_update_op = create_metrics(pred, ratings)
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        
        print('Training...')
        for e in range(0, epochs):
            print('Epoch {}/{}'.format(e + 1, epochs))
            
            # training
            sess.run(training_init_op)
            mean_loss = 0.0
            counter = 0
            
            while True:
                try:
                    _, loss_value = sess.run([optimizer, loss])
                    mean_loss += loss_value/batch_size
                    counter += 1
                except tf.errors.OutOfRangeError: 
                    # after last batch
                    break
            
            print("Train loss: {:.4f}".format(mean_loss/counter ))
            
            # validation
            if validation_data is not None:
                sess.run(tf.local_variables_initializer())
                sess.run(validation_init_op)
                mean_loss = 0.0
                counter = 0
            
                while True:
                    try:
                        loss_value, _, _ = sess.run([loss, mae_update_op, rmse_update_op])
                        mean_loss += loss_value/batch_size
                        counter += 1
                    except tf.errors.OutOfRangeError:
                        # after last batch
                        break
            
                mae_val, rmse_val = sess.run([mae, rmse])
                print("Validation loss: {:.4f} | MAE: {:.4f} | RMSE: {:.4f}".format(mean_loss/counter, mae_val, rmse_val))
            
        saver = tf.train.Saver()
        saver.save(sess, "logdir/model.ckpt")

        


In [131]:
def get_neural_cf_model(num_users, num_items, num_factors):
    user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
    item_input = Input(shape=(1,), dtype='int32', name = 'item_input')
    
    user_embeddings = Embedding(input_dim = num_users, output_dim = num_factors, name = 'user_embeddings', 
                                   embeddings_initializer='glorot_normal', embeddings_regularizer=l2(0.01),  input_length=1)
    item_embeddings = Embedding(input_dim = num_items, output_dim = num_factors, name = 'item_embeddings', 
                                   embeddings_initializer='glorot_normal', embeddings_regularizer=l2(0.01), input_length=1)
    
    concat_input = concatenate([Flatten()(user_embeddings(user_input)), Flatten()(item_embeddings(item_input))])
    
    layer1 = Dense(128, activation='relu')(concat_input)
    drop1  = Dropout(0.5)(layer1)
    layer2 = Dense(128, activation='relu')(drop1)
    drop2  = Dropout(0.5)(layer2)
    layer3 = Dense(64, activation='relu')(drop2)
    drop3  = Dropout(0.5)(layer3)
    layer4 = Dense(64, activation='relu')(drop3)
    drop4  = Dropout(0.5)(layer4)
    pred_layer = Dense(1, activation='sigmoid', name='prediction')(drop4)
    
    model = Model(inputs=[user_input, item_input], outputs=pred_layer)
    
    return model

In [132]:
model = get_neural_cf_model(parameters["num_users"], parameters["num_items"], parameters["num_factors"])

model.compile(optimizer=tf.train.AdamOptimizer(),
              loss='binary_crossentropy',
              metrics=['mae'])

X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.1, random_state=0)

model.fit([X_train[:, 0], X_train[:, 1]], Y_train, epochs=10, batch_size=32,
          validation_data=([X_valid[:, 0], X_valid[:, 1]], Y_valid))

Train on 90000 samples, validate on 10000 samples
Epoch 1/10

Epoch 2/10

Epoch 3/10

Epoch 4/10

Epoch 5/10

Epoch 6/10

Epoch 7/10

Epoch 8/10

Epoch 9/10

KeyboardInterrupt: 

In [None]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.1, random_state=0)
train(X_train, Y_train, parameters, hyperparameters, validation_data=(X_valid, Y_valid))

In [16]:
headers = ["movie id", "movie title", "release date", "video release date",
              "IMDb URL", "Genre unknown", "Action", "Adventure", "Animation",
              "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
              "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi",
              "Thriller", "War", "Western"]

items_info = pd.read_csv("datasets/ml-100k/u.item", sep='|', names = headers, index_col=0, engine='python')

In [17]:
items_info

Unnamed: 0_level_0,movie title,release date,video release date,IMDb URL,Genre unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movie id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,01-Jan-1995,,http://us.imdb.com/Title?Yao+a+yao+yao+dao+wai...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Twelve Monkeys (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Twelve%20Monk...,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
8,Babe (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Babe%20(1995),0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
9,Dead Man Walking (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Dead%20Man%20...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,Richard III (1995),22-Jan-1996,,http://us.imdb.com/M/title-exact?Richard%20III...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [18]:
#items_info.loc[item_dict[379]]

In [19]:
#items_info.loc[item_dict[228]]

In [20]:
#items_info.loc[item_dict[226]]

In [21]:
#items_info.loc[item_dict[229]]

In [22]:
#items_info.loc[item_dict[221]]

In [23]:
#items_info.loc[item_dict[227]]

In [24]:
#items_info.loc[item_dict[449]]

In [25]:
#items_info.loc[item_dict[180]]

In [26]:
#items_info.loc[item_dict[171]]

In [27]:
#items_info.loc[item_dict[49]]