In [1]:
import numpy as np
import pandas as pd
from os import path
from collections import OrderedDict
from tqdm import tqdm
from typing import Dict

import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Embedding, Flatten, Input, Lambda
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K

from sklearn.metrics import roc_auc_score

from recommenders.datasets import movielens

In [2]:
tf.__version__

'2.8.0'

In [3]:
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

---
Dataset

In [32]:
data = movielens.load_pandas_df(
    size='100k',
    header=['user', 'movie', 'rating', 'Timestamp'],
    title_col='title'
)

data.loc[:, 'rating'] = data['rating'].astype(np.int32)
data['id'] = data.apply(lambda x: str(x['user'])+"_"+str(x['movie']), axis=1)
df_full = data.loc[:, ['user', 'movie', 'rating', 'id']]
df_full.head()

100%|██████████| 4.81k/4.81k [00:12<00:00, 385KB/s]


Unnamed: 0,user,movie,rating,id
0,196,242,3,196_242
1,63,242,3,63_242
2,226,242,5,226_242
3,154,242,3,154_242
4,306,242,5,306_242


---
Dataset prep

In [46]:
class BPRModel(object):

    def __init__(self, df, user_col, item_col, rating_col):
        self.df_full = df
        self.user_col = user_col
        self.item_col = item_col
        self.rating_col = rating_col
        self.build_mappings(self.user_col, self.item_col)
        self.add_id_cols()

    def build_mappings(self, user_col, item_col):
        unique_users = self.df_full[user_col].unique()
        self.user2id_map = dict(zip(unique_users, np.arange(unique_users.shape[0], dtype=np.int32)))
        self.id2user_map = dict(zip(np.arange(unique_users.shape[0], dtype=np.int32), unique_users))

        unique_movies = self.df_full[item_col].unique()
        self.item2id_map = dict(zip(unique_movies, np.arange(unique_movies.shape[0], dtype=np.int32)))
        self.id2item_map = dict(zip(np.arange(unique_movies.shape[0], dtype=np.int32), unique_movies))

    def add_id_cols(self):
        self.df_full['user_id'] = self.df_full[self.user_col].apply(lambda u: self.user2id_map[u])
        self.df_full['movie_id'] = self.df_full[self.item_col].apply(lambda m: self.item2id_map[m])

    def get_triplets(self, df_train, threshold=3, take_nonexisting=False, item_limit=50):
        df_triplest = pd.DataFrame(columns=['user_id', 'positive_m_id', 'negative_m_id'])
        data = []
        users_without_data = []

        for user_id in tqdm(df_train.user_id.unique()):
            positive_movies = df_train[(df_train.user_id == user_id) & (df_train[self.rating_col] > threshold)].movie_id.values
            negative_movies = df_train[(df_train.user_id == user_id) & (df_train[self.rating_col] <= threshold)].movie_id.values
            if take_nonexisting:
                all_movies = df_train['movie'].unique()
                nonext_movies = np.setdiff1d(all_movies, df_train.loc[df_train[self.user_col]==123][self.item_col].values)
                negative_movies = np.concatenate((negative_movies, nonext_movies), axis=0)

            if negative_movies.shape[0] == 0 or positive_movies.shape[0] == 0:
                users_without_data.append(user_id)
                continue
            
            np.random.shuffle(positive_movies)
            positive_movies = positive_movies[:item_limit]

            np.random.shuffle(negative_movies)
            negative_movies = negative_movies[:item_limit]

            for positive_movie in positive_movies:
                for negative_movie in negative_movies:
                    data.append({'user_id': user_id, 'positive_m_id': positive_movie, 'negative_m_id': negative_movie})

        df_triplest = df_triplest.append(data, ignore_index=True)
        return df_triplest

    def bpr_predict(self, model: Model, user_id: int, item_ids: list, user_layer='user_embedding', item_layer='item_embedding'):
        """
        Predict by multiplication user vector by item matrix
        
        :return: list of the scores
        """
        user_vector = model.get_layer(user_layer).get_weights()[0][user_id]
        item_matrix = model.get_layer(item_layer).get_weights()[0][item_ids]

        scores = (np.dot(user_vector, item_matrix.T))

        return scores

    def ranking(self, model, user_id, item_ids):
        item_scores = [bpr_predict(model, user_id, i) for i in item_ids]
        res_df = pd.DataFrame({'item_id': item_ids, 'score': item_scores}).sort_values(by='score', ascending=False)
        return res_df

    @tf.function
    def identity_loss(self, _, y_pred):
        return tf.math.reduce_mean(y_pred)

    @tf.function
    def bpr_triplet_loss(self, X: dict):
        """
        Calculate triplet loss - as higher the difference between positive interactions
        and negative interactions as better

        :param X: X contains the user input, positive item input, negative item input
        :return:
        """
        positive_item_latent, negative_item_latent, user_latent = X

        positive_interactions = tf.math.reduce_sum(tf.math.multiply(user_latent, positive_item_latent), axis=-1, keepdims=True)
        negative_interactions = tf.math.reduce_sum(tf.math.multiply(user_latent, negative_item_latent), axis=-1, keepdims=True)

        return tf.math.subtract(tf.constant(1.0), tf.sigmoid(tf.math.subtract(positive_interactions, negative_interactions)))

    def out_shape(self, shapes):
        return shapes[0]
        

    def build_model(self, num_users: int, num_items: int, latent_dim: int) -> Model:
        """
        Build a model for Bayesian personalized ranking

        :param num_users: a number of the unique users
        :param num_items: a number of the unique movies
        :param latent_dim: vector length for the latent representation
        :return: Model
        """
        user_input = Input((1,), name='user_input')

        positive_item_input = Input((1,), name='positive_item_input')
        negative_item_input = Input((1,), name='negative_item_input')
        # One embedding layer is shared between positive and negative items
        item_embedding_layer = Embedding(num_items, latent_dim, name='item_embedding', input_length=1)

        positive_item_embedding = Flatten()(item_embedding_layer(positive_item_input))
        negative_item_embedding = Flatten()(item_embedding_layer(negative_item_input))

        user_embedding = Embedding(num_users, latent_dim, name='user_embedding', input_length=1)(user_input)
        user_embedding = Flatten()(user_embedding)

        triplet_loss = Lambda(self.bpr_triplet_loss, output_shape=self.out_shape)([positive_item_embedding,
                                                                negative_item_embedding,
                                                                user_embedding])

        model = Model(inputs=[positive_item_input, negative_item_input, user_input], outputs=triplet_loss)

        return model

    def train(self, latent_dim=350, batch_size=256, num_epochs=3, lr=0.002):
        num_users = len(self.user2id_map)
        num_items = len(self.item2id_map)
        model = self.build_model(num_users, num_items, latent_dim)
        model.compile(loss=self.identity_loss, optimizer=Adam(learning_rate=lr))

        # parameter space logging
        trainable_count = np.sum([K.count_params(w) for w in model.trainable_weights])
        non_trainable_count = np.sum([K.count_params(w) for w in model.non_trainable_weights])

        print('Total number of parameters: {:,}'.format(trainable_count + non_trainable_count))
        print('Trainable number of parameters: {:,}'.format(trainable_count))
        print('Non-trainable number of parameters: {:,}'.format(non_trainable_count))
        

        # model dataset init
        df_triplest = self.get_triplets(self.df_full)
        print('Training data length: {:,}'.format(df_triplest.shape[0]))
        X = {
            'user_input': tf.convert_to_tensor(df_triplest.user_id, dtype=tf.int32),
            'positive_item_input': tf.convert_to_tensor(df_triplest.positive_m_id, dtype=tf.int32),
            'negative_item_input': tf.convert_to_tensor(df_triplest.negative_m_id, dtype=tf.int32)
        }

        # model training
        model.fit(X, 
                tf.ones(df_triplest.shape[0]), 
                batch_size=batch_size,
                epochs=num_epochs)

        self.model = model 

    def inference(self, user, items):
        user_id = 0
        try:
            user_id = self.user2id_map[user]
        except:
            print(f"User; {user}, does not exists in the training dataset")
        
        item_ids = []
        for i in items:
            try:
                item_ids.append(self.item2id_map[i])
            except:
                print(f"Item {i}, does not exists in the training dataset")

        if (user_id==0) or (item_ids==[]):
            print("Not enough data for the inference")
            return
        else:
            result_df = self.ranking(self.model, user_id, item_ids)
            result_df['item'] = result_df['item_id'].map(self.id2item_map)
            return result_df


In [47]:
bpr = BPRModel(df_full, 'user', 'movie', 'rating')

In [48]:
bpr.train()

Total number of parameters: 918,750.0
Trainable number of parameters: 918,750
Non-trainable number of parameters: 0.0


100%|██████████| 943/943 [00:02<00:00, 426.67it/s]


Training data length: 1,109,044
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [49]:
bpr.inference(12, [123, 122, 1, 43, 54, 76])

Unnamed: 0,item_id,score,item
2,24,1.937564,1
0,861,1.48106,123
5,1009,-1.258224,76
1,510,-2.343066,122
3,615,-3.429435,43
4,79,-4.794955,54
