In [13]:
import numpy as np
import pandas as pd
from os import path
from collections import OrderedDict
from tqdm import tqdm
from typing import Dict

import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Embedding, Flatten, Input, Lambda
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K

from sklearn.metrics import roc_auc_score

from recommenders.datasets import movielens

In [14]:
tf.__version__

'2.6.0'

In [15]:
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

---
Dataset

In [5]:
data = movielens.load_pandas_df(
    size='100k',
    header=['user', 'movie', 'rating', 'Timestamp'],
    title_col='title'
)

data.loc[:, 'rating'] = data['rating'].astype(np.int32)
data['id'] = data.apply(lambda x: str(x['user'])+"_"+str(x['movie']), axis=1)
df_full = data.loc[:, ['user', 'movie', 'rating', 'id']]
df_full.head()

100%|██████████| 4.81k/4.81k [00:07<00:00, 625KB/s]  


Unnamed: 0,user,movie,rating,id
0,196,242,3,196_242
1,63,242,3,63_242
2,226,242,5,226_242
3,154,242,3,154_242
4,306,242,5,306_242


In [17]:
data1 = pd.read_csv('../data/lastfm.csv')
data1.head()

Unnamed: 0,user,artist,sex,country
0,1,red hot chili peppers,f,Germany
1,1,the black dahlia murder,f,Germany
2,1,goldfrapp,f,Germany
3,1,dropkick murphys,f,Germany
4,1,le tigre,f,Germany


In [7]:
data0 = pd.read_csv('../data/movielens.csv')
data0.head()
data = data0.loc[:, ['UserId', 'MovieId', 'Rating']]
data.rename(columns={'UserId': 'user', 'MovieId': 'movie', 'Rating': 'rating'}, inplace=True)
data.loc[:, 'rating'] = data['rating'].astype(np.int32)
data['id'] = data.apply(lambda x: str(x['user'])+"_"+str(x['movie']), axis=1)

---
Dataset prep

In [8]:
class BPRModel(object):

    def __init__(self, df, user_col, item_col, rating_col=None, explicit=True, save_dir='../tmp/models/exp-002'):
        self.explicit = explicit
        self.df_full = df
        self.user_col = user_col
        self.item_col = item_col
        self.rating_col = rating_col
        self.save_dir = save_dir
        self.build_mappings(self.user_col, self.item_col)
        self.add_id_cols()

    def build_mappings(self, user_col, item_col):
        unique_users = self.df_full[user_col].unique()
        self.user2id_map = dict(zip(unique_users, np.arange(unique_users.shape[0], dtype=np.int32)))
        self.id2user_map = dict(zip(np.arange(unique_users.shape[0], dtype=np.int32), unique_users))

        unique_movies = self.df_full[item_col].unique()
        self.item2id_map = dict(zip(unique_movies, np.arange(unique_movies.shape[0], dtype=np.int32)))
        self.id2item_map = dict(zip(np.arange(unique_movies.shape[0], dtype=np.int32), unique_movies))

    def add_id_cols(self):
        self.df_full['user_id'] = self.df_full[self.user_col].apply(lambda u: self.user2id_map[u])
        self.df_full['movie_id'] = self.df_full[self.item_col].apply(lambda m: self.item2id_map[m])

    def get_triplets_exp(self, df_train, cfg):

        try:
            threshold = cfg['threshold']
        except:
            threshold = 3
        
        try:
            take_nonexisting = cfg['take_nonexisiting']
        except:
            take_nonexisting = False 
        
        try:
            item_limit = cfg['item_limit']
        except:
            item_limit = 50

        df_triplest = pd.DataFrame(columns=['user_id', 'positive_m_id', 'negative_m_id'])
        data = []
        users_without_data = []

        for user_id in tqdm(df_train.user_id.unique()):
            positive_movies = df_train[(df_train.user_id == user_id) & (df_train[self.rating_col] > threshold)].movie_id.values
            negative_movies = df_train[(df_train.user_id == user_id) & (df_train[self.rating_col] <= threshold)].movie_id.values
            if take_nonexisting:
                all_movies = df_train.movie_id.unique()
                nonext_movies = np.setdiff1d(all_movies, df_train.loc[df_train[self.user_col]==user_id].movie_id.values)
                negative_movies = np.concatenate((negative_movies, nonext_movies), axis=0)

            if negative_movies.shape[0] == 0 or positive_movies.shape[0] == 0:
                users_without_data.append(user_id)
                continue
            
            np.random.shuffle(positive_movies)
            positive_movies = positive_movies[:item_limit]

            np.random.shuffle(negative_movies)
            negative_movies = negative_movies[:item_limit]

            for positive_movie in positive_movies:
                for negative_movie in negative_movies:
                    data.append({'user_id': user_id, 'positive_m_id': positive_movie, 'negative_m_id': negative_movie})

        df_triplest = df_triplest.append(data, ignore_index=True)
        return df_triplest

    def get_triplets_imp(self, df_train, cfg):

        try:
            item_limit = cfg['item_limit']
        except:
            item_limit = 50

        df_triplest = pd.DataFrame(columns=['user_id', 'positive_m_id', 'negative_m_id'])
        data = []
        users_without_data = []

        for user_id in tqdm(df_train.user_id.unique()):
            all_movies = df_train.movie_id.unique()
            positive_movies = df_train.loc[df_train[self.user_col]==user_id].movie_id.values
            negative_movies = np.setdiff1d(all_movies, positive_movies)

            if negative_movies.shape[0] == 0 or positive_movies.shape[0] == 0:
                users_without_data.append(user_id)
                continue

            np.random.shuffle(positive_movies)
            positive_movies = positive_movies[:item_limit]

            np.random.shuffle(negative_movies)
            negative_movies = negative_movies[:item_limit]

            for positive_movie in positive_movies:
                for negative_movie in negative_movies:
                    data.append({'user_id': user_id, 'positive_m_id': positive_movie, 'negative_m_id': negative_movie})

        df_triplest = df_triplest.append(data, ignore_index=True)
        return df_triplest

    def bpr_predict(self, model: Model, user_id: int, item_ids: list, user_layer='user_embedding', item_layer='item_embedding'):
        """
        Predict by multiplication user vector by item matrix
        
        :return: list of the scores
        """
        user_vector = model.get_layer(user_layer).get_weights()[0][user_id]
        item_matrix = model.get_layer(item_layer).get_weights()[0][item_ids]

        scores = (np.dot(user_vector, item_matrix.T))

        return scores

    def ranking(self, model, user_id, item_ids):
        item_scores = [self.bpr_predict(model, user_id, i) for i in item_ids]
        res_df = pd.DataFrame({'item_id': item_ids, 'score': item_scores}).sort_values(by='score', ascending=False)
        return res_df

    @tf.function
    def identity_loss(self, _, y_pred):
        return tf.math.reduce_mean(y_pred)

    @tf.function
    def bpr_triplet_loss(self, X: dict):
        """
        Calculate triplet loss - as higher the difference between positive interactions
        and negative interactions as better

        :param X: X contains the user input, positive item input, negative item input
        :return:
        """
        positive_item_latent, negative_item_latent, user_latent = X

        positive_interactions = tf.math.reduce_sum(tf.math.multiply(user_latent, positive_item_latent), axis=-1, keepdims=True)
        negative_interactions = tf.math.reduce_sum(tf.math.multiply(user_latent, negative_item_latent), axis=-1, keepdims=True)

        return tf.math.subtract(tf.constant(1.0), tf.sigmoid(tf.math.subtract(positive_interactions, negative_interactions)))

    def out_shape(self, shapes):
        return shapes[0]
        

    def build_model(self, num_users: int, num_items: int, latent_dim: int) -> Model:
        """
        Build a model for Bayesian personalized ranking

        :param num_users: a number of the unique users
        :param num_items: a number of the unique movies
        :param latent_dim: vector length for the latent representation
        :return: Model
        """
        user_input = Input((1,), name='user_input')

        positive_item_input = Input((1,), name='positive_item_input')
        negative_item_input = Input((1,), name='negative_item_input')
        # One embedding layer is shared between positive and negative items
        item_embedding_layer = Embedding(num_items, latent_dim, name='item_embedding', input_length=1)

        positive_item_embedding = Flatten()(item_embedding_layer(positive_item_input))
        negative_item_embedding = Flatten()(item_embedding_layer(negative_item_input))

        user_embedding = Embedding(num_users, latent_dim, name='user_embedding', input_length=1)(user_input)
        user_embedding = Flatten()(user_embedding)

        triplet_loss = Lambda(self.bpr_triplet_loss, output_shape=self.out_shape)([positive_item_embedding,
                                                                negative_item_embedding,
                                                                user_embedding])

        model = Model(inputs=[positive_item_input, negative_item_input, user_input], outputs=triplet_loss)
    
        return model

    def train(self, save_dir, latent_dim=350, batch_size=256, num_epochs=3, lr=0.002, dataset_cfg={}):
        """model building & training

        Args:
            latent_dim (int, optional): embedding vector length for each lookup. Defaults to 350.
            batch_size (int, optional): batch size. Defaults to 256.
            num_epochs (int, optional): number of epochs. Defaults to 3.
            lr (float, optional): learning rate. Defaults to 0.002.
            dataset_cfg (dict, optional): triplate dataset building function parameters. Defaults to {}.

        Returns:
            status : 1 > success , 0 > failed, 
            message : the status message
        """
        result_cfg = {'status': 1, 'message': 'Model Trained Successfully'}
        # try:
        num_users = len(self.user2id_map)
        num_items = len(self.item2id_map)
        model = self.build_model(num_users, num_items, latent_dim)
        model.compile(loss=self.identity_loss, optimizer=Adam(learning_rate=lr))

        # parameter space logging
        trainable_count = np.sum([K.count_params(w) for w in model.trainable_weights])
        non_trainable_count = np.sum([K.count_params(w) for w in model.non_trainable_weights])

        print('Total number of parameters: {:,}'.format(trainable_count + non_trainable_count))
        print('Trainable number of parameters: {:,}'.format(trainable_count))
        print('Non-trainable number of parameters: {:,}'.format(non_trainable_count))
        

        # model dataset init
        if self.explicit:
            df_triplest = self.get_triplets_exp(self.df_full, dataset_cfg)
        else:
            df_triplest = self.get_triplets_imp(self.df_full, dataset_cfg)
        print('Training data length: {:,}'.format(df_triplest.shape[0]))
        X = {
            'user_input': tf.convert_to_tensor(df_triplest.user_id, dtype=tf.int32),
            'positive_item_input': tf.convert_to_tensor(df_triplest.positive_m_id, dtype=tf.int32),
            'negative_item_input': tf.convert_to_tensor(df_triplest.negative_m_id, dtype=tf.int32)
        }

        # model training
        model.fit(X, 
                tf.ones(df_triplest.shape[0]), 
                batch_size=batch_size,
                epochs=num_epochs)
        model.save_weights(save_dir)
        self.model = model 
        # except:
        #     result_cfg = {'status': 0, 'message': 'There was an error in the model training process. Try Again..'}
        return None, result_cfg

    def inference(self, user, items, save_dir):
        """inference / rank the items for the given user

        Args:
            user (int/ str): user_id according to training dataset
            items (List[int], List[str]): list of item_id according to training dataset

        Returns:
            res_df (pd.DataFrame) : sorted list of items
            status : 2 > Warning , 1 > success , 0 > failed
            message : status info message
        """
        result_cfg = {'status': 1, 'message': ''}
        num_users = len(self.user2id_map)
        num_items = len(self.item2id_map)
        model = self.build_model(num_users, num_items, 350)
        model.load_weights(save_dir)
        user_id = 0
        try:
            user_id = self.user2id_map[user]
        except:
            result_cfg = {'status': 0, 'message': f"User; {user}, does not exists in the training dataset"}
            return None, result_cfg
        
        item_ids = []
        for i in items:
            try:
                item_ids.append(self.item2id_map[i])
            except:
                result_cfg['status'] = 2, 
                if result_cfg['message'] == '':
                    result_cfg['message'] =  f' {i} does not exists in the training dataset!'
                else:
                    result_cfg['message'] = f' {i} ' + result_cfg['message']

        if item_ids==[]:
            result_cfg = {'status': 0, 'message': 'None of the items given exists in the training dataset!'}
            return None, result_cfg
        else:
            result_df = self.ranking(model, user_id, item_ids)
            result_df['item'] = result_df['item_id'].map(self.id2item_map)
            result_df['user'] = user 
            result_df = result_df.loc[:, ['user', 'score', 'item']]
            return result_df, result_cfg

    def batch_pred(self, df):
        result_cfg = {'status': 1, 'message': ''}
        user_Kdf = df.groupby('user')['item'].apply(list).reset_index()
        res_dfs = []
        for _, row in user_Kdf.iterrows():
            rdf, sms = bpr.inference(int(row['user']), row['item'])
            if sms['status'] == 1:
                res_dfs.append(rdf)
            else:
                result_cfg['status'] = 2
                result_cfg['message'] += sms['message']
        res_df = pd.concat(res_dfs, axis=0)
        return res_df, result_cfg


In [9]:
model_cfg = {
    'df': data,
    'user_col': 'user',
    'item_col': 'movie',
    'rating_col': 'rating',
    'explicit': True,
    'threshold': 2,
    'take_nonexisting': True,
    'item_limit': 50,
    'latent_dim': 350,
    'bs': 256,
    'n': 3,
    'lr': 0.001
}

In [10]:
save_path = 'tmp/models/exp-002'
bpr = BPRModel(model_cfg['df'], model_cfg['user_col'], model_cfg['item_col'], model_cfg['rating_col'])

In [11]:
bpr.train(save_path)

2022-05-12 09:50:38.425968: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Total number of parameters: 918,750.0
Trainable number of parameters: 918,750
Non-trainable number of parameters: 0.0


100%|██████████| 943/943 [00:01<00:00, 506.64it/s]


Training data length: 1,109,044
Epoch 1/3


2022-05-12 09:50:41.802958: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/3
Epoch 3/3


(None, {'status': 1, 'message': 'Model Trained Successfully'})

In [12]:
bpr.inference(12, [123, 122, 1, 43, 54, 76], save_path)



(   user     score  item
 2    12  3.028188     1
 4    12  0.780196    54
 5    12  0.297961    76
 0    12 -1.533885   123
 1    12 -2.067558   122
 3    12 -2.133079    43,
 {'status': 1, 'message': ''})

In [11]:
sample_dfs = pd.DataFrame({'user': [12, 12, 12, 12, 34, 34, 34, 56, 56, 56, 56, 56], 'item': [123, 352, 45, 65, 123, 43, 34, 23, 43, 12, 14, 233]})
sample_dfs.head()

Unnamed: 0,user,item
0,12,123
1,12,352
2,12,45
3,12,65
4,34,123


In [20]:
list_df = sample_dfs.groupby('user')['item'].apply(list).reset_index()
list_df.head()

Unnamed: 0,user,item
0,12,"[123, 352, 45, 65]"
1,34,"[123, 43, 34]"
2,56,"[23, 43, 12, 14, 233]"


In [24]:
res_dfs = []
for index, row in list_df.iterrows():
    # print(type(row['item']))
    rdf, sms = bpr.inference(int(row['user']), row['item'])
    if sms['status'] == 1:
        res_dfs.append(rdf)


In [26]:
res_df = pd.concat(res_dfs, axis=0)
res_df.head()

Unnamed: 0,user,score,item
2,12,-0.069623,45
1,12,-1.143507,352
0,12,-1.533491,123
3,12,-2.775054,65
1,34,-0.462985,43


In [7]:
model_cfg = {
    'df': data1,
    'user_col': 'user',
    'item_col': 'artist',
    'rating_col': None,
    'explicit': True,
    'threshold': 2,
    'take_nonexisting': True,
    'item_limit': 50,
    'latent_dim': 350,
    'bs': 256,
    'n': 3,
    'lr': 0.001
}

In [10]:
bpr = BPRModel(model_cfg['df'], model_cfg['user_col'], model_cfg['item_col'], model_cfg['rating_col'], explicit=False)

In [11]:
bpr.train()

2022-05-09 13:09:04.038527: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Total number of parameters: 5,601,400.0
Trainable number of parameters: 5,601,400
Non-trainable number of parameters: 0.0


100%|██████████| 15000/15000 [00:34<00:00, 438.73it/s]


Training data length: 11,039,400
Epoch 1/3
Epoch 2/3
Epoch 3/3


(None, {'status': 1, 'message': 'Model Trained Successfully'})

In [16]:
sample_df = pd.DataFrame({'user': [12, 12, 34, 34, 34, 34, 32, 32, 32],
                          'item': ['edguy', 'jack johnson', 'the killers', 'judas priest', 'the who', 'le tigre', 'le tigre', 'aphex twin', 'edguy']})

In [17]:
sample_df

Unnamed: 0,user,item
0,12,edguy
1,12,jack johnson
2,34,the killers
3,34,judas priest
4,34,the who
5,34,le tigre
6,32,le tigre
7,32,aphex twin
8,32,edguy


In [18]:
sample_df.to_csv('../data/test_lastfm.csv', index=False)

In [22]:
sample_df = pd.DataFrame({'user': [12, 12, 12, 12, 43 ,43, 43, 43, 43, 43, 53, 53, 53, 53, 53, 53, 53, 112, 112, 112],
                         'item': [12, 23, 112, 43, 58, 97, 38, 233, 675, 73, 94, 67, 232, 226, 113, 756, 234, 12, 31, 43]})

In [23]:
sample_df

Unnamed: 0,user,item
0,12,12
1,12,23
2,12,112
3,12,43
4,43,58
5,43,97
6,43,38
7,43,233
8,43,675
9,43,73


In [24]:
sample_df.to_csv('../data/test_movielens.csv', index=False)