In [1]:
import tensorflow as tf
from tensorflow.keras import Model
import numpy as np
from tensorflow.keras.initializers import TruncatedNormal
from tqdm import tqdm
from time import time
import pandas as pd
import numpy as np
import pickle
import pandas as pd
from tqdm import tqdm

In [19]:
# base_folder = 'F:\\Projects\\train\\episerver\\data\\rs\\'
base_folder = 'E:\\Projects\\Train\\episerver\\data\\rs\\'
model_folder = 'E:\\Projects\\Train\\episerver\\model\\rs\\'

In [3]:
class RSModel(Model):

    def __init__(self, args):
        super(RSModel, self).__init__()
        self.embedding_size = args['embedding_size']
        self.keyword_embedding_size = args['keyword_embedding_size']
        self.alpha = args['alpha']
        self.beta = args['beta']
        self.gamma = args['gamma']
        self.num_items = args['num_items']
        self.num_users = args['num_users']
        self.num_keywords = args['num_keywords']
        self.item_keywords = tf.constant(args['item_keywords'], dtype=tf.int32)
        self.keyword_embedding = tf.keras.layers.Embedding(input_dim=self.num_keywords + 1, output_dim=self.keyword_embedding_size,
                                                           embeddings_initializer=TruncatedNormal(mean=0., stddev=0.1),
                                                           mask_zero=True,
                                                           embeddings_regularizer=tf.keras.regularizers.L2(self.alpha))
        self.user_embedding = tf.keras.layers.Embedding(input_dim=self.num_users + 1, output_dim=self.embedding_size,
                                                        embeddings_initializer=TruncatedNormal(mean=0., stddev=0.1),
                                                        embeddings_regularizer=tf.keras.regularizers.L2(self.beta))
        self.item_embedding = tf.keras.layers.Embedding(input_dim=self.num_items, output_dim=self.embedding_size,
                                                        embeddings_initializer=TruncatedNormal(mean=0., stddev=0.1),
                                                        embeddings_regularizer=tf.keras.regularizers.L2(self.beta))
        self.bias_u = tf.keras.layers.Embedding(input_dim=self.num_users + 1, output_dim=1,
                                                embeddings_initializer=TruncatedNormal(mean=0., stddev=0.1),
                                                embeddings_regularizer=tf.keras.regularizers.L2(self.gamma))
        self.bias_i = tf.keras.layers.Embedding(input_dim=self.num_items, output_dim=1,
                                                embeddings_initializer=TruncatedNormal(mean=0., stddev=0.1),
                                                embeddings_regularizer=tf.keras.regularizers.L2(self.gamma))
        self.mlp_dense = tf.keras.layers.Dense(units=1)

    def call(self, user_ids, item_ids):
        user_bias = self.bias_u(user_ids)
        item_bias = self.bias_i(item_ids)
        # matrix factorization
        users_embedding = self.user_embedding(user_ids)
        items_embedding = self.item_embedding(item_ids)
        mf = tf.math.multiply(users_embedding, items_embedding)
        # mlp
        item_keyword = tf.nn.embedding_lookup(self.item_keywords, item_ids)
        item_keyword_embedding = self.keyword_embedding(item_keyword)
        item_encode = tf.reduce_sum(item_keyword_embedding, axis=1)
        item_encode = self.mlp_dense(item_encode)
        # rating score
        r = tf.squeeze(user_bias) + tf.squeeze(item_bias) + tf.reduce_sum(mf, axis=1) + tf.reduce_sum(item_encode, axis=1)
        
#         r = tf.squeeze(user_bias) + tf.squeeze(item_bias) + tf.reduce_sum(mf, axis=1)
        return r

    def loss_fn_rmse(self, predictions, labels):
        loss = tf.reduce_sum(tf.math.square(predictions - labels))
        loss += tf.reduce_sum(self.keyword_embedding.losses)
        loss += tf.reduce_sum(rsmodel.user_embedding.losses) + tf.reduce_sum(rsmodel.item_embedding.losses)
#         loss += tf.reduce_sum(self.bias_u.losses) + tf.reduce_sum(self.bias_i.losses)
        return loss

def get_val_rmse(rs_model, val_dataset):
    all_ratings = []
    all_predictions = []
    for i in tqdm(range(val_dataset.num_batch)):
        user_ids, item_ids, ratings = val_dataset.get_batch(i)
        predictions = rs_model(user_ids, item_ids)
        all_predictions.append(predictions.numpy())
        all_ratings.append(ratings)
    val_predictions = np.concatenate(all_predictions, axis=0)
    val_ratings = np.concatenate(all_ratings, axis=0)
    return np.sqrt(np.mean((val_predictions - val_ratings) ** 2))

In [4]:
class DataSet:

    def __init__(self, ratings, batch_size=128):
        self.ratings = ratings
        self.batch_size = batch_size
        self.num_batch = self.ratings.shape[0] // self.batch_size

    def shuffle(self):
        np.random.shuffle(self.ratings)

    def get_batch(self, i):
        user_ids = self.ratings[i * self.batch_size: (i + 1) * self.batch_size, 0]
        item_ids = self.ratings[i * self.batch_size: (i + 1) * self.batch_size, 1]
        rates = self.ratings[i * self.batch_size: (i + 1) * self.batch_size, 2]
        return (np.array(user_ids, dtype=np.int32),
                np.array(item_ids, dtype=np.int32),
                np.array(rates, dtype=np.float32))

### load data

In [5]:
train = pickle.load(open(model_folder + 'train.pkl', 'rb'))
val = pickle.load(open(model_folder + 'val.pkl', 'rb'))
test = pickle.load(open(model_folder + 'test.pkl', 'rb'))

movie_id_idx_map = pickle.load(open(model_folder + 'movie_id_idx_map.pkl', 'rb'))
idx_movie_id_map = pickle.load(open(model_folder + 'idx_movie_id_map.pkl', 'rb'))
meta_data = pickle.load(open(model_folder + 'meta_data.pkl', 'rb'))

item_keywords = pickle.load(open(model_folder + 'item_keywords.pkl', 'rb'))

In [6]:
args = dict()
args['embedding_size'] = 64
args['keyword_embedding_size'] = 32
args['alpha'] = 0.005
args['beta'] = 0.005
args['gamma'] = 0.000
args['num_items'] = meta_data['num_items']
args['num_users'] = meta_data['num_users']
args['num_keywords'] = meta_data['num_keywords']
args['item_keywords'] = item_keywords

In [7]:
rec_model = RSModel(args)

epoch_step = tf.Variable(0, dtype=tf.int32)
ckpt = tf.train.Checkpoint(rec_model=rec_model, epoch_step=epoch_step)
manager = tf.train.CheckpointManager(checkpoint=ckpt, directory='./rsmodel_ckpt', max_to_keep=3)
print('load pretrained model at: ' + manager.latest_checkpoint)
ckpt.restore(manager.latest_checkpoint)

load pretrained model at: ./rsmodel_ckpt\ckpt-3


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1bf631e7fa0>

In [8]:
test_dataset = DataSet(test[['userId', 'itemId', 'rating']].values, batch_size=1024)
get_val_rmse(rec_model, test_dataset)

100%|█████████████████████████████████████████████████████████████████████████████| 1270/1270 [00:03<00:00, 414.71it/s]


1.0526156

In [9]:
top_n = 100
item_ids = np.array(range(meta_data['num_items']))

In [10]:
users_embedding = rec_model.user_embedding.weights[0].numpy()
items_encode = rec_model.item_embedding.weights[0].numpy()
items_bias = np.squeeze(rec_model.bias_i.weights[0].numpy())
users_bias = rec_model.bias_u.weights[0].numpy()

In [11]:
item_keyword = tf.nn.embedding_lookup(rec_model.item_keywords, item_ids)
item_keyword_embedding = rec_model.keyword_embedding(item_keyword)
item_keyword_encode = tf.reduce_sum(item_keyword_embedding, axis=1)
item_keyword_encode = rec_model.mlp_dense(item_keyword_encode)
item_keyword_encode = np.squeeze(item_keyword_encode.numpy())

In [15]:
predict_user_dict = dict()
# list_predict_users = []
for user_id in tqdm(range(1, meta_data['num_users'])):
    user_embedded = users_embedding[user_id]
    user_bias = users_bias[user_id]
    predicts = np.squeeze(np.matmul(user_embedded.reshape(1, -1), items_encode.T)) + items_bias + user_bias + item_keyword_encode
    best = np.argpartition(predicts, -top_n)[-top_n:]    
#     predict_user_dict[user_id] = sorted(zip(best, predicts[best]), key=lambda x: -x[1])
    predict_user_dict[user_id] = best.astype(np.int32)
#     list_predict_users.append(best.astype(np.int32))

100%|████████████████████████████████████████████████████████████████████████| 270895/270895 [03:05<00:00, 1459.14it/s]


In [16]:
predict_user_dict.__len__()

270895

In [17]:
pickle.dump(predict_user_dict, open(model_folder + 'predict_user_dict.pkl', 'wb'))

In [14]:
best.astype(np.int32)

array([ 8112,  3331,  2979,  2956,   544, 15720,   762,   358,    51,
       13281,  6519, 11381,   289,   272,  1194,  2343,   564,   758,
        6095,  5331,  1173, 12805, 12999,   656,   160,   625, 17311,
        2415,  2397,   350,    35, 17852,   749, 10976,   737,   116,
        1562,  2390,  5014,   117,  1656,  4749,  5511, 30234, 32448,
        8057,  1780,  6089,   344,  3588,  4032, 15953,  2283,  1374,
       13005, 15708,  1203,  6099,  6934,   587,   589,  1227,  6307,
        1777,  6359,    34,  1230,   365,  2977,    57,  1454,  5516,
        9133,   300, 18354, 10021,   627, 10482,    77,   534,  1600,
         115,  1622,  1654,   225,   616,  2382,  1705, 16323,   125,
           8,  1707,  2268,   579,  3999,  3681,     4,  1775, 15700,
         287])

In [28]:
import sys

In [32]:
sys.getsizeof(list_predict_users)/2**32

4.663877189159393e-05

In [31]:
list_predict_users = np.concatenate()

[array([ 8112,  3331,  2979,  2956,   544, 15720,   762,   358,    51,
        13281,  6519, 11381,   289,   272,  1194,  2343,   564,   758,
         6095,  5331,  1173, 12805, 12999,   656,   160,   625, 17311,
         2415,  2397,   350,    35, 17852,   749, 10976,   737,   116,
         1562,  2390,  5014,   117,  1656,  4749,  5511, 30234, 32448,
         8057,  1780,  6089,   344,  3588,  4032, 15953,  2283,  1374,
        13005, 15708,  1203,  6099,  6934,   587,   589,  1227,  6307,
         1777,  6359,    34,  1230,   365,  2977,    57,  1454,  5516,
         9133,   300, 18354, 10021,   627, 10482,    77,   534,  1600,
          115,  1622,  1654,   225,   616,  2382,  1705, 16323,   125,
            8,  1707,  2268,   579,  3999,  3681,     4,  1775, 15700,
          287], dtype=int64),
 array([ 8112,  3331,  2979,  2956,   544, 15720,   762,   358,    51,
        13281,  6519, 11381,   289,   272,  1194,  2343,   564,   758,
         6095,  5331,  1173, 12805, 12999,   65

### GUI prediction

In [20]:
user_id = 100