In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras import Model
import pickle
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
import os
import random
from time import time
from tqdm.notebook import tqdm
tqdm.pandas()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  del sys.path[0]


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

In [2]:
class DataSet:

    def __init__(self, train_file, test_file, negative_sample=3, batch_size=100):
        self.train_data = pd.read_csv(train_file)
        self.test_data = pd.read_csv(test_file)
        self.num_users = self.train_data['userID'].max() + 1
        self.num_items = self.train_data['itemID'].max() + 1
        self.negative_sample = negative_sample
        self.batch_size = batch_size
        self.user_rated_items = self.get_user_rated_items()

    def get_user_rated_items(self):
        rated_data = self.get_rated_data()
        user_rated_items = dict()
        for user_id, item_id, rate in rated_data:
            if not user_rated_items.__contains__(user_id):
                user_rated_items[user_id] = []
            user_rated_items[user_id].append(item_id)
        return user_rated_items

    def generate_train_data(self):
        rated_data = self.get_rated_data()
#         print('done load data')
        np.random.shuffle(rated_data)
        all_train_data = self.negative_sampling(rated_data)
        all_batch_data = self.get_all_batch_data(all_train_data, self.user_rated_items)
        return all_batch_data

    def get_rated_data(self):
        return [(user_id, item_id, rate) for user_id, item_id, rate in self.train_data[['userID', 'itemID', 'rating']].values]

    def negative_sampling(self, rated_data):
        user_ids = []
        item_ids = []
        labels = []
        set_rated = {(user_id, item_id) for user_id, item_id, rate in rated_data}
        for user_id, item_id, rate in rated_data:
            user_ids.append(user_id)
            item_ids.append(item_id)
            labels.append((1, rate))
            for j in range(self.negative_sample):
                random_item = np.random.randint(self.num_items)
                while set_rated.__contains__((user_id, random_item)):
                    random_item = np.random.randint(self.num_items)
                user_ids.append(user_id)
                item_ids.append(random_item)
                labels.append((0, 0))
        return user_ids, item_ids, labels

    def get_all_batch_data(self, all_train_data, user_rated_items):
        user_ids, item_ids, labels = all_train_data
        mask = self.num_items
        num_iter = user_ids.__len__()//self.batch_size
        batch_data = []
        for i in range(num_iter):
            batch_user_descriptions = []
            batch_item_ids = []
            batch_user_ids = []
            batch_num_items = []
            batch_labels = []
            batch_ratings = []
            for j in range(self.batch_size):
                idx = i * self.batch_size + j
                user_id = user_ids[idx]
                item_id = item_ids[idx]
                label = labels[idx][0]
                rating = labels[idx][1]
                rated_items = user_rated_items[user_id].copy()
                user_description = self.get_user_description(rated_items, item_id)
                batch_user_descriptions.append(user_description)
                batch_num_items.append(user_description.__len__())
                batch_user_ids.append(user_id)
                batch_item_ids.append(item_id)
                batch_labels.append(label)
                batch_ratings.append(rating)
            max_user_des = max(batch_num_items)
            batch_user_descriptions = self.padding_user_description(batch_user_descriptions, mask, max_user_des)
            batch_data.append((np.array(batch_user_descriptions, dtype=np.int32),
                               np.array(batch_user_ids, dtype=np.int32),
                               np.array(batch_item_ids, dtype=np.int32),
                               np.array(batch_num_items, dtype=np.float32),
                               np.array(batch_labels, dtype=np.float32),
                               np.array(batch_ratings, dtype=np.float32)
                               ))
#             if i % 1000 == 0:
#                 print(f'done %d/%d iter' % (i, num_iter))
#             if i > 1000:
#                 break
        return batch_data, num_iter

    def get_user_description(self, rated_items, item_id):
        for i in range(rated_items.__len__()):
            if rated_items[i] == item_id:
                rated_items[i] = rated_items[-1]
                del rated_items[-1]
                break
        return rated_items

    def padding_user_description(self, batch_user_descriptions, mask, max_len):
        for i in range(batch_user_descriptions.__len__()):
            batch_user_descriptions[i] = batch_user_descriptions[i] + [mask] * (max_len - batch_user_descriptions[i].__len__())
        return batch_user_descriptions

    def get_batch(self):
        pass

    def reset(self):
        pass




### Model

In [3]:
class FISM(Model):

    def __init__(self, args):
        super(FISM, self).__init__()
        self.embedding_size = args['embedding_size']
        self.alpha = args['alpha']
        self.beta = args['beta']
        self.gamma = args['gamma']
        self.lambda_ = args['lambda_']
        self.verbose = args['verborse']
        self.num_items = args['num_items']
        self.num_users = args['num_users']
        self.confidence_factor = args['confidence_factor']
        self.Q_norms = None
        self.P_norms = None
        self.item_norms = None
        self.item_vectors = None
        # self.P = tf.keras.layers.Embedding(input_dim=self.num_items + 1, output_dim=self.embedding_size,
        #                                    embeddings_initializer=TruncatedNormal(mean=0., stddev=0.1), trainable=True)

        # p_var = tf.Variable(tf.keras.initializers.TruncatedNormal(mean=0, stddev=0.1)(shape=[self.num_items, self.embedding_size]))
        # mask_value = tf.constant(0, shape=(1, self.embedding_size), dtype=tf.float32)
        # p_var = tf.concat((p_var, mask_value), axis=0)
        # self.P = tf.keras.layers.Embedding(input_dim=23607, output_dim=50, trainable=True, weights=[p_var.numpy()])

        self.P = tf.Variable(
            tf.random.truncated_normal(shape=[self.num_items, self.embedding_size], mean=0, stddev=0.1))
        self.mask_value = tf.constant(0, shape=(1, self.embedding_size), dtype=tf.float32)
        self.Q = tf.Variable(
            tf.random.truncated_normal(shape=[self.num_items, self.embedding_size], mean=0, stddev=0.1))
        self.bias_u = tf.keras.layers.Embedding(input_dim=self.num_users, output_dim=1,
                                                embeddings_initializer=TruncatedNormal(mean=0., stddev=0.1))
        self.bias_i = tf.keras.layers.Embedding(input_dim=self.num_items, output_dim=1,
                                                embeddings_initializer=TruncatedNormal(mean=0., stddev=0.1))
        #         self.optimizer = tf.optimizers.Adagrad(learning_rate=self.learning_rate, initial_accumulator_value=1)
        # self.optimizer = tf.optimizers.Adam(learning_rate=self.learning_rate)

    def call(self, user_descriptions, user_ids, item_ids, num_items):
        user_bias = self.bias_u(user_ids)
        item_bias = self.bias_i(item_ids)
        # user_rated_items_embedding = self.P(user_descriptions)
        P_with_mask = tf.concat([self.P, self.mask_value], axis=0)
        user_rated_items_embedding = tf.nn.embedding_lookup(P_with_mask, user_descriptions)
        # items_embedding = self.Q(item_ids)
        items_embedding = tf.nn.embedding_lookup(self.Q, item_ids)
        user_des = tf.reduce_sum(user_rated_items_embedding, axis=1)
        coefficient = tf.pow(num_items, -tf.constant(self.alpha, dtype=tf.float32))
        r = tf.squeeze(user_bias) + tf.squeeze(item_bias) + tf.math.multiply(coefficient, tf.reduce_sum(
            tf.math.multiply(user_des, items_embedding), axis=1))
        return r

    def loss_fn_old(self, predictions, labels, ratings):
        confidences = 1 + self.confidence_factor * ratings
        loss = tf.reduce_sum(tf.math.multiply(confidences, tf.math.square(predictions - labels)))
        loss += self.beta * (tf.reduce_sum(tf.math.square(self.P)) + tf.reduce_sum(
            tf.math.square(self.Q)))
        loss += self.lambda_ * tf.reduce_sum(tf.math.square(self.bias_u.embeddings)) + self.gamma * tf.reduce_sum(
            tf.math.square(self.bias_i.embeddings))
        return loss

    def loss_fn(self, predictions, labels, ratings):
        predictions = tf.math.sigmoid(predictions)
        #         predictions = tf.cast(predictions, tf.float64)
        predictions = tf.clip_by_value(predictions, clip_value_min=1e-7, clip_value_max=1 - 1e-7)
        cross_entropy_elements = -(tf.math.multiply(labels, tf.math.log(predictions)) +
                                   tf.math.multiply(1 - labels, tf.math.log(1 - predictions)))
        confidences = 1 + self.confidence_factor * ratings
        loss = tf.reduce_sum(tf.math.multiply(confidences, cross_entropy_elements))
        loss += self.beta * (tf.reduce_sum(tf.math.square(self.P)) + tf.reduce_sum(tf.math.square(self.Q)))
        loss += self.lambda_ * tf.reduce_sum(tf.math.square(self.bias_u.embeddings)) + self.gamma * tf.reduce_sum(
            tf.math.square(self.bias_i.embeddings))
        return loss

    def prepare_for_prediction(self):
        self.Q_norms = tf.sqrt(tf.reduce_sum(tf.square(self.Q), axis=1))
        self.P_norms = tf.sqrt(tf.reduce_sum(tf.square(self.P), axis=1))
        self.item_vectors = tf.concat([self.P, self.Q], axis=1)
        self.item_norms = tf.sqrt(tf.reduce_sum(tf.square(self.item_vectors), axis=1))

    def sim_items(self, item_id, top_n: int = 100):
        item_embedded = tf.nn.embedding_lookup(self.P, item_id)
        item_embedded = tf.reshape(item_embedded, shape=(self.embedding_size, -1))
        scores = tf.matmul(self.Q, item_embedded)
        scores = tf.squeeze(scores)
        scores = scores / (self.Q_norms * self.P_norms[item_id])
        scores = scores.numpy()
        best = np.argpartition(scores, -top_n)[-top_n:]
        return sorted(zip(best, scores[best]), key=lambda x: -x[1])

    def sim_items_concat_pq(self, item_id, top_n: int = 100):
        item_embedded = tf.nn.embedding_lookup(self.item_vectors, item_id)
        item_embedded = tf.reshape(item_embedded, shape=(2 * self.embedding_size, -1))
        scores = tf.matmul(self.item_vectors, item_embedded)
        scores = tf.squeeze(scores)
        scores = scores / (self.item_norms * self.item_norms[item_id])
        scores = scores.numpy()
        best = np.argpartition(scores, -top_n)[-top_n:]
        return sorted(zip(best, scores[best]), key=lambda x: -x[1])

In [4]:
# @tf.function
def predict_top_n(model, user_id, user_rated_items, top_n=100, batch_size=512):
    rated_items = set(user_rated_items[user_id])
    predicts = []
    user_descriptions = []
    user_ids = []
    item_ids = []
    num_items = []
    for item_id in range(model.num_items):
        if rated_items.__contains__(item_id):
            user_descriptions.append(list(rated_items.difference([item_id])) + [model.num_items])
            user_ids.append(user_id)
            item_ids.append(item_id)
            num_items.append(rated_items.__len__() - 1)
        else:
            user_descriptions.append(list(rated_items.difference([item_id])))
            user_ids.append(user_id)
            item_ids.append(item_id)
            num_items.append(rated_items.__len__())
        if user_descriptions.__len__() >= batch_size:
            batch_predict = model(np.array(user_descriptions, dtype=np.int32),
                                  np.array(user_ids, dtype=np.int32),
                                  np.array(item_ids, dtype=np.int32),
                                  np.array(num_items, dtype=np.float32))
            predicts += list(batch_predict.numpy())
            user_descriptions = []
            user_ids = []
            item_ids = []
            num_items = []
    batch_predict = model(np.array(user_descriptions, dtype=np.int32),
                          np.array(user_ids, dtype=np.int32),
                          np.array(item_ids, dtype=np.int32),
                          np.array(num_items, dtype=np.float32))
    predicts += list(batch_predict.numpy())
    items_score = [(iid, score) for iid, score in enumerate(predicts)]
    items_score.sort(key=lambda x: x[1], reverse=True)
    return items_score[:top_n]


def hit_rate_evaluate(fism_model, user_rated_items):
    total_items = 0
    in_train_count = 0
    count = 0
    count_hit = 0
    for test_user in tqdm(test_users_in_train):
        user_session_id = test_user[0]
        if user_idx_dict.__contains__(user_session_id):
            user_id = user_idx_dict[user_session_id]
            rated_items = test_user[1]
            rec_top_n = predict_top_n(fism_model, user_id, user_rated_items, batch_size=4096, top_n=100)
            top_item_ids = {rec_item[0] for rec_item in rec_top_n}
            for rated_item in rated_items:
                if item_idx_dict.__contains__(rated_item):
                    item_id = item_idx_dict[rated_item]
                    in_train_count += 1
                    if top_item_ids.__contains__(item_id):
                        count_hit += 1
        total_items += rated_items.__len__()
        count += 1
        if count > 300:
            break
    in_train_rate = in_train_count / total_items
    hit_rate = count_hit / total_items
    return in_train_rate, hit_rate


def rank_score_evaluate(fism_model, user_rated_items):
    count = 0
    list_user_ranks = []
    num_item = item_idx_dict.__len__()
    total_pred = 0
    pred_hit = 0
    for user_session_id, list_cluster_ids in tqdm(test_users_in_train):
        user_id = user_idx_dict[user_session_id]
        list_rec_items = predict_top_n(fism_model, user_id, user_rated_items, batch_size=4096, top_n=-1)
        rec_items_idx = {item_id: idx + 1 for idx, (item_id, score) in enumerate(list_rec_items)}
        user_ranks = []
        for cluster_id in list_cluster_ids:
            total_pred += 1
            if item_idx_dict.__contains__(cluster_id):
                pred_hit += 1
                if rec_items_idx.__len__() > 0:
                    item_idx = item_idx_dict[cluster_id]
                    if rec_items_idx.__contains__(item_idx):
                        pred_rank = rec_items_idx[item_idx] / num_item
                        user_ranks.append(pred_rank)
        list_user_ranks.append(user_ranks)

        count += 1
        if count > 300:
            break
    rank_mean_users = []
    for user_ranks in list_user_ranks:
        if user_ranks.__len__() > 0:
            rank_mean_users.append(np.mean(user_ranks))
    return np.mean(rank_mean_users), pred_hit / total_pred


def item_sim_rec_top_n(fism_model, user_cluster_ids=[], top_n=100):
    dict_max_score = {}
    for his_cluster_id in user_cluster_ids:
        if item_idx_dict.__contains__(his_cluster_id):
            item_idx = item_idx_dict[his_cluster_id]
            rec_items = fism_model.sim_items_concat_pq(item_id=item_idx, top_n=top_n)
            for item_idx, score in rec_items:
                if not dict_max_score.__contains__(item_idx):
                    dict_max_score[item_idx] = []
                dict_max_score[item_idx].append(score)
    for item_idx, list_scores in dict_max_score.items():
        dict_max_score[item_idx] = max(list_scores)
    list_rec_items = list(dict_max_score.items())
    list_rec_items.sort(key=lambda x: x[1], reverse=True)
    return list_rec_items[:top_n]


def item_sim_hit_rate_evaluate(fism_model, top_n=100, num_his=3):
    need_pred = 0
    in_train_set = 0

    list_user_hits = []
    count = 0
    for uid, list_cluster_ids in tqdm(test_users):
        random_split = random.randint(num_his, list_cluster_ids.__len__() - 1)
        history_cluster_ids = list_cluster_ids[random_split - num_his:random_split]
        predict_cluster_ids = list_cluster_ids[random_split: random_split + 10]
        top_n_rec = item_sim_rec_top_n(fism_model, user_cluster_ids=history_cluster_ids, top_n=top_n)
        rec_items_idx = {item_id: idx + 1 for idx, (item_id, distance) in enumerate(top_n_rec)}
        #     print(top_n_rec)
        user_hit = 0
        for pred_cluster_id in predict_cluster_ids:
            need_pred += 1
            if item_idx_dict.__contains__(pred_cluster_id):
                in_train_set += 1
                if rec_items_idx.__contains__(item_idx_dict[pred_cluster_id]):
                    user_hit += 1
        list_user_hits.append((user_hit, predict_cluster_ids.__len__()))
        count += 1
        if count > 3000:
            break
    hit_sum = sum([hit for hit, pred_len in list_user_hits])
    pred_sum = sum([pred_len for hit, pred_len in list_user_hits])
    hit_rate = hit_sum / pred_sum
    in_train_set_rate = in_train_set / need_pred
    return need_pred, hit_rate, in_train_set_rate


def item_sim_rank_score(fism_model):
    count = 0
    list_user_ranks = []
    num_item = item_idx_dict.__len__()
    total_pred = 0
    pred_hit = 0
    for uid, list_cluster_ids in tqdm(test_users):
        random_split = random.randint(3, list_cluster_ids.__len__() - 1)
        history_cluster_ids = list_cluster_ids[random_split - 3:random_split]
        pred_cluster_ids = list_cluster_ids[random_split: random_split + 10]
        dict_max_score = {}
        for his_cluster_id in history_cluster_ids:
            if item_idx_dict.__contains__(his_cluster_id):
                item_idx = item_idx_dict[his_cluster_id]
                rec_items = fism_model.sim_items_concat_pq(item_id=item_idx, top_n=num_item)
                for item_idx, score in rec_items:
                    if not dict_max_score.__contains__(item_idx):
                        dict_max_score[item_idx] = []
                    dict_max_score[item_idx].append(score)
        for item_idx, list_scores in dict_max_score.items():
            dict_max_score[item_idx] = max(list_scores)

        list_rec_items = list(dict_max_score.items())
        list_rec_items.sort(key=lambda x: x[1], reverse=True)
        rec_items_idx = {item_id: idx + 1 for idx, (item_id, score) in enumerate(list_rec_items)}

        user_ranks = []
        for pred_cluster_id in pred_cluster_ids:
            total_pred += 1
            if item_idx_dict.__contains__(pred_cluster_id):
                pred_hit += 1
                if rec_items_idx.__len__() > 0:
                    item_idx = item_idx_dict[pred_cluster_id]
                    pred_rank = rec_items_idx[item_idx] / num_item
                    user_ranks.append(pred_rank)
        list_user_ranks.append(user_ranks)

        count += 1
        if count > 500:
            break
    rank_mean_users = []
    for user_ranks in list_user_ranks:
        if user_ranks.__len__() > 0:
            rank_mean_users.append(np.mean(user_ranks))
    return np.mean(rank_mean_users), pred_hit / total_pred

In [5]:
base_folder = 'F:\Projects\ListingRecommendation\data\oto.com.vn\\ncf\\'

auto_id_cluster_dict = pickle.load(open(base_folder + 'auto_id_cluster_dict.pkl', 'rb'))
cluster_auto_ids_dict = pickle.load(open(base_folder + 'cluster_auto_ids_dict.pkl', 'rb'))
idx_cluster_dict = pickle.load(open(base_folder + 'idx_cluster_dict.pkl', 'rb'))
idx_user_dict = pickle.load(open(base_folder + 'idx_user_dict.pkl', 'rb'))
idx_item_dict = pickle.load(open(base_folder + 'idx_item_dict.pkl', 'rb'))
test_users = pickle.load(open(base_folder + 'test_users.pkl', 'rb'))

user_idx_dict = {user_id: idx for idx, user_id in idx_user_dict.items()}
item_idx_dict = {item_id: idx for idx, item_id in idx_item_dict.items()}

test_users_in_train = []
for user_session_id, items in test_users:
    if user_idx_dict.__contains__(user_session_id):
        test_users_in_train.append((user_session_id, items))

In [6]:
@tf.function
def train_step(model, optimizer, user_descriptions, user_ids, item_ids, num_items, labels, ratings):
    with tf.GradientTape() as tape:
        predictions = model(user_descriptions, user_ids, item_ids, num_items)
        loss = model.loss_fn(predictions, labels, ratings)
    gradients = tape.gradient(target=loss, sources=model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss


def training(fism_model, optimizer, dataset, num_epochs, pretrained=False):
    start_epoch = tf.Variable(0, dtype=tf.int32)
    ckpt = tf.train.Checkpoint(fism_model=fism_model, start_epoch=start_epoch)
    manager = tf.train.CheckpointManager(checkpoint=ckpt, directory='./fism_ckpt', max_to_keep=3)
    if pretrained:
        ckpt.restore(manager.latest_checkpoint)
    rated_data = dataset.get_rated_data()
    user_rated_items = dict()
    for user_id, item_id, rate in rated_data:
        if not user_rated_items.__contains__(user_id):
            user_rated_items[user_id] = []
        user_rated_items[user_id].append(item_id)

    for epoch in range(num_epochs):
        train_loss = tf.constant(0, tf.float32)
        # print('start training epoch: ' + str(epoch))
        start_load_data = time()
        all_batch_data, num_iter = dataset.generate_train_data()
        load_data_time = time() - start_load_data
        start_train_time = time()
        for user_descriptions, user_ids, item_ids, num_items, labels, ratings in tqdm(all_batch_data):
            loss_step = train_step(fism_model, optimizer, user_descriptions, user_ids, item_ids, num_items, labels, ratings)
            #             print('loss_step: ', loss_step.numpy())
            train_loss += loss_step
        train_time = time() - start_train_time
        print('epoch: ', epoch, '. load data time: ', load_data_time, '. train time: ', train_time ,'. train loss: ', train_loss.numpy() / (all_batch_data.__len__()))
        if epoch % 2 == 0:
            fism_model.prepare_for_prediction()
            need_pred, item_sim_hit_rate, in_train_set_rate = item_sim_hit_rate_evaluate(fism_model)
            item_sim_rank, rank_in_train_set_rate = item_sim_rank_score(fism_model)

            ### user
            in_train_rate, user_hit_rate = hit_rate_evaluate(fism_model, user_rated_items)
            user_rank_score, rank_in_train_set = rank_score_evaluate(fism_model, user_rated_items)

            score = {'need_pred': need_pred,
                     'item_sim_hit_rate': item_sim_hit_rate,
                     'in_train_set_rate': in_train_set_rate,
                     'item_sim_rank': item_sim_rank,


                     'cf_hit_rate': user_hit_rate,
                     'cf_in_train_set_rate': in_train_rate,
                     'cf_rank': user_rank_score}

            print('epoch: {}, score: {}'.format(epoch, score))
            # start_epoch = tf.Variable(start_epoch.numpy() + epoch + 1, dtype=tf.int32)
            ckpt.start_epoch.assign_add(epoch + 1)
            manager.save()
            print('done save at epoch: ', start_epoch.numpy())

In [7]:
dataset = DataSet(base_folder + 'train.csv', base_folder + 'test.csv', negative_sample=3, batch_size=1024)

In [8]:
args = dict()
args['embedding_size'] = 50
args['alpha'] = 0.6
args['beta'] = 0.0005
# args['gamma'] = 0.001
# args['lambda_'] = 0.001
args['gamma'] = 0.000
args['lambda_'] = 0.000
args['verborse'] = 1
args['num_items'] = dataset.num_items
args['num_users'] = dataset.num_users
args['confidence_factor'] = 1

fism = FISM(args)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)

In [9]:
training(fism, optimizer, dataset, num_epochs=100, pretrained=False)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=1076.0), HTML(value='')))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
W0827 10:13:30.513999  8260 def_function.py:598] 5 out of the last 8 calls to <function train_step at 0x0000000055D7A9D8> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. Please refer to https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args and https://www.tensorflow.org/api_docs/python/tf/function for more details.
W0827 10:13:31.013028  8260 def_function.py:598] 5 out of the last 11 calls to <function train_step at 0x0000000055D7A9D8> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings is likely due to passing python objects instead of tensors. Also, tf.function has experimental_relax_shapes=True option that relaxes ar


epoch:  0 . load data time:  11.828676700592041 . train time:  53.63206744194031 . train loss:  700.7676579925651


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=1369.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=1369.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


epoch: 0, score: {'need_pred': 3057, 'item_sim_hit_rate': 0.27641478573765127, 'in_train_set_rate': 0.7595682041216879, 'item_sim_rank': 0.13751833120494375, 'cf_hit_rate': 0.1968390804597701, 'cf_in_train_set_rate': 0.7825670498084292, 'cf_rank': 0.10465055133530193}
done save at epoch:  1


HBox(children=(FloatProgress(value=0.0, max=1076.0), HTML(value='')))


epoch:  1 . load data time:  14.030802726745605 . train time:  47.53571891784668 . train loss:  427.82992565055764


HBox(children=(FloatProgress(value=0.0, max=1076.0), HTML(value='')))


epoch:  2 . load data time:  11.976684808731079 . train time:  47.65972590446472 . train loss:  343.689968633829


HBox(children=(FloatProgress(value=0.0, max=1369.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1369.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


epoch: 2, score: {'need_pred': 3006, 'item_sim_hit_rate': 0.30538922155688625, 'in_train_set_rate': 0.7717897538256819, 'item_sim_rank': 0.10465495917765187, 'cf_hit_rate': 0.23515325670498086, 'cf_in_train_set_rate': 0.7825670498084292, 'cf_rank': 0.08992122158762109}
done save at epoch:  4


HBox(children=(FloatProgress(value=0.0, max=1076.0), HTML(value='')))


epoch:  3 . load data time:  12.139694213867188 . train time:  47.46071481704712 . train loss:  312.93892309479554


HBox(children=(FloatProgress(value=0.0, max=1076.0), HTML(value='')))


epoch:  4 . load data time:  14.646837711334229 . train time:  49.38682460784912 . train loss:  297.0112976301115


HBox(children=(FloatProgress(value=0.0, max=1369.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1369.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


epoch: 4, score: {'need_pred': 3025, 'item_sim_hit_rate': 0.2955371900826446, 'in_train_set_rate': 0.7649586776859504, 'item_sim_rank': 0.1161446226447485, 'cf_hit_rate': 0.26580459770114945, 'cf_in_train_set_rate': 0.7825670498084292, 'cf_rank': 0.09863025606742123}
done save at epoch:  9


HBox(children=(FloatProgress(value=0.0, max=1076.0), HTML(value='')))


epoch:  5 . load data time:  12.103692054748535 . train time:  47.321706771850586 . train loss:  288.7306284851301


HBox(children=(FloatProgress(value=0.0, max=1076.0), HTML(value='')))


epoch:  6 . load data time:  11.744671821594238 . train time:  50.21387219429016 . train loss:  283.55248025092936


HBox(children=(FloatProgress(value=0.0, max=1369.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1369.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


epoch: 6, score: {'need_pred': 2983, 'item_sim_hit_rate': 0.32215890043580286, 'in_train_set_rate': 0.7703654039557493, 'item_sim_rank': 0.11138210306473398, 'cf_hit_rate': 0.27155172413793105, 'cf_in_train_set_rate': 0.7825670498084292, 'cf_rank': 0.09987771134145011}
done save at epoch:  16


HBox(children=(FloatProgress(value=0.0, max=1076.0), HTML(value='')))


epoch:  7 . load data time:  11.75167202949524 . train time:  48.345765113830566 . train loss:  279.6188719795539


HBox(children=(FloatProgress(value=0.0, max=1076.0), HTML(value='')))


epoch:  8 . load data time:  14.06880497932434 . train time:  45.29659080505371 . train loss:  275.76577021375465


HBox(children=(FloatProgress(value=0.0, max=1369.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1369.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


epoch: 8, score: {'need_pred': 3030, 'item_sim_hit_rate': 0.30066006600660067, 'in_train_set_rate': 0.76006600660066, 'item_sim_rank': 0.12160092634440826, 'cf_hit_rate': 0.2825670498084291, 'cf_in_train_set_rate': 0.7825670498084292, 'cf_rank': 0.10360479386649928}
done save at epoch:  25


HBox(children=(FloatProgress(value=0.0, max=1076.0), HTML(value='')))


epoch:  9 . load data time:  12.893737316131592 . train time:  46.02763271331787 . train loss:  273.10484433085503


HBox(children=(FloatProgress(value=0.0, max=1076.0), HTML(value='')))


epoch:  10 . load data time:  11.766672849655151 . train time:  45.10357999801636 . train loss:  269.92718982342006


HBox(children=(FloatProgress(value=0.0, max=1369.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1369.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


epoch: 10, score: {'need_pred': 3019, 'item_sim_hit_rate': 0.31069890692282215, 'in_train_set_rate': 0.7687976151043392, 'item_sim_rank': 0.12502236170477296, 'cf_hit_rate': 0.2792145593869732, 'cf_in_train_set_rate': 0.7825670498084292, 'cf_rank': 0.1028874701652126}
done save at epoch:  36


HBox(children=(FloatProgress(value=0.0, max=1076.0), HTML(value='')))


epoch:  11 . load data time:  13.033745527267456 . train time:  50.30887722969055 . train loss:  267.9796119888476


HBox(children=(FloatProgress(value=0.0, max=1076.0), HTML(value='')))


epoch:  12 . load data time:  12.696726322174072 . train time:  48.23975896835327 . train loss:  266.14155436802974


HBox(children=(FloatProgress(value=0.0, max=1369.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1369.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


epoch: 12, score: {'need_pred': 3085, 'item_sim_hit_rate': 0.2985413290113452, 'in_train_set_rate': 0.7614262560777958, 'item_sim_rank': 0.11920139164111163, 'cf_hit_rate': 0.29118773946360155, 'cf_in_train_set_rate': 0.7825670498084292, 'cf_rank': 0.10865541398109826}
done save at epoch:  49


HBox(children=(FloatProgress(value=0.0, max=1076.0), HTML(value='')))


epoch:  13 . load data time:  13.400766611099243 . train time:  48.56577777862549 . train loss:  265.0007841542751


HBox(children=(FloatProgress(value=0.0, max=1076.0), HTML(value='')))


epoch:  14 . load data time:  11.862678289413452 . train time:  47.7607319355011 . train loss:  263.2503775557621


HBox(children=(FloatProgress(value=0.0, max=1369.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1369.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=320.0), HTML(value='')))


epoch: 14, score: {'need_pred': 3089, 'item_sim_hit_rate': 0.3072191647782454, 'in_train_set_rate': 0.7730657170605374, 'item_sim_rank': 0.12534601729691683, 'cf_hit_rate': 0.2792145593869732, 'cf_in_train_set_rate': 0.7825670498084292, 'cf_rank': 0.10860631937568954}
done save at epoch:  64


HBox(children=(FloatProgress(value=0.0, max=1076.0), HTML(value='')))


epoch:  15 . load data time:  14.054803609848022 . train time:  54.60412335395813 . train loss:  261.4638998605948


HBox(children=(FloatProgress(value=0.0, max=1076.0), HTML(value='')))


epoch:  16 . load data time:  13.87579345703125 . train time:  56.855252265930176 . train loss:  260.08460153345726


HBox(children=(FloatProgress(value=0.0, max=1369.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1369.0), HTML(value='')))




KeyboardInterrupt: 

#### bpr_mf score:
    {'need_pred': 2264, 'hit_rate': 0.28931095406360424, 'in_train_set_rate': 0.7628091872791519, 'rank': 0.11048865212027534, 'cf_need_pred': 2221, 'cf_hit_rate': 0.14633048176497074, 'cf_in_train_set_rate': 0.7757766771724448, 'cf_rank': 0.12322572834338055

### prediction

In [10]:
rated_data = dataset.get_rated_data()
user_rated_items = dict()
for user_id, item_id, rate in rated_data:
    if not user_rated_items.__contains__(user_id):
        user_rated_items[user_id] = []
    user_rated_items[user_id].append(item_id)

In [11]:
for i in tqdm(range(10)):
    predict_top_n(fism, i, user_rated_items, batch_size=4096)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [12]:
fism.num_items

23606

In [13]:
import math

def sigmoid(x):
    return 1 / (1 + math.exp(-x))

In [14]:
sigmoid(11.5), sigmoid(9.9), sigmoid(4)

(0.9999898700090192, 0.9999498278353162, 0.9820137900379085)

In [15]:
predict_top_n(fism, 2, user_rated_items, batch_size=4096)

[(3685, 12.186567),
 (13, 10.799454),
 (13168, 9.855587),
 (12801, 9.417419),
 (12855, 9.372033),
 (14, 9.280977),
 (6248, 9.12025),
 (1686, 8.478687),
 (14536, 8.3661785),
 (1773, 8.123233),
 (5509, 8.041772),
 (2496, 7.95879),
 (2603, 7.692417),
 (6505, 7.6681337),
 (1008, 7.4739704),
 (17838, 7.378082),
 (15, 7.3428006),
 (17566, 7.290766),
 (5706, 7.2192183),
 (13345, 7.164782),
 (17894, 7.150196),
 (6199, 7.13529),
 (4730, 7.1103134),
 (16, 7.0262804),
 (12, 6.917858),
 (18, 6.881666),
 (11161, 6.765689),
 (13875, 6.6233025),
 (1687, 6.592203),
 (21115, 6.575879),
 (17314, 6.4467955),
 (8981, 6.3982387),
 (1650, 6.23897),
 (18866, 6.2379932),
 (9207, 6.07539),
 (13746, 6.067525),
 (8420, 6.053816),
 (21997, 6.026434),
 (12826, 5.9899263),
 (13438, 5.9805274),
 (17696, 5.7239513),
 (5864, 5.7172956),
 (488, 5.7093563),
 (3275, 5.6911182),
 (12416, 5.6675043),
 (21240, 5.6552343),
 (2577, 5.503698),
 (1369, 5.488298),
 (2464, 5.480237),
 (6398, 5.4657464),
 (5510, 5.437488),
 (6587,

In [16]:
raw_u_matrix_df = pickle.load(open(base_folder + 'raw_u_matrix_df.pkl', 'rb'))
u_matrix_df = pickle.load(open(base_folder + 'u_matrix_df.pkl', 'rb'))
auto_id_cluster_dict = pickle.load(open(base_folder + 'auto_id_cluster_dict.pkl', 'rb'))
cluster_auto_ids_dict = pickle.load(open(base_folder + 'cluster_auto_ids_dict.pkl', 'rb'))
idx_cluster_dict = pickle.load(open(base_folder + 'idx_cluster_dict.pkl', 'rb'))
idx_user_dict = pickle.load(open(base_folder + 'idx_user_dict.pkl', 'rb'))
idx_item_dict = pickle.load(open(base_folder + 'idx_item_dict.pkl', 'rb'))

In [17]:
cluster_model = pickle.load(open('F:\Projects\ListingRecommendation\data\oto.com.vn' + os.sep + 'cluster_model.pkl', 'rb'))
idx_year_dict = dict()
for year_item in cluster_model['year_value_ids']:
    value = ''
    if year_item.__contains__('gte'):
        value += (str(year_item['gte']) + '-')
    if year_item.__contains__('lt'):
        value += ('-' +str(year_item['lt']))
    idx_year_dict[year_item['id']] = value

idx_price_dict = dict()
for price_item in cluster_model['price_value_ids']:
    value = ''
    if price_item.__contains__('gte'):
        value += (str(price_item['gte']) + '-')
    if price_item.__contains__('lt'):
        value += ('-' +str(price_item['lt']))
    idx_price_dict[price_item['id']] = value
# idx_exterior_color_dict = {idx:color for color, idx in cluster_model['exterior_color_value_ids'].items()}

In [18]:
base_folder = 'F:\Projects\ListingRecommendation\data\oto.com.vn\\ncf\\'

In [19]:
def random_test_items_rec():
    clear_output()
    random_user_idx = random.randint(0, fism.num_users)    
    rec_items = predict_top_n(fism, random_user_idx, user_rated_items, batch_size=4096)
    count_display = 0
    for item_id in user_rated_items[random_user_idx][:50]:
        cluster_key = idx_cluster_dict[idx_item_dict[item_id]]    
        list_keys = list(cluster_key)
        list_keys[2] = idx_year_dict[list_keys[2]]
        list_keys[3] = idx_price_dict[list_keys[3]]
#         list_keys[5] = idx_exterior_color_dict[list_keys[5]]
        print(str(cluster_key) + '\t----\t' + str(list_keys))
        count_display += 1
        if count_display> 50:
            break
    count_display = 0
    print('-' * 60)
    print('-' * 60)
    print('-' * 60)
    for item_id, score in rec_items:
        cluster_key = idx_cluster_dict[idx_item_dict[item_id]]    
        list_keys = list(cluster_key)
        list_keys[2] = idx_year_dict[list_keys[2]]
        list_keys[3] = idx_price_dict[list_keys[3]]
#         list_keys[5] = idx_exterior_color_dict[list_keys[5]]
        print(str(cluster_key) + '\t----\t' + str(list_keys))
        count_display += 1
        if count_display> 20:
            break

button_other = widgets.Button(description='random test')
out_other = widgets.Output()
def on_button_clicked(_):
      # "linking function with output"
      with out_other:
          # what happens when we press the button
          random_test_items_rec()
# linking button and function together using a button's method
button_other.on_click(on_button_clicked)
# displaying button and its output together
widgets.VBox([button_other,out_other])

VBox(children=(Button(description='random test', style=ButtonStyle()), Output()))

### Evaluation

In [16]:
test_users = pickle.load(open(base_folder + 'test_users.pkl', 'rb'))

In [18]:
test_users.__len__()

1369

In [21]:
user_idx_dict = {user_id:idx for idx, user_id in idx_user_dict.items()}
item_idx_dict = {item_id:idx for idx, item_id in idx_item_dict.items()}

In [26]:
test_user = test_users[0]

In [28]:
user_id = user_idx_dict[test_user[0]]

In [30]:
predict_top_n(fism, user_id, user_rated_items, batch_size=4096, top_n=-1)

[(2025, 1.5633965),
 (1803, 1.5580299),
 (19434, 1.5227872),
 (11960, 1.4938323),
 (19256, 1.410286),
 (13423, 1.3933122),
 (727, 1.3816149),
 (3343, 1.3751199),
 (16868, 1.3733063),
 (2314, 1.3621482),
 (4538, 1.3539121),
 (4610, 1.3500317),
 (13000, 1.3422687),
 (5623, 1.3413368),
 (11710, 1.3382826),
 (21291, 1.3306535),
 (4522, 1.3277571),
 (9044, 1.3231375),
 (6481, 1.3223011),
 (1292, 1.3207104),
 (21022, 1.311219),
 (867, 1.307471),
 (15436, 1.30653),
 (3369, 1.2994673),
 (17754, 1.2991552),
 (6203, 1.2987642),
 (2834, 1.295771),
 (5221, 1.2947232),
 (799, 1.2946851),
 (2535, 1.2936642),
 (17040, 1.2841288),
 (1491, 1.2779826),
 (14158, 1.2696521),
 (9145, 1.2670518),
 (6437, 1.2665544),
 (12965, 1.2644998),
 (2023, 1.2618456),
 (16546, 1.2596278),
 (22397, 1.2583109),
 (14248, 1.2575527),
 (19925, 1.2527559),
 (17842, 1.2497922),
 (10605, 1.2466345),
 (17641, 1.245878),
 (23262, 1.2438996),
 (11351, 1.242455),
 (325, 1.2423756),
 (2069, 1.2418395),
 (2214, 1.2403322),
 (2836, 1

In [31]:
test_user

('01pqZKkmGrZvh5OD0qh8DGaeGbCpEjGT',
 [53541.0, 291027.0, 138234.0, 66036.0, 62454.0, 346291.0])

In [38]:
test_users_in_train = []
for user_session_id, items in test_users:
    if user_idx_dict.__contains__(user_session_id):
        test_users_in_train.append((user_session_id, items))

In [40]:
test_users_in_train.__len__()

320

In [60]:
in_train_rate = in_train_count/total_items
hit_rate = count_hit/total_items
in_train_rate, hit_rate

(0.7823613086770982, 0.06401137980085349)

In [56]:
top_item_ids = {rec_item[0] for rec_item in rec_top_n}

In [57]:
top_item_ids

{0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [49]:
rec_top_n

[(14909, 1.8628606),
 (1955, 1.8406407),
 (16909, 1.7795709),
 (21015, 1.7754593),
 (17611, 1.7729967),
 (8274, 1.75195),
 (20495, 1.7392932),
 (21263, 1.7232723),
 (5623, 1.7203313),
 (20069, 1.7122546),
 (12613, 1.7060773),
 (4208, 1.7003525),
 (20078, 1.683277),
 (20638, 1.6806306),
 (6215, 1.6736042),
 (953, 1.651202),
 (16038, 1.6423681),
 (14522, 1.6128026),
 (16727, 1.601887),
 (20172, 1.5859454),
 (1831, 1.5849069),
 (10734, 1.5782744),
 (11629, 1.5761318),
 (21583, 1.5742289),
 (18201, 1.5654683),
 (1981, 1.5629481),
 (8552, 1.556633),
 (445, 1.5542078),
 (1116, 1.5529963),
 (5323, 1.5479565),
 (20610, 1.5455303),
 (3243, 1.5426302),
 (10971, 1.5361725),
 (10330, 1.5350013),
 (11754, 1.5233514),
 (20219, 1.520942),
 (20073, 1.5188262),
 (15745, 1.5156312),
 (14564, 1.5056658),
 (135, 1.5047414),
 (6061, 1.503298),
 (21756, 1.5020897),
 (9235, 1.5018246),
 (6871, 1.4878578),
 (4875, 1.4869689),
 (9543, 1.4816445),
 (21623, 1.4801532),
 (15390, 1.4778388),
 (14679, 1.477642),
 (

In [73]:
t1 = tf.Variable([[1, 2, 3], [4, 5, 6]], dtype=tf.float32)
t2 = tf.Variable([[7, 8, 9], [10, 11, 12]], dtype=tf.float32)
t3 = tf.concat([t1, t2], 0)

In [74]:
with tf.GradientTape() as tape:
    out = tf.reduce_sum(t3)

In [75]:
tape.gradient(t3, [t1, t2])

[None, None]