In [1]:
import collections
import os
import numpy as np
import pandas as pd
import pickle
import time
import multiprocessing as mp
from functools import partial
import json
from collections import OrderedDict
import random

In [2]:
def load_rating(d):
    print('reading rating file ...')

    # reading rating file
    rating_np = np.load('../data/' + d + '/ratings_final' + '.npy')

    n_user = len(set(rating_np[:, 0]))
    n_item = len(set(rating_np[:, 1]))
    print('u %d, i %d' % (n_user, n_item))
    user_history_dict = dict()
    for t_i in range(rating_np.shape[0]):
        user = rating_np[t_i][0]
        item = rating_np[t_i][1]
        rating = rating_np[t_i][2]

        if user not in user_history_dict:
            user_history_dict[user] = [item]
        user_history_dict[user].append(item)

    return user_history_dict

def load_kg(d):
    print('reading KG file ...')
    # reading kg file
    kg_np = np.load('../data/' + d + '/kg_final.npy')

    n_entity = len(set(kg_np[:, 0]) | set(kg_np[:, 2]))
    n_relation = len(set(kg_np[:, 1]))
    print('e %d, r %d' % (n_entity, n_relation))
    kg = collections.defaultdict(list)
    for h, r, t in kg_np:
        kg[h].append((t, r))

    return kg


In [3]:
d = 'amazon-book'
user_history_dict = load_rating(d)
kg = load_kg(d)

reading rating file ...
u 6969, i 9854
reading KG file ...
e 113487, r 39


In [None]:
t_start = time.time()
h = 2
m = 32
c = 8
r, entity_interaction_dict = get_ripple_set(h, m, c)
print('cost: %.3f' % (time.time() - t_start))


user_total = []
user_total_interact = []
for e_i in entity_interaction_dict.values():
    user_total_interact += e_i
t_start = time.time()
get_total_entity

constructing ripple set ...
cost: 63.265


In [None]:
t_start = time.time()
# avg_entity_interaction_tmp = len(list(set(user_total))) / len(total_entity)
for u, its in user_history_dict.items():
    user_total += its
    buffer = its.copy()
    for _ in range(h):
        _next = []
        for i in set(buffer):
            tails = [t for t, _ in kg[i]]
            user_total += tails
            _next += tails
        buffer = _next.copy()

print(len(list(set(user_total_interact))))
print(len(list(set(user_total))))
print('cost: %.3f' % (time.time() - t_start))

In [4]:
def get_ripple_set(h=2, m=32, c=12):
    print('constructing ripple set ...')

    # user -> [(hop_0_heads, hop_0_relations, hop_0_tails), (hop_1_heads, hop_1_relations, hop_1_tails), ...]
    ripple_set = collections.defaultdict(list)
    entity_interaction_dict = collections.defaultdict(list)
    total_entity = []
    with mp.Pool(processes=min(mp.cpu_count(), c)) as pool:
        job = partial(_get_ripple_set, n_hop=h, n_memory=m)
#         ripple_set = {ret[0]: np.array(ret[1], dtype=np.int32) for ret in pool.starmap(job, user_history_dict.items())}
        for u, u_r_set, u_interaction_list in pool.starmap(job, user_history_dict.items()):
            ripple_set[u] = np.array(u_r_set, dtype=np.int32)
            entity_interaction_dict[u] = u_interaction_list
    return ripple_set, entity_interaction_dict

def _get_ripple_set(user, history, n_hop=2, n_memory=32):
    ret = []
    entity_interaction_list = []
    total = history.copy()
    for h in range(n_hop):
        memories_h = []
        memories_r = []
        memories_t = []

        if h == 0:
            tails_of_last_hop = history
        else:
            tails_of_last_hop = ret[-1][2]

        for entity in tails_of_last_hop:
#             for tail_and_relation in kg[entity]:
            random.seed(time.time())
            for tail_and_relation in random.sample(kg[entity], min(len(kg[entity]), 16)):
                memories_h.append(entity)
                memories_r.append(tail_and_relation[1])
                memories_t.append(tail_and_relation[0])
        # if the current ripple set of the given user is empty, we simply copy the ripple set of the last hop here
        # this won't happen for h = 0, because only the items that appear in the KG have been selected
        # this only happens on 154 users in Book-Crossing dataset (since both BX dataset and the KG are sparse)
        if len(memories_h) == 0:
            ret.append(ret[-1])
        else:
            indices = np.random.choice(len(memories_h), size=n_memory, replace=len(memories_h) < n_memory)
            memories_h = [memories_h[i] for i in indices]
            memories_r = [memories_r[i] for i in indices]
            memories_t = [memories_t[i] for i in indices]
            entity_interaction_list += memories_h + memories_t
            ret.append([memories_h, memories_r, memories_t])
    return [user, ret, list(set(entity_interaction_list))]