In [None]:
!pip -q install scipy tqdm
import torch, sys, types, textwrap, importlib
print('CUDA device:', torch.cuda.get_device_name(0))

CUDA device: NVIDIA L4


In [None]:
def register_module(name: str, src: str):
    """
    Creates a ModuleType, executes the code, and registers it in sys.modules
    under the desired name so that future imports work.
    """
    module = types.ModuleType(name)
    exec(textwrap.dedent(src), module.__dict__)
    sys.modules[name] = module
    # Create the parent package 
    if '.' in name:
        parent, child = name.split('.', 1)
        if parent not in sys.modules:
            sys.modules[parent] = types.ModuleType(parent)
        setattr(sys.modules[parent], child, module)


In [None]:
consts_src = '''
"""DATASET_DIR = 'Digital_Music_dataset/'
DATA_DIR = 'Digital_Music_data/'
DATA_IX_DIR = 'Digital_Music_data_ix/'
DATA_IX_MAPPING_DIR = 'Digital_Music_ix_mapping/'
PATH_DATA_DIR = 'Digital_Music_path_data/'
DATA_FILE = 'reviews_Digital_Music_5.json.gz'"""

DATASET_DIR = 'MovieLens_100k_dataset/'
DATA_DIR = 'MovieLens_100k_data/'
DATA_IX_DIR = 'MovieLens_100k_data_ix/'
DATA_IX_MAPPING_DIR = 'MovieLens_100k_ix_mapping/'
PATH_DATA_DIR = 'MovieLens_100k_path_data/'
DATA_FILE = 'u.data'

"""DATASET_DIR = 'Douban_dataset/'
DATA_DIR = 'Douban_data/'
DATA_IX_DIR = 'Douban_data_ix/'
DATA_IX_MAPPING_DIR = 'Douban_ix_mapping/'
PATH_DATA_DIR = 'Douban_path_data/'
DATA_FILE = 'training_test_dataset.mat'"""

"""DATASET_DIR = 'Jester_dataset/'
DATA_DIR = 'Jester_data/'
DATA_IX_DIR = 'Jester_data_ix/'
DATA_IX_MAPPING_DIR = 'Jester_ix_mapping/'
PATH_DATA_DIR = 'Jester_path_data/'
DATA_FILE = 'Jester.csv'"""

USER_SIM_DICT = 'user_sim.dict'
ITEM_SIM_DICT = 'item_sim.dict'
USER_ITEM_DICT = 'user_item.dict'
ITEM_USER_DICT = 'item_user.dict'
ITEM_DIRECTOR_DICT = 'item_director.dict'
DIRECTOR_ITEM_DICT = 'director_item.dict'
ITEM_ACTOR_DICT = 'item_actor.dict'
ACTOR_ITEM_DICT = 'actor_item.dict'
TRAIN_USER_ITEM_DICT = 'train_user_item.dict'
TRAIN_ITEM_USER_DICT = 'train_item_user.dict'
VALID_USER_ITEM_DICT = 'valid_user_item.dict'
VALID_ITEM_USER_DICT = 'valid_item_user.dict'
TEST_USER_ITEM_DICT = 'test_user_item.dict'
TEST_ITEM_USER_DICT = 'test_item_user.dict'
USER_ITEM_1_DICT = 'user_item_1.dict'
USER_ITEM_2_DICT = 'user_item_2.dict'
USER_ITEM_3_DICT = 'user_item_3.dict'
USER_ITEM_4_DICT = 'user_item_4.dict'
USER_ITEM_5_DICT = 'user_item_5.dict'
ITEM_USER_1_DICT = 'item_user_1.dict'
ITEM_USER_2_DICT = 'item_user_2.dict'
ITEM_USER_3_DICT = 'item_user_3.dict'
ITEM_USER_4_DICT = 'item_user_4.dict'
ITEM_USER_5_DICT = 'item_user_5.dict'
TRAIN_PATH_FILE = 'train_path_file.txt'
VALID_PATH_FILE = 'valid_path_file.txt'
TEST_PATH_FILE = 'test_path_file.txt'

TYPE_TO_IX = 'type_to_ix.dict'
RELATION_TO_IX = 'relation_to_ix.dict'
ENTITY_TO_IX = 'entity_to_ix.dict'
IX_TO_TYPE = 'ix_to_type.dict'
IX_TO_RELATION = 'ix_to_relation.dict'
IX_TO_ENTITY = 'ix_to_entity.dict'

PAD_TOKEN = '#PAD_TOKEN'
USER_TYPE = 0
ITEM_TYPE = 1
PAD_TYPE = 2
DIRECTOR_TYPE = 3
ACTOR_TYPE = 4

USER_ITEM_1_REL = 0
USER_ITEM_2_REL = 1
USER_ITEM_3_REL = 2
USER_ITEM_4_REL = 3
USER_ITEM_5_REL = 4
ITEM_USER_1_REL = 5
ITEM_USER_2_REL = 6
ITEM_USER_3_REL = 7
ITEM_USER_4_REL = 8
ITEM_USER_5_REL = 9
USER_SIM_REL = 10
ITEM_SIM_REL = 11
END_REL = 12
PAD_REL = 13
ITEM_DIRECTOR_REL = 14
DIRECTOR_ITEM_REL = 15
ITEM_ACTOR_REL = 16
ACTOR_ITEM_REL = 17

ENTITY_EMB_DIM = 128
TYPE_EMB_DIM = 32
REL_EMB_DIM = 32
HIDDEN_DIM = 256
ATTENTION_DIM = 128
MAX_PATH_LEN = 5
SAMPLES = 30
# SAMPLES = 100
'''
register_module('constants.consts', consts_src)
from constants import consts   


In [None]:
format_src = '''
import constants.consts as consts

"""
functions used for converting path data into format for the model
"""

def format_paths(paths, e_to_ix, t_to_ix, r_to_ix, sampels):
    """
    Pads paths up to max path length, converting each path into tuple
    of (padded_path, path length).
    """

    new_paths = []
    padding_path = pad_path([], e_to_ix, t_to_ix, r_to_ix, consts.MAX_PATH_LEN, consts.PAD_TOKEN)
    for path in paths:
        path_len = len(path)
        pad_path(path, e_to_ix, t_to_ix, r_to_ix, consts.MAX_PATH_LEN, consts.PAD_TOKEN)
        new_paths.append((path, path_len))
    for i in range(sampels - len(paths)):
        new_paths.append((padding_path, 1))
    return new_paths


def pad_path(seq, e_to_ix, t_to_ix, r_to_ix, max_len, padding_token):
    """
    Pads paths up to max path length
    """
    relation_padding = r_to_ix[padding_token]
    type_padding = t_to_ix[padding_token]
    entity_padding = e_to_ix[padding_token]

    while len(seq) < max_len:
        seq.append([entity_padding, type_padding, relation_padding])

    return seq
'''
register_module('data.format', format_src)
from data.format import format_paths, pad_path


In [None]:
path_extr_src = """
import sys
from os import path
from collections import defaultdict
import copy
import random
sys.path.append(path.dirname(path.dirname(path.abspath('./constants'))))
import constants.consts as consts


class PathState:
    def __init__(self, path, length, entities):
        self.path = path    # array of [entity, entity type, relation to next] triplets
        self.length = length
        self.entities = entities    # set to keep track of the entities alr in the path to avoid cycles


def get_random_index(nums, max_length):
    index_list = list(range(max_length))
    random.shuffle(index_list)
    return index_list[:nums]


def find_paths_user_to_items(start_user, user_sim, item_sim, user_item_1, user_item_2, user_item_3, user_item_4,
                             user_item_5, item_user_1,  item_user_2, item_user_3, item_user_4, item_user_5, max_length,
                             sample_nums):
    '''
    Finds paths of max depth from a user to items
    '''
    item_to_paths = defaultdict(list)
    stack = []
    start = PathState([[start_user, consts.USER_TYPE, consts.END_REL]], 0, {start_user})
    stack.append(start)
    while len(stack) > 0:
        front = stack.pop()
        entity, type = front.path[-1][0], front.path[-1][1]
        # add path to item_to_paths dict, just want paths of max_length rn since length in [2,3,4,5]
        if type == consts.ITEM_TYPE and front.length == max_length:
            item_to_paths[entity].append(front.path)

        if front.length == max_length:
            continue

        if type == consts.USER_TYPE:
            if entity in user_sim:
                user_list = user_sim[entity]
                index_list = get_random_index(sample_nums, len(user_list))
                for index in index_list:
                    user = user_list[index]
                    if user not in front.entities:
                        new_path = copy.deepcopy(front.path)
                        new_path[-1][2] = consts.USER_SIM_REL
                        new_path.append([user, consts.USER_TYPE, consts.END_REL])
                        new_state = PathState(new_path, front.length + 1, front.entities | {user})
                        stack.append(new_state)

            if entity in user_item_1:
                item_list = user_item_1[entity]
                index_list = get_random_index(sample_nums, len(item_list))
                for index in index_list:
                    item = item_list[index]
                    if item not in front.entities:
                        new_path = copy.deepcopy(front.path)
                        new_path[-1][2] = consts.USER_ITEM_1_REL
                        new_path.append([item, consts.ITEM_TYPE, consts.END_REL])
                        new_state = PathState(new_path, front.length + 1, front.entities | {item})
                        stack.append(new_state)

            if entity in user_item_2:
                item_list = user_item_2[entity]
                index_list = get_random_index(sample_nums, len(item_list))
                for index in index_list:
                    item = item_list[index]
                    if item not in front.entities:
                        new_path = copy.deepcopy(front.path)
                        new_path[-1][2] = consts.USER_ITEM_2_REL
                        new_path.append([item, consts.ITEM_TYPE, consts.END_REL])
                        new_state = PathState(new_path, front.length + 1, front.entities | {item})
                        stack.append(new_state)

            if entity in user_item_3:
                item_list = user_item_3[entity]
                index_list = get_random_index(sample_nums, len(item_list))
                for index in index_list:
                    item = item_list[index]
                    if item not in front.entities:
                        new_path = copy.deepcopy(front.path)
                        new_path[-1][2] = consts.USER_ITEM_3_REL
                        new_path.append([item, consts.ITEM_TYPE, consts.END_REL])
                        new_state = PathState(new_path, front.length + 1, front.entities | {item})
                        stack.append(new_state)

            if entity in user_item_4:
                item_list = user_item_4[entity]
                index_list = get_random_index(sample_nums, len(item_list))
                for index in index_list:
                    item = item_list[index]
                    if item not in front.entities:
                        new_path = copy.deepcopy(front.path)
                        new_path[-1][2] = consts.USER_ITEM_4_REL
                        new_path.append([item, consts.ITEM_TYPE, consts.END_REL])
                        new_state = PathState(new_path, front.length + 1, front.entities | {item})
                        stack.append(new_state)

            if entity in user_item_5:
                item_list = user_item_5[entity]
                index_list = get_random_index(sample_nums, len(item_list))
                for index in index_list:
                    item = item_list[index]
                    if item not in front.entities:
                        new_path = copy.deepcopy(front.path)
                        new_path[-1][2] = consts.USER_ITEM_5_REL
                        new_path.append([item, consts.ITEM_TYPE, consts.END_REL])
                        new_state = PathState(new_path, front.length + 1, front.entities | {item})
                        stack.append(new_state)

        elif type == consts.ITEM_TYPE:
            if entity in item_sim:
                item_list = item_sim[entity]
                index_list = get_random_index(sample_nums, len(item_list))
                for index in index_list:
                    item = item_list[index]
                    if item not in front.entities:
                        new_path = copy.deepcopy(front.path)
                        new_path[-1][2] = consts.ITEM_SIM_REL
                        new_path.append([item, consts.ITEM_TYPE, consts.END_REL])
                        new_state = PathState(new_path, front.length + 1, front.entities | {item})
                        stack.append(new_state)

            if entity in item_user_1:
                user_list = item_user_1[entity]
                index_list = get_random_index(sample_nums, len(user_list))
                for index in index_list:
                    user = user_list[index]
                    if user not in front.entities:
                        new_path = copy.deepcopy(front.path)
                        new_path[-1][2] = consts.ITEM_USER_1_REL
                        new_path.append([user, consts.USER_TYPE, consts.END_REL])
                        new_state = PathState(new_path, front.length + 1, front.entities | {user})
                        stack.append(new_state)

            if entity in item_user_2:
                user_list = item_user_2[entity]
                index_list = get_random_index(sample_nums, len(user_list))
                for index in index_list:
                    user = user_list[index]
                    if user not in front.entities:
                        new_path = copy.deepcopy(front.path)
                        new_path[-1][2] = consts.ITEM_USER_2_REL
                        new_path.append([user, consts.USER_TYPE, consts.END_REL])
                        new_state = PathState(new_path, front.length + 1, front.entities | {user})
                        stack.append(new_state)

            if entity in item_user_3:
                user_list = item_user_3[entity]
                index_list = get_random_index(sample_nums, len(user_list))
                for index in index_list:
                    user = user_list[index]
                    if user not in front.entities:
                        new_path = copy.deepcopy(front.path)
                        new_path[-1][2] = consts.ITEM_USER_3_REL
                        new_path.append([user, consts.USER_TYPE, consts.END_REL])
                        new_state = PathState(new_path, front.length + 1, front.entities | {user})
                        stack.append(new_state)

            if entity in item_user_4:
                user_list = item_user_4[entity]
                index_list = get_random_index(sample_nums, len(user_list))
                for index in index_list:
                    user = user_list[index]
                    if user not in front.entities:
                        new_path = copy.deepcopy(front.path)
                        new_path[-1][2] = consts.ITEM_USER_4_REL
                        new_path.append([user, consts.USER_TYPE, consts.END_REL])
                        new_state = PathState(new_path, front.length + 1, front.entities | {user})
                        stack.append(new_state)

            if entity in item_user_5:
                user_list = item_user_5[entity]
                index_list = get_random_index(sample_nums, len(user_list))
                for index in index_list:
                    user = user_list[index]
                    if user not in front.entities:
                        new_path = copy.deepcopy(front.path)
                        new_path[-1][2] = consts.ITEM_USER_5_REL
                        new_path.append([user, consts.USER_TYPE, consts.END_REL])
                        new_state = PathState(new_path, front.length + 1, front.entities | {user})
                        stack.append(new_state)

    return item_to_paths
"""
register_module('path_extraction', path_extr_src)
from path_extraction import find_paths_user_to_items


In [None]:
prep_src = r'''
# ---------- آغاز فایل اصلی ----------
import pandas as pd, numpy as np, gzip, json, pickle, argparse, random, sys, torch
from tqdm import tqdm
from scipy.sparse import csr_matrix
from path_extraction import find_paths_user_to_items     # همان ماند
from data.format import format_paths                     # ← بدون تغییر لازم است

import random
import sys
from os import path, mkdir
sys.path.append(path.dirname(path.abspath('../constants')))
import constants.consts as consts


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_file',
                        default=consts.DATA_FILE,
                        help='Path to the json.gz file containing rating information')
    parser.add_argument('--rating_data_file',
                        default='rating_data.csv',
                        help='Path to the csv file containing rating data')
    parser.add_argument('--rating_train_data_file',
                        default='rating_train.csv',
                        help='Path to the csv file containing training data')
    parser.add_argument('--rating_valid_data_file',
                        default='rating_valid.csv',
                        help='Path to the csv file containing validating data')
    parser.add_argument('--rating_test_data_file',
                        default='rating_test.csv',
                        help='Path to the csv file containing testing data')
    parser.add_argument('--split_data',
                        default=False,
                        help='whether to split the data')
    parser.add_argument('--alpha',
                        type=float,
                        default=0.3,
                        help='alpha for constructing similarity')

    return parser.parse_args(args=[])


def read_rating_data(data_file):
    rating_data = []
    with gzip.open(data_file) as f:
        for l in f:
            rating_data.append(json.loads(l.strip()))
    rating_data_df = pd.DataFrame(rating_data)
    rating_data_df = rating_data_df[['reviewerID', 'asin', 'overall']]
    rating_data_df.columns = ['user_id', 'item_id', 'ratings']

    # save data
    rating_data_df.to_csv(consts.DATASET_DIR + 'rating_data.csv', index=False)


def train_valid_test_split(rating_data_file, dir, training_data_file, validating_data_file, testing_data_file):
    with open(consts.DATASET_DIR + rating_data_file, 'r', encoding='utf8') as fp:
        data = pd.read_csv(fp)

    # data split
    valid_test = np.random.choice(len(data), size=int(0.2 * len(data)), replace=False)
    valid_test_idx = np.zeros(len(data), dtype=bool)
    valid_test_idx[valid_test] = True
    rating_valid_test = data[valid_test_idx]
    rating_train = data[~valid_test_idx]

    num_ratings_valid_test = rating_valid_test.shape[0]
    test = np.random.choice(num_ratings_valid_test, size=int(0.50 * num_ratings_valid_test), replace=False)
    test_idx = np.zeros(num_ratings_valid_test, dtype=bool)
    test_idx[test] = True
    rating_test = rating_valid_test[test_idx]
    rating_valid = rating_valid_test[~test_idx]

    print("The number of training ratings is %d" % (len(rating_train)))
    print("The number of validating ratings is %d" % (len(rating_valid)))
    print("The number of testing ratings is %d" % (len(rating_test)))

    # save data
    rating_train.to_csv(dir + training_data_file, index=False)
    rating_valid.to_csv(dir + validating_data_file, index=False)
    rating_test.to_csv(dir + testing_data_file, index=False)


def create_directory(dir):
    print("Creating directory %s" % dir)
    try:
        mkdir(dir)
    except FileExistsError:
        print("Directory already exists")


def relation_cons(data_file, training_data_file, validating_data_file, testing_data_file, alpha, export_dir):
    """
    return: Write out python dictionaries for the edge of graph
    """

    with open(data_file, 'r', encoding='utf8') as fp:
        data = pd.read_csv(fp)
    with open(training_data_file, 'r', encoding='utf8') as fp:
        rating_train = pd.read_csv(fp)
    with open(validating_data_file, 'r', encoding='utf8') as fp:
        rating_valid = pd.read_csv(fp)
    with open(testing_data_file, 'r', encoding='utf8') as fp:
        rating_test = pd.read_csv(fp)

    user_item_dict = data.set_index('user_id').groupby('user_id')['item_id'].apply(list).to_dict()
    item_user_dict = data.set_index('item_id').groupby('item_id')['user_id'].apply(list).to_dict()
    train_user_item_dict = rating_train.set_index('user_id').groupby('user_id')['item_id'].apply(list).to_dict()
    train_item_user_dict = rating_train.set_index('item_id').groupby('item_id')['user_id'].apply(list).to_dict()
    valid_user_item_dict = rating_valid.set_index('user_id').groupby('user_id')['item_id'].apply(list).to_dict()
    valid_item_user_dict = rating_valid.set_index('item_id').groupby('item_id')['user_id'].apply(list).to_dict()
    test_user_item_dict = rating_test.set_index('user_id').groupby('user_id')['item_id'].apply(list).to_dict()
    test_item_user_dict = rating_test.set_index('item_id').groupby('item_id')['user_id'].apply(list).to_dict()
    with open(export_dir + consts.USER_ITEM_DICT, 'wb') as handle:
        pickle.dump(user_item_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(export_dir + consts.ITEM_USER_DICT, 'wb') as handle:
        pickle.dump(item_user_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(export_dir + consts.TRAIN_USER_ITEM_DICT, 'wb') as handle:
        pickle.dump(train_user_item_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(export_dir + consts.TRAIN_ITEM_USER_DICT, 'wb') as handle:
        pickle.dump(train_item_user_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(export_dir + consts.VALID_USER_ITEM_DICT, 'wb') as handle:
        pickle.dump(valid_user_item_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(export_dir + consts.VALID_ITEM_USER_DICT, 'wb') as handle:
        pickle.dump(valid_item_user_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(export_dir + consts.TEST_USER_ITEM_DICT, 'wb') as handle:
        pickle.dump(test_user_item_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(export_dir + consts.TEST_ITEM_USER_DICT, 'wb') as handle:
        pickle.dump(test_item_user_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # id_count
    users_id_num = data["user_id"].unique().shape[0]
    items_id_num = data["item_id"].unique().shape[0]
    ratings_num = data["ratings"].shape[0]
    print("the number of users: ", users_id_num)
    print("the number of items: ", items_id_num)
    print("the number of ratings: ", ratings_num)

    # from id to num
    # this id just used to construct the rating matrix
    user2id = dict((uid, i) for (i, uid) in enumerate(data["user_id"].unique()))
    item2id = dict((sid, i) for (i, sid) in enumerate(data["item_id"].unique()))
    id2user = dict((i, uid) for (i, uid) in enumerate(data["user_id"].unique()))
    id2item = dict((i, sid) for (i, sid) in enumerate(data["item_id"].unique()))
    user_id = list(map(lambda x: user2id[x], rating_train['user_id']))
    item_id = list(map(lambda x: item2id[x], rating_train['item_id']))
    rating_train['user_id'] = user_id
    rating_train['item_id'] = item_id

    # construct rating matrix
    # this matrix is used to construct the similarity of user pairs and item pairs
    rating_matrix_arr = np.zeros((users_id_num, items_id_num), dtype=float)
    for i in range(len(rating_train)):
        rating_matrix_arr[int(rating_train.iloc[i][0]), int(rating_train.iloc[i][1])] = rating_train.iloc[i][2]
    rating_matrix = torch.tensor(rating_matrix_arr, dtype=torch.float)
    # construct similarity
    user_sim_dict = {}
    user_sim_nums = 0
    user_sim_matrix = torch.cov(rating_matrix) / torch.sqrt(torch.mm(torch.var(rating_matrix, 1).unsqueeze(1),
                                                                     torch.var(rating_matrix, 1).unsqueeze(0))) >= alpha
    user_sparse = csr_matrix(user_sim_matrix)
    user_sim_pairs = user_sparse.todok().keys()
    for pair in user_sim_pairs:
        if pair[0] != pair[1]:
            user_sim_nums += 1
            user_i, user_j = id2user[pair[0]], id2user[pair[1]]
            if user_i not in user_sim_dict:
                user_sim_dict[user_i] = []
            user_sim_dict[user_i].append(user_j)
    print("user similar pair numbers: ", int(user_sim_nums/2))
    with open(export_dir + consts.USER_SIM_DICT, 'wb') as handle:
        pickle.dump(user_sim_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

    item_sim_dict = {}
    item_sim_nums = 0
    item_sim_matrix = torch.cov(rating_matrix.transpose(1, 0))/torch.sqrt(torch.mm(torch.var(rating_matrix, 0).unsqueeze(1),
                                                                torch.var(rating_matrix, 0).unsqueeze(0))) >= alpha
    item_sparse = csr_matrix(item_sim_matrix)
    item_sim_pairs = item_sparse.todok().keys()
    for pair in item_sim_pairs:
        if pair[0] != pair[1]:
            item_sim_nums += 1
            item_i, item_j = id2item[pair[0]], id2item[pair[1]]
            if item_i not in item_sim_dict:
                item_sim_dict[item_i] = []
            item_sim_dict[item_i].append(item_j)
    print("item similar pair numbers: ", int(item_sim_nums/2))
    with open(export_dir + consts.ITEM_SIM_DICT, 'wb') as handle:
        pickle.dump(item_sim_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)


def data_prep(data_file, export_dir):
    """
    return: Write out python dictionaries for the edges of graph
    """
    with open(data_file, 'r', encoding='utf8') as fp:
        data = pd.read_csv(fp)

    # train_user_item_k.dict
    # dict where key = a user, value = list of item be ranked k by this user
    user_item_1_dict = data[data["ratings"] == 1].set_index('user_id').groupby('user_id')['item_id'].apply(
        list).to_dict()
    user_item_2_dict = data[data["ratings"] == 2].set_index('user_id').groupby('user_id')['item_id'].apply(
        list).to_dict()
    user_item_3_dict = data[data["ratings"] == 3].set_index('user_id').groupby('user_id')['item_id'].apply(
        list).to_dict()
    user_item_4_dict = data[data["ratings"] == 4].set_index('user_id').groupby('user_id')['item_id'].apply(
        list).to_dict()
    user_item_5_dict = data[data["ratings"] == 5].set_index('user_id').groupby('user_id')['item_id'].apply(
        list).to_dict()
    with open(export_dir + 'train_' + consts.USER_ITEM_1_DICT, 'wb') as handle:
        pickle.dump(user_item_1_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(export_dir + 'train_' + consts.USER_ITEM_2_DICT, 'wb') as handle:
        pickle.dump(user_item_2_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(export_dir + 'train_' + consts.USER_ITEM_3_DICT, 'wb') as handle:
        pickle.dump(user_item_3_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(export_dir + 'train_' + consts.USER_ITEM_4_DICT, 'wb') as handle:
        pickle.dump(user_item_4_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(export_dir + 'train_' + consts.USER_ITEM_5_DICT, 'wb') as handle:
        pickle.dump(user_item_5_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # train_item_user_k.dict
    # dict where key = a item, value = list of user rank k to this item
    item_user_1_dict = data[data["ratings"] == 1].set_index('item_id').groupby('item_id')['user_id'].apply(
        list).to_dict()
    item_user_2_dict = data[data["ratings"] == 2].set_index('item_id').groupby('item_id')['user_id'].apply(
        list).to_dict()
    item_user_3_dict = data[data["ratings"] == 3].set_index('item_id').groupby('item_id')['user_id'].apply(
        list).to_dict()
    item_user_4_dict = data[data["ratings"] == 4].set_index('item_id').groupby('item_id')['user_id'].apply(
        list).to_dict()
    item_user_5_dict = data[data["ratings"] == 5].set_index('item_id').groupby('item_id')['user_id'].apply(
        list).to_dict()
    with open(export_dir + 'train_' + consts.ITEM_USER_1_DICT, 'wb') as handle:
        pickle.dump(item_user_1_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(export_dir + 'train_' + consts.ITEM_USER_2_DICT, 'wb') as handle:
        pickle.dump(item_user_2_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(export_dir + 'train_' + consts.ITEM_USER_3_DICT, 'wb') as handle:
        pickle.dump(item_user_3_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(export_dir + 'train_' + consts.ITEM_USER_4_DICT, 'wb') as handle:
        pickle.dump(item_user_4_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(export_dir + 'train_' + consts.ITEM_USER_5_DICT, 'wb') as handle:
        pickle.dump(item_user_5_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)


def ix_mapping(data_file, mapping_export_dir):
    pad_token = consts.PAD_TOKEN
    type_to_ix = {'user': consts.USER_TYPE, 'item': consts.ITEM_TYPE, pad_token: consts.PAD_TYPE}
    relation_to_ix = {'user_sim': consts.USER_SIM_REL, 'item_sim': consts.ITEM_SIM_REL,
                      'user_item_1': consts.USER_ITEM_1_REL,
                      'user_item_2': consts.USER_ITEM_2_REL, 'user_item_3': consts.USER_ITEM_3_REL,
                      'user_item_4': consts.USER_ITEM_4_REL,
                      'user_item_5': consts.USER_ITEM_5_REL, 'item_user_1': consts.ITEM_USER_1_REL,
                      'item_user_2': consts.ITEM_USER_2_REL,
                      'item_user_3': consts.ITEM_USER_3_REL, 'item_user_4': consts.ITEM_USER_4_REL,
                      'item_user_5': consts.ITEM_USER_5_REL,
                      '#END_RELATION': consts.END_REL, pad_token: consts.PAD_REL}

    # entity vocab set is combination of users and items
    with open(data_file, 'r', encoding='utf8') as fp:
        data = pd.read_csv(fp)

    users = set(data["user_id"].unique())
    items = set(data["item_id"].unique())

    # Id-ix mappings
    entity_to_ix = {(user, consts.USER_TYPE): ix for ix, user in enumerate(users)}
    entity_to_ix.update({(item, consts.ITEM_TYPE): ix + len(users) for ix, item in enumerate(items)})
    entity_to_ix[pad_token] = len(entity_to_ix)

    # Ix-id mappings
    ix_to_type = {v: k for k, v in type_to_ix.items()}
    ix_to_relation = {v: k for k, v in relation_to_ix.items()}
    ix_to_entity = {v: k for k, v in entity_to_ix.items()}

    # Export mappings
    # eg. Musical_Instruments_ix_mapping/type_to_ix.dict
    with open(mapping_export_dir + consts.TYPE_TO_IX, 'wb') as handle:
        pickle.dump(type_to_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(mapping_export_dir + consts.RELATION_TO_IX, 'wb') as handle:
        pickle.dump(relation_to_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(mapping_export_dir + consts.ENTITY_TO_IX, 'wb') as handle:
        pickle.dump(entity_to_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(mapping_export_dir + consts.IX_TO_TYPE, 'wb') as handle:
        pickle.dump(ix_to_type, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(mapping_export_dir + consts.IX_TO_RELATION, 'wb') as handle:
        pickle.dump(ix_to_relation, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(mapping_export_dir + consts.IX_TO_ENTITY, 'wb') as handle:
        pickle.dump(ix_to_entity, handle, protocol=pickle.HIGHEST_PROTOCOL)


def convert_to_ids(entity_to_ix, rel_dict, start_type, end_type):
    new_rel = {}
    for key, values in rel_dict.items():
        key_id = entity_to_ix[(key, start_type)]
        value_ids = []
        for val in values:
            value_ids.append(entity_to_ix[(val, end_type)])
        new_rel[key_id] = value_ids
    return new_rel


def ix_update(import_dir, mapping_dir, export_dir):
    with open(mapping_dir + consts.ENTITY_TO_IX, 'rb') as handle:
        entity_to_ix = pickle.load(handle)
    with open(import_dir + consts.USER_SIM_DICT, 'rb') as handle:
        user_sim_dict = pickle.load(handle)
    with open(import_dir + consts.ITEM_SIM_DICT, 'rb') as handle:
        item_sim_dict = pickle.load(handle)
    with open(import_dir + consts.USER_ITEM_DICT, 'rb') as handle:
        user_item_dict = pickle.load(handle)
    with open(import_dir + consts.ITEM_USER_DICT, 'rb') as handle:
        item_user_dict = pickle.load(handle)
    with open(import_dir + consts.TRAIN_USER_ITEM_DICT, 'rb') as handle:
        train_user_item_dict = pickle.load(handle)
    with open(import_dir + consts.TRAIN_ITEM_USER_DICT, 'rb') as handle:
        train_item_user_dict = pickle.load(handle)
    with open(import_dir + consts.VALID_USER_ITEM_DICT, 'rb') as handle:
        valid_user_item_dict = pickle.load(handle)
    with open(import_dir + consts.VALID_ITEM_USER_DICT, 'rb') as handle:
        valid_item_user_dict = pickle.load(handle)
    with open(import_dir + consts.TEST_USER_ITEM_DICT, 'rb') as handle:
        test_user_item_dict = pickle.load(handle)
    with open(import_dir + consts.TEST_ITEM_USER_DICT, 'rb') as handle:
        test_item_user_dict = pickle.load(handle)

    # mapping id to ix
    user_sim_ix = convert_to_ids(entity_to_ix, user_sim_dict, consts.USER_TYPE, consts.USER_TYPE)
    item_sim_ix = convert_to_ids(entity_to_ix, item_sim_dict, consts.ITEM_TYPE, consts.ITEM_TYPE)
    user_item_ix = convert_to_ids(entity_to_ix, user_item_dict, consts.USER_TYPE, consts.ITEM_TYPE)
    item_user_ix = convert_to_ids(entity_to_ix, item_user_dict, consts.ITEM_TYPE, consts.USER_TYPE)
    train_user_item_ix = convert_to_ids(entity_to_ix, train_user_item_dict, consts.USER_TYPE, consts.ITEM_TYPE)
    train_item_user_ix = convert_to_ids(entity_to_ix, train_item_user_dict, consts.ITEM_TYPE, consts.USER_TYPE)
    valid_user_item_ix = convert_to_ids(entity_to_ix, valid_user_item_dict, consts.USER_TYPE, consts.ITEM_TYPE)
    valid_item_user_ix = convert_to_ids(entity_to_ix, valid_item_user_dict, consts.ITEM_TYPE, consts.USER_TYPE)
    test_user_item_ix = convert_to_ids(entity_to_ix, test_user_item_dict, consts.USER_TYPE, consts.ITEM_TYPE)
    test_item_user_ix = convert_to_ids(entity_to_ix, test_item_user_dict, consts.ITEM_TYPE, consts.USER_TYPE)
    with open(export_dir + 'ix_' + consts.USER_SIM_DICT, 'wb') as handle:
        pickle.dump(user_sim_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(export_dir + 'ix_' + consts.ITEM_SIM_DICT, 'wb') as handle:
        pickle.dump(item_sim_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(export_dir + 'ix_' + consts.USER_ITEM_DICT, 'wb') as handle:
        pickle.dump(user_item_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(export_dir + 'ix_' + consts.ITEM_USER_DICT, 'wb') as handle:
        pickle.dump(item_user_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(export_dir + 'ix_' + consts.TRAIN_USER_ITEM_DICT, 'wb') as handle:
        pickle.dump(train_user_item_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(export_dir + 'ix_' + consts.TRAIN_ITEM_USER_DICT, 'wb') as handle:
        pickle.dump(train_item_user_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(export_dir + 'ix_' + consts.VALID_USER_ITEM_DICT, 'wb') as handle:
        pickle.dump(valid_user_item_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(export_dir + 'ix_' + consts.VALID_ITEM_USER_DICT, 'wb') as handle:
        pickle.dump(valid_item_user_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(export_dir + 'ix_' + consts.TEST_USER_ITEM_DICT, 'wb') as handle:
        pickle.dump(test_user_item_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(export_dir + 'ix_' + consts.TEST_ITEM_USER_DICT, 'wb') as handle:
        pickle.dump(test_item_user_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)

    for k in range(5):
        data_file = 'train_user_item_' + str(k + 1) + '.dict'
        with open(import_dir + data_file, 'rb') as handle:
            user_item_dict = pickle.load(handle)
        user_item_ix = convert_to_ids(entity_to_ix, user_item_dict, consts.USER_TYPE, consts.ITEM_TYPE)
        with open(export_dir + 'ix_' + data_file, 'wb') as handle:
            pickle.dump(user_item_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)

    for k in range(5):
        data_file = 'train_item_user_' + str(k + 1) + '.dict'
        with open(import_dir + data_file, 'rb') as handle:
            item_user_dict = pickle.load(handle)
        item_user_ix = convert_to_ids(entity_to_ix, item_user_dict, consts.ITEM_TYPE, consts.USER_TYPE)
        with open(export_dir + 'ix_' + data_file, 'wb') as handle:
            pickle.dump(item_user_ix, handle, protocol=pickle.HIGHEST_PROTOCOL)


def sample_paths(paths, samples):
    index_list = list(range(len(paths)))
    random.shuffle(index_list)
    indices = index_list[:samples]
    return [paths[i] for i in indices]


def construct_paths(data_file, training_data_file, validating_data_file, testing_data_file, import_dir, path_dir,
                    mapping_dir, samples):
    """
    Constructs paths from the target user to the target item
    """
    create_directory(path_dir)
    train_path_file = open(path_dir + consts.TRAIN_PATH_FILE, 'w')
    valid_path_file = open(path_dir + consts.VALID_PATH_FILE, 'w')
    test_path_file = open(path_dir + consts.TEST_PATH_FILE, 'w')

    # load data
    with open(data_file, 'r', encoding='utf8') as handle:
        rating_data = pd.read_csv(handle)
    with open(training_data_file, 'r', encoding='utf8') as handle:
        rating_train = pd.read_csv(handle)
    with open(validating_data_file, 'r', encoding='utf8') as handle:
        rating_valid = pd.read_csv(handle)
    with open(testing_data_file, 'r', encoding='utf8') as handle:
        rating_test = pd.read_csv(handle)
    with open(mapping_dir + consts.ENTITY_TO_IX, 'rb') as handle:
        entity_to_ix = pickle.load(handle)
    with open(mapping_dir + consts.TYPE_TO_IX, 'rb') as handle:
        type_to_ix = pickle.load(handle)
    with open(mapping_dir + consts.RELATION_TO_IX, 'rb') as handle:
        relation_to_ix = pickle.load(handle)
    with open(mapping_dir + consts.IX_TO_ENTITY, 'rb') as handle:
        ix_to_entity = pickle.load(handle)
    with open(import_dir + 'ix_' + consts.USER_SIM_DICT, 'rb') as handle:
        user_sim = pickle.load(handle)
    with open(import_dir + 'ix_' + consts.ITEM_SIM_DICT, 'rb') as handle:
        item_sim = pickle.load(handle)
    with open(import_dir + 'ix_' + consts.USER_ITEM_DICT, 'rb') as handle:
        user_item = pickle.load(handle)
    with open(import_dir + 'ix_train_' + consts.USER_ITEM_1_DICT, 'rb') as handle:
        user_item_1 = pickle.load(handle)
    with open(import_dir + 'ix_train_' + consts.USER_ITEM_2_DICT, 'rb') as handle:
        user_item_2 = pickle.load(handle)
    with open(import_dir + 'ix_train_' + consts.USER_ITEM_3_DICT, 'rb') as handle:
        user_item_3 = pickle.load(handle)
    with open(import_dir + 'ix_train_' + consts.USER_ITEM_4_DICT, 'rb') as handle:
        user_item_4 = pickle.load(handle)
    with open(import_dir + 'ix_train_' + consts.USER_ITEM_5_DICT, 'rb') as handle:
        user_item_5 = pickle.load(handle)
    with open(import_dir + 'ix_train_' + consts.ITEM_USER_1_DICT, 'rb') as handle:
        item_user_1 = pickle.load(handle)
    with open(import_dir + 'ix_train_' + consts.ITEM_USER_2_DICT, 'rb') as handle:
        item_user_2 = pickle.load(handle)
    with open(import_dir + 'ix_train_' + consts.ITEM_USER_3_DICT, 'rb') as handle:
        item_user_3 = pickle.load(handle)
    with open(import_dir + 'ix_train_' + consts.ITEM_USER_4_DICT, 'rb') as handle:
        item_user_4 = pickle.load(handle)
    with open(import_dir + 'ix_train_' + consts.ITEM_USER_5_DICT, 'rb') as handle:
        item_user_5 = pickle.load(handle)

    # trackers for statistics
    train_paths_not_found = 0
    valid_paths_not_found = 0
    test_paths_not_found = 0
    total_interactions = 0
    avg_num_paths = 0

    for user, items in tqdm(list(user_item.items())):
        total_interactions += len(items)
        item_to_paths = None

        for item in items:
            if item_to_paths is None:
                item_to_paths = find_paths_user_to_items(user, user_sim, item_sim, user_item_1, user_item_2,
                                                         user_item_3, user_item_4, user_item_5, item_user_1,
                                                         item_user_2, item_user_3,item_user_4, item_user_5, 2, 20)
                item_to_paths_len3 = find_paths_user_to_items(user, user_sim, item_sim, user_item_1, user_item_2,
                                                              user_item_3, user_item_4, user_item_5, item_user_1,
                                                              item_user_2, item_user_3, item_user_4, item_user_5, 3, 10)
                item_to_paths_len4 = find_paths_user_to_items(user, user_sim, item_sim, user_item_1, user_item_2,
                                                              user_item_3, user_item_4, user_item_5, item_user_1,
                                                              item_user_2, item_user_3, item_user_4, item_user_5, 4, 5)
                """item_to_paths_len5 = find_paths_user_to_items(user, user_sim, item_sim, user_item_1, user_item_2,
                                                              user_item_3, user_item_4, user_item_5, item_user_1,
                                                              item_user_2, item_user_3, item_user_4, item_user_5, 5, 3)"""
                for i in item_to_paths_len3.keys():
                    item_to_paths[i].extend(item_to_paths_len3[i])
                for i in item_to_paths_len4.keys():
                    item_to_paths[i].extend(item_to_paths_len4[i])
                """for i in item_to_paths_len5.keys():
                    item_to_paths[i].extend(item_to_paths_len5[i])"""

            # add paths for interaction
            item_paths = item_to_paths[item]
            item_paths = sample_paths(item_paths, samples)
            rating = float(rating_data.loc[rating_data.user_id == ix_to_entity[user][0]].loc[
                               rating_data.item_id == ix_to_entity[item][0]].ratings.values[0])
            if len(item_paths) > 0:
                interaction = (format_paths(item_paths, entity_to_ix, type_to_ix, relation_to_ix, samples), user, item, len(item_paths), rating)
                if ix_to_entity[item][0] in rating_train.loc[rating_train.user_id == ix_to_entity[user][0]]["item_id"].unique():
                    train_path_file.write(repr(interaction))
                    train_path_file.write("\n")
                elif ix_to_entity[item][0] in rating_valid.loc[rating_valid.user_id == ix_to_entity[user][0]]["item_id"].unique():
                    valid_path_file.write(repr(interaction))
                    valid_path_file.write("\n")
                elif ix_to_entity[item][0] in rating_test.loc[rating_test.user_id == ix_to_entity[user][0]]["item_id"].unique():
                    test_path_file.write(repr(interaction))
                    test_path_file.write("\n")
                avg_num_paths += len(item_paths)
            else:
                padding_path = [[[user, consts.USER_TYPE, consts.PAD_REL], [item, consts.ITEM_TYPE, consts.END_REL]]]
                interaction = (format_paths(padding_path, entity_to_ix, type_to_ix, relation_to_ix, samples), user, item, len(padding_path), rating)
                if ix_to_entity[item][0] in rating_train.loc[rating_train.user_id == ix_to_entity[user][0]]["item_id"].unique():
                    train_paths_not_found += 1
                    train_path_file.write(repr(interaction))
                    train_path_file.write("\n")
                elif ix_to_entity[item][0] in rating_valid.loc[rating_valid.user_id == ix_to_entity[user][0]]["item_id"].unique():
                    valid_paths_not_found += 1
                    valid_path_file.write(repr(interaction))
                    valid_path_file.write("\n")
                elif ix_to_entity[item][0] in rating_test.loc[rating_test.user_id == ix_to_entity[user][0]]["item_id"].unique():
                    test_paths_not_found += 1
                    test_path_file.write(repr(interaction))
                    test_path_file.write("\n")
                continue

    avg_num_paths = avg_num_paths / (
                total_interactions - train_paths_not_found - valid_paths_not_found - test_paths_not_found)

    print("number of paths attempted to find:", total_interactions)
    print("number of train paths not found:", train_paths_not_found)
    print("number of valid paths not found:", valid_paths_not_found)
    print("number of test paths not found:", test_paths_not_found)
    print("avg num paths per interaction:", avg_num_paths)

    train_path_file.close()
    valid_path_file.close()
    test_path_file.close()


def main():
    print("Data preparation:")
    args = parse_args()
    print("Forming knowledge graph...")
    create_directory(consts.DATA_DIR)

    # train_valid_test_split
    if args.split_data:
        # read data
        # read_rating_data(consts.DATASET_DIR + args.data_file)
        train_valid_test_split(args.rating_data_file, consts.DATASET_DIR, args.rating_train_data_file,
                               args.rating_valid_data_file, args.rating_test_data_file)

    relation_cons(consts.DATASET_DIR + args.rating_data_file, consts.DATASET_DIR + args.rating_train_data_file,
                  consts.DATASET_DIR + args.rating_valid_data_file, consts.DATASET_DIR + args.rating_test_data_file,
                  args.alpha, consts.DATA_DIR)

    data_prep(consts.DATASET_DIR + args.rating_train_data_file, consts.DATA_DIR)

    print("Mapping ids to indices...")
    create_directory(consts.DATA_IX_DIR)
    create_directory(consts.DATA_IX_MAPPING_DIR)
    ix_mapping(consts.DATASET_DIR + args.rating_data_file, consts.DATA_IX_MAPPING_DIR)
    ix_update(consts.DATA_DIR, consts.DATA_IX_MAPPING_DIR, consts.DATA_IX_DIR)

    print("Constructing paths from user to item...")
    construct_paths(consts.DATASET_DIR + args.rating_data_file, consts.DATASET_DIR + args.rating_train_data_file,
                    consts.DATASET_DIR + args.rating_valid_data_file, consts.DATASET_DIR + args.rating_test_data_file,
                    consts.DATA_IX_DIR, consts.PATH_DATA_DIR, consts.DATA_IX_MAPPING_DIR, consts.SAMPLES)


if __name__ == '__main__':
    main()
'''
register_module('data_preparation', prep_src)


In [None]:
train_src = r'''
import torch
import torch.nn as nn
import torch.optim as optim
import linecache
import numpy as np
import math
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class InteractionData(Dataset):
    """
    Dataset that can either store all interaction data in memory or load it line
    by line when needed
    """

    def __init__(self, train_path_file, in_memory=True):
        self.in_memory = in_memory
        self.file = train_path_file
        self.num_interactions = 0
        self.interactions = []
        if in_memory:
            with open(self.file, "r") as f:
                for line in f:
                    self.interactions.append(eval(line.rstrip("\n")))
            self.num_interactions = len(self.interactions)
        else:
            with open(self.file, "r") as f:
                for line in f:
                    self.num_interactions += 1

    def __getitem__(self, idx):
        # load the specific interaction either from memory or from file line
        if self.in_memory:
            return self.interactions[idx]
        else:
            line = linecache.getline(self.file, idx+1)
            return eval(line.rstrip("\n"))

    def __len__(self):
        return self.num_interactions


def my_collate(batch):
    """
    Custom dataloader collate function since we have tuples of lists of paths
    """

    data = [line[0] for line in batch]
    user = [line[1] for line in batch]
    item = [line[2] for line in batch]
    val_len = [line[3] for line in batch]
    target = [line[4] for line in batch]
    user = torch.LongTensor(user)
    item = torch.LongTensor(item)
    val_len = torch.LongTensor(val_len)
    target = torch.Tensor(target)
    return [data, user, item, val_len, target]


def train(model, train_paths_file, valid_paths_file, batch_size, epochs, model_path, load_checkpoint, not_in_memory,
          lr, l2_reg):
    """
    -trains and outputs a model using the input data
    -formatted_data is a list of path lists, each of which consists of tuples of
    (path, label, path_length), where the path is padded to ensure same overall length
    """
    model = model.cuda()
    loss_function = nn.MSELoss(reduction='none')

    # l2 regularization is tuned from {10−5 , 10−4 , 10−3 , 10−2 }
    # Learning rate is found from {0.001, 0.002, 0.01, 0.02} with grid search
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=l2_reg)

    if load_checkpoint:
        checkpoint = torch.load(model_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    # DataLoader used for batches
    interaction_data_train = InteractionData(train_paths_file, in_memory=not not_in_memory)
    train_loader = DataLoader(dataset=interaction_data_train, collate_fn=my_collate, batch_size=batch_size,
                              shuffle=True)
    interaction_data_valid = InteractionData(valid_paths_file, in_memory=not not_in_memory)
    valid_loader = DataLoader(dataset=interaction_data_valid, collate_fn=my_collate, batch_size=batch_size,
                              shuffle=False)

    for epoch in range(epochs):
        model, training_rmse, training_mae = training(model, optimizer, loss_function, train_loader, model_path)
        validating_rmse, validating_mae = predict(model, loss_function, valid_loader, model_path)
        print("Epoch: %d, Training_RMSE: %f, Training_MAE: %f, Validating_RMSE: %f, Validating_MAE: %f"
              % (epoch + 1, training_rmse, training_mae, validating_rmse, validating_mae))


def training(model, optimizer, loss_function, data_loader, model_path):
    rmse_metric = np.zeros(2)
    mae_metric = np.zeros(2)
    model.train()
    for interaction_batch, users, items, val_lens, targets in data_loader:
        # construct tensor of all paths in batch, tensor of all lengths, and tensor of interaction id
        paths = []
        lengths = []
        for inter_id, interaction_paths in enumerate(interaction_batch):
            for path, length in interaction_paths:
                paths.append(path)
                lengths.append(length)
        paths = torch.tensor(paths, dtype=torch.long, device='cuda')
        lengths = torch.tensor(lengths, dtype=torch.long, device='cuda')

        model.zero_grad()
        prediction_scores = model(paths, lengths, users.cuda(), items.cuda(),
                                  val_lens.cuda(), is_training=True).cuda()

        # Compute the loss, gradients, and update the parameters by calling .step()
        loss = loss_function(prediction_scores, targets.cuda())
        loss.sum().backward()
        optimizer.step()
        rmse_metric += (float(loss.sum()), len(targets))
        mae_metric += (float(torch.sum(abs(prediction_scores - targets.cuda()))), len(targets))

    mse = rmse_metric[0] / rmse_metric[1]
    rmse = math.sqrt(mse)
    mae = mae_metric[0] / mae_metric[1]

    # Save model to disk
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, model_path)

    return model, rmse, mae


def predict(model, loss_function, valid_loader, model_path):
    checkpoint = torch.load(model_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    rmse_metric = np.zeros(2)
    mae_metric = np.zeros(2)
    for interaction_batch, users, items, val_lens, targets in valid_loader:
        # construct tensor of all paths in batch, tensor of all lengths, and tensor of interaction id
        paths = []
        lengths = []
        for inter_id, interaction_paths in enumerate(interaction_batch):
            for path, length in interaction_paths:
                paths.append(path)
                lengths.append(length)
        paths = torch.tensor(paths, dtype=torch.long, device='cuda')
        lengths = torch.tensor(lengths, dtype=torch.long, device='cuda')

        with torch.no_grad():
            # Run the forward pass
            prediction_scores = model(paths, lengths, users.cuda(), items.cuda(),
                                      val_lens.cuda(), is_training=False).cuda()
            # Compute the loss
            loss = loss_function(prediction_scores, targets.cuda())
        rmse_metric += (float(loss.sum()), len(targets))
        mae_metric += (float(torch.sum(abs(prediction_scores - targets.cuda()))), len(targets))

    mse = rmse_metric[0] / rmse_metric[1]
    rmse = math.sqrt(mse)
    mae = mae_metric[0] / mae_metric[1]
    return rmse, mae
'''
register_module('model.train', train_src)


In [None]:
comper_src = '''
import torch
import torch.nn as nn
import torch.nn.functional as F
import constants.consts as consts

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


class COMPER(nn.Module):

    def __init__(self, e_emb_dim, t_emb_dim, r_emb_dim, hidden_dim, attention_dim, e_vocab_size, t_vocab_size,
                 r_vocab_size, miu):
        super(COMPER, self).__init__()
        self.hidden_dim = hidden_dim
        self.attention_dim = attention_dim
        self.miu = miu

        self.entity_embeddings = nn.Embedding(e_vocab_size, e_emb_dim)
        self.type_embeddings = nn.Embedding(t_vocab_size, t_emb_dim)
        self.rel_embeddings = nn.Embedding(r_vocab_size, r_emb_dim)

        self.bias = nn.Embedding(e_vocab_size, 1)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(e_emb_dim + t_emb_dim + r_emb_dim, hidden_dim)

        # The attention parameters
        self.attention_W_q = nn.Linear(2 * e_emb_dim, attention_dim, bias=False)
        self.attention_W_k = nn.Linear(hidden_dim, attention_dim, bias=False)
        self.attention_W_b = nn.Linear(attention_dim, 1, bias=False)

        # The linear layer that maps from hidden state space to label
        self.linear1 = nn.Linear(hidden_dim, 64)
        self.linear2 = nn.Linear(64, 1)
        """self.linear1 = nn.Linear(hidden_dim, 8)
        self.linear2 = nn.Linear(8, 4)
        self.linear3 = nn.Linear(4, 1)"""
        # self.Dropout = nn.Dropout(0.5)

    # def forward(self, paths, inter_ids, path_lengths, users, items, val_lens, batch_size, is_training):
        # transpose, so entities 1st row, types 2nd row, and relations 3nd (these are dim 1 and 2 since batch is 0)
        # this could just be the input if we want
    def forward(self, paths, lengths, users, items, val_lens, is_training):
        """paths = paths.reshape(-1, consts.MAX_PATH_LEN, 3)
        lengths = lengths.reshape(-1)"""

        inter_ids, _ = torch.arange(0, paths.shape[0] / consts.SAMPLES, device='cuda').repeat(consts.SAMPLES).sort()

        paths, inter_id, lengths, perm_idx = self.sort_batch(paths, inter_ids, lengths)
        sample_indexs = (lengths > 1).nonzero().squeeze(-1)
        drop_num = len(lengths) - len(sample_indexs)
        paths = paths[sample_indexs]
        lengths = lengths[sample_indexs]

        t_paths = torch.transpose(paths, 1, 2)

        # then concatenate embeddings, batch is index 0, so selecting along index 1
        # right now we do fetch embedding for padding tokens, but that these aren't used
        entity_embed = self.entity_embeddings(t_paths[:, 0, :])
        type_embed = self.type_embeddings(t_paths[:, 1, :])
        rel_embed = self.rel_embeddings(t_paths[:, 2, :])
        triplet_embed = torch.cat((entity_embed, type_embed, rel_embed), 2)  # concatenates lengthwise

        # we need dimensions to be input size x batch_size x embedding dim, so transpose first 2 dim
        batch_sec_embed = torch.transpose(triplet_embed, 0, 1)

        # pack padded sequences, so we don't do extra computation
        packed_embed = nn.utils.rnn.pack_padded_sequence(batch_sec_embed, lengths)

        # last_out is the output state before padding for each path, since we only want final output
        # self.lstm.flatten_parameters()
        packed_out, (last_out, _) = self.lstm(packed_embed)
        path_embedding = last_out[-1]
        path_embedding = torch.cat((path_embedding, torch.rand(drop_num, self.hidden_dim, device='cuda')), 0)
        # Get attention pooling of path_embedding over interaction id groups
        path_embedding = self.sort_path_embedding(path_embedding, perm_idx)
        path_embedding = path_embedding.reshape(-1, consts.SAMPLES, self.hidden_dim)

        users_embedding = self.entity_embeddings(users)
        items_embedding = self.entity_embeddings(items)
        queries = torch.cat((users_embedding, items_embedding), dim=1)
        sub_graph_embeddings = self.Attention(queries, path_embedding, path_embedding, val_lens, is_training)

        """start = True
        sub_graph_embeddings = torch.Tensor()
        for i in range(batch_size):
            # get ixs for this interaction
            inter_ixs = (inter_ids == i).nonzero().squeeze(1)

            # weighted pooled scores for this interaction
            query = self.entity_embeddings(torch.tensor((users[i], items[i])).to(device)).reshape(-1)

            sub_graph_embedding = self.Attention(path_embedding[inter_ixs], query, is_training)

            if start:
                # unsqueeze turns it into 2d tensor, so that we can concatenate along existing dim
                sub_graph_embeddings = sub_graph_embedding
                start = not start
            else:
                sub_graph_embeddings = torch.cat((sub_graph_embeddings, sub_graph_embedding), dim=0)"""

        # pass through linear layers
        """layer_1 = self.Dropout(F.relu(self.linear1(sub_graph_embeddings)))
        layer_2 = self.Dropout(F.relu(self.linear2(layer_1)))
        predict_scores = self.linear3(layer_2).squeeze(1)"""

        # predict_scores = self.linear2(self.Dropout(F.relu(self.linear1(sub_graph_embeddings))))
        predict_scores = self.linear2(F.relu(self.linear1(sub_graph_embeddings)))
        # predict_scores = self.linear1(sub_graph_embeddings).squeeze(1)

        b_u = self.bias(users)
        b_i = self.bias(items)
        output = predict_scores + b_u + b_i + self.miu

        return output.squeeze(-1)


    """def Attention(self, paths_embedding, query, is_training):
        features = self.attention_W_q(query) + self.attention_W_k(paths_embedding.unsqueeze(0))
        features = torch.tanh(features)
        weights = self.attention_W_b(features).squeeze(-1)
        attention_weights = F.softmax(weights, dim=1)
        sub_graph_embedding = torch.mm(attention_weights, paths_embedding).squeeze(1)
        # sub_graph_embedding = torch.mean(paths_embedding, 0).unsqueeze(0)

        return sub_graph_embedding"""


    def Attention(self, queries, keys, values, val_lens, is_training):
        queries, keys = self.attention_W_q(queries), self.attention_W_k(keys)
        # queries = (batch_size,num_hidden) keys = (batch_size,num_keys,num_hidden)
        features = queries.unsqueeze(1) + keys
        features = torch.tanh(features)
        scores = self.attention_W_b(features).squeeze(-1)  # (batch_size,num_keys)

        # mask
        mask = torch.arange((scores.shape[1]), dtype=torch.float32, device='cuda')[None, :] < val_lens[:, None]
        scores[~mask] = -1e6
        attention_weights = nn.functional.softmax(scores, dim=-1).unsqueeze(1)
        fusion_result = torch.bmm(attention_weights, values).squeeze(1)
        return fusion_result

    def sort_path_embedding(self, path_embedding, indexes):
        _, perm_idx = indexes.sort(0, descending=False)
        seq_tensor = path_embedding[perm_idx]
        return seq_tensor

    def sort_batch(self, batch, indexes, lengths):
        """
        sorts a batch of paths by path length, in decreasing order
        """
        seq_lengths, perm_idx = lengths.sort(0, descending=True)
        seq_tensor = batch[perm_idx]
        indexes_tensor = indexes[perm_idx]
        return seq_tensor.cuda(), indexes_tensor.cuda(), seq_lengths.cpu(), perm_idx.cuda()

    """def paths_split(self, interaction_batch):
        # construct tensor of all paths in batch, tensor of all lengths, and tensor of interaction id
        paths = []
        lengths = []
        inter_ids = []
        for inter_id, interaction_paths in enumerate(interaction_batch):
            for path, length in interaction_paths:
                paths.append(path)
                lengths.append(length)
            inter_ids.extend([inter_id for i in range(len(interaction_paths))])

        inter_ids = torch.tensor(inter_ids, dtype=torch.long)
        paths = torch.tensor(paths, dtype=torch.long)
        lengths = torch.tensor(lengths, dtype=torch.long)

        # sort based on path lengths, largest first, so that we can pack paths
        s_path_batch, s_inter_ids, s_lengths = self.sort_batch(paths, inter_ids, lengths)
        return s_path_batch.cuda(), s_inter_ids.cuda(), s_lengths.cpu()"""
'''
register_module('model.COMPER', comper_src)


In [None]:
pred_src = r'''
import model.train as _tr
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader
from model.train import InteractionData, my_collate
import math


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


def test(model, test_paths_file, batch_size, model_path, not_in_memory):
    model = model.to(device)
    loss_function = nn.MSELoss(reduction='none')
    checkpoint = torch.load(model_path)
    model.load_state_dict(checkpoint['model_state_dict'])

    # DataLoader used for batches
    interaction_data = InteractionData(test_paths_file, in_memory=not not_in_memory)
    data_loader = DataLoader(dataset=interaction_data, collate_fn=my_collate, batch_size=batch_size, shuffle=True)

    model.eval()
    rmse_metric = np.zeros(2)
    mae_metric = np.zeros(2)
    for interaction_batch, users, items, val_lens, targets in data_loader:
        # construct tensor of all paths in batch, tensor of all lengths, and tensor of interaction id
        paths = []
        lengths = []
        for inter_id, interaction_paths in enumerate(interaction_batch):
            for path, length in interaction_paths:
                paths.append(path)
                lengths.append(length)

        paths = torch.tensor(paths, dtype=torch.long, device=device)
        lengths = torch.tensor(lengths, dtype=torch.long, device=device)

        # sort based on path lengths, largest first, so that we can pack paths
        with torch.no_grad():
            # Run the forward pass
            prediction_scores = model(paths, lengths, users.cuda(), items.cuda(),
                                      val_lens.cuda(), is_training=False).cuda()

            # Compute the loss
            loss = loss_function(prediction_scores, targets.cuda())
        rmse_metric += (float(loss.sum()), len(targets))
        mae_metric += (float(torch.sum(abs(prediction_scores - targets.cuda()))), len(targets))

    mse = rmse_metric[0] / rmse_metric[1]
    rmse = math.sqrt(mse)
    mae = mae_metric[0] / mae_metric[1]
    return rmse, mae
'''
register_module('model.predictor', pred_src)

import types, sys
model_pkg = types.ModuleType('model')
sys.modules['model'] = model_pkg
model_pkg.InteractionData = sys.modules['model.train'].InteractionData
model_pkg.my_collate  = sys.modules['model.train'].my_collate
model_pkg.COMPER   = sys.modules['model.COMPER'].COMPER
model_pkg.train    = sys.modules['model.train']
model_pkg.test     = sys.modules['model.predictor'].test


In [None]:
main_src = r'''
import pickle
import argparse
import random
import numpy as np
import torch.nn as nn
import pandas as pd

import constants.consts as consts
from model import COMPER, train, test


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--train',
                        default=True,
                        action='store_true',
                        help='whether to train the model')
    parser.add_argument('--eval',
                        default=True,
                        action='store_true',
                        help='whether to evaluate the model')
    parser.add_argument('--model',
                        type=str,
                        default='model.pt',
                        help='name to save or load model from')
    parser.add_argument('--load_checkpoint',
                        default=False,
                        action='store_true',
                        help='whether to load the current model state before training ')
    parser.add_argument('-e', '--epochs',
                        type=int,
                        default=5,
                        help='number of epochs for training model')
    parser.add_argument('-b', '--batch_size',
                        type=int,
                        default=256,
                        help='batch_size')
    parser.add_argument('--not_in_memory',
                        default=False,
                        action='store_true',
                        help='denotes that the path data does not fit in memory')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        help='learning rate')
    parser.add_argument('--l2_reg',
                        type=float,
                        default=0.01,
                        help='l2 regularization coefficent')
    parser.add_argument('--np_baseline',
                        default=False,
                        action='store_true',
                        help='Run the model with the number of path baseline if True')

    return parser.parse_args()


def load_string_to_ix_dicts():
    """
    Loads the dictionaries mapping entity, relation, and type to id
    """
    data_path = 'data/' + consts.DATA_IX_MAPPING_DIR

    with open(data_path + consts.TYPE_TO_IX, 'rb') as handle:
        type_to_ix = pickle.load(handle)
    with open(data_path + consts.RELATION_TO_IX, 'rb') as handle:
        relation_to_ix = pickle.load(handle)
    with open(data_path + consts.ENTITY_TO_IX, 'rb') as handle:
        entity_to_ix = pickle.load(handle)

    return type_to_ix, relation_to_ix, entity_to_ix


def get_miu(data_file):
    with open(data_file, 'r', encoding='utf8') as fp:
        training_data = pd.read_csv(fp)
    miu = np.mean(training_data.iloc[:, 2])
    return miu

def main():
    """
    Main function for model testing and training
    """
    print("Main Loaded")
    random.seed(1000)
    args = parse_args()
    model_path = "model/" + args.model

    t_to_ix, r_to_ix, e_to_ix = load_string_to_ix_dicts()

    miu = get_miu('data/' + consts.DATASET_DIR + 'rating_train.csv')
    model = COMPER(consts.ENTITY_EMB_DIM, consts.TYPE_EMB_DIM, consts.REL_EMB_DIM, consts.HIDDEN_DIM,
                   consts.ATTENTION_DIM, len(e_to_ix), len(t_to_ix), len(r_to_ix), miu)

    if args.train:
        print("Training Starting")

        # 初始化
        for m in model.children():
            if isinstance(m, (nn.Embedding, nn.Linear)):
                nn.init.xavier_uniform_(m.weight)

        # load paths from disk
        train_paths_file = 'data/' + consts.PATH_DATA_DIR + consts.TRAIN_PATH_FILE
        valid_paths_file = 'data/' + consts.PATH_DATA_DIR + consts.VALID_PATH_FILE
        train(model, train_paths_file, valid_paths_file, args.batch_size, args.epochs, model_path,
              args.load_checkpoint, args.not_in_memory, args.lr, args.l2_reg)

    if args.eval:
        print("Evaluation Starting")

        # load paths from disk
        test_paths_file = 'data/' + consts.PATH_DATA_DIR + consts.TEST_PATH_FILE
        rmse, mae = test(model, test_paths_file, args.batch_size, model_path, args.not_in_memory)
        print("Testing_RMSE: %f, Testing_MAE: %f" % (rmse, mae))
        with open('result/main_result.txt', 'a') as fp:
            fp.write('RMSE = %s, MAE = %s\n' % (rmse, mae))


if __name__ == "__main__":
    main()
'''
register_module('main', main_src)


In [None]:
import pathlib, urllib.request, zipfile, pandas as pd, os, shutil

root = pathlib.Path('.')
ds_dir = root / 'MovieLens_100k_dataset'
ds_dir.mkdir(exist_ok=True)

zip_path = root / 'ml-100k.zip'
if not zip_path.exists():
    print('Downloading MovieLens-100K …')
    urllib.request.urlretrieve('https://files.grouplens.org/datasets/movielens/ml-100k.zip', zip_path)

if not (ds_dir / 'u.data').exists():
    print('Extracting …')
    with zipfile.ZipFile(zip_path, 'r') as zf:
        zf.extractall(root)
    shutil.move(root / 'ml-100k' / 'u.data', ds_dir / 'u.data')
    print('Converting u.data  →  rating_data.csv …')
    df = pd.read_csv(ds_dir / 'u.data', sep='\t', header=None,
                     names=['user_id','item_id','ratings','timestamp'])
    df[['user_id','item_id','ratings']].to_csv(ds_dir / 'rating_data.csv', index=False)
    from sklearn.model_selection import train_test_split
    train, temp = train_test_split(df, test_size=0.2, random_state=42)
    valid , test = train_test_split(temp, test_size=0.5, random_state=42)
    train[['user_id','item_id','ratings']].to_csv(ds_dir/'rating_train.csv', index=False)
    valid[['user_id','item_id','ratings']].to_csv(ds_dir/'rating_valid.csv', index=False)
    test [['user_id','item_id','ratings']].to_csv(ds_dir/'rating_test.csv',  index=False)

print(' MovieLens-100K ready →', ds_dir)


Downloading MovieLens-100K …
Extracting …
Converting u.data  →  rating_data.csv …
 MovieLens-100K ready → MovieLens_100k_dataset


In [12]:
import data_preparation
data_preparation.main()


Data preparation:
Forming knowledge graph...
Creating directory MovieLens_100k_data/
the number of users:  943
the number of items:  1682
the number of ratings:  100000




user similar pair numbers:  15516
item similar pair numbers:  14797
Mapping ids to indices...
Creating directory MovieLens_100k_data_ix/
Creating directory MovieLens_100k_ix_mapping/
Constructing paths from user to item...
Creating directory MovieLens_100k_path_data/


100%|██████████| 943/943 [1:43:41<00:00,  6.60s/it]


number of paths attempted to find: 100000
number of train paths not found: 3
number of valid paths not found: 15
number of test paths not found: 18
avg num paths per interaction: 29.73986635188668


In [14]:
import sys, os, shutil
import constants.consts as consts

os.makedirs('model', exist_ok=True)
os.makedirs('result', exist_ok=True)

import model
train_module = sys.modules['model.train']
train_func   = train_module.train
model.train  = train_func
if 'main' in sys.modules:
    sys.modules['main'].train = train_func

os.makedirs('data', exist_ok=True)
for d in (
    consts.DATASET_DIR.rstrip('/'),
    consts.DATA_IX_MAPPING_DIR.rstrip('/'),
    consts.PATH_DATA_DIR.rstrip('/')
):
    src = d
    dst = os.path.join('data', d)
    if os.path.exists(src) and not os.path.exists(dst):
        shutil.move(src, dst)

sys.argv = [
    'main',
    '--train',
    '--epochs', '5',
    '-b', '256',
    '--lr', '0.01',
    '--l2_reg', '0.01'
]

import main
main.main()


Main Loaded
Training Starting
Epoch: 1, Training_RMSE: 0.972978, Training_MAE: 0.773217, Validating_RMSE: 0.957811, Validating_MAE: 0.758035
Epoch: 2, Training_RMSE: 0.935074, Training_MAE: 0.739534, Validating_RMSE: 0.940854, Validating_MAE: 0.736498
Epoch: 3, Training_RMSE: 0.921025, Training_MAE: 0.726850, Validating_RMSE: 0.933321, Validating_MAE: 0.737486
Epoch: 4, Training_RMSE: 0.911654, Training_MAE: 0.717986, Validating_RMSE: 0.930260, Validating_MAE: 0.734418
Epoch: 5, Training_RMSE: 0.904607, Training_MAE: 0.712732, Validating_RMSE: 0.928204, Validating_MAE: 0.729147
Evaluation Starting
Testing_RMSE: 0.919606, Testing_MAE: 0.722576


In [15]:
from google.colab import drive
import os
import ast
import json
from tqdm import tqdm
from constants import consts

# 1) Mount Google Drive
drive.mount('/content/drive')

# 2) Define input and output paths
LOCAL_PATH_DIR = os.path.join('data', consts.PATH_DATA_DIR)  # e.g., 'data/MovieLens_100k_path_data/'
DEST_ROOT = '/content/drive/MyDrive/COMPER_movie_lens_100k_meta_paths'  # New directory for MovieLens-100k
os.makedirs(DEST_ROOT, exist_ok=True)

# 3) Define splits and their corresponding source files
SPLITS = {
    'train': consts.TRAIN_PATH_FILE,
    'val': consts.VALID_PATH_FILE,
    'test': consts.TEST_PATH_FILE
}

# 4) Function to convert formatted paths into JSON-friendly format
def convert_paths(formatted_paths):
    """Convert [(padded_path, real_len), ...] to dicts for JSONL."""
    return [{'triplets': p, 'len': l} for p, l in formatted_paths]

# 5) Process each split and write to JSONL with dataset-specific filenames
for split, fname in SPLITS.items():
    src = os.path.join(LOCAL_PATH_DIR, fname)
    out_filename = f'{split}_movielens100k_meta_paths.jsonl'
    dst = os.path.join(DEST_ROOT, out_filename)

    with open(src, 'r') as fin, open(dst, 'w') as fout:
        for line in tqdm(fin, desc=f'Processing {split} split'):
            formatted_paths, user_idx, item_idx, _, rating = ast.literal_eval(line.strip())
            record = {
                'user_idx': int(user_idx),
                'item_idx': int(item_idx),
                'rating': float(rating),
                'paths': convert_paths(formatted_paths)
            }
            fout.write(json.dumps(record) + '\n')

print('JSONL files for MovieLens-100k saved in:', DEST_ROOT)

Mounted at /content/drive


Processing train split: 80000it [02:22, 560.53it/s]
Processing val split: 10000it [00:18, 537.87it/s]
Processing test split: 10000it [00:18, 548.48it/s]

JSONL files for MovieLens-100k saved in: /content/drive/MyDrive/COMPER_movie_lens_100k_meta_paths



