In [7]:
import numpy as np
import os
from pandas import DataFrame
import pickle
import random

In [10]:
dataset = 'amazon-book'
# dataset = 'yelp2018'
# dataset = 'last-fm'
core = 10

print('dataset: %s, core: %d' % (dataset, core))
rating_file = dataset + '/ratings_final'
train_file = dataset + '/train'
test_file = dataset + '/test'
rating_arr = []
with open(train_file + '.txt') as f:
    lines = [line.split() for line in f]
    for line in lines:
        u = int(line[0])
        items = line[1:]
        rating_arr += [[u, int(i), 1] for i in items]

with open(test_file + '.txt') as f:
    lines = [line.split() for line in f]
    for line in lines:
        u = int(line[0])
        items = line[1:]
        rating_arr += [[u, int(i), 1] for i in items]
rating_np = np.array(rating_arr, dtype=np.int64)

    
item_counter = {}
user_counter = {}
for u, i, _ in rating_np:
    if i not in item_counter:
        item_counter[i] = 1
    else:
        item_counter[i] += 1
    
    if u not in user_counter:
        user_counter[u] = 1
    else:
        user_counter[u] += 1

        
n_user = len(set(rating_np[:, 0]))
n_item = len(set(rating_np[:, 1]))
print('before core filter:\t%10d, user(%d), item(%d)' % (len(rating_np), n_user, n_item))
core_rating_np = []
all_rating_np = []
user_history_dict = {}
for u, i, _ in rating_np:
    if user_counter[u] > core and item_counter[i] > core:
        core_rating_np += [[u, i, 1]]
        if u not in user_history_dict:
            user_history_dict[u] = set()
        user_history_dict[u].add(i)
    all_rating_np += [[u, i, 1]]
print('after core filter:\t%10d' % (len(core_rating_np)), end='')
rating_np = np.array(core_rating_np)

n_user = len(set(rating_np[:, 0]))
n_item = len(set(rating_np[:, 1]))
item_set = set(rating_np[:, 1])
print(', user(%d), item(%d)' % (n_user, n_item))

all_rating_np = np.array(all_rating_np)
all_item_set = set(all_rating_np[:, 1])


print('before add neg, # of interaction: %d' % (len(rating_np)))
neg_data = []
for u, i_set in user_history_dict.items():
    neg_i = list(all_item_set - i_set)
    neg_i = random.sample(neg_i, min(len(neg_i), len(i_set)))
    neg_data += [[u, i, 0] for i in neg_i]

neg_data = np.array(neg_data)
rating_np = np.append(rating_np, neg_data, axis=0)

train_data, eval_data, test_data, user_history_dict = dataset_split(rating_np)
print('add neg, # of interaction: %d' % (len(rating_np)))
np.save(rating_file, rating_np)
print('rating_file saved!')

title = [['user', 'item', 'like']]
def kk(d, t):
    data = {
        'user': d[:, 0],
        'item': d[:, 1],
        'like': d[:, 2]
    }
    df = DataFrame(data, columns=['user', 'item', 'like'])
    df.to_csv(f'data_filter/{dataset}/{t}_pd.csv')

kk(train_data, 'train')
kk(test_data, 'test')
kk(eval_data, 'eval')

dataset: amazon-book, core: 10
before core filter:	    846434, user(70679), item(24915)
after core filter:	    467791, user(18441), item(15717)
before add neg, # of interaction: 467791
splitting dataset ...
add neg, # of interaction: 935582
rating_file saved!


In [3]:
def dataset_split(rating_np):
    print('splitting dataset ...')

    # train:eval:test = 6:2:2
    eval_ratio = 0.2
    test_ratio = 0.2
    n_ratings = rating_np.shape[0]

    eval_indices = np.random.choice(n_ratings, size=int(n_ratings * eval_ratio), replace=False)
    left = set(range(n_ratings)) - set(eval_indices)
    test_indices = np.random.choice(list(left), size=int(n_ratings * test_ratio), replace=False)
    train_indices = list(left - set(test_indices))
    # print(len(train_indices), len(eval_indices), len(test_indices))

    # traverse training data, only keeping the users with positive ratings
    user_history_dict = dict()
    for i in train_indices:
        user = rating_np[i][0]
        item = rating_np[i][1]
        rating = rating_np[i][2]

        if rating == 1:
            if user not in user_history_dict:
                user_history_dict[user] = []
            user_history_dict[user].append(item)

    train_indices = [i for i in train_indices if rating_np[i][0] in user_history_dict]
    eval_indices = [i for i in eval_indices if rating_np[i][0] in user_history_dict]
    test_indices = [i for i in test_indices if rating_np[i][0] in user_history_dict]
    # print(len(train_indices), len(eval_indices), len(test_indices))

    train_data = rating_np[train_indices]
    eval_data = rating_np[eval_indices]
    test_data = rating_np[test_indices]

    # print(train_data)
    # input()

    return train_data, eval_data, test_data, user_history_dict