In [4]:
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import os
import re
import json
import gzip
import torch
import pickle
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

def load_pickle(filename):
    with open(filename, "rb") as f:
        return pickle.load(f)

def save_pickle(data, filename):
    with open(filename, "wb") as f:
        pickle.dump(data, f)

def load_json(file_path):
    with open(file_path, "r") as f:
        return json.load(f)
    
def ReadLineFromFile(path):
    lines = []
    with open(path,'r') as fd:
        for line in fd:
            lines.append(line.rstrip('\n'))
    return lines

def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield eval(l)
        
'''
Set seeds
'''
seed = 2022
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x10a01d130>

In [5]:
short_data_name = "beauty" # 'beauty' # "sports" # "clothing" # "games"
full_data_name = "Beauty" # 'Beauty' # "Sports_and_Outdoors" # "Clothing_Shoes_and_Jewelry" # "Video_Games"
if not os.path.exists(os.path.join("../../data/", short_data_name)):
    os.mkdir(os.path.join("../../data/", short_data_name))

In [6]:
# return (user, item, timestamp) sort in get_interaction
def Amazon(dataset_name, rating_score=3):
    '''
    reviewerID - ID of the reviewer, e.g. A2SUAM1J3GNN3B
    asin - ID of the product, e.g. 0000013714
    reviewerName - name of the reviewer
    helpful - helpfulness rating of the review, e.g. 2/3  --"helpful": [2, 3],
    reviewText - text of the review  --"reviewText": "I bought this for my husband who plays the piano. ..."
    overall - rating of the product  --"overall": 5.0,
    summary - summary of the review  --"summary": "Heavenly Highway Hymns",
    unixReviewTime - time of the review (unix time)  --"unixReviewTime": 1252800000,
    reviewTime - time of the review (raw)  --"reviewTime": "09 13, 2009"
    '''
    items_with_title = {}
    meta_file = f"../../data/raw_data/meta_{dataset_name}.json.gz"
    with gzip.open(meta_file, "r") as fr:
        for line in tqdm(fr, desc="load meta data"):
            line = eval(line)
            if "title" not in line:
                continue
            items_with_title[line['asin']] = 1 

    datas = []
    data_dict = {}
    data_file = f"../../data/raw_data/reviews_{dataset_name}_5.json.gz"
    
    with gzip.open(data_file, "r") as fr:
        for line in tqdm(fr, desc="load all interactions"):
            # try:
            line = eval(line)
            user = line['reviewerID']
            item = line['asin']
            if float(line['overall']) <= rating_score or item not in items_with_title: # remove low rating
                continue
            if (user, item) in data_dict:
                continue
            time = line['unixReviewTime']
            data_dict[(user, item)] = int(time) # merge duplicate interactions, keep the first record
            datas.append((user, item, int(time)))
    return datas

def Amazon_meta(dataset_name, datamaps):
    '''
    asin - ID of the product, e.g. 0000031852
    title - name of the product  --"title": "Girls Ballet Tutu Zebra Hot Pink",
    description
    price - price in US dollars (at time of crawl) --"price": 3.17,
    imUrl - url of the product image (str) --"imUrl": "http://ecx.images-amazon.com/images/I/51fAmVkTbyL._SY300_.jpg",
    related - related products (also bought, also viewed, bought together, buy after viewing)
    salesRank - sales rank information --"salesRank": {"Toys & Games": 211836}
    brand - brand name --"brand": "Coxlures",
    categories - list of categories the product belongs to --"categories": [["Sports & Outdoors", "Other Sports", "Dance"]]
    '''
    meta_datas = {}
    meta_file = f"../../data/raw_data/meta_{dataset_name}.json.gz"
    item_ids = list(datamaps['item2id'].keys())
    with gzip.open(meta_file, "r") as fr:
        for line in tqdm(fr, desc="load meta data"):
            line = eval(line)
            if line['asin'] not in item_ids:
                continue
            if "title" in line:
                line['title'] = re.sub(r'\n\t', ' ', line['title']).encode('UTF-8', 'ignore').decode('UTF-8')
                line['title'] = line['title'].split(",")[0]
            if "description" in line and type(line['description']) == str:
                line['description'] = re.sub(r'\n\t', ' ', line['description']).encode('UTF-8', 'ignore').decode('UTF-8')
            if 'related' in line:
                del line['related']
            if 'imUrl' in line:
                del line['imUrl']
            mapped_id = datamaps['item2id'][line['asin']]
            meta_datas[mapped_id] = line
    return meta_datas

def Amazon_Review(user2id, item2id, dataset_name, rating_score=3):
    review_data = {}
    data_file = f"../../data/raw_data/reviews_{dataset_name}_5.json.gz"
    aspect_explanations = None
    if os.path.exists(f"../../data/raw_data/reviews_{dataset_name}.pkl"):
        aspect_explanations = load_pickle(f"../../data/raw_data/reviews_{dataset_name}.pickle")
    no_sentence = 0

    with gzip.open(data_file, "r") as fr:
        for lidx, line in tqdm(enumerate(fr)):
            line = eval(line)
            if float(line['overall']) <= rating_score: # remove low rating
                continue
            user = line['reviewerID']
            item = line['asin']
            if (user, item) in review_data or user not in user2id or item not in item2id:
                continue

            if 'reviewText' in line:
                exp_ = line
                if aspect_explanations is not None:
                    exp_ = aspect_explanations[lidx]
                    assert exp_['user'] == user and exp_['item'] == item
                if 'sentence' in exp_:
                    selected_idx = random.randint(0, len(exp_['sentence'])-1)  # randomly sample review of only one feature
                    line['explanation'] = exp_['sentence'][selected_idx][2]
                    line['feature'] = exp_['sentence'][selected_idx][0]
                else:
                    no_sentence += 1
                line['reviewText'] = re.sub(r'\n\t', ' ', line['reviewText']).encode('UTF-8', 'ignore').decode('UTF-8')
            review_data[(user, item)] = line

    # how to obtain better review data?
    print(f"No sentence: {no_sentence}/{len(review_data)}.")
    return review_data
        
def add_comma(num): # 1000000 -> 1,000,000
    str_num = str(num)
    res_num = ''
    for i in range(len(str_num)):
        res_num += str_num[i]
        if (len(str_num)-i-1) % 3 == 0:
            res_num += ','
    return res_num[:-1]

# get user interaction sequence for sequential recommendation
def get_interaction(datas):
    user_seq = {}
    for data in datas:
        user, item, time = data
        if user in user_seq:
            user_seq[user].append((item, time))
        else:
            user_seq[user] = []
            user_seq[user].append((item, time))

    for user, item_time in user_seq.items():
        item_time.sort(key=lambda x: x[1])  
        items = []
        for t in item_time:
            items.append(t[0])
        user_seq[user] = items
    return user_seq

# K-core user_core item_core, return False if any user/item < core
def check_Kcore(user_items, user_core, item_core):
    user_count = defaultdict(int)
    item_count = defaultdict(int)
    for user, items in user_items.items():
        for item in items:
            user_count[user] += 1
            item_count[item] += 1

    for user, num in user_count.items():
        if num < user_core:
            return user_count, item_count, False
    for item, num in item_count.items():
        if num < item_core:
            return user_count, item_count, False
    return user_count, item_count, True 
 
def filter_Kcore(user_items, user_core, item_core): 
    user_count, item_count, isKcore = check_Kcore(user_items, user_core, item_core)
    while not isKcore:
        for user, num in user_count.items():
            if user_count[user] < user_core:  
                user_items.pop(user)
            else:
                for item in user_items[user]:
                    if item_count[item] < item_core:
                        user_items[user].remove(item)
        user_count, item_count, isKcore = check_Kcore(user_items, user_core, item_core)
    return user_items

def id_map(user_items): # user_items dict
    user2id = {} # raw 2 uid
    item2id = {} # raw 2 iid
    id2user = {} # uid 2 raw
    id2item = {} # iid 2 raw
    user_id = 1  # start from 1
    item_id = 1
    final_data = {}
    random_user_list = list(user_items.keys())
    random.shuffle(random_user_list)   
    for user in random_user_list:
        items = user_items[user]
        if user not in user2id:
            user2id[user] = str(user_id)
            id2user[str(user_id)] = user
            user_id += 1
        iids = [] # item id lists
        for item in items:
            if item not in item2id:
                item2id[item] = str(item_id)
                id2item[str(item_id)] = item
                item_id += 1
            iids.append(item2id[item])
        uid = user2id[user]
        final_data[uid] = iids
    data_maps = {
        'user2id': user2id,
        'item2id': item2id,
        'id2user': id2user,
        'id2item': id2item
    }
    return final_data, user_id-1, item_id-1, data_maps

In [7]:
def main(data_name, acronym, data_type='Amazon'):
    assert data_type in {'Amazon', 'Yelp', 'Steam'}
    rating_score = 0.0  # rating score smaller than this score would be deleted
    # user 5-core item 5-core
    user_core = 5
    item_core = 5
    attribute_core = 0

    datas = Amazon(data_name, rating_score)  # list of [user, item, timestamp]

    user_items = get_interaction(datas) # dict of {user: interaction list sorted by time} 
    print(f'{data_name} Raw data has been processed! Lower than {rating_score} are deleted!')
    # raw_id user: [item1, item2, item3...]
    user_items = filter_Kcore(user_items, user_core=user_core, item_core=item_core)
    print(f'User {user_core}-core complete! Item {item_core}-core complete!')

    user_items, user_num, item_num, datamaps = id_map(user_items) # get mapping dicts, randomly shuffle
    user_count, item_count, _ = check_Kcore(user_items, user_core=user_core, item_core=item_core)
    user_count_list = list(user_count.values()) # user click count
    user_avg, user_min, user_max = np.mean(user_count_list), np.min(user_count_list), np.max(user_count_list)
    item_count_list = list(item_count.values()) # item click count
    item_avg, item_min, item_max = np.mean(item_count_list), np.min(item_count_list), np.max(item_count_list)
    interact_num = np.sum([x for x in user_count_list])
    sparsity = (1 - interact_num / (user_num * item_num)) * 100
    show_info = f'Total User: {user_num}, Avg User: {user_avg:.4f}, Min Len: {user_min}, Max Len: {user_max}\n' + \
                f'Total Item: {item_num}, Avg Item: {item_avg:.4f}, Min Inter: {item_min}, Max Inter: {item_max}\n' + \
                f'Iteraction Num: {interact_num}, Sparsity: {sparsity:.2f}%'
    print(show_info)


    print('Begin extracting meta infos...')
    
    meta_infos = Amazon_meta(data_name, datamaps)

    print(f'{data_name} & {add_comma(user_num)} & {add_comma(item_num)} & {user_avg:.1f}'
          f'& {item_avg:.1f} & {add_comma(interact_num)} & {sparsity:.2f}\% \\')

    # -------------- Save Data ---------------
    data_file = '../../data/{}/'.format(acronym) + 'sequential_data.txt'
    metadata_file = '../../data/{}/'.format(acronym) + 'metadata.json'
    datamaps_file = '../../data/{}/'.format(acronym) + 'datamaps.json'

    with open(data_file, 'w') as out:
        for user, items in user_items.items():
            out.write(user + ' ' + ' '.join(items) + '\n')

    item_keys = sorted(meta_infos.keys(), key=lambda x: int(x))
    print(f"item2id: {len(datamaps['item2id'])}, meta_infos: {len(meta_infos)}, item_keys: {item_keys[:100]}")
    with open(metadata_file, 'w') as out:
        for key in item_keys:
            out.write(json.dumps(meta_infos[key]) + '\n')

    json_str = json.dumps(datamaps)
    with open(datamaps_file, 'w') as out:
        out.write(json_str)

    # -------------- Split Train/Valid/Test for Item Import & Tagging ---------------
    all_items = [item for item in datamaps['item2id'].keys()]
    random.shuffle(all_items)
    train_split = int(len(all_items) * 0.8)
    valid_split = int(len(all_items) * 0.1)
    train_items = all_items[:train_split]
    valid_items = all_items[train_split:train_split+valid_split]
    test_items = all_items[train_split+valid_split:]
    outputs = {'train': train_items, 'val': valid_items, 'test': test_items}
    save_pickle(outputs, '../../data/{}/item_splits.pkl'.format(short_data_name))


    # -------------- Create Train/Valid/Test for Review ---------------
    review_data = Amazon_Review(datamaps['user2id'], datamaps['item2id'], data_name, rating_score)
    train_exp_data, valid_exp_data, test_exp_data = [], [], []
    train_review_data, valid_review_data, test_review_data = [], [], []
    id2user, id2item = datamaps['id2user'], datamaps['id2item']
    for user, items in user_items.items():
        user = id2user[user]
        test_item = id2item[items[-1]]
        valid_item = id2item[items[-2]]
        test_review_data.append(review_data[(user, test_item)])
        if 'explanation' in review_data[(user, test_item)]:
            test_exp_data.append(review_data[(user, test_item)])
        valid_review_data.append(review_data[(user, valid_item)])
        if 'explanation' in review_data[(user, valid_item)]:
            valid_exp_data.append(review_data[(user, valid_item)])
        for item in items[:-2]:
            train_review_data.append(review_data[(user, id2item[item])])
            if 'explanation' in review_data[(user, id2item[item])]:
                train_exp_data.append(review_data[(user, id2item[item])])
    review_outputs = {'train': train_review_data, 'val': valid_review_data, 'test': test_review_data}
    save_pickle(review_outputs, '../../data/{}/review_splits.pkl'.format(short_data_name))
    exp_outputs = {'train': train_exp_data, 'val': valid_exp_data, 'test': test_exp_data}
    save_pickle(exp_outputs, '../../data/{}/exp_splits.pkl'.format(short_data_name))

In [8]:
main(full_data_name, short_data_name, data_type='Amazon')

FileNotFoundError: [Errno 2] No such file or directory: '../../data/raw_data/meta_Beauty.json.gz'

In [None]:
def sample_test_data(data_name, test_num=99, sample_type='random'):
    """
    sample_type:
        random:  sample `test_num` negative items randomly.
        pop: sample `test_num` negative items according to item popularity.
    """

    data_file = f'sequential_data.txt'
    if sample_type == 'random':
        test_file = f'negative_samples.txt'
    elif sample_type == 'pop':
        test_file = f'negative_samples_pop.txt'

    item_count = defaultdict(int)
    user_items = defaultdict()

    lines = open('../../data/{}/'.format(data_name) + data_file).readlines()
    for line in tqdm(lines):
        user, items = line.strip().split(' ', 1)
        items = items.split(' ')
        items = [int(item) for item in items]
        user_items[user] = items
        for item in items:
            item_count[item] += 1

    all_item = list(item_count.keys())
    count = list(item_count.values())
    sum_value = np.sum([x for x in count])
    probability = [value / sum_value for value in count]

    user_neg_items = defaultdict()

    for user, user_seq in tqdm(user_items.items()):
        test_samples = []
        while len(test_samples) < test_num:
            if sample_type == 'random':
                sample_ids = np.random.choice(all_item, test_num, replace=False)
            else: # sample_type == 'pop':
                sample_ids = np.random.choice(all_item, test_num, replace=False, p=probability)
            sample_ids = [str(item) for item in sample_ids if item not in user_seq and item not in test_samples]
            test_samples.extend(sample_ids)
        test_samples = test_samples[:test_num]
        user_neg_items[user] = test_samples

    with open('../../data/{}/'.format(data_name) + test_file, 'w') as out:
        for user, samples in user_neg_items.items():
            out.write(user+' '+' '.join(samples)+'\n')

In [None]:
sample_test_data(short_data_name, test_num=99, sample_type="pop")  #sample 99 negative testing samples for each user

100%|██████████| 35598/35598 [00:00<00:00, 189051.65it/s]
100%|██████████| 35598/35598 [02:31<00:00, 235.37it/s]


### 