In [1]:
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import os
import re
import json
import gzip
import torch
import pickle
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

def load_pickle(filename):
    with open(filename, "rb") as f:
        return pickle.load(f)

def save_pickle(data, filename):
    with open(filename, "wb") as f:
        pickle.dump(data, f)

def load_json(file_path):
    with open(file_path, "r") as f:
        return json.load(f)
    
def ReadLineFromFile(path):
    lines = []
    with open(path,'r') as fd:
        for line in fd:
            lines.append(line.rstrip('\n'))
    return lines

def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield eval(l)
        
'''
Set seeds
'''
seed = 2022
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7fbb04906948>

In [2]:
short_data_name = 'ml1m'
full_data_name = 'ml-1m'
if not os.path.exists(os.path.join("../../data/", short_data_name)):
    os.mkdir(os.path.join("../../data/", short_data_name))

In [3]:
# return (user, item, timestamp) sort in get_interaction
def ML1M(rating_score=3):
    datas = []
    data_dict = {}
    data_file = "../../data/raw_data/ml-1m/ratings.dat"

    inter_df = pd.read_csv(data_file, sep='::', header=None)
    inter_df.columns = ["uid", "iid", "rating", "timestamp"]
    inter_df = inter_df.sort_values(by="timestamp")
    inter_df = inter_df.drop_duplicates(["uid", "iid"]).reset_index(drop=True)

    for idx in range(len(inter_df)):
        user = str(inter_df.loc[idx, "uid"])
        item = str(inter_df.loc[idx, "iid"])
        time = inter_df.loc[idx, "timestamp"]
        rating = inter_df.loc[idx, "rating"]
        if rating <= rating_score:
            continue
        data_dict[(user, item)] = int(time)
        datas.append((user, item, int(time)))
    return datas

def ML1M_meta(datamaps):
    meta_datas = {}
    genres, years = defaultdict(int), defaultdict(int)
    meta_file = "../../data/raw_data/ml-1m/movies.dat"
    item_ids = list(datamaps['item2id'].keys())
    item_df = pd.read_csv(meta_file, sep='::', header=None, encoding="ISO-8859-1")
    item_df.columns = ["iid", 'i_title', 'i_genre']
    
    for idx in range(len(item_df)):
        iid = str(item_df.loc[idx, "iid"])
        if iid not in item_ids:
            continue
        title = item_df.loc[idx, "i_title"]
        item_title = title[:-7]
        item_year = title[-5:-1]
        item_genre = item_df.loc[idx, "i_genre"].replace("|", ", ")
        meta = {"id": str(iid), "title": item_title, "year": item_year, "genre": item_genre}
        mapped_id = datamaps['item2id'][iid]
        meta_datas[mapped_id] = meta

    user_datas = {}
    user_file = "../../data/raw_data/ml-1m/users.dat"
    user_ids = list(datamaps['user2id'].keys())
    user_df = pd.read_csv(user_file, sep='::', header=None)
    user_df.columns = ["uid", "gender", "age", "occupation", "zip_code"]
    for idx in range(len(user_df)):
        uid = str(user_df.loc[idx, "uid"])
        if uid not in user_ids:
            continue
        gender = user_df.loc[idx, "gender"]
        age = user_df.loc[idx, "age"]
        occupation = user_df.loc[idx, "occupation"]
        zip_code = user_df.loc[idx, "zip_code"]
        user = {"id": str(uid), "gender": gender, "age": age, "occupation": occupation, "zip_code": zip_code}

    datamaps['genres'] = genres
    return meta_datas, datamaps, user_datas
     
def add_comma(num): # 1000000 -> 1,000,000
    str_num = str(num)
    res_num = ''
    for i in range(len(str_num)):
        res_num += str_num[i]
        if (len(str_num)-i-1) % 3 == 0:
            res_num += ','
    return res_num[:-1]

# get user interaction sequence for sequential recommendation
def get_interaction(datas):
    user_seq = {}
    for data in datas:
        user, item, time = data
        if user in user_seq:
            user_seq[user].append((item, time))
        else:
            user_seq[user] = []
            user_seq[user].append((item, time))

    for user, item_time in user_seq.items():
        item_time.sort(key=lambda x: x[1])  
        items = []
        for t in item_time:
            items.append(t[0])
        user_seq[user] = items
    return user_seq

# K-core user_core item_core, return False if any user/item < core
def check_Kcore(user_items, user_core, item_core):
    user_count = defaultdict(int)
    item_count = defaultdict(int)
    for user, items in user_items.items():
        for item in items:
            user_count[user] += 1
            item_count[item] += 1

    for user, num in user_count.items():
        if num < user_core:
            return user_count, item_count, False
    for item, num in item_count.items():
        if num < item_core:
            return user_count, item_count, False
    return user_count, item_count, True # Kcore guaranteed

# recursively K-core filtering 
def filter_Kcore(user_items, user_core, item_core):  
    user_count, item_count, isKcore = check_Kcore(user_items, user_core, item_core)
    while not isKcore:
        for user, num in user_count.items():
            if user_count[user] < user_core: # remove the user
                user_items.pop(user)
            else:
                for item in user_items[user]:
                    if item_count[item] < item_core:
                        user_items[user].remove(item)
        user_count, item_count, isKcore = check_Kcore(user_items, user_core, item_core)
    return user_items

def id_map(user_items): # user_items dict
    user2id = {} # raw 2 uid
    item2id = {} # raw 2 iid
    id2user = {} # uid 2 raw
    id2item = {} # iid 2 raw
    user_id = 1  # start from 1
    item_id = 1
    final_data = {}
    random_user_list = list(user_items.keys())
    random.shuffle(random_user_list)  # shuffle for re-indexing
    for user in random_user_list:
        items = user_items[user]
        if user not in user2id:
            user2id[user] = str(user_id)
            id2user[str(user_id)] = user
            user_id += 1
        iids = [] # item id lists
        for item in items:
            if item not in item2id:
                item2id[item] = str(item_id)
                id2item[str(item_id)] = item
                item_id += 1
            iids.append(item2id[item])
        uid = user2id[user]
        final_data[uid] = iids
    data_maps = {
        'user2id': user2id,
        'item2id': item2id,
        'id2user': id2user,
        'id2item': id2item
    }
    return final_data, user_id-1, item_id-1, data_maps

In [4]:
def main(data_name, acronym, data_type='ML1M'):
    assert data_type in {'Amazon', 'Yelp', 'Steam', 'ML100k', 'ML1M'}
    rating_score = 3.0  # rating score smaller than this score would be deleted
    # user 5-core item 5-core
    user_core = 5
    item_core = 1
    attribute_core = 0

    datas = ML1M(rating_score)  # list of [user, item, timestamp]

    user_items = get_interaction(datas) # dict of {user: interaction list sorted by time} 
    print(f'{data_name} Raw data has been processed! Lower than {rating_score} are deleted!')
    # raw_id user: [item1, item2, item3...]
    user_items = filter_Kcore(user_items, user_core=user_core, item_core=item_core)
    print(f'User {user_core}-core complete! Item {item_core}-core complete!')

    user_items, user_num, item_num, data_maps = id_map(user_items) # get mapping dicts, randomly shuffle
    user_count, item_count, _ = check_Kcore(user_items, user_core=user_core, item_core=item_core)
    user_count_list = list(user_count.values()) # user click count
    user_avg, user_min, user_max = np.mean(user_count_list), np.min(user_count_list), np.max(user_count_list)
    item_count_list = list(item_count.values()) # item click count
    item_avg, item_min, item_max = np.mean(item_count_list), np.min(item_count_list), np.max(item_count_list)
    interact_num = np.sum([x for x in user_count_list])
    sparsity = (1 - interact_num / (user_num * item_num)) * 100
    show_info = f'Total User: {user_num}, Avg User: {user_avg:.4f}, Min Len: {user_min}, Max Len: {user_max}\n' + \
                f'Total Item: {item_num}, Avg Item: {item_avg:.4f}, Min Inter: {item_min}, Max Inter: {item_max}\n' + \
                f'Iteraction Num: {interact_num}, Sparsity: {sparsity:.2f}%'
    print(show_info)


    print('Begin extracting meta infos...')
    
    meta_infos, datamaps = ML1M_meta(data_maps)

    print(f'{data_name} & {add_comma(user_num)} & {add_comma(item_num)} & {user_avg:.1f}'
          f'& {item_avg:.1f} & {add_comma(interact_num)} & {sparsity:.2f}\% \\')

    # -------------- Save Data ---------------
    data_file = '../../data/{}/'.format(acronym) + 'sequential_data.txt'
    metadata_file = '../../data/{}/'.format(acronym) + 'metadata.json'
    userdata_file = '../../data/{}/'.format(acronym) + 'userdata.json'
    datamaps_file = '../../data/{}/'.format(acronym) + 'datamaps.json'

    with open(data_file, 'w') as out:
        for user, items in user_items.items():
            out.write(user + ' ' + ' '.join(items) + '\n')

    item_keys = sorted(meta_infos.keys(), key=lambda x: int(x))
    print(f"item2id: {len(datamaps['item2id'])}, meta_infos: {len(meta_infos)}, item_keys: {item_keys[:100]}")
    with open(metadata_file, 'w') as out:
        for key in item_keys:
            out.write(json.dumps(meta_infos[key]) + '\n')

    # print("datamap: ", datamaps.keys(), datamaps['item2id'].keys(), datamaps['user2id'].keys(), datamaps['id2item'].keys(), datamaps['id2user'].keys())
    json_str = json.dumps(datamaps)
    with open(datamaps_file, 'w') as out:
        out.write(json_str)

    # -------------- Split Train/Valid/Test for Item Import & Tagging ---------------
    all_items = [item for item in datamaps['item2id'].keys()]
    random.shuffle(all_items)
    train_split = int(len(all_items) * 0.8)
    valid_split = int(len(all_items) * 0.1)
    train_items = all_items[:train_split]
    valid_items = all_items[train_split:train_split+valid_split]
    test_items = all_items[train_split+valid_split:]
    outputs = {'train': train_items, 'val': valid_items, 'test': test_items}
    save_pickle(outputs, '../../data/{}/item_splits.pkl'.format(short_data_name))

In [5]:
main(full_data_name, short_data_name, data_type='ML1M')

  import sys


ml-1m Raw data has been processed! Lower than 3.0 are deleted!
User 5-core complete! Item 1-core complete!
Total User: 6034, Avg User: 95.3384, Min Len: 5, Max Len: 1435
Total Item: 3533, Avg Item: 162.8282, Min Inter: 1, Max Inter: 2853
Iteraction Num: 575272, Sparsity: 97.30%
Begin extracting meta infos...




ml-1m & 6,034 & 3,533 & 95.3& 162.8 & 575,272 & 97.30\% \
item2id: 3533, meta_infos: 3533, item_keys: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100']


In [7]:
def sample_test_data(data_name, test_num=99, sample_type='random'):
    """
    sample_type:
        random:  sample `test_num` negative items randomly.
        pop: sample `test_num` negative items according to item popularity.
    """

    data_file = f'sequential_data.txt'
    if sample_type == 'pop':
        test_file = f'negative_samples_pop.txt'
    else:
        test_file = f'negative_samples.txt'

    item_count = defaultdict(int)
    user_items = defaultdict()

    lines = open('../../data/{}/'.format(data_name) + data_file).readlines()
    for line in tqdm(lines):
        user, items = line.strip().split(' ', 1)
        items = items.split(' ')
        items = [int(item) for item in items]
        user_items[user] = items
        for item in items:
            item_count[item] += 1

    all_item = list(item_count.keys())
    count = list(item_count.values())
    sum_value = np.sum([x for x in count])
    probability = [value / sum_value for value in count]

    user_neg_items = defaultdict()

    for user, user_seq in tqdm(user_items.items()):
        test_samples = []
        while len(test_samples) < test_num:
            if sample_type == 'random':
                sample_ids = np.random.choice(all_item, test_num, replace=False)
            else: # sample_type == 'pop':
                np.random.seed(int(random.random() * 1000))
                sample_ids = np.random.choice(all_item, test_num, replace=False, p=probability)
            sample_ids = [str(item) for item in sample_ids if item not in user_seq and item not in test_samples]
            test_samples.extend(sample_ids)
        test_samples = test_samples[:test_num]
        user_neg_items[user] = test_samples

    with open('../../data/{}/'.format(data_name) + test_file, 'w') as out:
        for user, samples in user_neg_items.items():
            out.write(user+' '+' '.join(samples)+'\n')

In [8]:
sample_test_data(short_data_name, test_num=99, sample_type="pop")  #sample 99 negative testing samples for each user

100%|██████████| 6034/6034 [00:00<00:00, 31907.19it/s]
  0%|          | 0/6034 [00:00<?, ?it/s]


TypeError: Cannot cast scalar from dtype('float64') to dtype('int64') according to the rule 'safe'

### 