In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import sys
import os
import collections
from tqdm import tqdm
HERE = %pwd
sys.path.append(os.path.dirname(HERE))

%matplotlib inline
import matplotlib.pyplot as plt
import copy

In [2]:
# preprocessed data
version_input = "20250403_input"

# directory to save preprocessed data to use experiments
version_prep = "20250403_prep"

data_names = ["Movie", "Music", "Grocery", "Clothes", "Book"]
for data_name in data_names:
    dir_load_data = f"../data/preprocessed_data/{version_input}/Amazon_{data_name}"
    df_items = pd.read_csv(f"{dir_load_data}/items.csv", index_col=0).fillna("")
    items_all = set(df_items.index.values)
    
    dir_save_data = f"../data/preprocessed_data/{version_prep}/Amazon_{data_name}"
    os.makedirs(dir_save_data, exist_ok=True)
    
    di = dict()
    for type_user in ["light", "heavy"]:
        # load
        df_records = pd.read_csv(f"{dir_load_data}/records_{type_user}.csv", index_col=0).fillna("")
        gb = df_records.groupby("userID")
        users = df_records["userID"].unique()
        
        def _extract(user):
            df_ = gb.get_group(user)
            df_j = df_.join(df_items, on="itemID")
            # for in-context learning
            items_train = sorted(set(df_j["itemID"].unique()))

            # for evaluation; 1 test + 9 others = 10 candidates 
            items_test = df_j["itemID"].iloc[-1:].values
            items_others = sorted(items_all - set(items_train))
            items_others = np.random.choice(items_others, size=9)
            items_candi = sorted(set(items_others).union(items_test))

            d = {
                "id_train" : items_train, 
                "id_candi" : items_candi
            }
            return d
        
        d_ = {user : _extract(user) for user in tqdm(users)}

        # items that appeared for experiments
        items_train = np.unique(np.concatenate([d["id_train"] for d in d_.values()]))
        items_candi = np.unique(np.concatenate([d["id_candi"] for d in d_.values()]))
        items = sorted(set(items_train).union(set(items_candi)))
        di[type_user] = items
    
        # save
        import pickle
        with open(f"{dir_save_data}/ids_{type_user}.pickle", 'wb') as f:
            pickle.dump(d_, f)
        print(data_name, type_user, len(items))
    
    items = sorted(set(np.concatenate(list(di.values()))))
    df_items_valid = df_items.loc[items]
    df_items_valid.to_csv(f"{dir_save_data}/items_slim.csv")
print("finished")

100%|█████████████████████████████████████████████████████████████████| 205/205 [00:00<00:00, 208.16it/s]


Movie light 2709


100%|█████████████████████████████████████████████████████████████████| 205/205 [00:00<00:00, 226.00it/s]


Movie heavy 6246


100%|█████████████████████████████████████████████████████████████████| 205/205 [00:01<00:00, 192.23it/s]


Music light 2820


100%|█████████████████████████████████████████████████████████████████| 205/205 [00:01<00:00, 197.14it/s]


Music heavy 7063


100%|█████████████████████████████████████████████████████████████████| 205/205 [00:00<00:00, 230.26it/s]


Grocery light 2693


100%|█████████████████████████████████████████████████████████████████| 205/205 [00:00<00:00, 232.66it/s]


Grocery heavy 5937


100%|█████████████████████████████████████████████████████████████████| 205/205 [00:00<00:00, 212.50it/s]


Clothes light 2892


100%|█████████████████████████████████████████████████████████████████| 205/205 [00:01<00:00, 204.89it/s]


Clothes heavy 6239


100%|█████████████████████████████████████████████████████████████████| 205/205 [00:01<00:00, 175.49it/s]


Book light 2998


100%|█████████████████████████████████████████████████████████████████| 205/205 [00:01<00:00, 175.41it/s]


Book heavy 7903
finished
