In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import sys
import os
import collections
from tqdm import tqdm
HERE = %pwd
sys.path.append(os.path.dirname(HERE))

In [2]:
# token counter
import tiktoken
from tiktoken.core import Encoding
encoding = tiktoken.get_encoding("o200k_base")


def compute_token(text):
    return len(encoding.encode(text))

1. Download from https://nijianmo.github.io/amazon/index.html  
1. Place each dataset into `dir_save` following the format `f"{dir_save}/Music/CDs_and_Vinyl_5.json.gz"`. Please refer to `dict_amazon_name` in the cell below.

In [None]:
# data directory that save raw data
dir_save = "../data/raw_data/amazon18"

# directry name (key) and raw data name (value)
dict_amazon_name = {
    "Music" : "CDs_and_Vinyl",
    "Movie" : "Movies_and_TV",
    "Book" : "Books",
    "Grocery" : "Grocery_and_Gourmet_Food",
    "Clothes" : "Clothing_Shoes_and_Jewelry",
    "Beauty" : "All_Beauty"
}

In [3]:
import gzip
import json
from tqdm import tqdm


def amazon(data_name, dir_save):
    amazon_name = dict_amazon_name[data_name]
    dir_path = f"{dir_save}/{data_name}"
    path_records = f"{dir_path}/{amazon_name}_5.json.gz"
    path_items = f"{dir_path}/meta_{amazon_name}.json.gz"

    def _records():
        # load from transaction records
        g = gzip.open(path_records, 'rb')
        dict_ = {}
        idx = 0
        for l in tqdm(g):
            d_ = json.loads(l)
        
            try:
                user = d_['reviewerID']
                item = d_['asin']
                rating = d_['overall']
                
                time = d_["reviewTime"]
                t_, year = time.split(", ")
                time = f"{year}{t_[0:2]}{t_[3:]:0>2}"

                try:
                    text = d_["reviewText"]
                    # remove long reviews because they tend to be meaningless texts (e.g., html) 
                    if compute_token(text) > 300: 
                        text = ""
                except:
                    text = ""
                
                l = [item, rating, time, text]
                if user in dict_.keys():
                    dict_[user].append(l)
                else:
                    dict_[user] = [l]
            except:
                pass

        # transform dict to pandas.DataFrame
        def _reshape(user):
            df = pd.DataFrame(dict_[user], columns=['itemID', 'rating', 'time', 'review'])
            df.insert(0, "userID", user)
            return df
        
        df_ = pd.concat([_reshape(user) for user in tqdm(dict_.keys())])
        df_.reset_index(inplace=True, drop=True)

        # sort chronological order
        df_ = df_.sort_values(by="time", ascending=False)  

        # delete dupicated items
        df_records = df_.drop_duplicates(subset=["userID", "itemID"], keep='first')  
        return df_records
    
    def _item():
        # load from item master
        g = gzip.open(path_items, 'rb')
        dict_ = {}
        idx = 0
        for l in tqdm(g):
            d_ = json.loads(l)
            item = d_['asin']
    
            # title
            try:
                title = d_['title']
            except:
                title = ""
    
            ## if title was written in html, skip it
            ## long word title tend to be wierd, so skip it
            ## short word title tend to be wierd, so skip it
            if ("class=" in title) or (compute_token(title) > 100) or (len(set(title)) <= 3):
                title = ""
            
            # category
            try:
                cat = d_['category']
                cat = [c.replace("&amp;", "&") for c in cat[1:]]
                cat = [c for c in cat if len(c) <= 50]
                categories = ", ".join(cat)
            except:
                categories = ""
            
            # description
            try:
                description = ", ".join(d_['description'])

                ## remove wierd text
                if len(set(description)) <= 3:
                    description = ""

                ## delete html text
                flag = ["class=", "<br>", "<br />", "<I>", "href", "xml"]
                if np.sum([f in description for f in flag]) > 0:
                    description = ""

                ## delete items whose description were longer than 300 tokens
                if compute_token(description) >= 300:
                    description = ""
            except:
                description = ""

            l = [title, categories, description]
            if item in dict_.keys():
                # update longer description
                if len(description) > len(dict_[item][2]):
                    dict_[item] = l
            else:
                dict_[item] = l
                
        df_ = pd.DataFrame(dict_, index=["title", "categories", "description"]).T
        df_ = df_.dropna()
        df_items = df_[df_["title"] != ""]        
        return df_items
    
    df_records = _records()
    df_items = _item()
    return df_records, df_items


def remove_un_used_items(df_records, df_items):
    # items registered in item master
    items_master = set(df_items.index.values)
    
    # restrict transaction records whose rows are registered in items_master.
    s = df_records['itemID'].apply(lambda s : s in items_master)
    df_r = df_records[s]

    # restric item master whose items are registed in restricted transcation records
    df_i = df_items.loc[df_r['itemID'].unique()]
    return df_r, df_i

In [None]:
# directory to save preprocessed data to reduce dataset size
version_input = "20250403_input"

# datasets
data_names = ["Music", "Movie", "Grocery", "Clothes", "Book"]
for data_name in data_names:
    # load
    df_records, df_items = amazon(data_name, dir_save)
    df_records_master, df_item_master = remove_un_used_items(df_records, df_items)

    # groupby
    gb_master = df_records_master.groupby("userID")
    users_master = df_records_master["userID"].unique()
    L = [gb_master.get_group(user) for user in users_master]

    # user type : heavy or light
    du = {
        "light" : {
            "min" : 5,
            "max" : 10
        },
        "heavy" : {
            "min" : 30,
            "max" : 50
        }
    }
    
    di = dict()
    for type_user, dn in du.items():
        # for evaluation, add another user
        n_min = dn["min"] + 1
        n_max = dn["max"] + 1

        # select users whoose transactions satisfied the user type constraints
        df_records_ = pd.concat([
            df for df in L 
            if len(df) >= n_min
            and len(df) <= n_max
            and df["rating"].iloc[-1] > 3 
        ])
        df_records, df_items = remove_un_used_items(df_records_, df_item_master)
    
        # groupby
        gb = df_records.groupby("userID")
        users = df_records["userID"].unique()
        
        # select users
        ## +5 is for supplementary
        users_sample = np.random.choice(users, size=200+5, replace=False)
    
        # create transaction records for the selected users
        df_r = pd.concat([
            gb.get_group(user).sort_values(by="time", ascending=True)
            for user in users_sample
        ])
    
        # select candidate items 
        items_core = df_r["itemID"].unique()
        items_all = df_item_master.index.values
        items_others = sorted(set(items_all) - set(items_core))
        items_others = np.random.choice(items_others, size=500, replace=False)
        items_candi = sorted(set(items_core).union(set(items_others)))
        di[type_user] = items_candi
    
        d_ = {
            "data" : data_name,
            "user_type" : type_user,
            "#user" : len(users_sample),
            "#item_candi" : len(items_candi),
            "#user_all" : len(users),
            "#item_all" : len(df_item_master)
        }
        print(d_)
        
        # save
        dir_save_data = f"../data/preprocessed_data/{version_input}/Amazon_{data_name}"
        os.makedirs(dir_save_data, exist_ok=True)
        df_r.to_csv(f"{dir_save_data}/records_{type_user}.csv")

    # items used in both heavy and light to reduce data size
    items = sorted(set(np.concatenate(list(di.values()))))
    df_i = df_item_master.loc[items]
    df_i.to_csv(f"{dir_save_data}/items.csv")

print("finished")

10408381it [25:55, 6727.92it/s] 