In [1]:
import os, re, copy
import numpy as np
import pandas as pd
from tqdm import tqdm
import json, gzip

In [2]:
DATA_DIR = os.path.expanduser("~/blob/raw_datasets/amazon-beauty-2014")
DATASET_NAME = "Beauty"

ALL_AMAZON_PROD = os.path.expanduser("~/blob/raw_datasets/amazon-all/All_Amazon_Meta.json.gz")

In [53]:
import re

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        p = re.compile('(?<!\\\\)\'')
        l = p.sub('\"', l)
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')


def load_all_amazon_product_meta(path):
    return getDF(path)


if (not os.path.exists(os.path.join(DATA_DIR, "reviews.tsv"))) or (not os.path.exists(os.path.join(DATA_DIR, "meta.tsv"))) :
    print("Load from json file and save as tsv file.")
    review_df = getDF(os.path.join(DATA_DIR, f'reviews_{DATASET_NAME}.json.gz'))
    meta_df = getDF(os.path.join(DATA_DIR, f'meta_{DATASET_NAME}.json.gz'))
    review_df.to_csv(os.path.join(DATA_DIR, "reviews.tsv"), index=None)
    meta_df.to_csv(os.path.join(DATA_DIR, "meta.tsv"), index=None)
else:
    print("Load from tsv file.")
    review_df = pd.read_csv(os.path.join(DATA_DIR, "reviews.tsv"), low_memory=False)
    meta_df = pd.read_csv(os.path.join(DATA_DIR, "meta.tsv"), low_memory=False)
    
print("Columns of reviews: ", review_df.columns)
print("Columns of meta data: ", meta_df.columns)
print("Shape of reviews: ", review_df.shape)
print("Shape of metas: ", meta_df.shape)

Load from tsv file.
Columns of reviews:  Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime'],
      dtype='object')
Columns of meta data:  Index(['asin', 'description', 'title', 'imUrl', 'salesRank', 'categories',
       'price', 'related', 'brand'],
      dtype='object')
Shape of reviews:  (2023070, 9)
Shape of metas:  (259204, 9)


In [54]:
review_df.shape, meta_df.shape

((2023070, 9), (259204, 9))

In [55]:
meta_df.sample(5)

Unnamed: 0,asin,description,title,imUrl,salesRank,categories,price,related,brand
3669,B0001DHH0C,"* 7"" Style Comb with All Fine Teeth * Color: B...",Ace All-purpose Comb 7&quot; Fine-teeth * Black,http://ecx.images-amazon.com/images/I/310G18Qd...,{'Beauty': 39222},"[['Beauty', 'Hair Care', 'Styling Tools', 'Com...",4.99,"{'also_bought': ['B00B7RHYTK', 'B00DAJW0R4', '...",ACE
146045,B005HKEEFK,,Mally Beauty Evercolor Endless Eyeshadow (Twil...,http://ecx.images-amazon.com/images/I/311uUeG5...,{'Beauty': 423529},"[['Beauty', 'Makeup', 'Eyes', 'Eye Shadow']]",18.0,"{'also_bought': ['B003JLOSTG', 'B007TN0RDQ', '...",
43617,B0011DPWT4,Bath & Body Works C.O. Bigelow Rosemary Mint S...,Bath &amp; Body Works C.O. Bigelow Rosemary Mi...,http://ecx.images-amazon.com/images/I/41TNk65l...,{'Beauty': 849088},"[['Beauty', 'Bath & Body', 'Cleansers', 'Body ...",,,
236613,B00EKZ3T3Q,Renowned fashion illustrator Antonio Lopez col...,MAC Antonio Lopez Face/pink Palette,http://ecx.images-amazon.com/images/I/4114jpFa...,{'Beauty': 371488},"[['Beauty', 'Makeup', 'Face', 'Blush']]",52.4,{'also_viewed': ['B00HKA110G']},
53531,B001A9GMSY,Bath & Body Works Cucumber Melon Body Cream 8....,Bath &amp; Body Works Cucumber Melon Body Crea...,http://ecx.images-amazon.com/images/I/21AHFko9...,{'Beauty': 327917},"[['Beauty', 'Skin Care', 'Body', 'Moisturizers...",,"{'also_bought': ['B002G7H1IY', 'B003Q2F2RA'], ...",


In [56]:
meta_df['categories'] = meta_df['categories'].apply(lambda x: eval(x)[0])

In [57]:
meta_df = meta_df[~meta_df['title'].isna()]
meta_df.reset_index(inplace=True)
print(meta_df.shape)

(258760, 10)


In [58]:
review_df.sample(5)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
437374,AA0BJD80IPGU9,B000V26BQY,George C. Reeves,"[0, 0]","I purchased this item for my wife, and she abs...",5.0,Clarisonic,1310601600,"07 14, 2011"
1720292,A18OEWMZXSJ826,B0096DS22U,Hamada,"[0, 0]",I like Fendi too muchGood priceI like it,4.0,Nice,1405036800,"07 11, 2014"
796403,AY7VU86VFTU27,B001V6QL5K,tahisha hall,"[0, 0]",This product is a must if you are trying to ma...,5.0,Natural hair crack!,1396569600,"04 4, 2014"
1573586,A1XQRGA4MG5VW3,B007EF59RS,Susan L. Schwartz,"[1, 1]",I've been using Shalimar since high school (a ...,5.0,Love Shalimar,1361577600,"02 23, 2013"
1037870,A26EHQ52RL9LL1,B003F64FN4,Eric Zuniga,"[1, 1]","Good hold, especially for thick hair without l...",5.0,Great Product,1391644800,"02 6, 2014"


In [59]:
used_col = {
    'review': ['reviewerID', 'asin', 'overall', 'unixReviewTime'],
    'meta': ['asin', 'title', 'categories', 'price', 'description', 'brand'],
    }
review_df = review_df[used_col['review']]
meta_df = meta_df[used_col['meta']]

review_df = review_df.rename(columns={'overall': 'rating', 'unixReviewTime': 'timestamp', 'asin': 'item_id', 'reviewerID': 'user_id'})
meta_df = meta_df.rename(columns={'asin': 'item_id', 'categories': 'category'})

In [60]:
print(review_df.head(3))
print(meta_df.head(3))

          user_id     item_id  rating   timestamp
0  A39HTATAQ9V7YF  0205616461     5.0  1369699200
1  A3JM6GV9MNOF9X  0558925278     3.0  1355443200
2  A1Z513UWSAAO0F  0558925278     5.0  1404691200
      item_id                                              title  \
0  0205616461  Bio-Active Anti-Aging Serum (Firming Ultra-Hyd...   
1  0558925278  Eco Friendly Ecotools Quality Natural Bamboo C...   
2  0733001998                                Mastiha Body Lotion   

                                            category  price  \
0   [Beauty, Skin Care, Face, Creams & Moisturizers]    NaN   
1  [Beauty, Tools & Accessories, Makeup Brushes &...    NaN   
2   [Beauty, Skin Care, Body, Moisturizers, Lotions]    NaN   

                                         description brand  
0  As we age, our once youthful, healthy skin suc...   NaN  
1  Mineral Powder Brush--Apply powder or mineral ...   NaN  
2  From the Greek island of Chios, this Mastiha b...   NaN  


In [61]:
def get_valid_ids(df, col_name, k):
        frequency = df.groupby([col_name])[[col_name]].count()
        valid_id = frequency[frequency[col_name]>=k].index
        return valid_id

def keepFirstFilter(df: pd.DataFrame, user_col: str='user_id', item_col: str='item_id', time_col: str='timestamp') -> pd.DataFrame:
    print(f"*** Only keep the first interaction for duplicated review ***")
    print(f"Begin: {df.shape}")
    df = df.sort_values(by=[user_col, time_col], ).reset_index(drop=True)
    df = df.drop_duplicates(subset=[user_col, item_col], keep='first').reset_index(drop=True)
    print(f"After: {df.shape}")
    return df


def kCoreFilter(df: pd.DataFrame, user_k: int=10, item_k: int=10, user_col: str='user_id', item_col: str='item_id', max_iter: int=20) -> pd.DataFrame:
    print(f"*** Users whose interaction < {user_k} and items whose interaction < {item_k}  would be filtered out ***")
    print(f"Begin: {df.shape}")
    
    num_users_prev, num_items_prev = len(df[user_col].unique()), len(df[item_col].unique()) 
    delta = True
    iter = 0
    
    while delta and iter < max_iter: 
        valid_users = get_valid_ids(df, user_col, user_k)  
        df = df[df[user_col].isin(valid_users)]

        valid_items = get_valid_ids(df, item_col, item_k)  
        df = df[df[item_col].isin(valid_items)]

        num_users = len(valid_users)
        num_items = len(valid_items)  

        delta = (num_users != num_users_prev) or (num_items != num_items_prev)
        print('Iter: {0}, users: {1} / {2}, items: {3} / {4}'.format(iter, num_users, num_users_prev, num_items, num_items_prev))

        num_users_prev = num_users
        num_items_prev = num_items
        iter+=1
    print(f"After: {df.shape}")
    return df 


def lowRatingFilter(df: pd.DataFrame, rating_thres: float=3.0, rating_col: str='rating') -> pd.DataFrame:
    print(f"*** Rating < {rating_thres} would be filtered out ***")
    print(f"Begin: {df.shape}")
    df = df[df[rating_col] >= rating_thres].reset_index(drop=True)
    print(f"After: {df.shape}")
    return df

In [62]:
review_df = review_df[review_df['item_id'].isin(meta_df['item_id'])].reset_index(drop=True)

In [63]:
review_df1 = keepFirstFilter(review_df)
review_df1 = lowRatingFilter(review_df1, 3.0)
review_df1 = kCoreFilter(review_df1, 5, 5)
data_df = review_df1.reset_index(drop=True)

*** Only keep the first interaction for duplicated review ***
Begin: (2020642, 4)
After: (2020642, 4)
*** Rating < 3.0 would be filtered out ***
Begin: (2020642, 4)
After: (1724132, 4)
*** Users whose interaction < 5 and items whose interaction < 5  would be filtered out ***
Begin: (1724132, 4)
Iter: 0, users: 42949 / 1047744, items: 15791 / 231472
Iter: 1, users: 20766 / 42949, items: 10555 / 15791
Iter: 2, users: 17269 / 20766, items: 9340 / 10555
Iter: 3, users: 16223 / 17269, items: 8947 / 9340
Iter: 4, users: 15852 / 16223, items: 8785 / 8947
Iter: 5, users: 15675 / 15852, items: 8718 / 8785
Iter: 6, users: 15625 / 15675, items: 8689 / 8718
Iter: 7, users: 15587 / 15625, items: 8680 / 8689
Iter: 8, users: 15577 / 15587, items: 8678 / 8680
Iter: 9, users: 15576 / 15577, items: 8678 / 8678
Iter: 10, users: 15576 / 15576, items: 8678 / 8678
After: (139318, 4)


In [64]:
print(data_df)

                      user_id     item_id  rating   timestamp
0       A00700212KB3K0MVESPIY  B001RMP7M6     5.0  1385337600
1       A00700212KB3K0MVESPIY  B003TMO3EU     5.0  1385337600
2       A00700212KB3K0MVESPIY  B00028M3N2     5.0  1386028800
3       A00700212KB3K0MVESPIY  B0035RF85C     5.0  1386028800
4       A00700212KB3K0MVESPIY  B006GK5NNW     5.0  1386028800
...                       ...         ...     ...         ...
139313          AZZT1ERHBSNQ8  B000WYZ9Q4     4.0  1405123200
139314          AZZT1ERHBSNQ8  B003BMJGL8     5.0  1405123200
139315          AZZT1ERHBSNQ8  B004TSFE6Y     3.0  1405123200
139316          AZZT1ERHBSNQ8  B006ZBP8NM     5.0  1405123200
139317          AZZT1ERHBSNQ8  B007O7AZBG     5.0  1405123200

[139318 rows x 4 columns]


In [65]:
def map_id(df: pd.DataFrame, user_colname: str='user_id', item_colname: str='item_id', group_colname: str=None, return_map: bool=False, price_df: pd.DataFrame=None, group_df: pd.DataFrame=None) -> pd.DataFrame:
    '''map user and item ids'''
    users = df[user_colname].unique()
    items = df[item_colname].unique()
    print("**Map users and items**")
    n_users, n_items = len(users), len(items)
    user_map = {u: k+1 for k, u in enumerate(users)}
    item_map = {i: k+1 for k, i in enumerate(items)}
    df[user_colname] = df[user_colname].apply(lambda x: user_map[x])
    df[item_colname] = df[item_colname].apply(lambda x: item_map[x])

    if (group_colname is not None) and (group_colname in df):
        groups = df[group_colname].unique()
        n_groups = len(groups)
        group_map = {g: k+1 for k,g in enumerate(groups)}
        df[group_colname] = df[group_colname].apply(lambda x: group_map[x])
    else:
        group_map = {}

    if price_df is not None:
        if user_colname in price_df:
            price_df = price_df[price_df[user_colname].isin(users)].reset_index(drop=True)
            price_df[user_colname] = price_df.apply(lambda x: user_map[x])
        if item_colname in price_df:
            price_df = price_df[price_df[item_colname].isin(items)].reset_index(drop=True)
            price_df[item_colname] = price_df[item_colname].apply(lambda x: item_map[x])

    if group_df is not None:
        if user_colname in group_df:
            group_df = group_df[group_df[user_colname].isin(users)].reset_index(drop=True)
            group_df[user_colname] = group_df.apply(lambda x: user_map[x])
        if item_colname in group_df:
            group_df = group_df[group_df[item_colname].isin(items)].reset_index(drop=True)
            group_df[item_colname] = group_df[item_colname].apply(lambda x: item_map[x])
        if group_colname is not None and group_colname in group_df:
            groups = group_df[group_colname].unique()
            n_groups = len(groups)
            group_map = {g: k+1 for k,g in enumerate(groups)}
            group_df[group_colname] = group_df[group_colname].apply(lambda x: group_map[x])

    if return_map:
        return (df, price_df, group_df), (user_map, item_map, group_map)
    else:
        return df, price_df, group_df

In [66]:
(df, price_df, group_df),(user_map, item_map, group_map) = map_id(data_df, return_map=True)

**Map users and items**


In [67]:
all_map = {'item': item_map, 'user': user_map}

with open(os.path.join(DATA_DIR, "map.json"), 'w') as f:
    json.dump(all_map, f)

In [68]:
# split
def split_train_test_set_leave_one_out_seq(data: pd.DataFrame, col_name: str, time_colname:str, col_names_2_return: list, seed: int=42):
    '''Leave the last one item for test set'''
    if time_colname in data:
        df_sorted = data.sort_values(by=[col_name, time_colname]).reset_index(drop=True)
    else:
        df_sorted = data.sort_values(by=col_name).reset_index(drop=True)

    df_test = df_sorted.groupby(by=col_name, as_index=False).nth(-1)
    df_train = df_sorted.iloc[df_sorted.index.difference(df_test.index)]
    return df_train.reset_index(drop=True)[col_names_2_return], df_test.reset_index(drop=True)[col_names_2_return]

def split_train_test_set_leave_one_out(data: pd.DataFrame, col_name: str, col_names_2_return: list, seed: int = 0):
    if col_names_2_return is None:
        col_names_2_return = data.columns #.to_list()
    df_groupby = data.groupby(by=col_name, as_index=False) 
    df_test = df_groupby.sample(n=1, random_state=seed)[col_names_2_return]
    df_train = data.iloc[data.index.difference(df_test.index)][col_names_2_return] 
    return df_train.reset_index(drop=True), df_test.reset_index(drop=True) 

def split_train_test_set_by_ratio(data: pd.DataFrame, ratio: list, col_name: str, col_names_2_return: list, seed: int = 0):
    if col_names_2_return is None:
        col_names_2_return = data.columns #.to_list()
    assert len(ratio) == 2, 'ratio is for train/test.'
    frac = ratio[1] / sum(ratio)
    df_groupby = data.groupby(by=col_name, as_index=False) 
    df_test = df_groupby.sample(frac=frac, random_state=seed)[col_names_2_return]
    df_train = data.iloc[data.index.difference(df_test.index)] 
    return df_train.reset_index(drop=True), df_test.reset_index(drop=True) 

In [3]:
path = os.path.join(DATA_DIR, 'chatbot')
if not os.path.exists(path):
    os.makedirs(path)

In [71]:
df_train_0, df_test = split_train_test_set_leave_one_out_seq(df, 'user_id', 'timestamp', ['user_id', 'item_id'])
df_train, df_valid = split_train_test_set_leave_one_out_seq(df_train_0, 'user_id', 'timestamp', ['user_id', 'item_id',])

df_train.to_csv(os.path.join(path, "train.tsv"), index=None)
df_valid.to_csv(os.path.join(path, "valid.tsv"), index=None)
df_test.to_csv(os.path.join(path, "test.tsv"), index=None)
df_train_0.to_csv(os.path.join(path, "user_history.tsv"), index=None)

In [72]:
saved_meta_df = meta_df[meta_df['item_id'].isin(item_map.keys())]
saved_meta_df = saved_meta_df.drop_duplicates(subset=['item_id'], keep='first')
saved_meta_df.reset_index(inplace=True, drop=True)

In [73]:
saved_meta_df.shape

(8678, 6)

In [79]:
saved_meta_df['item_id'] = saved_meta_df['item_id'].apply(lambda x: item_map[x])

In [80]:
saved_meta_df

Unnamed: 0,item_id,title,category,price,description,brand
0,5660,Xtreme Brite Brightening Gel 1oz.,"[Beauty, Hair Care, Styling Products, Creams, ...",19.99,Xtreme Brite Brightening gel is a highly conc...,Xtreme Brite
1,8388,Versace Bright Crystal Eau de Toilette Spray f...,"[Beauty, Fragrance, Women's, Eau de Toilette]",52.33,Versace Bright Crystal Perfume for Women 3 oz ...,Versace
2,8598,Avalon Biotin B-Complex Thickening Conditioner...,"[Beauty, Hair Care, Conditioners]",9.49,It's in our name: Avalon Organics. Our commitm...,Avalon Organics
3,5325,"Better Living Classic Two Chamber Dispenser, W...","[Beauty, Bath & Body, Bathing Accessories, Bat...",25.99,Qality designed two chamber that utilizes a 'p...,Classic
4,8532,Better Living The Ulti-Mate Dispenser,"[Beauty, Bath & Body, Bathing Accessories, Bat...",39.95,"The Ulti-Mate Dispenser III, provides a perfec...",Better Living
...,...,...,...,...,...,...
8673,7499,Perfect Sleep Mask with Ear Plugs - Ideal Eye ...,"[Beauty, Skin Care, Eyes, Masks]",10.75,Do You Want To Improve Your Sleep? Then get th...,
8674,2590,Phytoceramides Anti Aging Supplement Reviews -...,"[Beauty, Skin Care]",18.36,Phytoceramides:Have you ever wanted to look in...,
8675,5016,Dr Song Rosehip Oil 4oz (4 oz),"[Beauty, Skin Care, Face, Oils & Serums]",19.99,,
8676,2992,VITAMIN C SERUM 20% with Hyaluronic Acid For Y...,"[Beauty, Skin Care, Face, Creams & Moisturizer...",36.00,The Secret to Younger Looking Skin Vitamin C S...,


In [5]:
user_history = df_train_0.groupby('user_id').agg(list)
item_count = pd.value_counts(user_history['item_id'])
saved_meta_df['visited_num'] = saved_meta_df['id'].apply(lambda x: item_count.loc[x] if x in item_count else 0)

In [81]:
saved_meta_df.rename(columns={'item_id': 'id'}, inplace=True)
saved_meta_df.to_feather(os.path.join(path, 'products.ftr'))
saved_meta_df.to_csv(os.path.join(path, 'products.csv'), index=None, sep='|')

In [6]:
df_train = pd.read_csv(os.path.join(path, "train.tsv"))
df_valid = pd.read_csv(os.path.join(path, "valid.tsv"))
df_test = pd.read_csv(os.path.join(path, "test.tsv"))
df_train_0 = pd.read_csv(os.path.join(path, "user_history.tsv"))
saved_meta_df = pd.read_feather(os.path.join(path, 'products.ftr'))

In [7]:
# Data for Simulator

user_history = df_train_0.groupby('user_id').agg(list)
saved_meta_df = saved_meta_df.set_index('id')

In [18]:
max_title_len = 50
id2title = {id: saved_meta_df.loc[id].title[: max_title_len] for id in saved_meta_df.index}

In [19]:
N = 900
test_data = df_test.sample(N, random_state=2024)
max_len = 10
test_data['history'] = test_data['user_id'].apply(lambda x: '; '.join([id2title[_] for _ in user_history.loc[x]['item_id'][-max_len:]]))
test_data['target'] = test_data['item_id'].apply(lambda x: saved_meta_df.loc[x].title)
test_data.reset_index(inplace=True, drop=True)

In [20]:
test_data['history'].apply(lambda x: len(x.split("; ")))

0       5
1      10
2       9
3       4
4       9
       ..
895     6
896    10
897    10
898     5
899     4
Name: history, Length: 900, dtype: int64

In [21]:
max(test_data['history'].apply(len))

518

In [22]:
from typing import *
import json, pickle

def write_jsonl(obj: List[Dict], fpath: str) -> None:
    try:
        with open(fpath, 'w') as outfile:
            for entry in obj:
                json.dump(entry, outfile)
                outfile.write('\n')
        print("Sucessfully saved into {}.".format(fpath))
    except Exception as e:
        print(f"Error {e} raised. The temp file would be saved in {fpath}.pkl")
        with open(f"{fpath}.pkl", 'wb') as tempfile:
            pickle.dump(obj, tempfile)
    return

test_data_jsonl = test_data[['history', 'target']].to_dict("records")
write_jsonl(test_data_jsonl, os.path.expanduser("~/data/beauty/simulator_test_data_900.jsonl"))

Sucessfully saved into /home/v-huangxu/work/LLM4CRS/eval/data/beauty/simulator_test_data_900_230816.jsonl.


Bad pipe message: %s [b"J\xb5`sB\x1b\xe4\x12\xbaA\x11\xda2\x87'K\x98\x7f \xad\x86\x05%\xe6\xfb\xc3\x1d\xf6\x89}\xac\xc0\xa08\x1d\x02e\xd6\xa8\x8c\xd5p\xd46\n\xed.Y\x1d8\xf8\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00"]
Bad pipe message: %s [b'\x95\x98p\xe0EL\xc4r\xa8\x8a\xaa\xd0G\xae\x0f\x10Y\x97 1\xba{\xe3\xfc\xf2\xfa\x0c\xcdU\xb5\xd0o\x93\x97\x9a\xf99\xf8G\x89~\x80\xce.\xd7\xfa|k\xc3H\xa6\x00\x08\x13\x02\x13\x03\x13\x01']
Bad pipe message: %s [b"\xe4:\xb7d\xae\x1cq+\xf7\xbf\xfd\x99\xb4\x90i\x9d\xca\xb5\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00\x05\x00\xff\x01\x00\x00j\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x0