In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import sys
import os
import collections
from tqdm import tqdm
HERE = %pwd
sys.path.append(os.path.dirname(HERE))

In [2]:
# token counter
import tiktoken
from tiktoken.core import Encoding
encoding = tiktoken.get_encoding("o200k_base")


def compute_token(text):
    return len(encoding.encode(text))

1. Download from https://www.kaggle.com/datasets/shuyangli94/food-com-recipes-and-user-interactions 
1. Unzip the file.
1. Place the dataset into `dir_save`

In [3]:
# data directory that save raw data
dir_save = "../data/raw_data/food"

In [4]:
# item master
df_ = pd.read_csv(f'{dir_save}/RAW_recipes.csv').rename(columns={"id" : "itemID"}).set_index("itemID")
df_items = df_[["name", "tags", "description", "ingredients"]]
print(f"original items : {len(df_items)}")

# remove long long text
def _fn(s):
    try:
        n = compute_token(s)
    except:
        n = 0
    return n

d = {
    a : np.array([_fn(s) for s in tqdm(df_items[a].values)]) 
    for a in ["name", "tags", "description", "ingredients"]
}
df = pd.DataFrame(d, index=df_items.index)
s = df.T.sum()
df_items = df_items[(s>50)*(s<300)]
print(f"remained items : {len(df_items)}")
df_items

original items : 231637


100%|████████████████████████████████████████████████████████| 231637/231637 [00:01<00:00, 128846.88it/s]
100%|█████████████████████████████████████████████████████████| 231637/231637 [00:15<00:00, 14801.09it/s]
100%|█████████████████████████████████████████████████████████| 231637/231637 [00:09<00:00, 24283.28it/s]
100%|█████████████████████████████████████████████████████████| 231637/231637 [00:08<00:00, 27191.42it/s]


remained items : 224814


Unnamed: 0_level_0,name,tags,description,ingredients
itemID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
137739,arriba baked winter squash mexican style,"['60-minutes-or-less', 'time-to-make', 'course...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ..."
31490,a bit different breakfast pizza,"['30-minutes-or-less', 'time-to-make', 'course...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg..."
112140,all in the kitchen chili,"['time-to-make', 'course', 'preparation', 'mai...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato..."
59389,alouette potatoes,"['60-minutes-or-less', 'time-to-make', 'course...","this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n..."
44061,amish tomato ketchup for canning,"['weeknight', 'time-to-make', 'course', 'main-...",my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar..."
...,...,...,...,...
188810,zydeco shrimp wrap,"['60-minutes-or-less', 'time-to-make', 'course...",a wrap inspired by great cajun flavors,"['white rice', 'vegetable oil', 'onion', 'gree..."
493372,zydeco spice mix,"['15-minutes-or-less', 'time-to-make', 'course...",this spice mix will make your taste buds dance!,"['paprika', 'salt', 'garlic powder', 'onion po..."
308080,zydeco ya ya deviled eggs,"['60-minutes-or-less', 'time-to-make', 'course...","deviled eggs, cajun-style","['hard-cooked eggs', 'mayonnaise', 'dijon must..."
298512,cookies by design cookies on a stick,"['30-minutes-or-less', 'time-to-make', 'course...","i've heard of the 'cookies by design' company,...","['butter', 'eagle brand condensed milk', 'ligh..."


In [5]:
# transaction
df_records = pd.read_csv(f'{dir_save}/RAW_interactions.csv').rename(
    columns={"user_id" : "userID", "recipe_id" : "itemID"}
)
print(f"original records : {len(df_records)}")

# remove long long review
s = pd.Series([_fn(s) for s in tqdm(df_records["review"].values)], index=df_records.index)
df_records = df_records[s<300]
print(f"remained records : {len(df_records)}")
df_records

original records : 1132367


100%|███████████████████████████████████████████████████████| 1132367/1132367 [01:07<00:00, 16676.59it/s]


remained records : 1128945


Unnamed: 0,userID,itemID,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."
...,...,...,...,...,...
1132362,116593,72730,2003-12-09,0,Another approach is to start making sauce with...
1132363,583662,386618,2009-09-29,5,These were so delicious! My husband and I tru...
1132364,157126,78003,2008-06-23,5,WOW! Sometimes I don't take the time to rate ...
1132365,53932,78003,2009-01-11,4,Very good! I used regular port as well. The ...


In [6]:
# items registered in item master
items_master = set(df_items.index.values)

# restrict transaction records whose rows are registered in items_master.
s = df_records['itemID'].apply(lambda s : s in items_master)
df_records = df_records[s]
print(f"remained records : {len(df_records)}")

# restric item master whose items are registed in restricted transcation records
df_items = df_items.loc[df_records['itemID'].unique()]
print(f"remained items : {len(df_items)}")

remained records : 1067071
remained items : 224505


In [7]:
# directory to save preprocessed data to use experiments
version_prep = "20250403_prep"
dir_save_data = f"../data/preprocessed_data/{version_prep}/Food"
os.makedirs(dir_save_data, exist_ok=True)

# groupby
s = df_records["userID"].value_counts()
gb = df_records.groupby("userID")

s = df_records["userID"].value_counts()

# heavy or light
du = dict()
du["light"] = s[(s>5+1)*(s<=10+1)].index.values
du["heavy"] = s[(s>30+1)*(s<=50+1)].index.values

In [8]:
di = dict()
for type_user, users_ in du.items():
    idx = 0
    dr = dict()
    d_ = dict()
    for user in tqdm(users_):
        df_r = gb.get_group(user).sort_values(by="date", ascending=True)
        # items except for the lastest item
        items_train = df_r["itemID"].values[:-1]
        
        # latest item
        s_last = df_r.iloc[-1]
        if s_last["rating"] > 3:
            item_test = s_last["itemID"]
        
            # candidate
            items_others = sorted(set(df_items.index.values) - set(df_r["itemID"]))
            items_others = np.random.choice(items_others, size=9, replace=False)
            items_candi = sorted(set(items_others).union(set([item_test])))
        
            d_[user] = {
                "id_train" : items_train, 
                "id_candi" : items_candi
            }
        
            df_ = df_r[["userID", "itemID", "rating", "review"]]
            dr[user] = df_
            idx += 1
        
        ## +5 is for supplementary
        if idx == 200+5:
            break

    # items for experiments
    di.update(d_)
    
    # save
    import pickle
    with open(f"{dir_save_data}/ids_{type_user}.pickle", 'wb') as f:
        pickle.dump(d_, f)
        
    df_r = pd.concat(dr.values())
    df_r.to_csv(f"{dir_save_data}/records_{type_user}.csv")

# items that appeared for experiments
items_train = np.unique(np.concatenate([d["id_train"] for d in di.values()]))
items_candi = np.unique(np.concatenate([d["id_candi"] for d in di.values()]))
items = sorted(set(items_train).union(set(items_candi)))

df_items_valid = df_items.loc[items]
df_items_valid.to_csv(f"{dir_save_data}/items_slim.csv")
print("finished")

  4%|██▍                                                              | 234/6127 [00:12<05:05, 19.29it/s]
 16%|██████████▎                                                      | 235/1486 [00:11<01:02, 19.92it/s]


finished
