In [None]:
# ==== Cell 0: 常量、依赖与随机种子 ====
import os, json, gzip, urllib.request, random
from pathlib import Path
import numpy as np
import pandas as pd
from datetime import datetime
from collections import defaultdict

# 数据集名
DATASET = "Grocery_and_Gourmet_Food"

# --- 智能路径逻辑 Start ---
current_path = Path.cwd()

# 判断：如果当前文件夹的名字已经是数据集的名字，说明脚本就在数据目录内部
if current_path.name == DATASET:
    print("检测到脚本运行在数据目录下，使用当前目录作为 DATA_DIR。")
    DATA_DIR = current_path
    ROOT = current_path.parents[1] # 尝试推断项目根目录（往上两级），仅作记录用
else:
    print("检测到脚本运行在项目根目录（或其他位置），将创建 data 子目录。")
    ROOT = current_path
    DATA_DIR = ROOT / "data" / DATASET

DATA_DIR.mkdir(parents=True, exist_ok=True)
# --- 智能路径逻辑 End ---

# 统一随机种子
RANDOM_SEED = 2025
NEG_ITEMS   = 100 
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
try:
    import torch
    torch.manual_seed(RANDOM_SEED)
    torch.cuda.manual_seed_all(RANDOM_SEED)
except Exception:
    pass

print("-" * 30)
print(f"ROOT (项目根目录): {ROOT}")
print(f"DATA_DIR (数据目录): {DATA_DIR}")
print("-" * 30)


In [None]:
# ==== Cell 1: 下载原始数据（若已存在则跳过） ====
DATA_FILE = f"reviews_{DATASET}_5.json.gz"
META_FILE = f"meta_{DATASET}.json.gz"

sources = [
    (DATA_FILE, f"https://snap.stanford.edu/data/amazon/productGraph/categoryFiles/{DATA_FILE}"),
    (META_FILE, f"https://snap.stanford.edu/data/amazon/productGraph/categoryFiles/{META_FILE}"),
]

for fname, url in sources:
    out = DATA_DIR / fname
    if out.exists():
        print("[exist]", out.name)
    else:
        print("[download]", url, "->", out)
        urllib.request.urlretrieve(url, out)

print("Done. Files under:", DATA_DIR)


In [None]:
# ==== Cell 2: 解析 gz → DataFrame ====
def parse_gz_json(path: Path):
    with gzip.open(path, 'rb') as g:
        for line in g:
            yield eval(line)  # Amazon 官方示例使用的简便写法

def load_df(path: Path) -> pd.DataFrame:
    data = {}
    for i, obj in enumerate(parse_gz_json(path)):
        data[i] = obj
    return pd.DataFrame.from_dict(data, orient="index")

review_fp = DATA_DIR / f"reviews_{DATASET}_5.json.gz"
meta_fp   = DATA_DIR / f"meta_{DATASET}.json.gz"

reviews = load_df(review_fp)
meta    = load_df(meta_fp)

print("reviews:", reviews.shape, "columns:", list(reviews.columns)[:8])
print("meta   :", meta.shape,    "columns:", list(meta.columns)[:8])

# 只保留交互中出现过的物品
meta_use = meta[meta['asin'].isin(reviews['asin'])].reset_index(drop=True)
all_asins = set(meta_use['asin'])

def related_filter(related_dict):
    out = {}
    if isinstance(related_dict, dict):
        for k, lst in related_dict.items():
            out[k] = list(all_asins & set(lst))
    return out

if 'related' in meta_use.columns:
    meta_use['related'] = meta_use['related'].apply(related_filter)
else:
    meta_use['related'] = [{} for _ in range(len(meta_use))]

print("#Users:", reviews['reviewerID'].nunique(),
      "#Items:", reviews['asin'].nunique(),
      "#Interactions:", len(reviews))


In [None]:
# ==== Cell 3: 构造交互表（逗号 CSV：user_id,item_id,time）+ item_meta ====
# 1) 重命名 → 去重 → 排序
inter = reviews.rename(columns={'reviewerID':'user_id','asin':'item_id','unixReviewTime':'time'})
inter = inter[['user_id','item_id','time']].drop_duplicates()
inter = inter.sort_values(by=['time','user_id'], kind='mergesort').reset_index(drop=True)

# 2) 重新编号（从 1 开始）
uid_list = sorted(inter['user_id'].unique())
iid_list = sorted(inter['item_id'].unique())
user2id = dict(zip(uid_list, range(1, len(uid_list)+1)))
item2id = dict(zip(iid_list, range(1, len(iid_list)+1)))

inter['user_id'] = inter['user_id'].map(user2id).astype(int)
inter['item_id'] = inter['item_id'].map(item2id).astype(int)
inter['time']    = inter['time'].astype(int)

# 3) leave-one-out：每个用户最后两条做 test/dev，其余做 train（再把每个用户最早一条加入 train，保持稳定）
clicked_item_set = {u: set(g['item_id'].tolist()) for u, g in inter.groupby('user_id')}

def make_dev_test(df: pd.DataFrame):
    res = []
    n_items = df['item_id'].nunique()
    work = df.copy()
    for _ in range(2):  # test/dev 各取最后一条
        last = work.groupby('user_id').tail(1).copy()
        work = work.drop(last.index)
        # （可选）保留一个 neg_items 列（后面会重采，这里不强依赖）
        res.append(last)
    return (res[0], res[1], work)  # test, dev, remain

# 每个用户的第一条保底放回 train
head1 = inter.groupby('user_id').head(1)
rest  = inter.drop(head1.index)
test_df, dev_df, remain_df = make_dev_test(rest)
train_df = pd.concat([head1, remain_df]).sort_index()

# 4) 保存“标准化的逗号 CSV”（框架中间产物；最终训练会用下面生成的子目录）
train_csv = DATA_DIR / "train.csv"
dev_csv   = DATA_DIR / "dev.csv"
test_csv  = DATA_DIR / "test.csv"

train_df[['user_id','item_id','time']].to_csv(train_csv, index=False)  # 逗号
dev_df[['user_id','item_id','time']].to_csv(dev_csv, index=False)
test_df[['user_id','item_id','time']].to_csv(test_csv, index=False)

print("[base csv] saved ->", train_csv, dev_csv, test_csv)

# 5) 物品侧信息（逗号 CSV）
#   二级品类
l2 = []
cats = meta_use.get('categories', [])
for row in cats:
    if isinstance(row, list) and len(row)>0 and len(row[0])>2:
        l2.append(row[0][2])
    else:
        l2.append(np.nan)
meta_use['l2_category'] = l2
l2_vals = sorted(meta_use['l2_category'].dropna().unique())
l2_map  = dict(zip(l2_vals, range(1, len(l2_vals)+1)))
meta_use['l2_category'] = meta_use['l2_category'].apply(lambda x: l2_map[x] if x==x else 0)

#   映射 related 到新 item_id 空间
item_meta_rows = []
for _, r in meta_use.iterrows():
    asin = r['asin']
    if asin not in item2id:
        continue
    info = r['related'] if isinstance(r['related'], dict) else {}
    item_meta_rows.append({
        'item_id'     : item2id[asin],
        'i_category'  : r['l2_category'],
        'r_complement': [item2id[x] for x in info.get('also_bought', []) if x in item2id],
        'r_substitute': [item2id[x] for x in info.get('also_viewed', []) if x in item2id],
    })
item_meta = pd.DataFrame(item_meta_rows, columns=['item_id','i_category','r_complement','r_substitute'])
item_meta_csv = DATA_DIR / "item_meta.csv"
item_meta.to_csv(item_meta_csv, index=False)
print("[item meta] saved ->", item_meta_csv)


In [None]:
# ==== Cell 4: 生成 GGFTOPK 与 GGFCTR（TSV） ====
TOPK_DIR = DATA_DIR / "GGFTOPK"
CTR_DIR  = DATA_DIR / "GGFCTR"
TOPK_DIR.mkdir(parents=True, exist_ok=True)
CTR_DIR.mkdir(parents=True, exist_ok=True)

# 读取标准化 base csv（逗号）
base = {
    "train": pd.read_csv(DATA_DIR / "train.csv"),
    "dev"  : pd.read_csv(DATA_DIR / "dev.csv"),
    "test" : pd.read_csv(DATA_DIR / "test.csv"),
}
for k in base:
    base[k] = base[k][["user_id","item_id","time"]].astype({"user_id":int,"item_id":int,"time":int})

# 全局候选 & 用户历史（用于负采样）
all_df    = pd.concat(base.values(), ignore_index=True)
all_items = np.asarray(sorted(all_df["item_id"].unique()), dtype=int)
user_hist = defaultdict(set)
for u, i in zip(all_df["user_id"].values, all_df["item_id"].values):
    user_hist[int(u)].add(int(i))

rng = np.random.default_rng(RANDOM_SEED)

def candidate_pool(u: int) -> np.ndarray:
    seen = user_hist[u]
    if not seen:
        return all_items
    cand = np.setdiff1d(all_items, np.fromiter(seen, dtype=int), assume_unique=False)
    return cand if cand.size>0 else all_items

# ====== A) GGFTOPK：为 train/dev/test 每条样本生成 NEG_ITEMS 个未交互负例（列名 neg_items） ======
for split in ["train","dev","test"]:
    df = base[split].copy()
    neg_lists = []
    for u in df["user_id"].values:
        cand = candidate_pool(int(u))
        neg_lists.append(list(rng.choice(cand, size=NEG_ITEMS, replace=True).astype(int)))
    df["neg_items"] = neg_lists
    # **关键**：ReChorus 这套脚本在 Amazon 上常用 TSV
    df.to_csv(TOPK_DIR / f"{split}.csv", index=False, sep="\t")
    print(f"[GGFTOPK] {split}.csv ->", df.shape, list(df.columns))

# ====== B) GGFCTR：1:1 负采样（含 label），保留 time；同样用 TSV ======
def build_ctr(df: pd.DataFrame) -> pd.DataFrame:
    pos = df[["user_id","item_id","time"]].copy().astype(int)
    pos["label"] = 1
    neg_rows = []
    for u, t in zip(pos["user_id"].values, pos["time"].values):
        cand = candidate_pool(int(u))
        j = int(rng.choice(cand))
        neg_rows.append((int(u), j, int(t), 0))
    neg = pd.DataFrame(neg_rows, columns=["user_id","item_id","time","label"])
    return pd.concat([pos, neg], ignore_index=True)

for split in ["train","dev","test"]:
    out = build_ctr(base[split])
    out.to_csv(CTR_DIR / f"{split}.csv", index=False, sep="\t")
    print(f"[GGFCTR ] {split}.csv ->", out.shape, list(out.columns))


In [None]:
# ==== Cell 5: 快速自检 ====
def peek(path: Path, n=2):
    df = pd.read_csv(path, sep=None, engine="python")
    print(path.name, df.shape, list(df.columns))
    display(df.head(n))

print("== GGFTOPK ==")
for sp in ["train","dev","test"]:
    peek((DATA_DIR / "GGFTOPK" / f"{sp}.csv"))

print("== GGFCTR ==")
for sp in ["train","dev","test"]:
    peek((DATA_DIR / "GGFCTR" / f"{sp}.csv"))

print("\n✅ 准备就绪：接下来直接用 --dataset \"Grocery_and_Gourmet_Food/GGFTOPK\" 或 \"Grocery_and_Gourmet_Food/GGFCTR\" 训练即可。")
