In [38]:
import json, gzip, itertools, pathlib, time
import pandas as pd
import pyarrow as pa, pyarrow.parquet as pq

ROOT = pathlib.Path().resolve().parents[0]
DATA = ROOT/"data"
# RAW = pathlib.Path("data/raw/Electronics_5.json.gz")
RAW_5CORE  = DATA / "raw" / "Electronics_5.json.gz"
RAW_META = DATA / "raw" / "meta_Electronics.json.gz"
CLICK_OUT = DATA / "clicks.parquet"
TAXO_OUT  = DATA / "taxonomy_edges.csv"

In [39]:
def deepest_list(cat_field):
    """
    Accepts:
      • list-of-strings  -> returns the list itself
      • list-of-lists    -> returns the last inner list
      • None / anything else -> returns []
    """
    if not cat_field:
        return []
    if all(isinstance(x, list) for x in cat_field):
        return cat_field[-1]
    elif all(isinstance(x, str) for x in cat_field):
        return cat_field
    else:
        return []

In [40]:
asin_to_cat = {}
t = time.time();

with gzip.open(RAW_META, "rt") as f:
    for line in f:
        m = json.loads(line)
        path_lists = m.get("categories") or m.get("category")
        path = deepest_list(path_lists)
        if path:
            asin_to_cat[m["asin"]] = path
print(f"Metadata dict size: {len(asin_to_cat):,}  |  {time.time()-t:.1f}s")

Metadata dict size: 756,077  |  33.1s


In [41]:
users, items, stamps, cat_paths = [], [], [], []
t0 = time.time()

with gzip.open(RAW_5CORE, "rt") as f:
    for n, line in enumerate(f, 1):
        r = json.loads(line)
        users.append(r["reviewerID"])
        items.append(r["asin"])
        stamps.append(r["unixReviewTime"])

        raw_cat = r.get("category")
        path = deepest_list(raw_cat) if raw_cat else asin_to_cat.get(r["asin"], [])
        cat_paths.append(path)

        if n % 500_000 == 0:
            print(f"{n:,} review lines parsed…")
print(f"Reviews parsed: {len(users):,}  |  {time.time()-t0:.1f}s")

500,000 review lines parsed…
1,000,000 review lines parsed…
1,500,000 review lines parsed…
2,000,000 review lines parsed…
2,500,000 review lines parsed…
3,000,000 review lines parsed…
3,500,000 review lines parsed…
4,000,000 review lines parsed…
4,500,000 review lines parsed…
5,000,000 review lines parsed…
5,500,000 review lines parsed…
6,000,000 review lines parsed…
6,500,000 review lines parsed…
Reviews parsed: 6,739,590  |  30.0s


In [42]:
CLICK_OUT.parent.mkdir(parents=True, exist_ok=True)
pq.write_table(
    pa.Table.from_pydict({
        "user_id": users,
        "item_id": items,
        "ts": stamps
    }),
    CLICK_OUT,
    compression="zstd"
)
print("Wrote", CLICK_OUT, "| size =", round(CLICK_OUT.stat().st_size / 1e6, 1), "MB")

Wrote /Users/vedantajain/hrec/data/clicks.parquet | size = 78.8 MB


In [43]:
edge_set = set()
for path in cat_paths:
    edge_set.update(itertools.pairwise(path))   # handles empty list safely

taxo_df = pd.DataFrame(list(edge_set), columns=["parent_id", "child_id"])
TAXO_OUT.parent.mkdir(parents=True, exist_ok=True)
taxo_df.to_csv(TAXO_OUT, index=False)
print("Taxonomy edges:", len(taxo_df))
print("Wrote", TAXO_OUT)
taxo_df.head()

Taxonomy edges: 4256
Wrote /Users/vedantajain/hrec/data/taxonomy_edges.csv


Unnamed: 0,parent_id,child_id
0,Cooling Pads & External Fans,Aluminum
1,Messenger Bags,Colombian Leather
2,Imported,38 centimeters high
3,Briefcases,"Padded compartment protects laptops up to 16"""
4,Posing Props,100% Polyester
