# RecSys EDA (Dask on GPU)

Explore the raw RecSys inputs with `dask_cudf` to guide preprocessing and later parquet writes.

In [None]:
import pathlib
import zipfile

import dask_cudf as dc
import cudf
from dask.distributed import Client

# Data is mounted to /tmp in the RAPIDS container
DATA_DIR = pathlib.Path("/tmp")
UNZIP_DIR = DATA_DIR / "unzipped"
UNZIP_DIR.mkdir(exist_ok=True)

client = Client(processes=False)  # single-GPU scheduler



In [2]:
def ensure_unzipped(zip_path: pathlib.Path) -> pathlib.Path:
    """Extract a single-file zip to UNZIP_DIR if not already present."""
    target = UNZIP_DIR / zip_path.with_suffix("").name
    if not target.exists():
        with zipfile.ZipFile(zip_path) as zf:
            info = zf.infolist()[0]
            target.parent.mkdir(parents=True, exist_ok=True)
            with zf.open(info) as src, target.open("wb") as dst:
                dst.write(src.read())
    return target


EVENTS_CSV = ensure_unzipped(DATA_DIR / "events.csv.zip")
ITEMS1_CSV = ensure_unzipped(DATA_DIR / "item_properties_part1.csv.zip")
ITEMS2_CSV = ensure_unzipped(DATA_DIR / "item_properties_part2.csv.zip")
TREE_CSV = DATA_DIR / "category_tree.csv"

EVENTS_DF = dc.read_csv(EVENTS_CSV, dtype={"timestamp": "int64", "visitorid": "int64"})
ITEM_PROPS_DF = dc.read_csv([ITEMS1_CSV, ITEMS2_CSV], dtype={"timestamp": "int64", "itemid": "int64"})
TREE_DF = dc.read_csv(TREE_CSV)



In [None]:
def quick_stats(name: str, df: dc.DataFrame):
    rows = df.shape[0].compute()
    cols = list(df.columns)
    dtypes = {k: str(v) for k, v in df.dtypes.to_dict().items()}
    print(f"\n{name}")
    print(f"rows={rows:,} cols={len(cols)}")
    print("dtypes:", dtypes)
    print("head:")
    print(df.head())

quick_stats("events", EVENTS_DF)
quick_stats("item_properties", ITEM_PROPS_DF)
quick_stats("category_tree", TREE_DF)



events
rows=2,756,101 cols=5
dtypes: {'timestamp': 'int64', 'visitorid': 'int64', 'event': 'object', 'itemid': 'int64', 'transactionid': 'int8'}
head:


Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,



item_properties
rows=20,275,902 cols=4
dtypes: {'timestamp': 'int64', 'itemid': 'int64', 'property': 'object', 'value': 'object'}
head:


Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513



category_tree
rows=1,669 cols=2
dtypes: {'categoryid': 'int64', 'parentid': 'int64'}
head:


Unnamed: 0,categoryid,parentid
0,1016,213
1,809,169
2,570,9
3,1691,885
4,536,1691


In [4]:
# Event-level distributions
event_counts = EVENTS_DF["event"].value_counts().compute()
unique_visitors = EVENTS_DF["visitorid"].nunique().compute()
unique_items = EVENTS_DF["itemid"].nunique().compute()
ts_min, ts_max = EVENTS_DF["timestamp"].min().compute(), EVENTS_DF["timestamp"].max().compute()
print("Event counts:\n", event_counts)
print(f"Unique visitors: {unique_visitors:,}")
print(f"Unique items (events): {unique_items:,}")
print(f"Timestamp range: {ts_min} -> {ts_max}")


Event counts:
 event
transaction      22457
view           2664312
addtocart        69332
Name: count, dtype: int64
Unique visitors: 1,407,580
Unique items (events): 235,061
Timestamp range: 1430622004384 -> 1442545187788


In [5]:
# Item properties overview
# Compute value counts first (dask Series), then take top 10 locally
prop_counts = ITEM_PROPS_DF["property"].value_counts().compute().head(10)
items_with_props = ITEM_PROPS_DF["itemid"].nunique().compute()
print("Top properties:\n", prop_counts)
print(f"Items with properties: {items_with_props:,}")


Top properties:
 property
601       14
1026     205
342      314
338      990
812       20
310       17
562     1348
362     1699
766      492
476      128
Name: count, dtype: int64
Items with properties: 417,053


In [6]:
# Category tree sanity check
print(TREE_DF.describe().compute())
print("Unique category IDs:", TREE_DF["categoryid"].nunique().compute())

        categoryid     parentid
count  1669.000000  1644.000000
mean    849.285201   847.571168
std     490.195116   505.058485
min       0.000000     8.000000
25%     427.000000   381.000000
50%     848.000000   866.000000
75%    1273.000000  1291.000000
max    1698.000000  1698.000000
Unique category IDs: 1669


### Notes for downstream preprocessing

- Keep using `dask_cudf` for parsing so you can write partitioned parquet easily (local `/tmp/parquet` or future S3 bucket). Use `compression="snappy"` and partition by `event` or date for training efficiency.
- Narrow schemas before writing: cast IDs to `int32` where safe, avoid wide string columns, and normalize item/visitor features into separate parquet datasets.
- For a two-tower recsys later, plan to emit parquet tables like `events`, `item_features`, and `visitor_history` that can be loaded with `cudf` or `dask_cudf`.