# Data volume check for GPU preprocessing

Quickly inspect the RecSys CSV inputs to decide whether a single-GPU `cudf` flow is fine or if we need to scale out with `dask_cudf`.

In [None]:
import pathlib
import zipfile
from dataclasses import dataclass
from typing import Iterable
import pynvml

# Data is mounted to /tmp in the rapids container
DATA_DIR = pathlib.Path("/tmp")

@dataclass
class FileStats:
    name: str
    rows: int
    size_bytes: int

    @property
    def size_mb(self) -> float:
        return self.size_bytes / (1024 ** 2)

    @property
    def size_gb(self) -> float:
        return self.size_bytes / (1024 ** 3)


def count_rows_in_zip(zip_path: pathlib.Path) -> tuple[int, int]:
    """Return (rows, uncompressed_size_bytes) for a single-file zip."""
    with zipfile.ZipFile(zip_path) as zf:
        info = zf.infolist()[0]
        with zf.open(info) as fh:
            # Skip header, then count remaining lines
            header = fh.readline()
            rows = sum(1 for _ in fh)
    return rows, info.file_size


def count_rows_in_plain(csv_path: pathlib.Path) -> tuple[int, int]:
    with csv_path.open("r", encoding="utf-8") as fh:
        header = fh.readline()
        rows = sum(1 for _ in fh)
    return rows, csv_path.stat().st_size


def humanize_rows(n: int) -> str:
    if n >= 1_000_000:
        return f"{n/1_000_000:.2f}M"
    if n >= 1_000:
        return f"{n/1_000:.1f}k"
    return str(n)


def summarize(stats: Iterable[FileStats]):
    print(f"Analyzed folder: {DATA_DIR}")
    total_rows = 0
    total_bytes = 0
    for s in stats:
        total_rows += s.rows
        total_bytes += s.size_bytes
        print(f"{s.name:28} rows={humanize_rows(s.rows):>8} | size={s.size_mb:8.2f} MB")
    print("-" * 72)
    print(f"TOTAL rows={humanize_rows(total_rows):>8} | size={total_bytes/(1024**2):8.2f} MB ({total_bytes/(1024**3):.2f} GB)")


zip_files = [
    DATA_DIR / "events.csv.zip",
    DATA_DIR / "item_properties_part1.csv.zip",
    DATA_DIR / "item_properties_part2.csv.zip",
]
plain_files = [DATA_DIR / "category_tree.csv"]

stats: list[FileStats] = []
for path in zip_files:
    rows, size_bytes = count_rows_in_zip(path)
    stats.append(FileStats(path.name, rows, size_bytes))

for path in plain_files:
    rows, size_bytes = count_rows_in_plain(path)
    stats.append(FileStats(path.name, rows, size_bytes))

summarize(stats)

Analyzed folder: /tmp
events.csv.zip               rows=   2.76M | size=   89.87 MB
item_properties_part1.csv.zip rows=  11.00M | size=  461.88 MB
item_properties_part2.csv.zip rows=   9.28M | size=  389.99 MB
category_tree.csv            rows=    1.7k | size=    0.01 MB
------------------------------------------------------------------------
TOTAL rows=  23.03M | size=  941.75 MB (0.92 GB)


### CuDF vs Dask guidance

- If GPU has \u2265 ~8 GB free, the ~1 GB uncompressed data above fits comfortably in single-GPU `cudf`.
- If you expect to duplicate DataFrames (joins, wide feature sets), or GPU memory is tight, start with `dask_cudf.read_csv` on the zip files to stream in partitions.
- For CPU fallback, replace `cudf` with `pandas` in your preprocessing cells.

In [None]:
try:
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
    free_gb = mem.free / 1024**3
    total_gb = mem.total / 1024**3
    decision = "cudf" if free_gb >= 8 else "dask_cudf"
    print(f"GPU 0 memory: total={total_gb:.2f} GB, free={free_gb:.2f} GB.")
    print(f"Decision: use `{decision}` for preprocessing.")
    pynvml.nvmlShutdown()
except Exception as e:
    print(f"GPU check failed: `{e}`. Default to `cudf` if GPU is available; otherwise use `pandas`.")

**GPU 0 memory:** total=4.00 GB, free=3.62 GB

**Decision:** use `dask_cudf` for preprocessing.