In [29]:
# =========================
# Cell 1 — Auth + config
# =========================
import os, math, random, json
import requests

from huggingface_hub import HfApi

DATASET_ID = "tomg-group-umd/cinepile"   # change if needed
SEED = 42
SHUFFLE_BUFFER = 10_000                 # streaming shuffle buffer
MAX_VIZ = 24                            # max items to show in a grid

DATASETS_SERVER = "https://datasets-server.huggingface.co"

def hf_token_from_env():
    for k in ("HF_TOKEN", "HUGGINGFACE_HUB_TOKEN", "HF_ACCESS_TOKEN"):
        v = os.getenv(k)
        if v:
            return v.strip()
    return None

HF_TOKEN = hf_token_from_env()
assert HF_TOKEN, "No HF token found in env vars (HF_TOKEN / HUGGINGFACE_HUB_TOKEN / HF_ACCESS_TOKEN)."

HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}

api = HfApi(token=HF_TOKEN)
print("whoami:", api.whoami()["name"])
print("dataset:", api.dataset_info(DATASET_ID).id)


whoami: Tarnimat
dataset: tomg-group-umd/cinepile


In [30]:
# =========================
# Cell 2 — Sampling scheme + dataset-server helpers
# =========================
def sampling_fraction(n: int) -> float:
    # Scheme:
    # 1%    for ≤10,000
    # 0.1%  for ≤100,000
    # 0.01% for ≤1,000,000
    # 0.001% for ≤10,000,000
    if n <= 10_000:
        return 0.01
    if n <= 100_000:
        return 0.001
    if n <= 1_000_000:
        return 0.0001
    if n <= 10_000_000:
        return 0.00001
    return 0.000001  # >10M

def get_splits(dataset_id: str):
    r = requests.get(
        f"{DATASETS_SERVER}/splits",
        params={"dataset": dataset_id},
        headers=HEADERS,
        timeout=60,
    )
    r.raise_for_status()
    return r.json()["splits"]  # list of dicts: {config, split, num_examples? ...}

def get_num_rows(dataset_id: str, config: str, split: str) -> int:
    """
    Robust across dataset-server JSON shapes:
    - Sometimes size has size.num_rows
    - Often size has size.splits[] each with {split, num_rows, num_bytes}
    """
    r = requests.get(
        f"{DATASETS_SERVER}/size",
        params={"dataset": dataset_id, "config": config},  # NOTE: no split here
        headers=HEADERS,
        timeout=60,
    )
    r.raise_for_status()
    j = r.json()
    size = j.get("size", {})

    if "num_rows" in size:
        return int(size["num_rows"])

    for s in size.get("splits", []):
        if s.get("split") == split and "num_rows" in s:
            return int(s["num_rows"])

    raise KeyError(f"Could not find num_rows for config='{config}', split='{split}'. size keys={list(size.keys())}")


In [32]:
# =========================
# Cell 3 — Build sampling plans (ALL configs)
# =========================
splits = get_splits(DATASET_ID)

print("Available configs/splits:")
for s in splits:
    print(f"- config={s['config']} split={s['split']} num_examples={s.get('num_examples')}")

# unique configs
configs = sorted({s["config"] for s in splits})

plans = {}  # config -> list[(split, N, frac, K)]
for cfg in configs:
    targets = []
    for s in splits:
        if s["config"] != cfg:
            continue
        split = s["split"]
        n = get_num_rows(DATASET_ID, cfg, split)
        frac = sampling_fraction(n)
        k = max(1, int(math.ceil(n * frac)))
        targets.append((split, n, frac, k))
    plans[cfg] = targets

print("\nSampling plan per config:")
for cfg, targets in plans.items():
    print(f"\nCONFIG={cfg}")
    for split, n, frac, k in targets:
        print(f"  - {split}: N={n:,} -> frac={frac:g} -> K={k:,}")


Available configs/splits:
- config=v2 split=train num_examples=None
- config=v2 split=test num_examples=None
- config=v1 split=train num_examples=None
- config=v1 split=test num_examples=None

Sampling plan per config:

CONFIG=v1
  - train: N=298,888 -> frac=0.0001 -> K=30
  - test: N=4,941 -> frac=0.01 -> K=50

CONFIG=v2
  - train: N=298,888 -> frac=0.0001 -> K=30
  - test: N=4,941 -> frac=0.01 -> K=50


In [33]:
# =========================
# Cell 4 — Streaming sampler (downloads ONLY what you iterate over)
# =========================
from datasets import load_dataset

def take_k_streaming(dataset_id: str, config: str, split: str, k: int, seed: int = 42, buffer_size: int = 10_000):
    ds = load_dataset(dataset_id, config, split=split, streaming=True)
    ds = ds.shuffle(seed=seed, buffer_size=buffer_size)
    return list(ds.take(k))


In [34]:
# =========================
# Cell 5 — Visualization (image grid if possible, else text preview)
# =========================
import io
import matplotlib.pyplot as plt
from PIL import Image

def find_image_columns(example: dict):
    cols = []
    for k, v in example.items():
        if v is None:
            continue
        if isinstance(v, dict) and ("bytes" in v or "path" in v):
            cols.append(k)
        elif isinstance(v, Image.Image):
            cols.append(k)
    return cols

def to_pil(v):
    if v is None:
        return None
    if isinstance(v, dict):
        if v.get("bytes") is not None:
            return Image.open(io.BytesIO(v["bytes"])).convert("RGB")
        if v.get("path"):
            return Image.open(v["path"]).convert("RGB")
    if isinstance(v, Image.Image):
        return v.convert("RGB")
    return None

def visualize_samples(samples, max_viz: int = 24):
    if not samples:
        print("No samples to visualize.")
        return

    img_cols = find_image_columns(samples[0])
    if not img_cols:
        print("No image-like columns detected. Showing text preview:")
        for i, ex in enumerate(samples[:10]):
            print(f"\n--- sample {i} ---")
            for k in list(ex.keys())[:12]:
                s = str(ex[k])
                print(f"{k}: {s[:250]}{'...' if len(s) > 250 else ''}")
        return

    col = img_cols[0]
    imgs = []
    for ex in samples[:max_viz]:
        img = to_pil(ex.get(col))
        if img is not None:
            imgs.append(img)

    if not imgs:
        print(f"Detected '{col}' but couldn't decode images. Showing text preview instead.")
        return

    n = len(imgs)
    cols = 6
    rows = math.ceil(n / cols)
    plt.figure(figsize=(cols * 3, rows * 3))
    for i, img in enumerate(imgs):
        ax = plt.subplot(rows, cols, i + 1)
        ax.imshow(img)
        ax.axis("off")
    plt.tight_layout()
    plt.show()


In [35]:
# =========================
# Cell 6 — Run sampling + visualization (per config/split)
# =========================
# Example: sample both configs and both splits (train/test) and visualize each

for cfg, targets in plans.items():
    print(f"\n====================\nCONFIG={cfg}\n====================")
    for split, n, frac, k in targets:
        print(f"\n-> Sampling split='{split}' with K={k} (N={n:,}, frac={frac:g})")
        samples = take_k_streaming(DATASET_ID, cfg, split, k, seed=SEED, buffer_size=SHUFFLE_BUFFER)
        print("Collected:", len(samples))
        visualize_samples(samples, max_viz=MAX_VIZ)



CONFIG=v1

-> Sampling split='train' with K=30 (N=298,888, frac=0.0001)


README.md:   0%|          | 0.00/6.68k [00:00<?, ?B/s]

Collected: 30
No image-like columns detected. Showing text preview:

--- sample 0 ---
movie_name: Small Soldiers
year: 1998
genre: ['Action', 'Adventure', 'Comedy', 'Family', 'Sci-Fi']
yt_clip_title: Small Soldiers (7/10) Movie CLIP - I Always Hated These Things (1998) HD
yt_clip_link: https://youtube.com/watch?v=n7KKfjFRw8w
movie_scene: <subtitle> You all right?
<subtitle> All my makeup is cruelty free.
<subtitle> You come here often?
<subtitle> Will you take me to the prom?
<visual descriptions> The dolls attack him.
<subtitle> If you can't accessorize, pulverize.
<subtitle> You've...
subtitles: <subtitle> You all right?
<subtitle> All my makeup is cruelty free.
<subtitle> You come here often?
<subtitle> Will you take me to the prom?
<subtitle> If you can't accessorize, pulverize.
<subtitle> You've been a bad boy.
<subtitle> And now, you mus...
question: Where does Archer hide?
choices: ['Under the bed', 'Behind the couch', 'In the closet', "In Ellen's backpack", 'In the attic']
answ

In [36]:
from datasets import load_dataset

cfg, split = "v2", "train"
ds = load_dataset("tomg-group-umd/cinepile", cfg, split=split, streaming=True)

ex = next(iter(ds))
print("keys:", list(ex.keys()))

# look for anything video-ish
videoish = [k for k in ex.keys() if "video" in k.lower() or "mp4" in k.lower() or "path" in k.lower()]
print("video-ish keys:", videoish)

# also check if any value is a dict with bytes/path (typical HF media)
media_like = [k for k,v in ex.items() if isinstance(v, dict) and ("bytes" in v or "path" in v)]
print("media-like dict fields:", media_like)


keys: ['movie_name', 'year', 'genre', 'yt_clip_title', 'yt_clip_link', 'movie_scene', 'subtitles', 'question', 'choices', 'answer_key', 'answer_key_position', 'question_category', 'hard_split', 'visual_reliance', 'videoID']
video-ish keys: ['videoID']
media-like dict fields: []
