In [1]:

import sys
!{sys.executable} -m pip install -U ipywidgets




In [2]:
from datasets import get_dataset_config_names

DATASET_ID = "lmms-lab/LLaVA-Video-178K"

configs = get_dataset_config_names(DATASET_ID)
print("num configs:", len(configs))
for c in configs:
    print(c)

README.md: 0.00B [00:00, ?B/s]

num configs: 19
0_30_s_academic_v0_1
0_30_s_youtube_v0_1
0_30_s_activitynet
0_30_s_perceptiontest
0_30_s_nextqa
30_60_s_academic_v0_1
30_60_s_youtube_v0_1
30_60_s_activitynet
30_60_s_perceptiontest
30_60_s_nextqa
1_2_m_youtube_v0_1
1_2_m_academic_v0_1
1_2_m_activitynet
1_2_m_nextqa
2_3_m_youtube_v0_1
2_3_m_academic_v0_1
2_3_m_activitynet
2_3_m_nextqa
llava_hound


In [7]:
from __future__ import annotations

import os
from collections import Counter, defaultdict
from pprint import pformat
from typing import Any, Dict, List, Tuple

import pandas as pd
from datasets import load_dataset_builder, load_dataset, get_dataset_config_names

DATASET_ID = "lmms-lab/LLaVA-Video-178K"
CONFIGS = get_dataset_config_names(DATASET_ID)

VIDEO_EXTS = {".mp4", ".webm", ".mkv", ".mov", ".avi", ".m4v"}
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".bmp"}


def shallow_preview(x: Any, max_str=160) -> Any:
    if isinstance(x, str):
        s = x.replace("\n", "\\n")
        return s if len(s) <= max_str else s[:max_str] + "…"
    if isinstance(x, (int, float, bool)) or x is None:
        return x
    if isinstance(x, dict):
        keys = list(x.keys())
        out = {}
        for k in keys[:3]:
            out[k] = shallow_preview(x[k], max_str=max_str)
        if len(keys) > 3:
            out["…"] = f"+{len(keys)-3} more keys"
        return out
    if isinstance(x, list):
        out = [shallow_preview(v, max_str=max_str) for v in x[:2]]
        if len(x) > 2:
            out.append(f"… +{len(x)-2} more items")
        return out
    return f"<{type(x).__name__}>"


def iter_leaf_paths(obj: Any, prefix=""):
    if isinstance(obj, dict):
        for k, v in obj.items():
            p = f"{prefix}.{k}" if prefix else k
            yield from iter_leaf_paths(v, p)
    elif isinstance(obj, list):
        for i, v in enumerate(obj):
            p = f"{prefix}[{i}]"
            yield from iter_leaf_paths(v, p)
    else:
        yield prefix, obj


def detect_media_from_example(ex: Dict[str, Any]) -> Tuple[List[Tuple[str, str]], Dict[str, Counter]]:
    media_fields = []
    formats = defaultdict(Counter)

    for path, val in iter_leaf_paths(ex):
        p = None
        if isinstance(val, str):
            p = val
        elif isinstance(val, dict) and isinstance(val.get("path"), str):
            p = val["path"]

        if not p:
            continue

        _, ext = os.path.splitext(p)
        ext = ext.lower()

        if ext in VIDEO_EXTS:
            media_fields.append((path, "video"))
            formats[path][ext] += 1
        elif ext in IMAGE_EXTS:
            media_fields.append((path, "image"))
            formats[path][ext] += 1

    seen = set()
    uniq = []
    for p, k in media_fields:
        if p not in seen:
            uniq.append((p, k))
            seen.add(p)

    return uniq, formats


def get_available_splits(cfg: str) -> List[str]:
    try:
        b = load_dataset_builder(DATASET_ID, name=cfg)
        if b.info and b.info.splits:
            return list(b.info.splits.keys())
    except Exception:
        pass

    # based on your earlier error messages
    candidates = ["caption", "open_ended", "multi_choice"]
    found = []
    for s in candidates:
        try:
            _ = load_dataset(DATASET_ID, name=cfg, split=s, streaming=True)
            found.append(s)
        except Exception:
            pass
    return found


def try_get_num_examples(cfg: str, split: str):
    try:
        b = load_dataset_builder(DATASET_ID, name=cfg)
        sinfo = b.info.splits.get(split) if b.info and b.info.splits else None
        return getattr(sinfo, "num_examples", None) if sinfo else None
    except Exception:
        return None


def get_one_example(cfg: str, split: str) -> Tuple[Dict[str, Any] | None, str | None]:
    """
    Returns (example, error_str).
    Tries streaming first, then fallback to non-streaming minimal slice.
    """
    # 1) streaming
    try:
        ds_stream = load_dataset(DATASET_ID, name=cfg, split=split, streaming=True)
        it = iter(ds_stream)
        ex = next(it, None)
        if ex is not None:
            return ex, None
    except Exception as e:
        # keep error but still try fallback
        stream_err = repr(e)
    else:
        stream_err = "StopIteration (empty stream)"

    # 2) fallback: non-streaming minimal slice
    try:
        ds_one = load_dataset(DATASET_ID, name=cfg, split=f"{split}[:1]")
        if len(ds_one) > 0:
            return ds_one[0], None
        return None, "Non-streaming returned 0 rows"
    except Exception as e:
        return None, f"Streaming failed: {stream_err} | Fallback failed: {repr(e)}"


rows = []

for cfg in CONFIGS:
    splits = get_available_splits(cfg)

    for split in splits:
        n_examples = try_get_num_examples(cfg, split)

        ex, err = get_one_example(cfg, split)
        if ex is None:
            rows.append({
                "config": cfg,
                "split": split,
                "num_examples_meta": n_examples,
                "top_level_keys": [],
                "media_fields_detected": "<none>",
                "formats_inferred": {},
                "sample_preview": {},
                "status": "FAILED",
                "error": err,
            })
            continue

        media_fields, formats = detect_media_from_example(ex)

        keys = list(ex.keys()) if isinstance(ex, dict) else []
        preview = {k: shallow_preview(ex[k]) for k in keys[:12]}
        if len(keys) > 12:
            preview["…"] = f"+{len(keys)-12} more keys"

        rows.append({
            "config": cfg,
            "split": split,
            "num_examples_meta": n_examples,
            "top_level_keys": keys,
            "media_fields_detected": ", ".join([f"{p}({k})" for p, k in media_fields]) or "<none_detected>",
            "formats_inferred": {p: dict(c.most_common(5)) for p, c in formats.items()},
            "sample_preview": preview,
            "status": "OK",
            "error": None,
        })

df = pd.DataFrame(rows)
df


Generating open_ended split: 0 examples [00:00, ? examples/s]

Generating open_ended split: 0 examples [00:00, ? examples/s]

Generating open_ended split: 0 examples [00:00, ? examples/s]

Generating open_ended split: 0 examples [00:00, ? examples/s]

Unnamed: 0,config,split,num_examples_meta,top_level_keys,media_fields_detected,formats_inferred,sample_preview,status,error
0,0_30_s_academic_v0_1,caption,,"[id, conversations, data_source, video]",video(video),{'video': {'.mp4': 1}},"{'id': '028CE', 'conversations': [{'from': 'hu...",OK,
1,0_30_s_academic_v0_1,open_ended,,"[id, conversations, data_source, video]",video(video),{'video': {'.mp4': 1}},"{'id': '028CE', 'conversations': [{'from': 'hu...",OK,
2,0_30_s_academic_v0_1,multi_choice,,"[id, conversations, data_source, video]",video(video),{'video': {'.mp4': 1}},"{'id': 'RW587', 'conversations': [{'from': 'hu...",OK,
3,0_30_s_youtube_v0_1,caption,,"[id, conversations, data_source, video]",video(video),{'video': {'.mp4': 1}},"{'id': '8bqHXtRirm4', 'conversations': [{'from...",OK,
4,0_30_s_youtube_v0_1,open_ended,,"[id, conversations, data_source, video]",video(video),{'video': {'.mp4': 1}},"{'id': '8bqHXtRirm4', 'conversations': [{'from...",OK,
5,0_30_s_youtube_v0_1,multi_choice,,"[id, conversations, data_source, video]",video(video),{'video': {'.mp4': 1}},"{'id': 'YVQwAEKZpaU', 'conversations': [{'from...",OK,
6,0_30_s_activitynet,open_ended,,[],<none>,{},{},FAILED,Streaming failed: StopIteration (empty stream)...
7,0_30_s_perceptiontest,multi_choice,,"[id, conversations, data_source, video]",video(video),{'video': {'.mp4': 1}},"{'id': 'perceptiontest_video_2779', 'conversat...",OK,
8,0_30_s_nextqa,open_ended,,"[id, conversations, data_source, video]",video(video),{'video': {'.mp4': 1}},"{'id': '1102-8903248754', 'conversations': [{'...",OK,
9,0_30_s_nextqa,multi_choice,,"[id, conversations, data_source, video]",video(video),{'video': {'.mp4': 1}},"{'id': '1006-8968804598', 'conversations': [{'...",OK,


In [9]:
from __future__ import annotations

import os
from pathlib import Path
from typing import Optional, Dict, Any, List

from huggingface_hub import hf_hub_download
from PIL import Image as PILImage
from IPython.display import display

DATASET_REPO_ID = "lmms-lab/LLaVA-Video-178K"
LOCAL_VIDEO_DIR = Path("./hf_videos_cache")   # change to your scratch dir on IBEX
LOCAL_VIDEO_DIR.mkdir(parents=True, exist_ok=True)

def resolve_video_path_from_hf(video_relpath: str) -> Optional[str]:
    """
    Given a relative path stored in the dataset row, try:
      - local path (if already present)
      - HF Hub download (dataset repo file)
    Returns a local filesystem path, or None.
    """
    # 1) already local?
    if os.path.exists(video_relpath):
        return video_relpath

    # 2) maybe it exists under our chosen cache dir with same relative structure
    candidate = LOCAL_VIDEO_DIR / video_relpath
    if candidate.exists():
        return str(candidate)

    # 3) download from the dataset repo
    try:
        local_file = hf_hub_download(
            repo_id=DATASET_REPO_ID,
            repo_type="dataset",
            filename=video_relpath,          # IMPORTANT: use the exact relative path
            local_dir=str(LOCAL_VIDEO_DIR),  # keep mirrored structure
            local_dir_use_symlinks=False,
        )
        return local_file
    except Exception as e:
        print("hf_hub_download failed for:", video_relpath)
        print("  ", repr(e))
        return None

def decode_random_frames(video_path: str, num_frames=3, seed=123) -> List[PILImage.Image]:
    import random
    import decord
    from decord import VideoReader, cpu

    vr = VideoReader(video_path, ctx=cpu(0))
    total = len(vr)
    if total == 0:
        return []

    rng = random.Random(seed + hash(video_path) % 10_000_000)
    if total <= num_frames:
        idxs = list(range(total))
    else:
        idxs = sorted(rng.sample(range(total), num_frames))

    imgs = []
    for idx in idxs:
        frame = vr[idx].asnumpy()
        imgs.append(PILImage.fromarray(frame))
    return imgs

def visualize_row(ex: Dict[str, Any], frames_per_video=3):
    video_val = ex.get("video")
    if isinstance(video_val, dict) and isinstance(video_val.get("path"), str):
        video_rel = video_val["path"]
    else:
        video_rel = video_val if isinstance(video_val, str) else None

    print("id:", ex.get("id"))
    print("data_source:", ex.get("data_source"))
    print("video (rel):", video_rel)

    if not video_rel:
        print("No video field.")
        return

    local_video = resolve_video_path_from_hf(video_rel)
    print("video (local):", local_video)

    if not local_video:
        print("Could not resolve video file; skipping.")
        return

    try:
        frames = decode_random_frames(local_video, num_frames=frames_per_video)
        for im in frames:
            display(im)
    except Exception as e:
        print("Frame decode failed:", repr(e))


In [13]:
from __future__ import annotations

import math
import os
import random
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import pandas as pd
from datasets import load_dataset, get_dataset_config_names
from huggingface_hub import hf_hub_download, list_repo_files
from PIL import Image as PILImage
from IPython.display import display

# =========================
# Config
# =========================
DATASET_ID = "lmms-lab/LLaVA-Video-178K"
TASK_SPLITS = ["caption", "open_ended", "multi_choice"]

SEED = 123
MAX_VIZ_PER_SPLIT = 12
FRAMES_PER_VIDEO = 3

# Put on fast storage (IBEX scratch recommended)
LOCAL_VIDEO_DIR = Path("./hf_videos_cache")
LOCAL_VIDEO_DIR.mkdir(parents=True, exist_ok=True)

VIDEO_EXTS = {".mp4", ".webm", ".mkv", ".mov", ".avi", ".m4v"}

# =========================
# Sampling scheme
# =========================
def sample_fraction(n: int) -> float:
    if n <= 10_000:
        return 0.01
    if n <= 100_000:
        return 0.001
    if n <= 1_000_000:
        return 0.0001
    if n <= 10_000_000:
        return 0.00001
    return 0.000001

def choose_sample_indices(n: int, k: int, seed=SEED) -> List[int]:
    rng = random.Random(seed)
    if k >= n:
        return list(range(n))
    return rng.sample(range(n), k)

# =========================
# Video helpers
# =========================
def get_video_relpath(ex: Dict[str, Any], field="video") -> Optional[str]:
    v = ex.get(field)
    if isinstance(v, str):
        return v
    if isinstance(v, dict) and isinstance(v.get("path"), str):
        return v["path"]
    return None

def decode_random_frames(video_path: str, num_frames=FRAMES_PER_VIDEO, seed=SEED) -> List[PILImage.Image]:
    import decord
    from decord import VideoReader, cpu

    vr = VideoReader(video_path, ctx=cpu(0))
    total = len(vr)
    if total <= 0:
        return []

    rng = random.Random(seed + (hash(video_path) % 10_000_000))
    if total <= num_frames:
        idxs = list(range(total))
    else:
        idxs = sorted(rng.sample(range(total), num_frames))

    imgs = []
    for idx in idxs:
        frame = vr[idx].asnumpy()
        imgs.append(PILImage.fromarray(frame))
    return imgs

# =========================
# 1-time HF repo index
# =========================
print("Indexing repo files from HF (one-time)...")
repo_files = list_repo_files(repo_id=DATASET_ID, repo_type="dataset")

# keep only video-like files (you can broaden this if needed)
hf_video_files = set(
    f for f in repo_files
    if os.path.splitext(f)[1].lower() in VIDEO_EXTS
)

print(f"HF repo files: {len(repo_files):,} | video files indexed: {len(hf_video_files):,}")

def resolve_video_to_local(video_relpath: str) -> Tuple[Optional[str], str]:
    """
    Fast resolver:
      - if not present in HF index => MISSING_ON_HF (no download attempt)
      - if present => download into LOCAL_VIDEO_DIR and return local path
    """
    # already local?
    if os.path.exists(video_relpath):
        return video_relpath, "OK_LOCAL"

    # cached in our local dir with same relative structure?
    candidate = LOCAL_VIDEO_DIR / video_relpath
    if candidate.exists():
        return str(candidate), "OK_CACHED"

    # If HF doesn't have it, skip immediately
    if video_relpath not in hf_video_files:
        return None, "MISSING_ON_HF"

    # Download (should succeed if index is correct)
    try:
        local_file = hf_hub_download(
            repo_id=DATASET_ID,
            repo_type="dataset",
            filename=video_relpath,
            local_dir=str(LOCAL_VIDEO_DIR),
            etag_timeout=10,  # avoid hanging too long on metadata
        )
        return local_file, "OK_DOWNLOADED"
    except Exception as e:
        return None, f"DOWNLOAD_ERROR: {type(e).__name__}"

# =========================
# Main loop (no transcript printing)
# =========================
configs = get_dataset_config_names(DATASET_ID)

summary_rows = []

for cfg in configs:
    for split in TASK_SPLITS:
        # load split to get exact N and to sample indices
        try:
            ds = load_dataset(DATASET_ID, name=cfg, split=split)
        except Exception as e:
            summary_rows.append({
                "config": cfg, "split": split, "N": None,
                "status": "SKIPPED", "error": repr(e)[:250],
            })
            continue

        N = ds.num_rows
        if N == 0:
            summary_rows.append({
                "config": cfg, "split": split, "N": 0,
                "status": "EMPTY", "error": None,
            })
            continue

        frac = sample_fraction(N)
        k = max(1, int(math.ceil(N * frac)))
        k_viz = min(k, MAX_VIZ_PER_SPLIT)
        idxs = choose_sample_indices(N, k_viz, seed=SEED)

        print("\n" + "=" * 110)
        print(f"CONFIG={cfg} | SPLIT={split} | N={N} | frac={frac} | sample_k={k} | viz={k_viz}")
        print("=" * 110)

        ok_viz = 0
        missing = 0
        dl_err = 0

        for j, ix in enumerate(idxs):
            ex = ds[ix]
            vid_rel = get_video_relpath(ex, field="video")

            print(f"\n--- sample {j+1}/{k_viz} (row={ix}) ---")
            print("id:", ex.get("id"))
            print("data_source:", ex.get("data_source"))
            print("video_rel:", vid_rel)

            if not vid_rel:
                print("No video relpath; skipping.")
                continue

            local_vid, status = resolve_video_to_local(vid_rel)
            print("resolve_status:", status)

            if status == "MISSING_ON_HF":
                missing += 1
                continue
            if local_vid is None:
                dl_err += 1
                continue

            try:
                frames = decode_random_frames(local_vid, num_frames=FRAMES_PER_VIDEO)
                if not frames:
                    print("No frames decoded.")
                for im in frames:
                    display(im)
                ok_viz += 1
            except Exception as e:
                print("Frame decode failed:", repr(e))

        summary_rows.append({
            "config": cfg,
            "split": split,
            "N": N,
            "fraction": frac,
            "sample_k": k,
            "viz": k_viz,
            "viz_ok": ok_viz,
            "missing_on_hf": missing,
            "download_errors": dl_err,
            "status": "OK",
            "error": None,
        })

summary_df = pd.DataFrame(summary_rows).sort_values(["config", "split"]).reset_index(drop=True)
summary_df


Indexing repo files from HF (one-time)...
HF repo files: 302 | video files indexed: 1

CONFIG=0_30_s_academic_v0_1 | SPLIT=caption | N=11985 | frac=0.001 | sample_k=12 | viz=12

--- sample 1/12 (row=857) ---
id: 323--sQXBqu-_1w-split_3
data_source: 0_30_s_academic_v0_1
video_rel: academic_source/youcook2/323/-sQXBqu-_1w/split_3.mp4
resolve_status: MISSING_ON_HF

--- sample 2/12 (row=4385) ---
id: MVO1W
data_source: 0_30_s_academic_v0_1
video_rel: academic_source/Charades/MVO1W.mp4
resolve_status: MISSING_ON_HF

--- sample 3/12 (row=1428) ---
id: 1001-3013532781
data_source: 0_30_s_academic_v0_1
video_rel: academic_source/NextQA/1001/3013532781.mp4
resolve_status: MISSING_ON_HF

--- sample 4/12 (row=6672) ---
id: ZS2WD
data_source: 0_30_s_academic_v0_1
video_rel: academic_source/Charades/ZS2WD.mp4
resolve_status: MISSING_ON_HF

--- sample 5/12 (row=4367) ---
id: DTNFC
data_source: 0_30_s_academic_v0_1
video_rel: academic_source/Charades/DTNFC.mp4
resolve_status: MISSING_ON_HF

--- sampl

Generating open_ended split: 0 examples [00:00, ? examples/s]

Generating open_ended split: 0 examples [00:00, ? examples/s]

Generating open_ended split: 0 examples [00:00, ? examples/s]


CONFIG=0_30_s_perceptiontest | SPLIT=multi_choice | N=1785 | frac=0.01 | sample_k=18 | viz=12

--- sample 1/12 (row=107) ---
id: perceptiontest_video_7656
data_source: 0_30_s_perceptiontest
video_rel: perception_test/videos/video_7656.mp4
resolve_status: MISSING_ON_HF

--- sample 2/12 (row=548) ---
id: perceptiontest_video_10216
data_source: 0_30_s_perceptiontest
video_rel: perception_test/videos/video_10216.mp4
resolve_status: MISSING_ON_HF

--- sample 3/12 (row=178) ---
id: perceptiontest_video_6080
data_source: 0_30_s_perceptiontest
video_rel: perception_test/videos/video_6080.mp4
resolve_status: MISSING_ON_HF

--- sample 4/12 (row=1574) ---
id: perceptiontest_video_3275
data_source: 0_30_s_perceptiontest
video_rel: perception_test/videos/video_3275.mp4
resolve_status: MISSING_ON_HF

--- sample 5/12 (row=834) ---
id: perceptiontest_video_11039
data_source: 0_30_s_perceptiontest
video_rel: perception_test/videos/video_11039.mp4
resolve_status: MISSING_ON_HF

--- sample 6/12 (row=545

Generating open_ended split: 0 examples [00:00, ? examples/s]

Generating open_ended split: 0 examples [00:00, ? examples/s]

Generating open_ended split: 0 examples [00:00, ? examples/s]


CONFIG=30_60_s_perceptiontest | SPLIT=multi_choice | N=618 | frac=0.01 | sample_k=7 | viz=7

--- sample 1/7 (row=53) ---
id: perceptiontest_video_3215
data_source: 30_60_s_perceptiontest
video_rel: perception_test/videos/video_3215.mp4
resolve_status: MISSING_ON_HF

--- sample 2/7 (row=274) ---
id: perceptiontest_video_1614
data_source: 30_60_s_perceptiontest
video_rel: perception_test/videos/video_1614.mp4
resolve_status: MISSING_ON_HF

--- sample 3/7 (row=89) ---
id: perceptiontest_video_2867
data_source: 30_60_s_perceptiontest
video_rel: perception_test/videos/video_2867.mp4
resolve_status: MISSING_ON_HF

--- sample 4/7 (row=417) ---
id: perceptiontest_video_8989
data_source: 30_60_s_perceptiontest
video_rel: perception_test/videos/video_8989.mp4
resolve_status: MISSING_ON_HF

--- sample 5/7 (row=272) ---
id: perceptiontest_video_10212
data_source: 30_60_s_perceptiontest
video_rel: perception_test/videos/video_10212.mp4
resolve_status: MISSING_ON_HF

--- sample 6/7 (row=110) ---
id

Generating open_ended split: 0 examples [00:00, ? examples/s]

Generating open_ended split: 0 examples [00:00, ? examples/s]

Generating open_ended split: 0 examples [00:00, ? examples/s]


CONFIG=1_2_m_nextqa | SPLIT=open_ended | N=4694 | frac=0.01 | sample_k=47 | viz=12

--- sample 1/12 (row=428) ---
id: 1164-7748781694
data_source: 1_2_m_nextqa
video_rel: NextQA/NExTVideo/1164/7748781694.mp4
resolve_status: MISSING_ON_HF

--- sample 2/12 (row=2192) ---
id: 1122-6263161839
data_source: 1_2_m_nextqa
video_rel: NextQA/NExTVideo/1122/6263161839.mp4
resolve_status: MISSING_ON_HF

--- sample 3/12 (row=714) ---
id: 0043-5737998057
data_source: 1_2_m_nextqa
video_rel: NextQA/NExTVideo/0043/5737998057.mp4
resolve_status: MISSING_ON_HF

--- sample 4/12 (row=3336) ---
id: 0054-3737244927
data_source: 1_2_m_nextqa
video_rel: NextQA/NExTVideo/0054/3737244927.mp4
resolve_status: MISSING_ON_HF

--- sample 5/12 (row=2183) ---
id: 1164-8787162041
data_source: 1_2_m_nextqa
video_rel: NextQA/NExTVideo/1164/8787162041.mp4
resolve_status: MISSING_ON_HF

--- sample 6/12 (row=882) ---
id: 1120-4003814112
data_source: 1_2_m_nextqa
video_rel: NextQA/NExTVideo/1120/4003814112.mp4
resolve_statu

Generating open_ended split: 0 examples [00:00, ? examples/s]

Generating open_ended split: 0 examples [00:00, ? examples/s]

Generating open_ended split: 0 examples [00:00, ? examples/s]


CONFIG=2_3_m_nextqa | SPLIT=open_ended | N=61 | frac=0.01 | sample_k=1 | viz=1

--- sample 1/1 (row=3) ---
id: 1016-11049178966
data_source: 2_3_m_nextqa
video_rel: NextQA/NExTVideo/1016/11049178966.mp4
resolve_status: MISSING_ON_HF

CONFIG=2_3_m_nextqa | SPLIT=multi_choice | N=52 | frac=0.01 | sample_k=1 | viz=1

--- sample 1/1 (row=3) ---
id: 1013-10566292134
data_source: 2_3_m_nextqa
video_rel: NextQA/NExTVideo/1013/10566292134.mp4
resolve_status: MISSING_ON_HF

CONFIG=llava_hound | SPLIT=open_ended | N=255000 | frac=0.0001 | sample_k=26 | viz=12

--- sample 1/12 (row=13726) ---
id: v_3dR4MEUDHa0-Scene-011_0
data_source: llava_hound
video_rel: shareVideoGPTV/frames/all_frames/v_3dR4MEUDHa0-Scene-011
resolve_status: MISSING_ON_HF

--- sample 2/12 (row=70169) ---
id: v_lQP65cm11FA-Scene-003_2
data_source: llava_hound
video_rel: shareVideoGPTV/frames/all_frames/v_lQP65cm11FA-Scene-003
resolve_status: MISSING_ON_HF

--- sample 3/12 (row=22855) ---
id: v_zB8kwWJqQ8Q-Scene-008_1
data_sou

Unnamed: 0,config,split,N,fraction,sample_k,viz,viz_ok,missing_on_hf,download_errors,status,error
0,0_30_s_academic_v0_1,caption,11985.0,0.001,12.0,12.0,0.0,12.0,0.0,OK,
1,0_30_s_academic_v0_1,multi_choice,5753.0,0.01,58.0,12.0,0.0,12.0,0.0,OK,
2,0_30_s_academic_v0_1,open_ended,48468.0,0.001,49.0,12.0,0.0,12.0,0.0,OK,
3,0_30_s_activitynet,caption,,,,,,,,SKIPPED,DatasetGenerationError('An error occurred whil...
4,0_30_s_activitynet,multi_choice,,,,,,,,SKIPPED,DatasetGenerationError('An error occurred whil...
5,0_30_s_activitynet,open_ended,,,,,,,,SKIPPED,DatasetGenerationError('An error occurred whil...
6,0_30_s_nextqa,caption,,,,,,,,SKIPPED,"ValueError('Unknown split ""caption"". Should be..."
7,0_30_s_nextqa,multi_choice,5496.0,0.01,55.0,12.0,0.0,12.0,0.0,OK,
8,0_30_s_nextqa,open_ended,5492.0,0.01,55.0,12.0,0.0,12.0,0.0,OK,
9,0_30_s_perceptiontest,caption,,,,,,,,SKIPPED,"ValueError('Unknown split ""caption"". Should be..."
