In [1]:
!pip -q install datasets huggingface_hub fsspec opencv-python
!pip -q install imageio[ffmpeg] requests

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from datasets import load_dataset
from huggingface_hub import create_repo, upload_folder
import os, json, shutil, fsspec, cv2
from pathlib import Path
from collections import Counter
import textwrap

DATASET_DIR = "/content/video_dataset"
NUM_VAL = 25
NUM_TEST = 25
SEED = 42
REPO_ID = "JerrySprite/Video-Dataset-FP"
PRIVATE = False

if os.path.exists(DATASET_DIR):
    shutil.rmtree(DATASET_DIR)
os.makedirs(f"{DATASET_DIR}/videos", exist_ok=True)
META_PATH = Path(DATASET_DIR) / "metadata.jsonl"


In [5]:
def _hfuri_from_rel(rel: str) -> str:
    # HuggingFace Uniform Resource Identifier
    rel = rel.lstrip("./")
    return f"hf://datasets/IntelligenceLab/VideoHallu/{rel}"

def _copy_hfuri_to_local(hfuri: str, dst_abs: str):
    # Copy HuggingFace Video to Local
    os.makedirs(os.path.dirname(dst_abs), exist_ok=True)
    with fsspec.open(hfuri, "rb") as fin, open(dst_abs, "wb") as fout:
        shutil.copyfileobj(fin, fout)

def _duration_sec(path: str) -> float:
    # Count video duration
    cap = cv2.VideoCapture(path)
    fps = cap.get(cv2.CAP_PROP_FPS) or 0
    frames = cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0
    cap.release()
    return (frames / fps) if fps > 0 else 0.0

def _next_id() -> str:
    # geneate id
    if META_PATH.exists():
        with open(META_PATH, "r", encoding="utf-8") as f:
            n = sum(1 for ln in f if ln.strip())
    else:
        n = 0
    return f"{n+1:04d}"

def _append_meta(rec: dict):
    # Write as metadata.jsonl
    with open(META_PATH, "a", encoding="utf-8") as f:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")


In [6]:
def collect_split(split_name: str, k: int):
    # Collect dataset
    ds = load_dataset("IntelligenceLab/VideoHallu", split=split_name)
    ds = ds.shuffle(seed=SEED)
    k = min(k, len(ds))
    subset = ds.select(range(k))

    for ex in subset:
        rel = ex.get("video") or ex.get("video_path")
        if not isinstance(rel, str):
            continue
        hfuri = rel if rel.startswith(("hf://", "http://", "https://", "zip::")) else _hfuri_from_rel(rel)

        # Extract Question and Answer
        q = ex.get("Question") or ex.get("question")
        a = ex.get("Answer") or ex.get("answer")
        if not (isinstance(q, str) and isinstance(a, str)):
            continue

        sid = _next_id()
        dst_rel = f"videos/{sid}.mp4"
        dst_abs = f"{DATASET_DIR}/{dst_rel}"

        # Copy video to local and count duration
        _copy_hfuri_to_local(hfuri, dst_abs)
        dur = round(_duration_sec(dst_abs), 3)

        # Record metadata
        rec = {
            "id": sid,
            "split": split_name,
            "video_path": dst_rel,
            "question": q.strip(),
            "answer": a.strip(),
            "duration_sec": dur,
            "source": "IntelligenceLab/VideoHallu",
        }
        _append_meta(rec)
        print(f"✅ {split_name:10s} {sid}  {dur:.2f}s")


In [7]:
collect_split("validation", NUM_VAL)
collect_split("test",       NUM_TEST)

print("\nDone →", DATASET_DIR)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.parquet:   0%|          | 0.00/60.8k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/51.3k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/93.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/800 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/908 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1525 [00:00<?, ? examples/s]

✅ validation 0001  5.00s
✅ validation 0002  8.00s
✅ validation 0003  4.00s
✅ validation 0004  5.00s
✅ validation 0005  5.00s
✅ validation 0006  5.00s
✅ validation 0007  4.00s
✅ validation 0008  4.00s
✅ validation 0009  8.00s
✅ validation 0010  8.57s
✅ validation 0011  10.00s
✅ validation 0012  8.00s
✅ validation 0013  5.04s
✅ validation 0014  4.00s
✅ validation 0015  5.04s
✅ validation 0016  5.37s
✅ validation 0017  5.04s
✅ validation 0018  5.00s
✅ validation 0019  5.00s
✅ validation 0020  8.00s
✅ validation 0021  4.00s
✅ validation 0022  5.00s
✅ validation 0023  8.57s
✅ validation 0024  4.00s
✅ validation 0025  4.00s
✅ test       0026  5.04s
✅ test       0027  4.00s
✅ test       0028  5.00s
✅ test       0029  8.57s
✅ test       0030  5.00s
✅ test       0031  5.37s
✅ test       0032  8.00s
✅ test       0033  4.00s
✅ test       0034  8.00s
✅ test       0035  4.00s
✅ test       0036  5.00s
✅ test       0037  4.00s
✅ test       0038  5.00s
✅ test       0039  5.06s
✅ test       0040  5.00s

In [8]:
from IPython.display import display, HTML
from base64 import b64encode
import os, json
from pathlib import Path

#Preview video and QA
DATASET_DIR = "/content/video_dataset"
META_PATH = Path(DATASET_DIR) / "metadata.jsonl"

def _show_video_local(path, width=480):
    with open(path, "rb") as f:
        data_url = "data:video/mp4;base64," + b64encode(f.read()).decode()
    return HTML(f'<video width="{width}" controls src="{data_url}"></video>')

def preview_saved(n=6):
    rows = []
    with open(META_PATH, "r", encoding="utf-8") as f:
        for ln in f:
            if ln.strip():
                rows.append(json.loads(ln))
    for rec in rows[:n]:
        print(f"🎬 [{rec['id']}] {rec.get('split','')}, {rec.get('duration_sec','?')}s")
        print(f"Q: {rec['question']}\nA: {rec['answer']}\n")
        display(_show_video_local(os.path.join(DATASET_DIR, rec["video_path"])))

preview_saved(50)


Output hidden; open in https://colab.research.google.com to view.

In [9]:
from google.colab import drive
drive.mount('/content/drive')

# Upload to Drive
DRIVE_DIR = "/content/drive/MyDrive/colab_datasets/video_dataset_vh50"
if os.path.exists(DRIVE_DIR):
    shutil.rmtree(DRIVE_DIR)
shutil.copytree(DATASET_DIR, DRIVE_DIR)
print("✅ Synced to Drive →", DRIVE_DIR)


Mounted at /content/drive
✅ Synced to Drive → /content/drive/MyDrive/colab_datasets/video_dataset_vh50


In [10]:
# Upload to Hugging Face
create_repo(REPO_ID, repo_type="dataset", private=False, exist_ok=True)
upload_folder(
    folder_path=DATASET_DIR,
    repo_id=REPO_ID,
    repo_type="dataset",
    commit_message="Upload 25 (val) + 25 (test) random samples from VideoHallu",
    ignore_patterns=["**/.ipynb_checkpoints/**"]
)
print(f"✅ Uploaded → https://huggingface.co/datasets/{REPO_ID}")


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...o_dataset/videos/0005.mp4:  79%|#######9  | 1.28MB / 1.62MB            

  ...o_dataset/videos/0008.mp4:  71%|#######   | 1.01MB / 1.43MB            

  ...o_dataset/videos/0004.mp4:  85%|########5 | 1.55MB / 1.81MB            

  ...o_dataset/videos/0009.mp4:  88%|########7 | 1.61MB / 1.84MB            

  ...o_dataset/videos/0003.mp4:  74%|#######4  | 1.24MB / 1.66MB            

  ...o_dataset/videos/0002.mp4:  94%|#########4|  964kB / 1.02MB            

  ...o_dataset/videos/0010.mp4: 100%|##########|  654kB /  654kB            

  ...o_dataset/videos/0027.mp4:  96%|#########5| 7.05MB / 7.36MB            

  ...o_dataset/videos/0001.mp4:  96%|#########5| 2.04MB / 2.13MB            

  ...o_dataset/videos/0013.mp4:  92%|#########1| 1.50MB / 1.64MB            

✅ Uploaded → https://huggingface.co/datasets/JerrySprite/Video-Dataset-FP
