In [6]:
import os
from PIL import Image
from tqdm import tqdm
import tensorflow_datasets as tfds
import tensorflow as tf
from pathlib import Path
import numpy as np
import re

# ==== 参数设置 ====
builder_dir = r"F:\PangYe\bridgev2\0.1.0"      # TFDS 数据目录（含 dataset_info.json / features.json / tfrecord-*）
# split       = "train"
# frame_indices = [0]                 # [0]=第一帧；[-1]=最后一帧；[0, -1]=第一+最后；None=全部帧



In [7]:
# 使用 builder_from_directory 加载数据集
builder = tfds.builder_from_directory(builder_dir)

# 获取数据集的 info 信息
info = builder.info

# 打印数据集的基本信息
print("Dataset Name:", builder.name)
print("Dataset Version:", info.version)
print("Dataset Homepage:", info.homepage)
print("Dataset Description:", info.description)

# 打印 splits 信息
print("\nSplits Information:")
for split, split_info in info.splits.items():
    print(f"{split}: {split_info.num_examples} examples, {split_info.num_bytes:,} bytes")

# 打印 features 信息
print("\nInfo:")
print(info)

# 打印监督的键
print("\nSupervised Keys:")
print(info.supervised_keys)


Dataset Name: bridge
Dataset Version: 0.1.0
Dataset Homepage: https://rail-berkeley.github.io/bridgedata/
Dataset Description: WidowX interacting with toy kitchens

Splits Information:
train: 25460 examples, 365,981,955,835 bytes
test: 3475 examples, 50,084,717,377 bytes

Info:
tfds.core.DatasetInfo(
    name='bridge',
    full_name='bridge/0.1.0',
    description="""
    WidowX interacting with toy kitchens
    """,
    homepage='https://rail-berkeley.github.io/bridgedata/',
    data_dir='F:\\PangYe\\bridgev2\\0.1.0',
    file_format=tfrecord,
    download_size=Unknown size,
    dataset_size=387.49 GiB,
    features=FeaturesDict({
        'steps': Dataset({
            'action': FeaturesDict({
                'open_gripper': bool,
                'rotation_delta': Tensor(shape=(3,), dtype=float32),
                'terminate_episode': float32,
                'world_vector': Tensor(shape=(3,), dtype=float32),
            }),
            'is_first': bool,
            'is_last': bool,
 

In [8]:
ds = builder.as_dataset(split='test')

In [9]:
max_to_check = 100000  # 防止太慢，可设大一点
count = 0

# 这里不再使用 ds[split]，因为 ds 已经是一个 Dataset 了
for _ in tqdm(ds.take(max_to_check), desc="Counting samples"):
    count += 1
    break

Counting samples:   0%|          | 0/3475 [00:01<?, ?it/s]


## 分别提取

In [None]:
# ==== 输出准备 ====
output_path  = r"F:\PangYe\bridgev2_DATA\extracted" # 输出目录
output_dir = Path(output_path)
output_dir.mkdir(parents=True, exist_ok=True)
max_samples = 10                     # None 表示全部样本；设为整数表示只处理这么多

# ==== 迭代样本 ====
pbar = tqdm(enumerate(ds.take(max_samples), start=1), total=max_samples, desc="Extracting samples")

for sample_idx, example in pbar:
    steps = example["steps"]

    # 创建当前 episode 文件夹：id00001, id00002, ...
    ep_dir = output_dir / f"id{sample_idx:05d}"
    images_dir = ep_dir / "images"
    images_dir.mkdir(parents=True, exist_ok=True)

    lang = None

    # 遍历 step 并保存图片
    for step_idx, step in enumerate(steps.as_numpy_iterator(), start=1):
        img = step["observation"]["image"]
        lang_text = step["observation"]["natural_language_instruction"].decode("utf-8")

        # 记录第一步的自然语言指令
        if step_idx == 1:
            lang = lang_text

        # 保存图像到 images/ 目录
        img_pil = Image.fromarray(img)
        img_name = f"step{step_idx:03d}.png"
        img_pil.save(images_dir / img_name)

    # 保存语言到 instruction.txt（与 images 平级）
    if lang is not None:
        with open(ep_dir / "instruction.txt", "w", encoding="utf-8") as f:
            f.write(lang.strip())

pbar.close()
print(f"\n✅ 提取完成，共保存 {min(max_samples, builder.info.splits[split].num_examples)} 个 episode。")

Extracting samples: 100%|██████████| 10/10 [00:10<00:00,  1.10s/it]


✅ 提取完成，共保存 10 个 episode。





In [5]:
# ==== 输出准备 ====
output_path  = r"F:\PangYe\bridgev2_DATA\extracted_1" # 输出目录
output_dir = Path(output_path)
output_dir.mkdir(parents=True, exist_ok=True)
max_samples = 10                     # None 表示全部样本；设为整数表示只处理这么多

# ==== 迭代样本 ====
pbar = tqdm(enumerate(ds.take(max_samples), start=1), total=max_samples, desc="Extracting samples")

for sample_idx, example in pbar:
    steps = example["steps"]

    # 创建当前 episode 文件夹：id00001, id00002, ...
    ep_dir = output_dir / f"id{sample_idx:05d}"
    ep_dir.mkdir(parents=True, exist_ok=True)

    # === 1. 记录首步语言指令 ===
    lang = None

    # === 2. 遍历 step 并保存图片 ===
    for step_idx, step in enumerate(steps.as_numpy_iterator(), start=1):
        img = step["observation"]["image"]
        lang_text = step["observation"]["natural_language_instruction"].decode("utf-8")

        # 保存第一个 step 的语言
        if step_idx == 1:
            lang = lang_text

        # 保存图像
        img_pil = Image.fromarray(img)
        img_name = f"step{step_idx:03d}.png"
        img_pil.save(ep_dir / img_name)

    # === 3. 保存语言到 txt 文件 ===
    if lang is not None:
        with open(ep_dir / "instruction.txt", "w", encoding="utf-8") as f:
            f.write(lang.strip())

pbar.close()
print(f"\n✅ 提取完成，共保存 {min(max_samples, builder.info.splits[split].num_examples)} 个 episode。")

Extracting samples: 100%|██████████| 10/10 [00:09<00:00,  1.00it/s]


✅ 提取完成，共保存 10 个 episode。





## 取九

In [None]:
# ==== 迭代样本（使用 np.linspace 采样9帧，含首尾） ====
output_path  = r"F:\PangYe\bridgev2_DATA\extracted_nine_test" # 输出目录
output_dir = Path(output_path)
output_dir.mkdir(parents=True, exist_ok=True)
max_samples = 3475                     # None 表示全部样本；设为整数表示只处理这么多

pbar = tqdm(enumerate(ds.take(max_samples), start=1), total=max_samples, desc="Extracting 9 frames (linspace)")


for sample_idx, example in pbar:
    steps = list(example["steps"].as_numpy_iterator())
    num_steps = len(steps)

    ep_dir = output_dir / f"id{sample_idx:05d}"
    images_dir = ep_dir / "images"
    images_dir.mkdir(parents=True, exist_ok=True)

    # === 1. 提取第一步语言指令 ===
    lang = steps[0]["observation"]["natural_language_instruction"].decode("utf-8")

    # === 2. 计算9个等间隔索引（包含首帧与末帧） ===
    if num_steps <= 9:
        frame_indices = np.arange(num_steps)
    else:
        frame_indices = np.linspace(0, num_steps - 1, num=9, dtype=int)

    # === 3. 保存这些帧 ===
    for local_idx, step_idx in enumerate(frame_indices, start=1):
        img = steps[step_idx]["observation"]["image"]
        img_pil = Image.fromarray(img)
        img_name = f"step{step_idx:03d}.png"
        img_pil.save(images_dir / img_name)

    # === 4. 保存语言文件 ===
    with open(ep_dir / "instruction.txt", "w", encoding="utf-8") as f:
        f.write(lang.strip())

pbar.close()
print(f"\n✅ 提取完成（每个 episode 使用 np.linspace 等间隔选取 9 帧，含首尾），共保存 {min(max_samples, builder.info.splits[split].num_examples)} 个 episode。")


Extracting 9 frames (linspace):  10%|█         | 2593/25460 [15:59<2:21:00,  2.70it/s]


OSError: [Errno 22] Invalid argument: 'F:\\PangYe\\bridgev2_DATA\\extracted_nine_train\\id02594\\images\\step028.png'

In [10]:
# ==== 配置 ====
output_path  = r"F:\PangYe\bridgev2_DATA\extracted_nine_test"
output_dir = Path(output_path)
output_dir.mkdir(parents=True, exist_ok=True)
MAX_SAMPLES = 3475   # None 表示不限制

ID_RE = re.compile(r"^id(\d{5})$")

def is_episode_complete(ep_dir: Path) -> bool:
    """判断该 episode 是否完整（有 instruction.txt 且 images 里 >=9 张）"""
    images_dir = ep_dir / "images"
    if not images_dir.exists():
        return False
    pngs = list(images_dir.glob("*.png"))
    instr_ok = (ep_dir / "instruction.txt").exists()
    return instr_ok and len(pngs) >= 9

# ==== 统计已完成的样本数量（只计算真正完整的）====
completed_ids = []
for d in sorted(output_dir.iterdir()):
    if d.is_dir() and ID_RE.match(d.name) and is_episode_complete(d):
        completed_ids.append(int(d.name[-5:]))

resume_from = len(completed_ids)  # 已完整完成的 episode 数量
print(f"[INFO] 已完成 {resume_from} 个 episode，将从第 {resume_from+1} 个继续。")

# ==== 计算本次要处理的目标数量 ====
if MAX_SAMPLES is None:
    to_take = None
else:
    # 如果之前已经达到/超过 MAX_SAMPLES，就不再处理
    remaining = max(0, MAX_SAMPLES - resume_from)
    if remaining == 0:
        print("[INFO] 已达到 MAX_SAMPLES，无需继续。")
        raise SystemExit
    to_take = remaining

# ==== 基于 skip() 进行续跑 ====
ds_resumed = ds.skip(resume_from)
if to_take is not None:
    ds_resumed = ds_resumed.take(to_take)

pbar = tqdm(enumerate(ds_resumed, start=resume_from + 1),
            total=to_take, desc="Extracting 9 frames (linspace)")

for sample_idx, example in pbar:
    steps = list(example["steps"].as_numpy_iterator())
    num_steps = len(steps)

    ep_dir = output_dir / f"id{sample_idx:05d}"
    images_dir = ep_dir / "images"
    images_dir.mkdir(parents=True, exist_ok=True)

    # 若该 episode 已完整，直接跳过（防止重复写）
    if is_episode_complete(ep_dir):
        pbar.set_postfix_str("skip (already complete)")
        continue

    # === 1. 提取第一步语言指令 ===
    lang = steps[0]["observation"]["natural_language_instruction"].decode("utf-8")

    # === 2. 计算9个等间隔索引（包含首帧与末帧） ===
    if num_steps <= 9:
        frame_indices = np.arange(num_steps)
    else:
        # 用 round 再转 int，分布更均匀；去重以防 round 碰撞
        cand = np.round(np.linspace(0, num_steps - 1, num=9)).astype(int)
        frame_indices = np.unique(cand)

    # === 3. 保存这些帧 ===
    for step_idx in frame_indices:
        img = steps[step_idx]["observation"]["image"]
        img_pil = Image.fromarray(img)
        img_name = f"step{step_idx:03d}.png"
        img_pil.save(images_dir / img_name)

    # === 4. 保存语言文件 ===
    with open(ep_dir / "instruction.txt", "w", encoding="utf-8") as f:
        f.write(lang.strip())

pbar.close()
print(f"\n✅ 续跑完成：从 id{resume_from+1:05d} 开始处理，共新增 {to_take if to_take is not None else '若干'} 个 episode。")

[INFO] 已完成 0 个 episode，将从第 1 个继续。


Extracting 9 frames (linspace): 100%|██████████| 3475/3475 [25:47<00:00,  2.24it/s]


✅ 续跑完成：从 id00001 开始处理，共新增 3475 个 episode。





In [5]:
# ==== 迭代样本（仅提取第一帧） ====
output_path  = r"F:\PangYe\bridgev2_DATA\extracted_first_100" # 输出目录
output_dir = Path(output_path)
output_dir.mkdir(parents=True, exist_ok=True)
max_samples = 100                     # None 表示全部样本；设为整数表示只处理这么多


pbar = tqdm(enumerate(ds.take(max_samples), start=1), total=max_samples, desc="Extracting first frame only")

for sample_idx, example in pbar:
    steps = list(example["steps"].as_numpy_iterator())

    ep_dir = output_dir / f"id{sample_idx:05d}"
    images_dir = ep_dir / "images"
    images_dir.mkdir(parents=True, exist_ok=True)

    # === 1. 第一步的 step ===
    first_step = steps[0]
    img = first_step["observation"]["image"]
    lang = first_step["observation"]["natural_language_instruction"].decode("utf-8")

    # === 2. 保存图像 ===
    img_pil = Image.fromarray(img)
    img_pil.save(images_dir / "step001.png")

    # === 3. 保存语言文件 ===
    with open(ep_dir / "instruction.txt", "w", encoding="utf-8") as f:
        f.write(lang.strip())

pbar.close()
print(f"\n✅ 提取完成（每个 episode 仅保存第一帧），共保存 {min(max_samples, builder.info.splits[split].num_examples)} 个 episode。")


Extracting first frame only: 100%|██████████| 100/100 [00:13<00:00,  7.62it/s]


✅ 提取完成（每个 episode 仅保存第一帧），共保存 100 个 episode。



