In [14]:
import sys, os, site
print("sys.executable:", sys.executable)
print("CONDA_PREFIX:", os.environ.get("CONDA_PREFIX"))
print("site-packages:", site.getsitepackages()[:2])


sys.executable: /ibex/project/c2320/dataset-check/x-embodiment/envs/oxe/bin/python
CONDA_PREFIX: /sw/rl9g/machine_learning/2024.01/rl9_cudnn8_cuda11.8_py3.9_env/machine_learning-module/env
site-packages: ['/ibex/project/c2320/dataset-check/x-embodiment/envs/oxe/lib/python3.10/site-packages']


In [15]:
import os, sys
from pathlib import Path

ENV_BIN = str(Path(sys.executable).resolve().parent)  # .../envs/oxe/bin
os.environ["PATH"] = ENV_BIN + ":" + os.environ.get("PATH", "")

print("ENV_BIN:", ENV_BIN)
print("gsutil which:", os.popen("which gsutil").read().strip())


ENV_BIN: /ibex/project/c2320/dataset-check/x-embodiment/envs/oxe/bin
gsutil which: /ibex/project/c2320/dataset-check/x-embodiment/envs/oxe/bin/gsutil


In [16]:
import shutil, subprocess, os

print("which gsutil:", shutil.which("gsutil"))
print("PATH:", os.environ.get("PATH","")[:200], "...")
subprocess.run("gsutil version", shell=True)


which gsutil: /ibex/project/c2320/dataset-check/x-embodiment/envs/oxe/bin/gsutil
PATH: /ibex/project/c2320/dataset-check/x-embodiment/envs/oxe/bin:/sw/rl9g/machine_learning/2024.01/rl9_cudnn8_cuda11.8_py3.9_env/machine_learning-module/env/bin:/sw/rl9g/cuda/11.8/rl9_binary/bin:/sw/rl9g/g ...
gsutil version: 5.35


CompletedProcess(args='gsutil version', returncode=0)

In [17]:
import json, re, subprocess
from pathlib import Path
from datetime import datetime

import pandas as pd

# === Where to save outputs ===
OUT_DIR = Path.cwd() / "oxe_peek"
SCAN_DIR = OUT_DIR / "features_scan"
OUT_DIR.mkdir(parents=True, exist_ok=True)
SCAN_DIR.mkdir(parents=True, exist_ok=True)

# === Buckets to try (order matters) ===
BUCKETS = [
    "gs://gdm-robotics-open-x-embodiment",
    "gs://gresearch/robotics",
]

print("OUT_DIR:", OUT_DIR)
print("SCAN_DIR:", SCAN_DIR)
print("Buckets:", BUCKETS)


OUT_DIR: /ibex/project/c2320/dataset-check/x-embodiment/oxe_peek
SCAN_DIR: /ibex/project/c2320/dataset-check/x-embodiment/oxe_peek/features_scan
Buckets: ['gs://gdm-robotics-open-x-embodiment', 'gs://gresearch/robotics']


In [22]:
import subprocess, time, shutil

def gsutil(cmd: str, timeout_s: int = 60) -> str:
    """
    Run gsutil with a timeout so one dataset can't hang forever.
    """
    if shutil.which("gsutil") is None:
        raise RuntimeError("gsutil not found in PATH inside this kernel")

    p = subprocess.run(
        cmd,
        shell=True,
        text=True,
        capture_output=True,
        timeout=timeout_s,
    )
    if p.returncode != 0:
        raise RuntimeError(
            f"gsutil failed ({p.returncode})\n"
            f"CMD: {cmd}\nSTDOUT:\n{p.stdout}\nSTDERR:\n{p.stderr}"
        )
    return p.stdout



In [19]:
def list_versions(ds: str):
    """
    List semver-like version folders for a dataset, trying both buckets.
    Returns (versions, bucket_used).
    """
    last_err = None

    for base in BUCKETS:
        try:
            out = gsutil(f'gsutil ls "{base}/{ds}/"')
            lines = [ln.strip() for ln in out.splitlines() if ln.strip().startswith("gs://")]

            vers = []
            for ln in lines:
                name = ln.rstrip("/").split("/")[-1]
                name = name.replace("_$folder$", "").replace(":", "")
                if re.match(r"^\d+\.\d+\.\d+$", name):
                    vers.append(name)

            def semver_key(s): 
                return tuple(int(x) for x in s.split("."))

            vers = sorted(set(vers), key=semver_key)
            return vers, base

        except Exception as e:
            last_err = e

    # nothing worked
    raise last_err


def load_features_json(ds: str, version: str, out_dir: Path):
    """
    Download features.json, trying both buckets.
    Returns (features_dict, local_path, gcs_path_used).
    """
    out_dir.mkdir(parents=True, exist_ok=True)
    local = out_dir / f"{ds}__{version}__features.json"

    last_err = None
    for base in BUCKETS:
        gcs = f'{base}/{ds}/{version}/features.json'
        try:
            # existence check first (gives clearer errors)
            gsutil(f'gsutil ls "{gcs}"')
            gsutil(f'gsutil cp "{gcs}" "{local}"')
            return json.loads(local.read_text()), local, gcs
        except Exception as e:
            last_err = e

    raise last_err


In [10]:
def find_image_descriptions(feats):
    # find any "description" fields near "image" features
    descs = []
    def rec(o, path=""):
        if isinstance(o, dict):
            for k, v in o.items():
                rec(v, f"{path}/{k}")
        elif isinstance(o, list):
            for i, v in enumerate(o):
                rec(v, f"{path}[{i}]")
        else:
            if isinstance(o, str) and "description" in path and re.search(r"(image|rgb|camera|wrist|front|left|right)", path, re.I):
                descs.append((path, o))
    rec(feats)
    return descs

def has_real_rgb(desc_text: str):
    # heuristic: placeholder datasets explicitly say not available / zeros
    bad = ["not available", "np.zeros", "zeros"]
    return not any(b in desc_text.lower() for b in bad)


In [20]:
# Paste your dataset names here (one per line)
CANDIDATES_TEXT = """
language_table
stanford_hydra_dataset_converted_externally_to_rlds
austin_buds_dataset_converted_externally_to_rlds
furniture_bench_dataset_converted_externally_to_rlds
cmu_franka_exploration_dataset_converted_externally_to_rlds
ucsd_kitchen_dataset_converted_externally_to_rlds
austin_sirius_dataset_converted_externally_to_rlds
utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds
utokyo_saytap_converted_externally_to_rlds
berkeley_mvp_converted_externally_to_rlds
kaist_nonprehensile_converted_externally_to_rlds
dlr_sara_pour_converted_externally_to_rlds
asu_table_top_converted_externally_to_rlds
stanford_robocook_converted_externally_to_rlds
iamlab_cmu_pickup_insert_converted_externally_to_rlds
utaustin_mutex
berkeley_fanuc_manipulation
cmu_playing_with_food
cmu_stretch
berkeley_gnm_cory_hall
berkeley_gnm_sac_son
eth_agent_affordances
kuka
columbia_cairlab_pusht_real
robot_vqa
fractal20220817_data
bridge
taco_play
jaco_play
berkeley_cable_routing
roboturk
nyu_door_opening_surprising_effectiveness
viola
berkeley_autolab_ur5
toto
stanford_kuka_multimodal_dataset_converted_externally_to_rlds
nyu_rot_dataset_converted_externally_to_rlds
nyu_franka_play_dataset_converted_externally_to_rlds
maniskill_dataset_converted_externally_to_rlds
ucsd_pick_and_place_dataset_converted_externally_to_rlds
austin_sailor_dataset_converted_externally_to_rlds
bc_z
usc_cloth_sim_converted_externally_to_rlds
utokyo_pr2_opening_fridge_converted_externally_to_rlds
utokyo_xarm_pick_and_place_converted_externally_to_rlds
utokyo_xarm_bimanual_converted_externally_to_rlds
robo_net
berkeley_rpt_converted_externally_to_rlds
stanford_mask_vit_converted_externally_to_rlds
tokyo_u_lsmo_converted_externally_to_rlds
dlr_sara_grid_clamp_converted_externally_to_rlds
dlr_edan_shared_control_converted_externally_to_rlds
imperialcollege_sawyer_wrist_cam
qut_dexterous_manipulation
uiuc_d3field
cmu_play_fusion
berkeley_gnm_recon
droid
conq_hose_manipulation
dobbe
fmb
io_ai_tech
mimic_play
aloha_mobile
robo_set
tidybot
vima_converted_externally_to_rlds
spoc
plex_robosuite
""".strip()

CANDIDATES = [x.strip() for x in CANDIDATES_TEXT.splitlines() if x.strip()]
print("Candidates:", len(CANDIDATES))
print(CANDIDATES[:10])


Candidates: 69
['language_table', 'stanford_hydra_dataset_converted_externally_to_rlds', 'austin_buds_dataset_converted_externally_to_rlds', 'furniture_bench_dataset_converted_externally_to_rlds', 'cmu_franka_exploration_dataset_converted_externally_to_rlds', 'ucsd_kitchen_dataset_converted_externally_to_rlds', 'austin_sirius_dataset_converted_externally_to_rlds', 'utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds', 'utokyo_saytap_converted_externally_to_rlds', 'berkeley_mvp_converted_externally_to_rlds']


In [21]:
results = []

for ds in CANDIDATES:
    try:
        vers, bucket_for_versions = list_versions(ds)
        if not vers:
            results.append({
                "dataset": ds,
                "version": None,
                "rgb_real": False,
                "note": "no version folders found",
                "bucket_versions": bucket_for_versions,
                "bucket_features": None,
            })
            continue

        v = vers[-1]  # latest semver

        feats, local_path, gcs_path = load_features_json(ds, v, SCAN_DIR)
        descs = find_image_descriptions(feats)

        if not descs:
            results.append({
                "dataset": ds,
                "version": v,
                "rgb_real": None,
                "note": f"features.json downloaded ({local_path.name}) from {gcs_path} but no image description hits",
                "bucket_versions": bucket_for_versions,
                "bucket_features": gcs_path.split("/", 3)[0] + "//" + gcs_path.split("/", 3)[2],  # rough bucket id
            })
            continue

        real_any = any(has_real_rgb(desc) for _, desc in descs)
        rep = descs[0][1].replace("\n", " ")[:200]

        results.append({
            "dataset": ds,
            "version": v,
            "rgb_real": bool(real_any),
            "note": rep,
            "bucket_versions": bucket_for_versions,
            "bucket_features": gcs_path.split("/", 3)[0] + "//" + gcs_path.split("/", 3)[2],
        })

    except Exception as e:
        results.append({
            "dataset": ds,
            "version": None,
            "rgb_real": False,
            "note": f"error: {e}",
            "bucket_versions": None,
            "bucket_features": None,
        })

df = pd.DataFrame(results)
df


Unnamed: 0,dataset,version,rgb_real,note,bucket_versions,bucket_features
0,language_table,0.1.0,True,An RGB image of the scene.,gs://gdm-robotics-open-x-embodiment,gs://gdm-robotics-open-x-embodiment
1,stanford_hydra_dataset_converted_externally_to...,0.1.0,True,Main camera RGB observation.,gs://gdm-robotics-open-x-embodiment,gs://gdm-robotics-open-x-embodiment
2,austin_buds_dataset_converted_externally_to_rlds,0.1.0,True,Main camera RGB observation.,gs://gdm-robotics-open-x-embodiment,gs://gdm-robotics-open-x-embodiment
3,furniture_bench_dataset_converted_externally_t...,0.1.0,True,Main camera RGB observation.,gs://gdm-robotics-open-x-embodiment,gs://gdm-robotics-open-x-embodiment
4,cmu_franka_exploration_dataset_converted_exter...,0.1.0,True,Main camera RGB observation.,gs://gdm-robotics-open-x-embodiment,gs://gdm-robotics-open-x-embodiment
...,...,...,...,...,...,...
64,robo_set,0.0.1,,features.json downloaded (robo_set__0.0.1__fea...,gs://gresearch/robotics,gs://gresearch
65,tidybot,0.0.1,,features.json downloaded (tidybot__0.0.1__feat...,gs://gresearch/robotics,gs://gresearch
66,vima_converted_externally_to_rlds,0.0.1,,features.json downloaded (vima_converted_exter...,gs://gresearch/robotics,gs://gresearch
67,spoc,0.0.1,,features.json downloaded (spoc__0.0.1__feature...,gs://gresearch/robotics,gs://gresearch


In [23]:
import pandas as pd

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)   # show full text in "note"
df


Unnamed: 0,dataset,version,rgb_real,note,bucket_versions,bucket_features
0,language_table,0.1.0,True,An RGB image of the scene.,gs://gdm-robotics-open-x-embodiment,gs://gdm-robotics-open-x-embodiment
1,stanford_hydra_dataset_converted_externally_to_rlds,0.1.0,True,Main camera RGB observation.,gs://gdm-robotics-open-x-embodiment,gs://gdm-robotics-open-x-embodiment
2,austin_buds_dataset_converted_externally_to_rlds,0.1.0,True,Main camera RGB observation.,gs://gdm-robotics-open-x-embodiment,gs://gdm-robotics-open-x-embodiment
3,furniture_bench_dataset_converted_externally_to_rlds,0.1.0,True,Main camera RGB observation.,gs://gdm-robotics-open-x-embodiment,gs://gdm-robotics-open-x-embodiment
4,cmu_franka_exploration_dataset_converted_externally_to_rlds,0.1.0,True,Main camera RGB observation.,gs://gdm-robotics-open-x-embodiment,gs://gdm-robotics-open-x-embodiment
5,ucsd_kitchen_dataset_converted_externally_to_rlds,0.1.0,True,Main camera RGB observation.,gs://gdm-robotics-open-x-embodiment,gs://gdm-robotics-open-x-embodiment
6,austin_sirius_dataset_converted_externally_to_rlds,0.1.0,True,Wrist camera RGB observation.,gs://gdm-robotics-open-x-embodiment,gs://gdm-robotics-open-x-embodiment
7,utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds,1.0.0,True,Main camera RGB observation.,gs://gdm-robotics-open-x-embodiment,gs://gdm-robotics-open-x-embodiment
8,utokyo_saytap_converted_externally_to_rlds,0.1.0,True,Dummy wrist camera RGB observation.,gs://gdm-robotics-open-x-embodiment,gs://gdm-robotics-open-x-embodiment
9,berkeley_mvp_converted_externally_to_rlds,0.1.0,True,Hand camera RGB observation.,gs://gdm-robotics-open-x-embodiment,gs://gdm-robotics-open-x-embodiment


In [26]:
def find_image_specs(feats):
    hits = []

    def rec(o, path=""):
        if isinstance(o, dict):
            shape = o.get("shape")
            dtype = o.get("dtype")

            if isinstance(shape, (list, tuple)) and len(shape) in (3, 4):
                c = shape[-1]
                if c in (1, 3, 4):
                    hits.append({
                        "path": path,
                        "shape": shape,
                        "dtype": dtype,
                        "keys": list(o.keys())
                    })

            for k, v in o.items():
                rec(v, f"{path}/{k}")

        elif isinstance(o, list):
            for i, v in enumerate(o):
                rec(v, f"{path}[{i}]")

    rec(feats)
    return hits



In [27]:
from pathlib import Path
import json
import pandas as pd

unknown_df = df[df["rgb_real"].isna()].copy()
print("Datasets with unknown RGB:", len(unknown_df))


Datasets with unknown RGB: 23


In [29]:
spec_results = []

for _, row in unknown_df.iterrows():
    ds = row["dataset"]
    v  = row["version"]

    feats_path = SCAN_DIR / f"{ds}__{v}__features.json"

    if not feats_path.exists():
        spec_results.append({
            "dataset": ds,
            "version": v,
            "spec_hits": 0,
            "spec_paths": None,
            "note": "features.json missing"
        })
        continue

    feats = json.loads(feats_path.read_text())
    hits = find_image_specs(feats)

    spec_results.append({
        "dataset": ds,
        "version": v,
        "spec_hits": len(hits),
        "spec_paths": [h["path"] for h in hits[:5]] if hits else None
    })


In [30]:
spec_df = pd.DataFrame(spec_results)
display(spec_df)


Unnamed: 0,dataset,version,spec_hits,spec_paths
0,kuka,0.1.0,0,
1,columbia_cairlab_pusht_real,0.1.0,0,
2,robot_vqa,0.1.0,0,
3,fractal20220817_data,0.1.0,0,
4,bridge,0.1.0,0,
5,jaco_play,0.1.0,0,
6,berkeley_cable_routing,0.1.0,0,
7,roboturk,0.1.0,0,
8,nyu_door_opening_surprising_effectiveness,0.1.0,0,
9,berkeley_autolab_ur5,0.1.0,0,


In [31]:
def find_image_keys_only(feats):
    keys = []
    def rec(o, path=""):
        if isinstance(o, dict):
            for k, v in o.items():
                p = f"{path}/{k}"
                if re.search(r"(rgb|image|camera|wrist|front|left|right)", p, re.I):
                    keys.append(p)
                rec(v, p)
        elif isinstance(o, list):
            for i, v in enumerate(o):
                rec(v, f"{path}[{i}]")
    rec(feats)
    return keys


In [32]:
ds = "plex_robosuite"
v = "0.0.1"
feats = json.loads((SCAN_DIR / f"{ds}__{v}__features.json").read_text())
find_image_keys_only(feats)


['/featuresDict/features/steps/sequence/feature/featuresDict/features/observation/featuresDict/features/image',
 '/featuresDict/features/steps/sequence/feature/featuresDict/features/observation/featuresDict/features/image/image',
 '/featuresDict/features/steps/sequence/feature/featuresDict/features/observation/featuresDict/features/image/image/dtype',
 '/featuresDict/features/steps/sequence/feature/featuresDict/features/observation/featuresDict/features/image/image/encodingFormat',
 '/featuresDict/features/steps/sequence/feature/featuresDict/features/observation/featuresDict/features/image/image/shape',
 '/featuresDict/features/steps/sequence/feature/featuresDict/features/observation/featuresDict/features/image/image/shape/dimensions',
 '/featuresDict/features/steps/sequence/feature/featuresDict/features/observation/featuresDict/features/image/pythonClassName',
 '/featuresDict/features/steps/sequence/feature/featuresDict/features/observation/featuresDict/features/wrist_image',
 '/featu

In [33]:
def find_tfds_image_features(feats):
    hits = []

    def rec(o, path=""):
        if isinstance(o, dict):
            # TFDS Image pattern
            if (
                "image" in o
                and isinstance(o.get("image"), dict)
                and "shape" in o["image"]
                and "encodingFormat" in o["image"]
            ):
                hits.append(path)

            for k, v in o.items():
                rec(v, f"{path}/{k}")

        elif isinstance(o, list):
            for i, v in enumerate(o):
                rec(v, f"{path}[{i}]")

    rec(feats)
    return hits


In [34]:
tfds_hits = []

for _, row in unknown_df.iterrows():
    ds = row["dataset"]
    v  = row["version"]
    feats = json.loads((SCAN_DIR / f"{ds}__{v}__features.json").read_text())

    hits = find_tfds_image_features(feats)
    tfds_hits.append({
        "dataset": ds,
        "version": v,
        "tfds_image_hits": len(hits),
        "paths": hits[:3] if hits else None
    })

tfds_df = pd.DataFrame(tfds_hits)
display(tfds_df)


Unnamed: 0,dataset,version,tfds_image_hits,paths
0,kuka,0.1.0,1,[/featuresDict/features/steps/sequence/feature/featuresDict/features/observation/featuresDict/features/image]
1,columbia_cairlab_pusht_real,0.1.0,0,
2,robot_vqa,0.1.0,1,[/featuresDict/features/steps/sequence/feature/featuresDict/features/observation/featuresDict/features/images/sequence/feature]
3,fractal20220817_data,0.1.0,1,[/featuresDict/features/steps/sequence/feature/featuresDict/features/observation/featuresDict/features/image]
4,bridge,0.1.0,0,
5,jaco_play,0.1.0,0,
6,berkeley_cable_routing,0.1.0,0,
7,roboturk,0.1.0,0,
8,nyu_door_opening_surprising_effectiveness,0.1.0,0,
9,berkeley_autolab_ur5,0.1.0,0,


In [35]:
ds = "vima_converted_externally_to_rlds"
v = "0.0.1"
feats = json.loads((SCAN_DIR / f"{ds}__{v}__features.json").read_text())
find_image_keys_only(feats)

['/featuresDict/features/steps/sequence/feature/featuresDict/features/multimodal_instruction_assets/featuresDict/features/frontal_image',
 '/featuresDict/features/steps/sequence/feature/featuresDict/features/multimodal_instruction_assets/featuresDict/features/frontal_image/pythonClassName',
 '/featuresDict/features/steps/sequence/feature/featuresDict/features/multimodal_instruction_assets/featuresDict/features/frontal_image/sequence',
 '/featuresDict/features/steps/sequence/feature/featuresDict/features/multimodal_instruction_assets/featuresDict/features/frontal_image/sequence/feature',
 '/featuresDict/features/steps/sequence/feature/featuresDict/features/multimodal_instruction_assets/featuresDict/features/frontal_image/sequence/feature/pythonClassName',
 '/featuresDict/features/steps/sequence/feature/featuresDict/features/multimodal_instruction_assets/featuresDict/features/frontal_image/sequence/feature/tensor',
 '/featuresDict/features/steps/sequence/feature/featuresDict/features/mul

In [36]:
def find_tensor_image_features(feats):
    hits = []

    def rec(o, path=""):
        if isinstance(o, dict):
            # TensorSpec-like pattern
            if "tensor" in o and isinstance(o["tensor"], dict):
                t = o["tensor"]
                shp = t.get("shape", {})
                dims = shp.get("dimensions") if isinstance(shp, dict) else None
                if isinstance(dims, list) and len(dims) in (3,4) and dims[-1] in (1,3,4):
                    hits.append(path)

            for k, v in o.items():
                rec(v, f"{path}/{k}")

        elif isinstance(o, list):
            for i, v in enumerate(o):
                rec(v, f"{path}[{i}]")

    rec(feats)
    return hits


In [37]:
still_zero = tfds_df[tfds_df["tfds_image_hits"] == 0].copy()
tensor_results = []

for _, r in still_zero.iterrows():
    ds, v = r["dataset"], r["version"]
    feats = json.loads((SCAN_DIR / f"{ds}__{v}__features.json").read_text())

    th = find_tensor_image_features(feats)
    tensor_results.append({
        "dataset": ds,
        "version": v,
        "tensor_image_hits": len(th),
        "tensor_paths": th[:3] if th else None,
    })

tensor_df = pd.DataFrame(tensor_results)
display(tensor_df)


Unnamed: 0,dataset,version,tensor_image_hits,tensor_paths
0,columbia_cairlab_pusht_real,0.1.0,0,
1,bridge,0.1.0,0,
2,jaco_play,0.1.0,0,
3,berkeley_cable_routing,0.1.0,0,
4,roboturk,0.1.0,0,
5,nyu_door_opening_surprising_effectiveness,0.1.0,0,
6,berkeley_autolab_ur5,0.1.0,0,
7,toto,0.1.0,0,
8,vima_converted_externally_to_rlds,0.0.1,0,


In [38]:
zero_list = tensor_df["dataset"].tolist()
for ds in zero_list:
    v = df.loc[df["dataset"] == ds, "version"].iloc[0]
    feats = json.loads((SCAN_DIR / f"{ds}__{v}__features.json").read_text())
    keys = find_image_keys_only(feats)
    print(ds, "keys:", len(keys))
    if keys:
        print("  ", keys[:10])


columbia_cairlab_pusht_real keys: 12
   ['/featuresDict/features/steps/sequence/feature/featuresDict/features/observation/featuresDict/features/wrist_image', '/featuresDict/features/steps/sequence/feature/featuresDict/features/observation/featuresDict/features/wrist_image/pythonClassName', '/featuresDict/features/steps/sequence/feature/featuresDict/features/observation/featuresDict/features/wrist_image/image', '/featuresDict/features/steps/sequence/feature/featuresDict/features/observation/featuresDict/features/wrist_image/image/shape', '/featuresDict/features/steps/sequence/feature/featuresDict/features/observation/featuresDict/features/wrist_image/image/shape/dimensions', '/featuresDict/features/steps/sequence/feature/featuresDict/features/observation/featuresDict/features/wrist_image/image/dtype', '/featuresDict/features/steps/sequence/feature/featuresDict/features/observation/featuresDict/features/image', '/featuresDict/features/steps/sequence/feature/featuresDict/features/observat

In [39]:
def find_any_image_schema(feats):
    hits = []

    def rec(o, path=""):
        if isinstance(o, dict):
            # Case A: TFDS Image-like node: image -> shape -> dimensions
            img = o.get("image")
            if isinstance(img, dict):
                shp = img.get("shape")
                dims = shp.get("dimensions") if isinstance(shp, dict) else None
                if isinstance(dims, list) and len(dims) in (3, 4) and dims[-1] in (1, 3, 4):
                    hits.append((path, "image", dims, img.get("dtype"), img.get("encodingFormat")))

            # Case B: TensorSpec node: tensor -> shape -> dimensions
            t = o.get("tensor")
            if isinstance(t, dict):
                shp = t.get("shape")
                dims = shp.get("dimensions") if isinstance(shp, dict) else None
                if isinstance(dims, list) and len(dims) in (3, 4) and dims[-1] in (1, 3, 4):
                    hits.append((path, "tensor", dims, t.get("dtype"), t.get("encoding")))

            for k, v in o.items():
                rec(v, f"{path}/{k}")

        elif isinstance(o, list):
            for i, v in enumerate(o):
                rec(v, f"{path}[{i}]")

    rec(feats)
    return hits


In [40]:
schema_rows = []

for _, row in df.iterrows():
    ds = row["dataset"]
    v  = row["version"]
    feats_path = SCAN_DIR / f"{ds}__{v}__features.json"

    if not feats_path.exists() or pd.isna(v):
        schema_rows.append({"dataset": ds, "schema_hits": 0, "schema_paths": None})
        continue

    feats = json.loads(feats_path.read_text())
    hits = find_any_image_schema(feats)

    schema_rows.append({
        "dataset": ds,
        "schema_hits": len(hits),
        "schema_paths": [h[0] for h in hits[:3]] if hits else None
    })

schema_df = pd.DataFrame(schema_rows)
df2 = df.merge(schema_df, on="dataset", how="left")
display(df2[["dataset","version","rgb_real","schema_hits","schema_paths","note"]])


Unnamed: 0,dataset,version,rgb_real,schema_hits,schema_paths,note
0,language_table,0.1.0,True,0,,An RGB image of the scene.
1,stanford_hydra_dataset_converted_externally_to_rlds,0.1.0,True,0,,Main camera RGB observation.
2,austin_buds_dataset_converted_externally_to_rlds,0.1.0,True,0,,Main camera RGB observation.
3,furniture_bench_dataset_converted_externally_to_rlds,0.1.0,True,0,,Main camera RGB observation.
4,cmu_franka_exploration_dataset_converted_externally_to_rlds,0.1.0,True,0,,Main camera RGB observation.
5,ucsd_kitchen_dataset_converted_externally_to_rlds,0.1.0,True,0,,Main camera RGB observation.
6,austin_sirius_dataset_converted_externally_to_rlds,0.1.0,True,0,,Wrist camera RGB observation.
7,utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds,1.0.0,True,0,,Main camera RGB observation.
8,utokyo_saytap_converted_externally_to_rlds,0.1.0,True,0,,Dummy wrist camera RGB observation.
9,berkeley_mvp_converted_externally_to_rlds,0.1.0,True,0,,Hand camera RGB observation.


In [43]:
def rgb_status(row):
    if row["rgb_real"] is True:
        return "CONFIRMED_RGB (description)"
    if row["rgb_real"] is False:
        return "CONFIRMED_NO_RGB (explicit zeros/unavailable)"
    if (row.get("schema_hits") or 0) > 0:
        return "CONFIRMED_RGB (schema)"
    return "RGB_UNKNOWN (needs runtime check)"

df2["rgb_status"] = df2.apply(rgb_status, axis=1)
df2["rgb_status"].value_counts()


rgb_status
CONFIRMED_RGB (description)                      44
RGB_UNKNOWN (needs runtime check)                23
CONFIRMED_NO_RGB (explicit zeros/unavailable)     2
Name: count, dtype: int64

In [51]:
import re

IMG_KEY_RE = re.compile(r"(image|rgb|camera|wrist|front|left|right|top|side|fisheye)", re.I)

def _parse_dims_from_any(shape_obj):
    """
    Returns dims as a list of ints if we can interpret them, else None.
    Handles:
      - {"dimensions":[150,200,3]}
      - {"dimensions":[{"size":150},{"size":200},{"size":3}]}
      - [150,200,3]
      - {"dims":[...]} / {"dim":[...]} / {"size":[...]} (fallback keys)
    """
    if shape_obj is None:
        return None

    # case: shape is directly a list
    if isinstance(shape_obj, list) and all(isinstance(x, int) for x in shape_obj):
        return shape_obj

    if isinstance(shape_obj, dict):
        # try common keys
        for key in ("dimensions", "dims", "dim", "sizes", "size"):
            if key in shape_obj:
                d = shape_obj[key]

                # dims are ints
                if isinstance(d, list) and all(isinstance(x, int) for x in d):
                    return d

                # dims are list of dicts like {"size":150}
                if isinstance(d, list) and all(isinstance(x, dict) for x in d):
                    out = []
                    for x in d:
                        # common fields inside a dimension object
                        for k2 in ("size", "value", "dim", "dimension"):
                            if k2 in x and isinstance(x[k2], int):
                                out.append(x[k2])
                                break
                    if out:
                        return out

    return None


def find_image_dims_flexible(feats):
    hits = []

    def rec(o, path=""):
        if isinstance(o, dict):
            # if this node has a "shape" key, try parse it
            if "shape" in o:
                dims = _parse_dims_from_any(o.get("shape"))
                if isinstance(dims, list) and len(dims) in (3, 4) and dims[-1] in (1, 3, 4):
                    if IMG_KEY_RE.search(path):
                        hits.append((path + "/shape", dims))

            for k, v in o.items():
                rec(v, f"{path}/{k}")

        elif isinstance(o, list):
            for i, v in enumerate(o):
                rec(v, f"{path}[{i}]")

    rec(feats)
    return hits


In [52]:
ds="bridge"; v="0.1.0"
feats=json.loads((SCAN_DIR/f"{ds}__{v}__features.json").read_text())
hits = find_image_dims_flexible(feats)
len(hits), hits[:10]



(0, [])

In [53]:
import re

def get_by_path(obj, path):
    cur = obj
    # split "/a/b[0]/c" into tokens
    tokens = [t for t in path.strip("/").split("/") if t]
    for tok in tokens:
        m = re.match(r"^(.*)\[(\d+)\]$", tok)
        if m:
            key, idx = m.group(1), int(m.group(2))
            cur = cur[key][idx]
        else:
            cur = cur[tok]
    return cur


In [54]:
shape_path = "/featuresDict/features/steps/sequence/feature/featuresDict/features/observation/featuresDict/features/image/image/shape"
dims_path  = shape_path + "/dimensions"

shape_obj = get_by_path(feats, shape_path)
dims_obj  = get_by_path(feats, dims_path)

print("shape_obj type:", type(shape_obj))
print("shape_obj:", shape_obj)
print("dims_obj type:", type(dims_obj))
print("dims_obj:", dims_obj)


shape_obj type: <class 'dict'>
shape_obj: {'dimensions': ['480', '640', '3']}
dims_obj type: <class 'list'>
dims_obj: ['480', '640', '3']


In [55]:
def dims_from_key_paths(feats, keys):
    dims_hits = []
    for k in keys:
        if k.endswith("/shape/dimensions"):
            try:
                dims_obj = get_by_path(feats, k)
                dims_hits.append((k, dims_obj))
            except Exception:
                pass
    return dims_hits

keys = find_image_keys_only(feats)
dims_hits = dims_from_key_paths(feats, keys)
print("dims_hits:", len(dims_hits))
dims_hits[:10]


dims_hits: 1


[('/featuresDict/features/steps/sequence/feature/featuresDict/features/observation/featuresDict/features/image/image/shape/dimensions',
  ['480', '640', '3'])]

In [56]:
def _to_int_list(x):
    """
    Convert dims that may be ints or numeric strings into a list[int].
    Returns None if conversion isn't clean.
    """
    if not isinstance(x, list):
        return None

    out = []
    for v in x:
        if isinstance(v, int):
            out.append(v)
        elif isinstance(v, str) and v.strip().isdigit():
            out.append(int(v.strip()))
        else:
            return None
    return out


In [57]:
def _parse_dims_from_any(shape_obj):
    if shape_obj is None:
        return None

    # shape is directly a list
    if isinstance(shape_obj, list):
        return _to_int_list(shape_obj)

    if isinstance(shape_obj, dict):
        for key in ("dimensions", "dims", "dim", "sizes", "size"):
            if key in shape_obj:
                d = shape_obj[key]

                # list of ints or numeric strings
                if isinstance(d, list):
                    ints = _to_int_list(d)
                    if ints is not None:
                        return ints

                # list of dicts like {"size":"480"}
                if isinstance(d, list) and all(isinstance(x, dict) for x in d):
                    out = []
                    for x in d:
                        for k2 in ("size", "value", "dim", "dimension"):
                            if k2 in x:
                                vv = x[k2]
                                if isinstance(vv, int):
                                    out.append(vv)
                                    break
                                if isinstance(vv, str) and vv.strip().isdigit():
                                    out.append(int(vv.strip()))
                                    break
                    if out:
                        return out

    return None


In [58]:
ds="bridge"; v="0.1.0"
feats=json.loads((SCAN_DIR/f"{ds}__{v}__features.json").read_text())
hits = find_image_dims_flexible(feats)
len(hits), hits[:10]


(1,
 [('/featuresDict/features/steps/sequence/feature/featuresDict/features/observation/featuresDict/features/image/image/shape',
   [480, 640, 3])])

In [59]:
schema_rows = []
for _, row in df.iterrows():
    ds = row["dataset"]
    v  = row["version"]
    feats_path = SCAN_DIR / f"{ds}__{v}__features.json"

    if not feats_path.exists() or pd.isna(v):
        schema_rows.append({"dataset": ds, "schema_hits": 0, "schema_paths": None})
        continue

    feats = json.loads(feats_path.read_text())
    hits = find_image_dims_flexible(feats)

    schema_rows.append({
        "dataset": ds,
        "schema_hits": len(hits),
        "schema_paths": [h[0] for h in hits[:3]] if hits else None
    })

schema_df = pd.DataFrame(schema_rows)
df4 = df.merge(schema_df, on="dataset", how="left")

df4["rgb_status"] = df4.apply(rgb_status, axis=1)
df4["rgb_status"].value_counts()


rgb_status
CONFIRMED_RGB (description)                      44
CONFIRMED_RGB (schema)                           23
CONFIRMED_NO_RGB (explicit zeros/unavailable)     2
Name: count, dtype: int64

In [60]:
final_df = df.merge(schema_df, on="dataset", how="left")

def classify_row(row):
    if row["rgb_real"] is True:
        return (
            "CONFIRMED_RGB",
            "description",
            None,
            row["note"]
        )

    if row["rgb_real"] is False:
        return (
            "CONFIRMED_NO_RGB",
            "explicit_zero",
            None,
            row["note"]
        )

    if (row.get("schema_hits") or 0) > 0:
        return (
            "CONFIRMED_RGB",
            "schema",
            row["schema_paths"],
            "RGB confirmed via TFDS/RLDS image schema"
        )

    return (
        "RGB_UNKNOWN",
        "runtime_required",
        None,
        "No image description or schema; requires runtime episode inspection"
    )

final_df[
    ["rgb_status", "rgb_evidence", "rgb_paths", "final_note"]
] = final_df.apply(
    lambda r: pd.Series(classify_row(r)),
    axis=1
)


In [61]:
final_df = final_df[[
    "dataset",
    "version",
    "rgb_status",
    "rgb_evidence",
    "rgb_paths",
    "final_note"
]]

final_df


Unnamed: 0,dataset,version,rgb_status,rgb_evidence,rgb_paths,final_note
0,language_table,0.1.0,CONFIRMED_RGB,description,,An RGB image of the scene.
1,stanford_hydra_dataset_converted_externally_to_rlds,0.1.0,CONFIRMED_RGB,description,,Main camera RGB observation.
2,austin_buds_dataset_converted_externally_to_rlds,0.1.0,CONFIRMED_RGB,description,,Main camera RGB observation.
3,furniture_bench_dataset_converted_externally_to_rlds,0.1.0,CONFIRMED_RGB,description,,Main camera RGB observation.
4,cmu_franka_exploration_dataset_converted_externally_to_rlds,0.1.0,CONFIRMED_RGB,description,,Main camera RGB observation.
5,ucsd_kitchen_dataset_converted_externally_to_rlds,0.1.0,CONFIRMED_RGB,description,,Main camera RGB observation.
6,austin_sirius_dataset_converted_externally_to_rlds,0.1.0,CONFIRMED_RGB,description,,Wrist camera RGB observation.
7,utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds,1.0.0,CONFIRMED_RGB,description,,Main camera RGB observation.
8,utokyo_saytap_converted_externally_to_rlds,0.1.0,CONFIRMED_RGB,description,,Dummy wrist camera RGB observation.
9,berkeley_mvp_converted_externally_to_rlds,0.1.0,CONFIRMED_RGB,description,,Hand camera RGB observation.
