In [38]:
import os
import re
import cv2
import numpy as np
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm
import torch
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration

"""
Bounding‑box pipeline **v5 – deux colonnes dédiées**
===================================================

Changements demandés :
1. **Suppression totale** de la colonne `grasp_points`.
2. Deux nouvelles colonnes fixes :
   * `box.pickup`   – les 4 floats du cercle orange (objet à saisir).
   * `box.target`   – les 4 floats de la boîte noire (zone cible).

Format fidèle à gr00t : chaque colonne est une *liste plate de longueur
4* (`[ymin,xmin,ymax,xmax]`, valeurs normalisées ∈ [0‑1]).  Gr00t peut
encore faire `np.asarray(col, dtype=np.float32)` sans surprise.
"""

# ─────────────────────────── Config & model ────────────────────────────
MODEL_ID = "google/paligemma-3b-mix-224"

DEVICE = (
    torch.device("cuda") if torch.cuda.is_available() else
    torch.device("mps") if torch.backends.mps.is_available() else
    torch.device("cpu")
)
DTYPE = torch.float16 if DEVICE.type in {"cuda", "mps"} else torch.float32

processor = AutoProcessor.from_pretrained(MODEL_ID)
model = (
    PaliGemmaForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype=DTYPE)
    .to(DEVICE)
    .eval()
)

ZERO_BOX = [0.0, 0.0, 0.0, 0.0]

VIDEO_DIR   = os.path.expanduser(
    "/home/pa-boss/Isaac-GR00T/bounding-box-test1/videos/chunk-000/observation.images.secondary_0"
)
PARQUET_DIR = os.path.expanduser(
    "/home/pa-boss/Isaac-GR00T/bounding-box-test1/data/chunk-000"
)
OUT_DIR = "/home/pa-boss/Isaac-GR00T/notebooks/outputs"; os.makedirs(OUT_DIR, exist_ok=True)

# ───────────────────── Helper : normalisation [0‑1] ─────────────────────

def _norm(v, dim):
    if 0.0 <= v <= 1.0:
        return float(v)
    if 0.0 <= v <= 1000.0:
        return float(v) / 1000.0
    return float(v) / dim


def _norm_box(box, h, w):
    y1,x1,y2,x2 = box
    return [_norm(y1,h), _norm(x1,w), _norm(y2,h), _norm(x2,w)]

# ─────────────── Parsing PaliGemma (no label info) ────────────────
_loc_re = re.compile(r"<loc(\d+)>")

def _parse_pg(txt):
    boxes = []
    for chunk in re.split(r"[;\n]", txt):
        locs = _loc_re.findall(chunk)
        if len(locs) == 4:
            boxes.append(list(map(float, locs)))
    return boxes  # order assumed: orange then black (prompt order)

# ─────────────── Classical fallback w/ explicit labels ───────────────
_warned = False

def _detect_orange(img):
    hsv  = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    mask = cv2.inRange(hsv,(5,100,100),(25,255,255))
    mask = cv2.GaussianBlur(mask,(9,9),2)
    circ = cv2.HoughCircles(mask,cv2.HOUGH_GRADIENT,1,50,param1=100,param2=15,minRadius=10,maxRadius=200)
    if circ is None: return None
    x,y,r = circ[0][0]
    return [y-r,x-r,y+r,x+r]

def _detect_black(img):
    g = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    _,th = cv2.threshold(g,50,255,cv2.THRESH_BINARY_INV)
    cnts,_ = cv2.findContours(th,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
    if not cnts: return None
    x,y,w,h = cv2.boundingRect(max(cnts,key=cv2.contourArea))
    return [y,x,y+h,x+w]

def _fallback(img, raw):
    global _warned
    if not _warned:
        print("⚠️  Fallback – raw PG sample:\n", raw[:200], "…")
        _warned = True
    return [_detect_orange(img), _detect_black(img)]

# ───────────────────── Detection wrapper (returns 2) ─────────────────────

def detect_two_boxes(frame_bgr):
    """Return (box.pickup, target_box) each length‑4 list normalised 0‑1."""
    pil = Image.fromarray(cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)).resize((224,224))
    print(f"Shape of pil: {pil}")
    inp = processor(text="<Image> detect orange circle ; black box", images=pil, return_tensors="pt")
    inp = {k:v.to(DEVICE) for k,v in inp.items()}
    with torch.no_grad():
        ids = model.generate(**inp, max_length=inp["input_ids"].shape[-1]+100)
    raw = processor.batch_decode(ids[:, inp["input_ids"].shape[-1]:], skip_special_tokens=True)[0]

    boxes = _parse_pg(raw)
    if len(boxes) < 2:
        boxes = _fallback(frame_bgr, raw)
    pickup  = boxes[0] if len(boxes) >= 1 and boxes[0] is not None else ZERO_BOX
    target  = boxes[1] if len(boxes) >= 2 and boxes[1] is not None else ZERO_BOX

    h,w = frame_bgr.shape[:2]
    return _norm_box(pickup,h,w), _norm_box(target,h,w)

# ───────────────────── DataFrame helpers ─────────────────────

def _ensure_vec(df, col):
    if col not in df.columns:
        df[col] = None
    for idx in df.index:
        v = df.at[idx,col]
        if not (isinstance(v,list) and len(v)==4 and all(isinstance(x,(int,float)) for x in v)):
            df.at[idx,col] = ZERO_BOX
    return df

Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.92it/s]


In [42]:
# ───────────────────── Main loop ─────────────────────
clips = sorted(f for f in os.listdir(VIDEO_DIR) if f.startswith("episode_") and f.endswith(".mp4"))

for vid in tqdm(clips, desc="episodes"):
    cap = cv2.VideoCapture(os.path.join(VIDEO_DIR, vid))
    ok, frame = cap.read(); cap.release()
    if not ok:
        print("❌", vid, "frame0 fail"); continue

    pickup_box, target_box = detect_two_boxes(frame)
    

    # Write to parquet ---------------------------------------------------
    episode = vid[:-4]
    pq_path = os.path.join(PARQUET_DIR, f"{episode}.parquet")
    if not os.path.exists(pq_path):
        print("❌ parquet missing for", vid); continue

    df = pd.read_parquet(pq_path)
    df = _ensure_vec(df, "box.pickup")
    df = _ensure_vec(df, "box.target")

    first = df.index[0]
    df.at[first, "box.pickup"] = pickup_box
    df.at[first, "box.target"] = target_box

    # Option : répliquer sur toutes les lignes (stats > robustes)
    df.loc[:, "box.pickup"] = df.loc[:, "box.pickup"].apply(lambda _: pickup_box)
    df.loc[:, "box.target"] = df.loc[:, "box.target"].apply(lambda _: target_box)

    df.to_parquet(pq_path)

    # ─── debug image → OUT_DIR ----------------------------------------
    dbg = frame.copy()

    # rectangle rouge (pickup)
    x1 = int(pickup_box[1] * frame.shape[1])  # xmin * width
    y1 = int(pickup_box[0] * frame.shape[0])  # ymin * height
    x2 = int(pickup_box[3] * frame.shape[1])  # xmax * width
    y2 = int(pickup_box[2] * frame.shape[0])  # ymax * height
    cv2.rectangle(dbg, (x1, y1), (x2, y2), (0, 0, 255), 2)

    # rectangle bleu (target)
    x1 = int(target_box[1] * frame.shape[1])
    y1 = int(target_box[0] * frame.shape[0])
    x2 = int(target_box[3] * frame.shape[1])
    y2 = int(target_box[2] * frame.shape[0])
    cv2.rectangle(dbg, (x1, y1), (x2, y2), (255, 0, 0), 2)

    cv2.imwrite(os.path.join(OUT_DIR, f"{vid[:-4]}_dbg.png"), dbg)

    print(f"[{episode}] pickup={pickup_box} target={target_box}")

print("✅ All episodes updated with pickup_box / target_box (grasp_points supprimé).")


episodes:   0%|          | 0/20 [00:00<?, ?it/s]

Shape of pil: <PIL.Image.Image image mode=RGB size=224x224 at 0x75257424C220>


episodes:   5%|▌         | 1/20 [00:00<00:06,  2.86it/s]

[episode_000000] pickup=[0.311, 0.373, 0.552, 0.509] target=[0.649, 0.324, 0.715, 0.351]
Shape of pil: <PIL.Image.Image image mode=RGB size=224x224 at 0x7526B49A5720>


episodes:  10%|█         | 2/20 [00:00<00:06,  2.87it/s]

[episode_000001] pickup=[0.33, 0.358, 0.559, 0.49] target=[0.469, 0.29, 0.539, 0.32]
Shape of pil: <PIL.Image.Image image mode=RGB size=224x224 at 0x7526B49A5720>


episodes:  15%|█▌        | 3/20 [00:01<00:05,  2.90it/s]

[episode_000002] pickup=[0.359, 0.338, 0.591, 0.46] target=[0.692, 0.346, 0.759, 0.379]
Shape of pil: <PIL.Image.Image image mode=RGB size=224x224 at 0x7526B49A5720>


episodes:  20%|██        | 4/20 [00:01<00:05,  2.92it/s]

[episode_000003] pickup=[0.363, 0.33, 0.583, 0.463] target=[0.376, 0.523, 0.455, 0.557]
Shape of pil: <PIL.Image.Image image mode=RGB size=224x224 at 0x7526B49A5720>


episodes:  25%|██▌       | 5/20 [00:01<00:05,  2.91it/s]

[episode_000004] pickup=[0.361, 0.334, 0.586, 0.455] target=[0.282, 0.52, 0.359, 0.557]
Shape of pil: <PIL.Image.Image image mode=RGB size=224x224 at 0x7526B49A5720>


episodes:  30%|███       | 6/20 [00:02<00:05,  2.49it/s]

[episode_000005] pickup=[0.322, 0.498, 0.4, 0.534] target=[0.0, 0.0, 4.2625, 3.196875]
Shape of pil: <PIL.Image.Image image mode=RGB size=224x224 at 0x7525794B87F0>


episodes:  35%|███▌      | 7/20 [00:02<00:04,  2.62it/s]

[episode_000006] pickup=[0.568, 0.523, 0.634, 0.557] target=[0.907, 0.19, 4.2625, 0.35]
Shape of pil: <PIL.Image.Image image mode=RGB size=224x224 at 0x75256D2A4460>


episodes:  40%|████      | 8/20 [00:03<00:05,  2.37it/s]

[episode_000007] pickup=[0.269, 0.463, 0.347, 0.503] target=[0.0, 0.0, 4.2625, 3.196875]
Shape of pil: <PIL.Image.Image image mode=RGB size=224x224 at 0x752574112860>


episodes:  45%|████▌     | 9/20 [00:03<00:04,  2.53it/s]

[episode_000008] pickup=[0.276, 0.203, 0.475, 0.27] target=[0.45, 0.121, 0.52, 0.158]
Shape of pil: <PIL.Image.Image image mode=RGB size=224x224 at 0x752574112860>


episodes:  50%|█████     | 10/20 [00:03<00:03,  2.65it/s]

[episode_000009] pickup=[0.28, 0.208, 0.451, 0.286] target=[0.392, 0.121, 0.455, 0.158]
Shape of pil: <PIL.Image.Image image mode=RGB size=224x224 at 0x75256D2A5210>


episodes:  55%|█████▌    | 11/20 [00:04<00:03,  2.72it/s]

[episode_000010] pickup=[0.308, 0.261, 0.448, 0.344] target=[0.321, 0.485, 0.392, 0.52]
Shape of pil: <PIL.Image.Image image mode=RGB size=224x224 at 0x7525794B87F0>


episodes:  60%|██████    | 12/20 [00:04<00:02,  2.79it/s]

[episode_000011] pickup=[0.219, 0.39, 0.29, 0.422] target=[0.267, 0.208, 0.449, 0.258]
Shape of pil: <PIL.Image.Image image mode=RGB size=224x224 at 0x752574112860>


episodes:  65%|██████▌   | 13/20 [00:04<00:02,  2.85it/s]

[episode_000012] pickup=[0.267, 0.344, 0.326, 0.377] target=[0.282, 0.215, 0.462, 0.286]
Shape of pil: <PIL.Image.Image image mode=RGB size=224x224 at 0x7525794B87F0>


episodes:  70%|███████   | 14/20 [00:05<00:02,  2.50it/s]

[episode_000013] pickup=[0.338, 0.136, 0.411, 0.167] target=[0.0, 0.0, 4.2625, 3.1625]
Shape of pil: <PIL.Image.Image image mode=RGB size=224x224 at 0x75256D2B1E40>


episodes:  75%|███████▌  | 15/20 [00:05<00:02,  2.31it/s]

[episode_000014] pickup=[0.359, 0.118, 0.43, 0.155] target=[0.0, 0.0, 4.2625, 3.196875]
Shape of pil: <PIL.Image.Image image mode=RGB size=224x224 at 0x75257424C220>


episodes:  80%|████████  | 16/20 [00:07<00:03,  1.10it/s]

[episode_000015] pickup=[0.269, 0.152, 0.354, 0.182] target=[0.0, 0.0, 4.2625, 3.1625]
Shape of pil: <PIL.Image.Image image mode=RGB size=224x224 at 0x75257424C220>


episodes:  85%|████████▌ | 17/20 [00:08<00:02,  1.36it/s]

[episode_000016] pickup=[0.241, 0.242, 0.416, 0.305] target=[0.402, 0.579, 0.469, 0.612]
Shape of pil: <PIL.Image.Image image mode=RGB size=224x224 at 0x75257424C220>


episodes:  90%|█████████ | 18/20 [00:10<00:02,  1.12s/it]

[episode_000017] pickup=[0.288, 0.41, 0.363, 0.44] target=[0.0, 0.0, 4.2625, 3.196875]
Shape of pil: <PIL.Image.Image image mode=RGB size=224x224 at 0x75257424C220>


episodes:  95%|█████████▌| 19/20 [00:10<00:00,  1.06it/s]

[episode_000018] pickup=[0.351, 0.107, 0.415, 0.138] target=[0.0, 0.0, 4.2625, 3.196875]
Shape of pil: <PIL.Image.Image image mode=RGB size=224x224 at 0x752574112860>


episodes: 100%|██████████| 20/20 [00:11<00:00,  1.79it/s]

[episode_000019] pickup=[0.291, 0.182, 0.367, 0.221] target=[0.0, 0.0, 4.2625, 3.196875]
✅ All episodes updated with pickup_box / target_box (grasp_points supprimé).



