In [None]:
#%pip install bitsandbytes

In [None]:
from transformers import AutoProcessor, AutoModelForVision2Seq, BitsAndBytesConfig
import torch
torch.backends.cuda.matmul.allow_tf32 = True  # T4 acelera un poco

mname = "Qwen/Qwen2-VL-7B-Instruct"
bnb = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4",
                         bnb_4bit_compute_dtype=torch.bfloat16)

proc = AutoProcessor.from_pretrained(mname)
model = AutoModelForVision2Seq.from_pretrained(
    mname,
    quantization_config=bnb,
    device_map="auto",              # reparte GPU/CPU si hace falta
    torch_dtype=torch.bfloat16
)

# generación estable (sin sampling) y salida corta
gen_kwargs = dict(max_new_tokens=128, temperature=0.0)


The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
from PIL import Image
import json

In [None]:
PROMPT = (
    "You are verifying annotations made by a computer vision model that detects bus stops in satellite imagery. "
    "Each image contains a marked area predicted as a possible bus stop. Your task is to confirm or reject the prediction "
    "based only on clear visual evidence.\n\n"

    "Instructions:\n"
    "- Confirm (true) only if you clearly see a bus stop, such as a shelter or a labeled stop sign.\n"
    "- Reject (false) if it shows a tree, pole, billboard, vehicle, house entrance, or any non-stop object.\n"
    "- If unsure or unclear, default to false.\n\n"

    "Respond ONLY in this JSON format:\n"
    '{ "is_bus_stop": true|false, "reason": "short one-line explanation" }\n\n'

    "Examples:\n"
    '{ "is_bus_stop": false, "reason": "billboard near road, not a bus stop" }\n'
    '{ "is_bus_stop": true, "reason": "clearly labeled bus stop sign" }'
)




In [None]:
def detect_bus_stops(img_path):
    img = Image.open(img_path).convert("RGB")
    messages = [
    {
        "role": "system",
        "content": "Act as a strict annotation verifier assessing the correctness of bus stop predictions in satellite imagery."
    },
    {
        "role": "user",
        "content": [
            { "type": "image", "image": img },
            { "type": "text", "text": PROMPT }
        ]
    }
    ]
    # 1. Preparar texto del chat
    chat_text = proc.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
    # 2. Convertir texto+imagen a tensores
    inputs = proc(text=chat_text, images=img, return_tensors="pt").to(model.device)
    # 3. Generar (greedy decoding)
    out = model.generate(**inputs, max_new_tokens=512, do_sample=False)
    # 4. Decodificar y extraer JSON
    txt = proc.decode(out[0], skip_special_tokens=True)
    #jtxt = txt[txt.find("{"): txt.rfind("}")+1]
    return txt



In [None]:
res = detect_bus_stops("/content/ruta8_pt_11602_z20.png")
print(res)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


system
Act as a strict annotation verifier assessing the correctness of bus stop predictions in satellite imagery.
user
You are verifying annotations made by a computer vision model that detects bus stops in satellite imagery. Each image contains a marked area predicted as a possible bus stop. Your task is to confirm or reject the prediction based only on clear visual evidence.

Instructions:
- Confirm (true) only if you clearly see a bus stop, such as a shelter or a labeled stop sign.
- Reject (false) if it shows a tree, pole, billboard, vehicle, house entrance, or any non-stop object.
- If unsure or unclear, default to false.

Respond ONLY in this JSON format:
{ "is_bus_stop": true|false, "reason": "short one-line explanation" }

Examples:
{ "is_bus_stop": false, "reason": "billboard near road, not a bus stop" }
{ "is_bus_stop": true, "reason": "clearly labeled bus stop sign" }
assistant
{
  "is_bus_stop": false,
  "reason": "no clear visual evidence of a bus stop"
}


In [None]:
from PIL import Image
import json
import re # Import the re module

PROMPT = (
  "Eres un verificador. La imagen fue MARCADA por otro modelo como posible PARADA DE ÓMNIBUS. "
  "Puede estar mal. Si NO estás seguro, responde false.\n"
  "Responde SOLO JSON válido, sin texto extra:\n"
  "{\"is_bus_stop\": true|false, \"reason\": \"≤1 línea\"}\n"
  "Criterios: aceptar solo si se ve claramente un refugio o una señal de parada junto a una carretera; "
  "rechazar postes, árboles, carteles, vehículos, casetas/galpones, entradas de casas."
)

def safe_json(txt):
    m = re.search(r"\{.*\}", txt, re.S)
    if not m:
        return {"is_bus_stop": False, "reason": "no JSON"}
    try:
        return json.loads(m.group())
    except json.JSONDecodeError:
        return {"is_bus_stop": False, "reason": "invalid JSON"}

def tile_iter(img, size=896, overlap=128):
    W,H = img.size
    step = size - overlap
    xs = list(range(0, max(1, W-size)+1, step)) or [0]
    ys = list(range(0, max(1, H-size)+1, step)) or [0]
    for y in ys:
        for x in xs:
            yield img.crop((x,y,x+size,y+size)), (x,y,x+size,y+size)

def qwen_verify(img, fewshot_pos, fewshot_neg):
    """few-shot visual: 1 positivo + 1 negativo antes de la consulta"""
    msgs = [
        {"role":"system","content":"Responde estrictamente en JSON válido."},

        # Ejemplo POSITIVO
        {"role":"user","content":[
            {"type":"image","image": fewshot_pos},
            {"type":"text","text": PROMPT}
        ]},
        {"role":"assistant","content":
            "{\"is_bus_stop\": true, \"reason\": \"refugio pequeño junto a la ruta\"}"},

        # Ejemplo NEGATIVO
        {"role":"user","content":[
            {"type":"image","image": fewshot_neg},
            {"type":"text","text": PROMPT}
        ]},
        {"role":"assistant","content":
            "{\"is_bus_stop\": false, \"reason\": \"caseta/cabina en campo, no parada\"}"}
    ]
    return msgs

def classify_image_with_tiles(path_img, path_pos_example, path_neg_example):
    img = Image.open(path_img).convert("RGB")
    pos = Image.open(path_pos_example).convert("RGB")
    neg = Image.open(path_neg_example).convert("RGB")

    best = {"is_bus_stop": False, "reason": "no tile accepted", "tile": None}

    for tile, bbox in tile_iter(img, size=896, overlap=128):
        msgs = qwen_verify(tile, pos, neg)
        msgs.append({"role":"user","content":[
            {"type":"image","image": tile},
            {"type":"text","text": PROMPT}
        ]})
        chat = proc.apply_chat_template(msgs, add_generation_prompt=True, tokenize=False)
        inputs = proc(text=chat, images=tile, return_tensors="pt").to(model.device)
        print("Inputs before model generate:", inputs) # Added print statement
        out = model.generate(**inputs, max_new_tokens=256, do_sample=False)
        ans = safe_json(proc.decode(out[0], skip_special_tokens=True))

        # siempre sobreescribe con el último tile válido
        if ans.get("is_bus_stop") == True or best["is_bus_stop"] is False:
            best = {**ans, "tile": bbox}

    return best

# Ejemplo:
# result = classify_image_with_tiles("data/img.png", "fewshot/pos.jpg", "fewshot/neg.jpg")
# print(result)

In [None]:
result = classify_image_with_tiles("/content/ruta8_pt_11932_z20.png", "/content/ruta8_pt_11602_z20.png", "/content/ruta8_pt_11845_z20.png")
print(result)

IndexError: index 1 is out of bounds for dimension 0 with size 1

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [None]:
"""
Pipeline de verificación de paradas (versión 1)
================================================

Objetivo
--------
Dado un conjunto de **imágenes satelitales con la predicción de Roboflow dibujada** (rectángulo rojo
marcando la candidata) y/o (opcional) un archivo con metadatos (bbox, lat/lon del centro del tile),
esté pipeline decide si cada candidata es realmente una **parada de ómnibus**.

Qué hace este pipeline
----------------------
1) **Detección del recorte (crop) de la candidata** a partir del rectángulo rojo pintado en la imagen de Roboflow.
   - Si también tenés un CSV/JSON con bbox numéricas, puede usarse directamente (más robusto).
2) **Verificación con un VLM** (por ejemplo Qwen/Qwen2-VL-7B-Instruct) usando **dos vistas**:
   - *FULL*: el mosaico completo con la caja dibujada (contexto).
   - *CROP*: un recorte ampliado (~30% de padding) sobre la caja (detalle).
   - Respuesta **JSON estricta**: `{is_bus_stop, reason, confidence}`.
3) **Robustez por consenso** con **rotaciones del recorte** (0/90/180/270):
   - Majority vote y promedio de confianza; si hay dudas, se marca para revisión.
4) **Gate geo-espacial (opcional)**:
   - Si se dispone de lat/lon del centro del tile y del zoom, se convierte el **pixel center del bbox a lat/lon**
     y se calcula distancia al eje de la carretera. Se rechaza si está lejos (p.ej. >8 m).
   - (Opcional) Gate por intersección con edificaciones para descartar techos/galpones.
5) **Clasificador ligero (opcional)** con embeddings **SigLIP/CLIP + LogisticRegression** sobre crops.
   - Se puede entrenar rápidamente con positivos/negativos duros.
6) **Salida consolidada** en CSV: decisión final, razón, confianzas parciales, flags de gates.

Modificaciones incluidas vs. versión anterior
---------------------------------------------
- **Nuevo prompt** específico con criterios visuales medibles y política "falso por defecto".
- **Doble entrada al VLM** (FULL + CROP) en lugar de solo una imagen.
- **Parser JSON estricto** con saneo de respuestas para evitar texto extra.
- **Consenso por rotaciones** del CROP con voto mayoritario y umbrales de confianza.
- **Gate geo-espacial** opcional: distancia a carretera y (opcional) edificios.
- **Soporte de dos fuentes de bbox**: (a) detectadas desde el rectángulo rojo pintado, (b) leídas desde CSV/JSON.
- **Módulo opcional SigLIP/CLIP** para verificación rápida basada en embeddings.
- **Registro a CSV** con trazabilidad (razones, confianzas, flags, errores).
- **Post-hoc recalibración** y **bucketing** (true/false/review) sin re-inferir el modelo.
- **Galería HTML** opcional para validación visual.
- **Debug de crops**: exporta pares FULL+BOX y CROP para inspección.

Cómo usar
---------
1) Ajustá las rutas en la sección CONFIG.
2) Si tenés metadatos (CSV/JSON) con columnas: `filename, x, y, width, height, center_lat, center_lon, zoom`,
   poné la ruta en `PATH_BBOX_META` y `USE_META_BBOX=True`.
3) Si **no** tenés bbox meta, el código intentará **detectar el rectángulo rojo** en la imagen.
4) (Opcional) Si disponés de **shapefile/GeoJSON de carreteras** para el gate, configurá `PATH_ROADS`.
5) (Opcional) Si querés **clasificador SigLIP**, prepará un CSV de entrenamiento con
   `filename,label` (1=parada,0=no) y apuntalo en `PATH_CLF_TRAIN`.

Notas
-----
- El gate geo-espacial requiere conocer la geolocalización: ya sea vía (a) CSV meta por imagen, o (b) parseo del
  nombre del archivo (ver regex `FILENAME_LATLON_REGEX`).
- Si no hay geo, el pipeline sigue funcionando (salta ese gate).
- Para Qwen2-VL, reutilizá tu `model` y `processor` existentes; este archivo sólo define las llamadas.

"""

import os
import re
import json
import math
import csv
from dataclasses import dataclass
from typing import Optional, Tuple, List, Dict, Any

import numpy as np
from PIL import Image, ImageDraw

# Dependencias opcionales (se usan si están instaladas)
try:
    import geopandas as gpd
    from shapely.geometry import Point
except Exception:
    gpd = None
    Point = None

try:
    from sklearn.linear_model import LogisticRegression
    import joblib
except Exception:
    LogisticRegression = None
    joblib = None

try:
    from transformers import AutoProcessor, AutoModel
    import torch
except Exception:
    AutoProcessor = None
    AutoModel = None
    torch = None

try:
    import pandas as pd
except Exception:
    pd = None


# ======================
# CONFIG
# ======================

IMAGES_DIR = "/content/drive/My Drive/Tesis/Datos/Ruta8/paradas_no_seguras"  # carpeta con imágenes predichas
OUTPUT_CSV = "/content/drive/My Drive/Tesis/Resultados/verificacion_paradas_v1.csv"

# Metadatos opcionales con bbox y/o geo
USE_META_BBOX = False
PATH_BBOX_META = "/content/drive/My Drive/Tesis/Datos/meta_bbox.csv"  # columns: filename,x,y,width,height[,center_lat,center_lon,zoom]

# Gate geo (opcional)
USE_GEO_GATE = False
PATH_ROADS = "/content/drive/My Drive/Tesis/Datos/rutas_uy.geojson"  # eje de ruta 8 (o nacional)
MAX_DIST_TO_ROAD_M = 8.0

# Buildings (opcional)
USE_BUILDINGS_GATE = False
PATH_BUILDINGS = "/content/drive/My Drive/Tesis/Datos/buildings_uy.geojson"

# Intento de parsear lat/lon desde el nombre del archivo
FILENAME_LATLON_REGEX = re.compile(r"lat_(-?\d+\.\d+)_lon_(-?\d+\.\d+)")
DEFAULT_ZOOM = 20
TILE_SIZE = (640, 640)  # ancho, alto

# Consenso por rotaciones (solo CROP)
ROTATIONS = [0, 90, 180, 270]
VOTE_THR = 0.6     # promedio de confianza para aceptar
UNCERTAIN_RANGE = (0.35, 0.6)

# Prompt para el VLM
PROMPT_VLM = (
    "You are a strict verifier of bus-stop predictions on top-down satellite imagery of Uruguay.\n\n"
    "You will receive TWO images of the same location:\n"
    "1) FULL tile with a thin red rectangle marking the candidate.\n"
    "2) CROP: a zoom-in of the marked rectangle with ~20–40 m across.\n\n"
    "Decide if the marked candidate is a BUS STOP.\n\n"
    "Positive cues (all usually present): small shelter (≈2–4 m wide) with a short roof shadow next to the road edge,\n"
    "or a simple sign pole with small shadow right at the road edge. Aligned roughly parallel to the road and within ~0–5 m from the edge.\n\n"
    "Negative cues: houses/roofs (>6 m), sheds, truck cabins, containers, billboards, median road signs, trees/bushes,\n"
    "objects far from the road edge or inside lots, or blurry views. If unsure, reject.\n\n"
    "Output ONLY JSON: {\"is_bus_stop\": true|false, \"reason\": \"short one-line\", \"confidence\": 0.00-1.00}.\n"
    "No extra text."
)

PROMPT_VLM_CROP_ONLY = (
    "You are a strict verifier of a bus-stop candidate on top-down satellite imagery.\n"
    "You will receive a single zoomed-in crop around the candidate.\n"
    "Accept only if a small shelter (≈2–4 m) or a bus-stop sign pole is clearly visible near the road edge.\n"
    "If unsure, reject. Output ONLY JSON: {\"is_bus_stop\": true|false, \"reason\": \"short\", \"confidence\": 0.00-1.00}."
)


# ======================
# Utils: bbox desde el rectángulo rojo dibujado
# ======================

def detect_red_box_bbox(img: Image.Image, r_thr: int = 170, gb_thr: int = 120) -> Optional[Tuple[int, int, int, int]]:
    """Detecta el rectángulo rojo dibujado (estilo overlay de Roboflow) y retorna bbox (x_center,y_center,w,h) en px.
    Estrategia simple: umbral por color, tomar el bbox de todos los píxeles rojos.
    Si no se detecta, retorna None.
    """
    arr = np.array(img.convert("RGB"))
    R, G, B = arr[..., 0], arr[..., 1], arr[..., 2]
    mask = (R >= r_thr) & (G <= gb_thr) & (B <= gb_thr)

    ys, xs = np.where(mask)
    if ys.size < 20:  # muy pocos pixeles rojos
        return None
    x0, x1 = int(xs.min()), int(xs.max())
    y0, y1 = int(ys.min()), int(ys.max())

    # a veces el overlay incluye texto rojo: si el bbox es demasiado fino, puede ser ruido.
    if (x1 - x0) < 8 or (y1 - y0) < 8:
        return None

    # Convertir a (cx, cy, w, h)
    w = x1 - x0 + 1
    h = y1 - y0 + 1
    cx = x0 + w // 2
    cy = y0 + h // 2
    return (cx, cy, w, h)


def crop_from_bbox(img: Image.Image, bbox_xywh: Tuple[int, int, int, int], pad: float = 0.3) -> Image.Image:
    W, H = img.size
    cx, cy, w, h = bbox_xywh
    pw, ph = int(w * pad), int(h * pad)
    x0 = max(0, cx - w // 2 - pw)
    y0 = max(0, cy - h // 2 - ph)
    x1 = min(W, cx + w // 2 + pw)
    y1 = min(H, cy + h // 2 + ph)
    return img.crop((x0, y0, x1, y1))


# ======================
# Utils: geo (Web Mercator helpers)
# ======================

def _latlon_to_meters(lat: float, lon: float) -> Tuple[float, float]:
    origin_shift = 2 * math.pi * 6378137 / 2.0
    mx = lon * origin_shift / 180.0
    my = math.log(math.tan((90 + lat) * math.pi / 360.0)) * 6378137
    return mx, my


def _meters_to_latlon(mx: float, my: float) -> Tuple[float, float]:
    lon = (mx / (2 * math.pi * 6378137 / 2.0)) * 180.0
    lat = (2 * math.atan(math.exp(my / 6378137)) - math.pi / 2) * 180.0 / math.pi
    return lat, lon


def pixel_to_latlon(center_lat: float, center_lon: float, zoom: int, tile_size: Tuple[int, int],
                    px: float, py: float) -> Tuple[float, float]:
    """Convierte coordenada de pixel dentro del tile (0..W, 0..H) a lat/lon, dada la lat/lon del centro del tile.
    Aproximación Web Mercator.
    """
    W, H = tile_size
    res0 = 156543.03392804097  # m/px at zoom 0
    res = res0 / (2 ** zoom)

    mx0, my0 = _latlon_to_meters(center_lat, center_lon)
    dx = (px - W / 2) * res
    dy = (py - H / 2) * res
    lat, lon = _meters_to_latlon(mx0 + dx, my0 - dy)  # ojo con el eje y (top-down)
    return lat, lon


# ======================
# VLM: prompts y parsing JSON
# ======================

def parse_json_strict(txt: str) -> Dict[str, Any]:
    m = re.search(r"\{.*\}", txt, flags=re.S)
    if not m:
        return {"is_bus_stop": False, "reason": "no JSON", "confidence": 0.0}
    raw = m.group(0)
    raw = raw.replace("True", "true").replace("False", "false")
    raw = re.sub(r",\s*}\s*$", "}", raw)
    raw = re.sub(r",\s*]", "]", raw)
    try:
        obj = json.loads(raw)
    except Exception:
        return {"is_bus_stop": False, "reason": "JSON parse error", "confidence": 0.0}
    ibs = bool(obj.get("is_bus_stop", False))
    reason = str(obj.get("reason", ""))[:160]
    try:
        conf = float(obj.get("confidence", 0.0))
    except Exception:
        conf = 0.0
    conf = max(0.0, min(1.0, conf))
    return {"is_bus_stop": ibs, "reason": reason, "confidence": conf}


def vlm_verify_full_and_crop(full_img: Image.Image, crop_img: Image.Image,
                             model, processor, prompt: str = PROMPT_VLM) -> Dict[str, Any]:
    messages = [
        {"role": "system", "content": "Act as a strict annotation verifier for bus stops on top-down satellite imagery. Be conservative."},
        {"role": "user", "content": [
            {"type": "text", "text": prompt},
            {"type": "image", "image": full_img},
            {"type": "image", "image": crop_img},
        ]}
    ]
    chat_text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
    # Pass batch lists: text=[...], images=[[...]] to satisfy processor batching
    inputs = processor(text=[chat_text], images=[[full_img, crop_img]], return_tensors="pt", padding=True).to(model.device)
    out = model.generate(**inputs, max_new_tokens=128, do_sample=False, temperature=0.0, top_p=1.0)
    txt = processor.decode(out[0], skip_special_tokens=True)
    return parse_json_strict(txt)


def vlm_verify_crop_only(crop_img: Image.Image, model, processor, prompt: str = PROMPT_VLM_CROP_ONLY) -> Dict[str, Any]:
    messages = [
        {"role": "system", "content": "Act as a strict annotation verifier for bus stops on top-down satellite imagery. Be conservative."},
        {"role": "user", "content": [
            {"type": "text", "text": prompt},
            {"type": "image", "image": crop_img},
        ]}
    ]
    chat_text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
    # Batch lists to avoid processor quirks across versions
    inputs = processor(text=[chat_text], images=[[crop_img]], return_tensors="pt", padding=True).to(model.device)
    out = model.generate(**inputs, max_new_tokens=128, do_sample=False, temperature=0.0, top_p=1.0)
    txt = processor.decode(out[0], skip_special_tokens=True)
    return parse_json_strict(txt)


# ======================
# SigLIP/CLIP (opcional)
# ======================

def load_siglip_model(device: str = "cuda"):
    if AutoProcessor is None or AutoModel is None:
        raise RuntimeError("Transformers no disponible. Instala: pip install transformers accelerate")
    proc = AutoProcessor.from_pretrained("google/siglip-so400m-patch14-384")
    mdl = AutoModel.from_pretrained("google/siglip-so400m-patch14-384")
    if torch is not None:
        mdl = mdl.to(device)
        mdl.eval()
    return mdl, proc


def siglip_embed(img: Image.Image, mdl, proc, device: str = "cuda") -> np.ndarray:
    if torch is None:
        raise RuntimeError("PyTorch no disponible")
    inputs = proc(images=img, return_tensors="pt")
    if device:
        inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        feats = mdl.get_image_features(**inputs)
    v = feats[0].detach().cpu().numpy()
    v = v / (np.linalg.norm(v) + 1e-9)
    return v


def train_light_classifier(train_csv: str, images_dir: str, device: str = "cuda",
                           save_path: Optional[str] = None) -> Any:
    if LogisticRegression is None:
        raise RuntimeError("scikit-learn no disponible. Instala: pip install scikit-learn joblib")
    mdl, proc = load_siglip_model(device)
    X, y = [], []
    with open(train_csv, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            fn = row['filename']
            label = int(row['label'])
            p = os.path.join(images_dir, fn)
            if not os.path.isfile(p):
                continue
            img = Image.open(p).convert('RGB')
            emb = siglip_embed(img, mdl, proc, device)
            X.append(emb)
            y.append(label)
    X = np.stack(X, axis=0)
    y = np.array(y)
    clf = LogisticRegression(max_iter=300, class_weight='balanced')
    clf.fit(X, y)
    if save_path and joblib is not None:
        joblib.dump({"clf": clf}, save_path)
    return clf


def load_light_classifier(path: str):
    if joblib is None:
        raise RuntimeError("joblib no disponible. Instala: pip install joblib")
    obj = joblib.load(path)
    return obj["clf"]


def clf_score_crop(crop_img: Image.Image, clf, siglip_mdl, siglip_proc, device: str = "cuda") -> float:
    emb = siglip_embed(crop_img, siglip_mdl, siglip_proc, device)
    p = float(clf.predict_proba(emb.reshape(1, -1))[0, 1])
    return p


# ======================
# Gate geo
# ======================

def load_roads(path: str):
    if gpd is None:
        raise RuntimeError("GeoPandas no disponible. Instala: pip install geopandas shapely pyproj")
    roads = gpd.read_file(path)
    roads = roads.to_crs(3857)
    roads["_geom_len_m"] = roads.geometry.length
    return roads


def load_buildings(path: str):
    if gpd is None:
        raise RuntimeError("GeoPandas no disponible")
    gdf = gpd.read_file(path).to_crs(3857)
    return gdf


def dist_to_nearest_road_m(lat: float, lon: float, roads_gdf) -> float:
    if gpd is None:
        return float('inf')
    mx, my = _latlon_to_meters(lat, lon)
    p = Point(mx, my)
    # índice espacial si existe
    try:
        idx = roads_gdf.sindex
        cand_idx = list(idx.nearest(p.bounds, num_results=8))
        d = min(roads_gdf.iloc[i].geometry.distance(p) for i in cand_idx)
    except Exception:
        d = roads_gdf.distance(p).min()
    return float(d)


def intersects_building(lat: float, lon: float, buildings_gdf) -> bool:
    if gpd is None:
        return False
    mx, my = _latlon_to_meters(lat, lon)
    p = Point(mx, my)
    try:
        idx = buildings_gdf.sindex
        cand_idx = list(idx.intersection(p.buffer(3).bounds))  # 3 m buffer
        return any(buildings_gdf.iloc[i].geometry.buffer(0).intersects(p.buffer(1.5)) for i in cand_idx)
    except Exception:
        return bool(buildings_gdf.intersects(p.buffer(1.5)).any())


# ======================
# Meta utils
# ======================

@dataclass
class ImageMeta:
    filename: str
    bbox_xywh: Optional[Tuple[int, int, int, int]] = None  # px
    center_lat: Optional[float] = None
    center_lon: Optional[float] = None
    zoom: Optional[int] = None


def read_meta_csv(path: str) -> Dict[str, ImageMeta]:
    meta: Dict[str, ImageMeta] = {}
    with open(path, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            fn = row['filename']
            im = ImageMeta(filename=fn)
            try:
                x = int(float(row.get('x', '')))
                y = int(float(row.get('y', '')))
                w = int(float(row.get('width', '')))
                h = int(float(row.get('height', '')))
                im.bbox_xywh = (x, y, w, h)
            except Exception:
                pass
            try:
                im.center_lat = float(row.get('center_lat', ''))
                im.center_lon = float(row.get('center_lon', ''))
            except Exception:
                pass
            try:
                im.zoom = int(row.get('zoom', ''))
            except Exception:
                im.zoom = None
            meta[fn] = im
    return meta


def parse_latlon_from_name(name: str) -> Tuple[Optional[float], Optional[float]]:
    m = FILENAME_LATLON_REGEX.search(name)
    if not m:
        return None, None
    return float(m.group(1)), float(m.group(2))


# ======================
# Pipeline principal
# ======================

def verify_image(
    img_path: str,
    model,
    processor,
    roads_gdf=None,
    buildings_gdf=None,
    meta: Optional[ImageMeta] = None,
    clf=None,
    siglip_mdl=None,
    siglip_proc=None,
    device: str = "cuda",
) -> Dict[str, Any]:
    """Procesa una imagen y devuelve un dict con el resultado consolidado."""
    out: Dict[str, Any] = {
        "filename": os.path.basename(img_path),
        "status": "ok",
        "has_bbox": False,
        "geo_gate_used": False,
        "geo_ok": True,
        "dist_to_road_m": None,
        "building_intersection": None,
        "vlm_is_bus_stop": False,
        "vlm_conf": 0.0,
        "vlm_reason": "",
        "vote_is_bus_stop": False,
        "vote_conf_mean": None,
        "vote_raw": None,
        "clf_p": None,
        "final_label": None,
        "final_reason": "",
    }
    try:
        img = Image.open(img_path).convert('RGB')
        # 1) obtener bbox
        bbox = None
        if meta and meta.bbox_xywh is not None:
            bbox = meta.bbox_xywh
        else:
            bbox = detect_red_box_bbox(img)
        if bbox is None:
            out["status"] = "no_bbox"
            out["final_label"] = False
            out["final_reason"] = "No se detectó rectángulo rojo ni bbox meta"
            return out
        out["has_bbox"] = True
        crop = crop_from_bbox(img, bbox, pad=0.3)

        # 2) Gate geo (si disponible)
        lat_c = None
        lon_c = None
        zoom = DEFAULT_ZOOM
        if meta:
            if meta.center_lat is not None and meta.center_lon is not None:
                lat_c, lon_c = meta.center_lat, meta.center_lon
            if meta.zoom is not None:
                zoom = meta.zoom
        if (lat_c is None or lon_c is None):
            # intento parsear del nombre
            lat_c, lon_c = parse_latlon_from_name(os.path.basename(img_path))
        if USE_GEO_GATE and gpd is not None and lat_c is not None and lon_c is not None and roads_gdf is not None:
            # centro del bbox (px) a lat/lon
            cx, cy, w, h = bbox
            lat_px, lon_px = pixel_to_latlon(lat_c, lon_c, zoom, TILE_SIZE, cx, cy)
            out["geo_gate_used"] = True
            d = dist_to_nearest_road_m(lat_px, lon_px, roads_gdf)
            out["dist_to_road_m"] = round(d, 2)
            if d > MAX_DIST_TO_ROAD_M:
                out["geo_ok"] = False
            # Buildings (opcional)
            if USE_BUILDINGS_GATE and buildings_gdf is not None:
                inter = intersects_building(lat_px, lon_px, buildings_gdf)
                out["building_intersection"] = bool(inter)
                if inter:
                    out["geo_ok"] = False

        if not out["geo_ok"]:
            out["final_label"] = False
            out["final_reason"] = f"Geo gate: distancia a ruta > {MAX_DIST_TO_ROAD_M} m o en edificio"
            return out

        # 3) VLM (FULL + CROP)
        r0 = vlm_verify_full_and_crop(img, crop, model, processor)
        out["vlm_is_bus_stop"] = bool(r0.get("is_bus_stop", False))
        out["vlm_conf"] = float(r0.get("confidence", 0.0))
        out["vlm_reason"] = str(r0.get("reason", ""))

        # 4) Consenso por rotaciones (solo CROP)
        votes = []
        for ang in ROTATIONS:
            if ang == 0:
                crop_r = crop
            else:
                crop_r = crop.rotate(ang, expand=True)
            ri = vlm_verify_crop_only(crop_r, model, processor)
            votes.append(float(ri.get("confidence", 0.0)) * (1.0 if ri.get("is_bus_stop", False) else -1.0))
        conf_mean = float(np.mean([abs(v) for v in votes])) if votes else None
        is_true_majority = sum(1 for v in votes if v > 0) > len(votes) / 2.0
        out["vote_conf_mean"] = None if conf_mean is None else round(conf_mean, 3)
        out["vote_is_bus_stop"] = bool(is_true_majority and (conf_mean is not None and conf_mean >= VOTE_THR))
        out["vote_raw"] = ",".join(f"{v:.3f}" for v in votes)

        # 5) Clasificador ligero (opcional)
        p_clf = None
        if clf is not None and siglip_mdl is not None and siglip_proc is not None:
            p_clf = clf_score_crop(crop, clf, siglip_mdl, siglip_proc, device)
            out["clf_p"] = round(float(p_clf), 3)

        # 6) Decisión final (reglas simples, ajustables)
        reasons = []
        label = False

        # base: usar consenso si existe
        if out["vote_is_bus_stop"]:
            label = True
            reasons.append(f"consenso_rotaciones_conf≈{out['vote_conf_mean']}")
        else:
            # fallback a decisión VLM base si es muy segura
            if out["vlm_is_bus_stop"] and out["vlm_conf"] >= 0.75:
                label = True
                reasons.append(f"vlm_full+crop_conf={out['vlm_conf']:.2f}")

        # influenciar con clasificador ligero
        if p_clf is not None:
            if p_clf >= 0.80:
                label = True
                reasons.append(f"clf_siglip_p={p_clf:.2f}")
            elif p_clf <= 0.20:
                label = False
                reasons.append(f"clf_siglip_p={p_clf:.2f}")

        # zonas grises: forzar revisión manual
        if not label and out["vlm_conf"] >= UNCERTAIN_RANGE[0] and out["vlm_conf"] <= UNCERTAIN_RANGE[1]:
            reasons.append("incertidumbre_VLM")

        out["final_label"] = bool(label)
        out["final_reason"] = "; ".join([out["vlm_reason"]] + reasons if out["vlm_reason"] else reasons)
        return out

    except Exception as e:
        out["status"] = f"error:{type(e).__name__}:{e}"
        out["final_label"] = False
        out["final_reason"] = "exception"
        return out


def run_pipeline(
    images_dir: str,
    output_csv: str,
    model,
    processor,
    use_meta_bbox: bool = USE_META_BBOX,
    path_bbox_meta: Optional[str] = PATH_BBOX_META,
    use_geo_gate: bool = USE_GEO_GATE,
    path_roads: Optional[str] = PATH_ROADS,
    use_buildings_gate: bool = USE_BUILDINGS_GATE,
    path_buildings: Optional[str] = PATH_BUILDINGS,
    clf_path: Optional[str] = None,          # ruta a joblib del clasificador ligero (opcional)
    siglip_device: str = "cuda",
    recursive: bool = False                  # NUEVO: buscar imágenes en subcarpetas
):
    # cargar meta
    meta_map: Dict[str, ImageMeta] = {}
    if use_meta_bbox and path_bbox_meta and os.path.isfile(path_bbox_meta):
        meta_map = read_meta_csv(path_bbox_meta)

    # geo
    roads = None
    buildings = None
    if use_geo_gate and path_roads and os.path.isfile(path_roads):
        roads = load_roads(path_roads)
    if use_geo_gate and use_buildings_gate and path_buildings and os.path.isfile(path_buildings):
        buildings = load_buildings(path_buildings)

    # clasificador ligero
    clf = None
    siglip_mdl = None
    siglip_proc = None
    if clf_path and os.path.isfile(clf_path):
        clf = load_light_classifier(clf_path)
        siglip_mdl, siglip_proc = load_siglip_model(siglip_device)

    # inicializar filas ANTES del loop (evita UnboundLocalError)
    rows = []

    if not os.path.isdir(images_dir):
        raise FileNotFoundError(f"images_dir no existe: {images_dir}")

    file_paths: List[str] = []
    if recursive:
        for root, _, fnames in os.walk(images_dir):
            for fn in fnames:
                if fn.lower().endswith((".png", ".jpg", ".jpeg")):
                    file_paths.append(os.path.join(root, fn))
    else:
        file_paths = [os.path.join(images_dir, f) for f in os.listdir(images_dir) if f.lower().endswith((".png", ".jpg", ".jpeg"))]

    file_paths.sort()
    if not file_paths:
        raise FileNotFoundError(f"No se encontraron imágenes (*.png|*.jpg|*.jpeg) en {images_dir}")

    for i, p in enumerate(file_paths, 1):
        fn = os.path.basename(p)
        mm = meta_map.get(fn)
        res = verify_image(p, model, processor, roads, buildings, mm, clf, siglip_mdl, siglip_proc, siglip_device)
        rows.append(res)
        if i % 25 == 0:
            print(f"Procesadas {i}/{len(file_paths)} imágenes...")

    # guardar CSV
    fieldnames = list(rows[0].keys()) if rows else []
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)
    with open(output_csv, "w", newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for r in rows:
            writer.writerow(r)
    print(f"Listo. Guardado CSV en: {output_csv}")


# ======================
# Post-proceso: unir CSV + imágenes, bucketing y galería (opcional)
# ======================

def _collect_image_index(images_dir: str) -> Dict[str, str]:
    """Índice {basename -> ruta completa} (recursivo). Si hay duplicados, se elige el path más corto."""
    idx: Dict[str, str] = {}
    for root, _, files in os.walk(images_dir):
        for fn in files:
            if fn.lower().endswith((".png", ".jpg", ".jpeg")):
                p = os.path.join(root, fn)
                b = os.path.basename(fn)
                if b not in idx or len(p) < len(idx[b]):
                    idx[b] = p
    return idx


def join_csv_with_images(results_csv: str, images_dir: str):
    """Devuelve DataFrame con columnas del CSV + `image_path` y `exists`.
    Requiere `pandas`.
    """
    if pd is None:
        raise RuntimeError("Pandas no disponible. Instala: pip install pandas")
    if not os.path.isfile(results_csv):
        raise FileNotFoundError(f"No existe CSV: {results_csv}")
    if not os.path.isdir(images_dir):
        raise FileNotFoundError(f"No existe carpeta de imágenes: {images_dir}")

    df = pd.read_csv(results_csv)
    if "filename" not in df.columns:
        raise ValueError("El CSV debe tener columna 'filename'.")

    idx = _collect_image_index(images_dir)
    df["image_path"] = df["filename"].apply(lambda x: idx.get(os.path.basename(str(x))))
    df["exists"] = df["image_path"].apply(lambda p: bool(isinstance(p, str) and os.path.isfile(p)))
    return df


def bucketize_for_review(df, out_base: str, uncertain_range: Tuple[float, float] = (0.35, 0.60)) -> Dict[str, int]:
    """Copia imágenes en out_base/true, out_base/false, out_base/review según reglas de incertidumbre."""
    os.makedirs(out_base, exist_ok=True)
    out_true = os.path.join(out_base, "true"); os.makedirs(out_true, exist_ok=True)
    out_false= os.path.join(out_base, "false");os.makedirs(out_false, exist_ok=True)
    out_rev  = os.path.join(out_base, "review");os.makedirs(out_rev, exist_ok=True)

    import shutil
    lo, hi = uncertain_range
    stats = {"true":0, "false":0, "review":0}

    for _, r in df.iterrows():
        p = r.get("image_path")
        if not (isinstance(p, str) and os.path.isfile(p)):
            continue
        b = os.path.basename(p)
        status = str(r.get("status", ""))
        has_bbox = bool(r.get("has_bbox", False))
        final_label = bool(r.get("final_label", False))
        vlm_conf = float(r.get("vlm_conf", 0.0))
        vote_conf = r.get("vote_conf_mean")
        vote_conf = float(vote_conf) if (pd is not None and pd.notna(vote_conf)) else (vote_conf if isinstance(vote_conf, float) else None)

        uncertain = (status != "ok") or (not has_bbox) or (lo <= vlm_conf <= hi) or (vote_conf is not None and lo <= vote_conf <= hi)
        dst = out_rev if uncertain else (out_true if final_label else out_false)
        try:
            shutil.copy2(p, os.path.join(dst, b))
            key = "review" if dst==out_rev else ("true" if dst==out_true else "false")
            stats[key] += 1
        except Exception:
            pass
    return stats


def build_html_gallery(df, out_dir: str, copy_images: bool = True, cols: int = 4, max_width: int = 420) -> str:
    """Genera una galería HTML de validación visual. Retorna la ruta al index.html."""
    os.makedirs(out_dir, exist_ok=True)
    imgs_dir = os.path.join(out_dir, "imgs")
    if copy_images:
        os.makedirs(imgs_dir, exist_ok=True)
    import shutil

    rows = []
    for _, r in df.iterrows():
        p = r.get("image_path")
        if not (isinstance(p, str) and os.path.isfile(p)):
            continue
        b = os.path.basename(p)
        label = bool(r.get("final_label", False))
        vlm_conf = float(r.get("vlm_conf", 0.0))
        vote_conf = r.get("vote_conf_mean")
        vote_conf = float(vote_conf) if (pd is not None and pd.notna(vote_conf)) else None
        reason = str(r.get("final_reason", ""))[:120]

        if copy_images:
            dst_rel = f"imgs/{b}"
            try:
                shutil.copy2(p, os.path.join(imgs_dir, b))
            except Exception:
                dst_rel = p
        else:
            dst_rel = p
        border = "#2e7d32" if label else "#c62828"
        rows.append((dst_rel, border, label, vlm_conf, vote_conf, reason, b))

    html_path = os.path.join(out_dir, "index.html")
    with open(html_path, "w", encoding="utf-8") as f:
        f.write(f"""<!doctype html>
<html lang=\"es\"><head><meta charset=\"utf-8\"/>
<title>Revisión de paradas</title>
<style>
body{{font-family:system-ui,Segoe UI,Arial,sans-serif;background:#fafafa;margin:16px}}
.grid{{display:grid;grid-template-columns:repeat({cols},1fr);gap:14px}}
.card{{background:white;border-radius:12px;box-shadow:0 2px 8px rgba(0,0,0,.08);padding:10px}}
.card img{{width:100%;height:auto;max-width:{max_width}px;border-radius:10px;border:4px solid var(--b)}}
.meta{{font-size:12px;color:#333;margin-top:6px;white-space:pre-wrap}}
.badge{{display:inline-block;padding:2px 8px;border-radius:999px;background:#eee;margin-right:6px;font-size:12px}}
.true{{background:#e8f5e9}} .false{{background:#ffebee}}
</style></head><body>
<h1>Revisión de paradas</h1>
<div class=\"badge true\">Verde=True</div><div class=\"badge false\">Rojo=False</div>
<div class=\"grid\">""")
        for dst_rel, border, label, vlm_conf, vote_conf, reason, b in rows:
            f.write(f"""
<div class=\"card\" style=\"--b:{border}\">
  <img src=\"{dst_rel}\" alt=\"{b}\">
  <div class=\"meta\">
    <b>{'TRUE' if label else 'FALSE'}</b> · vlm={vlm_conf:.2f} · vote={'' if vote_conf is None else f'{vote_conf:.2f}'}<br>
    <b>file:</b> {b}<br>
    <b>reason:</b> {reason}
  </div>
</div>""")
        f.write("""
</div></body></html>""")
    return html_path


def posthoc_recalibrate_and_bucket(results_csv: str, images_dir: str, out_dir: str,
                                   vote_thr: float = 0.45, vlm_thr: float = 0.60,
                                   uncertain_range: Tuple[float, float] = (0.30, 0.55),
                                   make_gallery: bool = False) -> Dict[str, Any]:
    """Reclasifica sin re-inferir (umbrales más permisivos) y copia a true/false/review. Opcional: galería HTML."""
    if pd is None:
        raise RuntimeError("Pandas no disponible. Instala: pip install pandas")
    df = join_csv_with_images(results_csv, images_dir)
    df = df[df["exists"]]

    # nueva etiqueta
    new_true = []
    for _, r in df.iterrows():
        vlm_ok = bool(r.get("vlm_is_bus_stop", False))
        vlm_conf = float(r.get("vlm_conf", 0.0))
        vote_ok = bool(r.get("vote_is_bus_stop", False))
        vote_conf = r.get("vote_conf_mean")
        vote_conf = float(vote_conf) if pd.notna(vote_conf) else 0.0
        is_true = (vote_ok and vote_conf >= vote_thr) or (vlm_ok and vlm_conf >= vlm_thr)
        new_true.append(is_true)
    df["new_true"] = new_true

    stats = bucketize_for_review(df, out_dir, uncertain_range=uncertain_range)

    out = {"counts": stats}
    if make_gallery:
        html = build_html_gallery(df, os.path.join(out_dir, "galeria"), copy_images=True)
        out["gallery"] = html
    return out


# ======================
# Debug de crops (exportar FULL+BOX y CROP)
# ======================

def _draw_box(img: Image.Image, bbox_xywh: Tuple[int,int,int,int], color=(255,0,0), width: int = 3) -> Image.Image:
    img2 = img.copy()
    draw = ImageDraw.Draw(img2)
    cx, cy, w, h = bbox_xywh
    x0, y0 = cx - w//2, cy - h//2
    x1, y1 = cx + w//2, cy + h//2
    for i in range(width):
        draw.rectangle([x0-i, y0-i, x1+i, y1+i], outline=color)
    return img2


def export_debug_crops(images_dir: str, out_dir: str, sample_n: int = 30,
                       r_thr: int = 170, gb_thr: int = 120, pad: float = 0.3) -> Dict[str,int]:
    os.makedirs(out_dir, exist_ok=True)
    paths: List[str] = []
    for root, _, files in os.walk(images_dir):
        for fn in files:
            if fn.lower().endswith((".png", ".jpg", ".jpeg")):
                paths.append(os.path.join(root, fn))
    paths.sort()
    if sample_n and len(paths) > sample_n:
        import random
        paths = random.sample(paths, sample_n)

    ok = miss = 0
    for p in paths:
        img = Image.open(p).convert("RGB")
        bbox = detect_red_box_bbox(img, r_thr=r_thr, gb_thr=gb_thr)
        if bbox is None:
            miss += 1
            img.save(os.path.join(out_dir, f"MISS_{os.path.basename(p)}"))
            continue
        ok += 1
        crop = crop_from_bbox(img, bbox, pad=pad)
        with_box = _draw_box(img, bbox)
        with_box.save(os.path.join(out_dir, f"FULL_{os.path.basename(p)}"))
        crop.save(os.path.join(out_dir, f"CROP_{os.path.basename(p)}"))
    return {"ok": ok, "miss": miss}


# ======================
# Ejemplo de uso (referencia)
# ======================
"""
# Requisitos (ejecutar una vez por sesión):
# Opción estable:
# !pip -q install -U "transformers>=4.44.2" "accelerate>=0.33.0" "qwen-vl-utils>=0.0.8" "bitsandbytes>=0.43.0"
# Si ves errores de 'qwen2_vl' o de configuración, instala desde fuente (más reciente):
# !pip -q install -U git+https://github.com/huggingface/transformers
# Reiniciar el runtime si te lo pide.

from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
import torch

model_name = "Qwen/Qwen2-VL-7B-Instruct"

# Carga recomendada para T4 (FP16). Para errores de mapeo ('qwen2_vl' / config), usa la última versión de Transformers
# o instala desde fuente: pip install -U git+https://github.com/huggingface/transformers
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.float16,         # T4 no soporta bfloat16
    device_map="auto",
    trust_remote_code=True,
)

# Ahora correr el pipeline
run_pipeline(
    images_dir=IMAGES_DIR,
    output_csv=OUTPUT_CSV,
    model=model,
    processor=processor,
    use_meta_bbox=USE_META_BBOX,
    path_bbox_meta=PATH_BBOX_META,
    use_geo_gate=USE_GEO_GATE,
    path_roads=PATH_ROADS,
    use_buildings_gate=USE_BUILDINGS_GATE,
    path_buildings=PATH_BUILDINGS,
    clf_path=None,   # o ".../clf_siglip.joblib" si ya entrenaste
    siglip_device="cuda",
)
"""


'\n# Requisitos (ejecutar una vez por sesión):\n# Opción estable:\n# !pip -q install -U "transformers>=4.44.2" "accelerate>=0.33.0" "qwen-vl-utils>=0.0.8" "bitsandbytes>=0.43.0"\n# Si ves errores de \'qwen2_vl\' o de configuración, instala desde fuente (más reciente):\n# !pip -q install -U git+https://github.com/huggingface/transformers\n# Reiniciar el runtime si te lo pide.\n\nfrom transformers import Qwen2VLForConditionalGeneration, AutoProcessor\nimport torch\n\nmodel_name = "Qwen/Qwen2-VL-7B-Instruct"\n\n# Carga recomendada para T4 (FP16). Para errores de mapeo (\'qwen2_vl\' / config), usa la última versión de Transformers\n# o instala desde fuente: pip install -U git+https://github.com/huggingface/transformers\nprocessor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)\nmodel = Qwen2VLForConditionalGeneration.from_pretrained(\n    model_name,\n    torch_dtype=torch.float16,         # T4 no soporta bfloat16\n    device_map="auto",\n    trust_remote_code=True,\n)\

In [None]:
# Requisitos (ejecutar una vez por sesión):
# Opción estable:
# !pip -q install -U "transformers>=4.44.2" "accelerate>=0.33.0" "qwen-vl-utils>=0.0.8" "bitsandbytes>=0.43.0"
# Si ves errores de 'qwen2_vl' o de configuración, instala desde fuente (más reciente):
# !pip -q install -U git+https://github.com/huggingface/transformers
# Reiniciar el runtime si te lo pide.

from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
import torch

model_name = "Qwen/Qwen2-VL-7B-Instruct"

# Carga recomendada para T4 (FP16). Para errores de mapeo ('qwen2_vl' / config), usa la última versión de Transformers
# o instala desde fuente: pip install -U git+https://github.com/huggingface/transformers
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.float16,         # T4 no soporta bfloat16
    device_map="auto",
    trust_remote_code=True,
)


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 25/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 50/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 75/754 imágenes...
Procesadas 100/754 imágenes...
Procesadas 125/754 imágenes...
Procesadas 150/754 imágenes...
Procesadas 175/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 200/754 imágenes...
Procesadas 225/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 250/754 imágenes...
Procesadas 275/754 imágenes...
Procesadas 300/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 325/754 imágenes...
Procesadas 350/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 375/754 imágenes...
Procesadas 400/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 425/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 450/754 imágenes...
Procesadas 475/754 imágenes...
Procesadas 500/754 imágenes...
Procesadas 525/754 imágenes...
Procesadas 550/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 575/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 600/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 625/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 650/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 675/754 imágenes...
Procesadas 700/754 imágenes...
Procesadas 725/754 imágenes...
Procesadas 750/754 imágenes...
Listo. Guardado CSV en: /content/drive/My Drive/Tesis/Resultados/verificacion_paradas_v1.csv


In [None]:
run_pipeline(
    images_dir=IMAGES_DIR,
    output_csv=OUTPUT_CSV,
    model=model,
    processor=processor,
    recursive=True,   # ponelo en True si tenés subcarpetas
)

The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 25/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 50/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 75/754 imágenes...
Procesadas 100/754 imágenes...
Procesadas 125/754 imágenes...
Procesadas 150/754 imágenes...
Procesadas 175/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 200/754 imágenes...
Procesadas 225/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 250/754 imágenes...
Procesadas 275/754 imágenes...
Procesadas 300/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 325/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 350/754 imágenes...
Procesadas 375/754 imágenes...
Procesadas 400/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 425/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 450/754 imágenes...
Procesadas 475/754 imágenes...
Procesadas 500/754 imágenes...
Procesadas 525/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 550/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 575/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 600/754 imágenes...
Procesadas 625/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 650/754 imágenes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Procesadas 675/754 imágenes...
Procesadas 700/754 imágenes...
Procesadas 725/754 imágenes...
Procesadas 750/754 imágenes...
Listo. Guardado CSV en: /content/drive/My Drive/Tesis/Resultados/verificacion_paradas_v1.csv


In [None]:
# === Bucketing de imágenes en true/false/review (+ opcional: recalibración y galería) ===
import os, re, shutil
import pandas as pd

# ---------- CONFIG (ajustar) ----------
RESULTS_CSV = "/content/drive/My Drive/Tesis/Resultados/verificacion_paradas_v1.csv"
IMAGES_DIR  = "/content/drive/My Drive/Tesis/Datos/Ruta8/paradas_no_seguras"
OUT_DIR     = "/content/drive/My Drive/Tesis/Resultados/revision_paradas_v1"
UNCERTAIN_RANGE = (0.35, 0.60)  # casos con conf intermedia => review

# Recalibración (más permisiva). Si False, usa las etiquetas del CSV tal cual.
USE_RECALIBRATION = True
VOTE_THR   = 0.45   # si hay vote_is_bus_stop y vote_conf_mean >= VOTE_THR
VLM_THR    = 0.60   # o si vlm_is_bus_stop y vlm_conf >= VLM_THR

# Galería HTML (opcional)
MAKE_GALLERY = True
GALLERY_DIR  = os.path.join(OUT_DIR, "galeria")
GALLERY_COLS = 4
GALLERY_MAXW = 420
# -------------------------------------

os.makedirs(OUT_DIR, exist_ok=True)
OUT_TRUE  = os.path.join(OUT_DIR, "true");  os.makedirs(OUT_TRUE, exist_ok=True)
OUT_FALSE = os.path.join(OUT_DIR, "false"); os.makedirs(OUT_FALSE, exist_ok=True)
OUT_REV   = os.path.join(OUT_DIR, "review");os.makedirs(OUT_REV, exist_ok=True)

def collect_image_index(images_dir: str):
    """Indexa recursivamente {basename -> ruta completa}. Si hay duplicados, elige el path más corto."""
    idx, dups = {}, {}
    for root, _, files in os.walk(images_dir):
        for fn in files:
            if fn.lower().endswith((".png", ".jpg", ".jpeg")):
                p = os.path.join(root, fn)
                b = os.path.basename(fn)
                if b in idx:
                    dups.setdefault(b, []).append(p)
                    if len(p) < len(idx[b]):  # heurística simple
                        idx[b] = p
                else:
                    idx[b] = p
    if dups:
        print(f"Aviso: {len(dups)} nombres de archivo duplicados; se eligió un path por nombre.")
    return idx

def sanitize(s: str, maxlen=120) -> str:
    s = re.sub(r'[\\/:*?"<>|]', "_", str(s))
    s = re.sub(r"\s+", " ", s).strip()
    return s[:maxlen]

# 1) Cargar CSV y mapear imágenes
if not os.path.isfile(RESULTS_CSV):
    raise FileNotFoundError(f"No existe CSV: {RESULTS_CSV}")
if not os.path.isdir(IMAGES_DIR):
    raise FileNotFoundError(f"No existe carpeta de imágenes: {IMAGES_DIR}")

df = pd.read_csv(RESULTS_CSV)
if "filename" not in df.columns:
    raise ValueError("El CSV debe tener columna 'filename' (nombre del archivo).")

idx = collect_image_index(IMAGES_DIR)
df["image_path"] = df["filename"].apply(lambda x: idx.get(os.path.basename(str(x))))
df["exists"] = df["image_path"].apply(lambda p: bool(isinstance(p, str) and os.path.isfile(p)))
df_ex = df[df["exists"]].copy()

print("Total filas CSV:", len(df))
print("Con imagen encontrada:", len(df_ex))
print("Sin imagen:", len(df) - len(df_ex))

# 2) Decidir etiqueta final (con o sin recalibración)
def decide_label(row):
    if not USE_RECALIBRATION:
        return bool(row.get("final_label", False))
    # Recalibración más permisiva
    vlm_ok   = bool(row.get("vlm_is_bus_stop", False))
    vlm_conf = float(row.get("vlm_conf", 0.0))
    vote_ok  = bool(row.get("vote_is_bus_stop", False))
    vote_cm  = row.get("vote_conf_mean")
    vote_cm  = float(vote_cm) if pd.notna(vote_cm) else 0.0
    return (vote_ok and vote_cm >= VOTE_THR) or (vlm_ok and vlm_conf >= VLM_THR)

df_ex["label_new"] = df_ex.apply(decide_label, axis=1)

# 3) Enviar a true / false / review según incertidumbre
lo, hi = UNCERTAIN_RANGE
cop = {"true":0, "false":0, "review":0, "omitidas":0}

for _, r in df_ex.iterrows():
    p = r["image_path"]
    b = os.path.basename(p)

    status = str(r.get("status", ""))
    has_bbox = bool(r.get("has_bbox", False))
    vlm_conf = float(r.get("vlm_conf", 0.0))
    vote_conf = r.get("vote_conf_mean")
    vote_conf = float(vote_conf) if pd.notna(vote_conf) else None

    # incierto si: error, sin bbox, o conf intermedia
    uncertain = (
        status != "ok" or (not has_bbox) or
        (lo <= vlm_conf <= hi) or
        (vote_conf is not None and lo <= vote_conf <= hi)
    )

    lbl = bool(r["label_new"])
    dst = OUT_REV if uncertain else (OUT_TRUE if lbl else OUT_FALSE)

    name_out = f"{'T' if lbl else 'F'}_vlm{vlm_conf:.2f}{'' if vote_conf is None else f'_vote{vote_conf:.2f}'}_{b}"
    try:
        shutil.copy2(p, os.path.join(dst, name_out))
        key = "review" if dst==OUT_REV else ("true" if dst==OUT_TRUE else "false")
        cop[key] += 1
    except Exception as e:
        cop["omitidas"] += 1
        print("No pude copiar:", p, "->", e)

print("\nCopias:", cop)
print("Carpetas listas:\n", OUT_TRUE, "\n", OUT_FALSE, "\n", OUT_REV)

# 4) (Opcional) Galería HTML con tarjetas
if MAKE_GALLERY:
    os.makedirs(GALLERY_DIR, exist_ok=True)
    imgs_dir = os.path.join(GALLERY_DIR, "imgs"); os.makedirs(imgs_dir, exist_ok=True)

    rows = []
    for _, r in df_ex.iterrows():
        p = r["image_path"]
        if not os.path.isfile(p):
            continue
        b = os.path.basename(p)
        lbl = bool(r["label_new"])
        vlm_conf = float(r.get("vlm_conf", 0.0))
        vote_conf = r.get("vote_conf_mean")
        vote_conf = float(vote_conf) if pd.notna(vote_conf) else None
        reason = sanitize(r.get("final_reason", ""))

        dst_rel = f"imgs/{b}"
        try:
            shutil.copy2(p, os.path.join(imgs_dir, b))
        except Exception:
            dst_rel = p

        border = "#2e7d32" if lbl else "#c62828"
        rows.append((dst_rel, border, lbl, vlm_conf, vote_conf, reason, b))

    html = os.path.join(GALLERY_DIR, "index.html")
    with open(html, "w", encoding="utf-8") as f:
        f.write(f"""<!doctype html>
<html lang="es"><head><meta charset="utf-8"/>
<title>Revisión de paradas</title>
<style>
body{{font-family:system-ui,Segoe UI,Arial,sans-serif;background:#fafafa;margin:16px}}
.grid{{display:grid;grid-template-columns:repeat({GALLERY_COLS},1fr);gap:14px}}
.card{{background:white;border-radius:12px;box-shadow:0 2px 8px rgba(0,0,0,.08);padding:10px}}
.card img{{width:100%;height:auto;max-width:{GALLERY_MAXW}px;border-radius:10px;border:4px solid var(--b)}}
.meta{{font-size:12px;color:#333;margin-top:6px;white-space:pre-wrap}}
.badge{{display:inline-block;padding:2px 8px;border-radius:999px;background:#eee;margin-right:6px;font-size:12px}}
.true{{background:#e8f5e9}} .false{{background:#ffebee}}
</style></head><body>
<h1>Revisión de paradas</h1>
<div class="badge true">Verde=True</div> <div class="badge false">Rojo=False</div>
<div class="grid">""")
        for dst_rel, border, lbl, vc, votec, reason, b in rows:
            f.write(f"""
<div class="card" style="--b:{border}">
  <img src="{dst_rel}" alt="{b}">
  <div class="meta">
    <b>{'TRUE' if lbl else 'FALSE'}</b> · vlm={vc:.2f} · vote={'' if votec is None else f'{votec:.2f}'}<br>
    <b>file:</b> {b}<br>
    <b>reason:</b> {reason}
  </div>
</div>""")
        f.write("""
</div></body></html>""")
    print("Galería:", html)



Total filas CSV: 754
Con imagen encontrada: 754
Sin imagen: 0

Copias: {'true': 0, 'false': 0, 'review': 754, 'omitidas': 0}
Carpetas listas:
 /content/drive/My Drive/Tesis/Resultados/revision_paradas_v1/true 
 /content/drive/My Drive/Tesis/Resultados/revision_paradas_v1/false 
 /content/drive/My Drive/Tesis/Resultados/revision_paradas_v1/review
Galería: /content/drive/My Drive/Tesis/Resultados/revision_paradas_v1/galeria/index.html
