#Paquetes necesarios

In [None]:
import cv2  
import math 

from ultralytics import YOLO

Modelos preentrenados, visualizando con las utilidades de ultralytics

In [None]:
# Carga del modelo
#model = YOLO('yolo11n.pt') #Contenedores
#model = YOLO('yolo11n-seg.pt') #Máscaras
model = YOLO('yolo11n-pose.pt')  #Pose

#Para un vídeo 
filename = "TGC23_PdH_C0056cut.mp4"
results = model(filename, show=True)

cv2.destroyAllWindows()

Desde cámara, detección con yolo11, modelo nano. Visualización propia con OpenCV

In [None]:
# Carga del modelo, descarga en disco si no está presente en la carpeta
model = YOLO('yolo11n.pt') #Contenedores

# Etiqueta de las distintas clases
classNames = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat",
              "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
              "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
              "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
              "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
              "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
              "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed",
              "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone",
              "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
              "teddy bear", "hair drier", "toothbrush"
              ]


# Captura desde la webcam
vid = cv2.VideoCapture(0)
  
while(True):      
    # fotograma a fotograma
    ret, img = vid.read()
  
    # si hay imagen válida
    if ret:  
        # Detecta en la imagen
        results = model(img, stream=True)
        
        # Para cada detección
        for r in results:
            boxes = r.boxes

            for box in boxes:
                # Contenedor
                x1, y1, x2, y2 = box.xyxy[0]
                x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) # convert to int values
                
                # Confianza
                confidence = math.ceil((box.conf[0]*100))/100
                print("Confianza --->",confidence)

                # Clase
                cls = int(box.cls[0])
                print("Clase -->", classNames[cls])

                # Convierte identificador numérico de clase a un color RGB
                escala = int((cls / len(classNames)) * 255 * 3)
                if escala >= 255*2:
                    R = 255
                    G = 255
                    B = escala - 255*2
                else:
                    if escala >= 255:
                        R = 255
                        G = escala - 255
                        B = 0
                    else:
                        R = escala
                        G = 0
                        B = 0

                # Dibuja el contenedor y clase
                cv2.rectangle(img, (x1, y1), (x2, y2), (R, G, B), 3)
                cv2.putText(img, classNames[cls] , [x1, y1], cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, B), 2)

        # Muestra fotograma
        cv2.imshow('Vid', img)
    
    # Detenemos pulsado ESC
    if cv2.waitKey(20) == 27:
        break
  
# Libera el objeto de captura
vid.release()
# Destruye ventanas
cv2.destroyAllWindows()

Seguimiento. Requiere instalar lap con pip install lap

In [None]:
from collections import defaultdict
import numpy as np

# Carga del modelo, descarga en disco si no está presente en la carpeta
model = YOLO('yolo11n.pt') #Contenedores

# Etiqueta de las distintas clases
classNames = ["person", "bicycle", "car"]


# Captura desde la webcam
vid = cv2.VideoCapture(0)
track_history = defaultdict(lambda: [])
  
while(True):      
    # fotograma a fotograma
    ret, img = vid.read()
  
    # si hay imagen válida
    if ret:  
        # Seguimiento, con persistencia entre fotogramas
        results = model.track(img, persist=True, classes = [0,1,2])

        if 0:
            if results is not None:
                print(results[0])
                boxes = results[0].boxes.xywh.cpu()
                track_ids = results[0].boxes.id.int().cpu().tolist()
                annotated_frame = results[0].plot()
                for box, track_id in zip(boxes, track_ids):
                    x, y, w, h = box
                    track = track_history[track_id]
                    track.append((float(x), float(y)))
                    if len(track) > 30:
                        track.pop(0)
                    points = np.hstack(track).astype(np.int32).reshape((-1, 1, 2))
                    cv2.polylines(annotated_frame, [points], isClosed=False, color=(230, 230, 230), thickness=10)
                cv2.imshow("YOLO11 Tracking", annotated_frame)
                if cv2.waitKey(1) & 0xFF == ord("q"):
                    break
        

        
        # Para cada detección
        for r in results:
            boxes = r.boxes

            for box in boxes:
                # Contenedor
                x1, y1, x2, y2 = box.xyxy[0]
                x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) # convert to int values

                #Etiqueta de seguimiento
                if box.id is not None:
                    track_id = str(int(box.id[0].tolist()))
                else:
                    track_id = ''
                
                # Confianza
                confidence = math.ceil((box.conf[0]*100))/100
                print("Confianza --->",confidence)

                # Clase
                cls = int(box.cls[0])
                print("Clase -->", classNames[cls])

                # Convierte identificador numérico de clase a un color RGB
                escala = int((cls / len(classNames)) * 255 * 3)
                if escala >= 255*2:
                    R = 255
                    G = 255
                    B = escala - 255*2
                else:
                    if escala >= 255:
                        R = 255
                        G = escala - 255
                        B = 0
                    else:
                        R = escala
                        G = 0
                        B = 0

                # Dibuja el contenedor y clase
                cv2.rectangle(img, (x1, y1), (x2, y2), (R, G, B), 3)
                cv2.putText(img, track_id + ' ' + classNames[cls] , [x1, y1], cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, B), 2)

        # Muestra fotograma
        cv2.imshow('Vid', img)
    
    # Detenemos pulsado ESC
    if cv2.waitKey(20) == 27:
        break
  
# Libera el objeto de captura
vid.release()
# Destruye ventanas
cv2.destroyAllWindows()



In [12]:
"""
PIPELINE GPU - TAREA 4.6
- Usa GPU si está disponible
- Todo en vídeo + CSV
- Limpio y rápido
"""

# ========================= CONFIGURACIÓN =========================
VIDEO_IN_PATH = "C0142.MP4"
VIDEO_OUT_PATH = "salida_anonimizada.mp4"
CSV_OUT_PATH = "detecciones.csv"
OUTPUT_DIR = "outputs"

GENERAL_MODEL = "yolo11n.pt"
PLATE_MODEL = "best.pt"

CONF_THRESHOLD = 0.25
BLUR_INTENSITY = 51
PLATE_METHOD = "yolo"
OCR_LANG = "en"
MOVEMENT_THRESHOLD = 50

# =================================================================
import os, csv, cv2, numpy as np, matplotlib.pyplot as plt
from ultralytics import YOLO
import easyocr
from collections import defaultdict
import torch
%matplotlib inline

# GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Dispositivo: {device.upper()}")
if device == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")

os.makedirs(OUTPUT_DIR, exist_ok=True)
CSV_OUT_PATH = os.path.join(OUTPUT_DIR, CSV_OUT_PATH)
VIDEO_OUT_PATH = os.path.join(OUTPUT_DIR, VIDEO_OUT_PATH)

# Modelos en GPU
model_general = YOLO(GENERAL_MODEL).to(device)
model_plate = YOLO(PLATE_MODEL).to(device) if PLATE_METHOD == "yolo" else None
reader = easyocr.Reader([OCR_LANG], gpu=(device == 'cuda'))

# Clases
classNames = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat",
              "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
              "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
              "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
              "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
              "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
              "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed",
              "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone",
              "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
              "teddy bear", "hair drier", "toothbrush"]
VEHICLE_CLASSES = {"car", "bus", "truck", "motorbike", "bicycle"}

# Utilidades
def blur_region(img, x1, y1, x2, y2):
    h, w = img.shape[:2]
    x1, y1 = max(0, x1), max(0, y1)
    x2, y2 = min(w, x2), min(h, y2)
    if x2 <= x1 or y2 <= y1: return img
    roi = img[y1:y2, x1:x2]
    k = BLUR_INTENSITY if (x2-x1) > 30 else 15
    k = k if k % 2 == 1 else k + 1
    blurred = cv2.GaussianBlur(roi, (k, k), 0)
    img[y1:y2, x1:x2] = blurred
    return img

# Procesamiento
cap = cv2.VideoCapture(VIDEO_IN_PATH)
width, height = int(cap.get(3)), int(cap.get(4))
fps = cap.get(5) or 25
total_frames = int(cap.get(7))

out = cv2.VideoWriter(VIDEO_OUT_PATH, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height))

csv_file = open(CSV_OUT_PATH, "w", newline="", encoding="utf-8")
writer = csv.writer(csv_file)
writer.writerow(["fotograma", "tipo_objeto", "confianza_objeto", "identificador_tracking",
                 "x1", "y1", "x2", "y2", "matrícula_en_su_caso", "confianza_matricula",
                 "mx1", "my1", "mx2", "my2", "texto_matricula"])

results_stream = model_general.track(source=VIDEO_IN_PATH, tracker="botsort.yaml", stream=True, device=device)

class_counts = defaultdict(set)
track_history = {}
frame_id = 0

print(f"Procesando {total_frames} frames en {device.upper()}...\n")

for res in results_stream:
    frame_id += 1
    frame = res.orig_img.copy()
    boxes = res.boxes

    for box in boxes:
        x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
        conf = float(box.conf)
        cls_id = int(box.cls)
        track_id = int(box.id) if box.id is not None else -1
        label = classNames[cls_id]

        if track_id != -1:
            class_counts[label].add(track_id)
            cx = (x1 + x2) // 2
            if track_id not in track_history:
                track_history[track_id] = {"first": (cx, 0), "label": label}
            track_history[track_id]["last"] = (cx, 0)

        plate_found = False
        mx1, my1, mx2, my2 = -1, -1, -1, -1
        plate_conf = 0.0
        plate_text = ""

        if label == "person":
            frame = blur_region(frame, x1, y1, x2, y2)
            cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 0, 0), 2)
            cv2.putText(frame, f"Persona {track_id}", (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,0,0), 2)

        elif label in VEHICLE_CLASSES:
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 128, 255), 2)
            cv2.putText(frame, f"{label.capitalize()} {track_id}", (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,128,255), 2)

            crop = frame[y1:y2, x1:x2]
            if crop.size > 0:
                if PLATE_METHOD == "yolo" and model_plate:
                    lp_res = model_plate.predict(crop, conf=CONF_THRESHOLD, device=device, verbose=False)
                    if len(lp_res[0].boxes) > 0:
                        lbox = lp_res[0].boxes[0]
                        lx1, ly1, lx2, ly2 = map(int, lbox.xyxy[0].cpu().numpy())
                        mx1, my1 = x1 + lx1, y1 + ly1
                        mx2, my2 = x1 + lx2, y1 + ly2
                        plate_conf = float(lbox.conf)
                        plate_found = True

                if plate_found:
                    frame = blur_region(frame, mx1, my1, mx2, my2)
                    plate_crop = frame[my1:my2, mx1:mx2]
                    if plate_crop.size > 0:
                        ocr = reader.readtext(plate_crop)
                        if ocr:
                            plate_text = ocr[0][1].upper().replace(" ", "")
                    cv2.rectangle(frame, (mx1, my1), (mx2, my2), (0, 255, 0), 2)
                    cv2.putText(frame, f"{plate_text or 'Plate'}", (mx1, my1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,0), 2)

        writer.writerow([frame_id, label, round(conf, 3), track_id, x1, y1, x2, y2,
                         int(plate_found), round(plate_conf, 3), mx1, my1, mx2, my2, plate_text])

    out.write(frame)
    if frame_id % 30 == 0 or frame_id == total_frames:
        print(f"  Frame {frame_id}/{total_frames}")

cap.release()
out.release()
csv_file.close()

# Flujo
flow = defaultdict(lambda: defaultdict(int))
for tid, data in track_history.items():
    if "last" in data and data["first"]:
        dx = data["last"][0] - data["first"][0]
        if abs(dx) > MOVEMENT_THRESHOLD:
            dir = "derecha" if dx > 0 else "izquierda"
            flow[data["label"]][dir] += 1

print("\n" + "="*50)
print("RESULTADOS")
print(f"Vídeo: {VIDEO_OUT_PATH}")
print(f"CSV: {CSV_OUT_PATH}")
print("\nCONTEO:")
for cls, ids in sorted(class_counts.items()):
    print(f"  {cls}: {len(ids)}")
print("\nFLUJO:")
for cls, dirs in sorted(flow.items()):
    print(f"  {cls}:")
    for d, c in sorted(dirs.items()):
        print(f"    {d}: {c}")
print("="*50)

Dispositivo: CUDA
GPU: NVIDIA GeForce RTX 4060 Laptop GPU
Procesando 2832 frames en CUDA...


video 1/1 (frame 1/2832) c:\Users\User\VC-ULPGC\VC_P4\P4\C0142.MP4: 384x640 4 cars, 1 bus, 23.1ms
video 1/1 (frame 2/2832) c:\Users\User\VC-ULPGC\VC_P4\P4\C0142.MP4: 384x640 4 cars, 1 bus, 17.7ms
video 1/1 (frame 3/2832) c:\Users\User\VC-ULPGC\VC_P4\P4\C0142.MP4: 384x640 4 cars, 1 bus, 25.0ms
video 1/1 (frame 4/2832) c:\Users\User\VC-ULPGC\VC_P4\P4\C0142.MP4: 384x640 4 cars, 1 bus, 14.2ms
video 1/1 (frame 5/2832) c:\Users\User\VC-ULPGC\VC_P4\P4\C0142.MP4: 384x640 4 cars, 1 bus, 24.8ms
video 1/1 (frame 6/2832) c:\Users\User\VC-ULPGC\VC_P4\P4\C0142.MP4: 384x640 4 cars, 1 bus, 15.1ms
video 1/1 (frame 7/2832) c:\Users\User\VC-ULPGC\VC_P4\P4\C0142.MP4: 384x640 4 cars, 1 bus, 12.5ms
video 1/1 (frame 8/2832) c:\Users\User\VC-ULPGC\VC_P4\P4\C0142.MP4: 384x640 4 cars, 1 bus, 11.2ms
video 1/1 (frame 9/2832) c:\Users\User\VC-ULPGC\VC_P4\P4\C0142.MP4: 384x640 4 cars, 1 bus, 10.1ms
video 1/1 (frame 10/2832

Intregración con seguimiento (tracking)
!!!!!!!!!Nota: he tenido que bajar a la versión de python 3.9.5 e instalar lap con pip install lap

In [None]:
# Carga del modelo
model = YOLO('yolo11n.pt') #Contenedores
#model = YOLO('yolov11n-seg.pt') #Máscaras
#model = YOLO('yolo11n-pose.pt')  #Pose

#Para un vídeo 
filename = "TGC23_PdH_C0056cut.mp4"
results = model.track(source=filename, show=True)  # BoT-SORT tracker (por defecto)
#results = model.track(source=filename, show=True, tracker="bytetrack.yaml")  # ByteTrack tracker

cv2.destroyAllWindows()