# DEMO: applicazione YOLO-World + IoU sul conteggio delle mele

Codice di dimostrazione del funzionamento algoritmico (logica in batch)

Il progetto permette invece di caricare un video e di vedere il processing real time

In [1]:
# -----------------------
# LIBRERIE
# -----------------------
import cv2
import supervision as sv
from inference.models.yolo_world.yolo_world import YOLOWorld
import numpy as np
from utils.SimpleIoUTracker import SimpleIoUTracker



In [2]:
# -----------------------
# PARAMETRI
# -----------------------

INPUT_VIDEO = "apple_tree_video.mp4"
OUTPUT_VIDEO_obj_only = "output_annotated_obj_only.mp4"
OUTPUT_VIDEO_obj_and_track = "output_annotated_obj_and_track.mp4"

CONFIDENCE = 0.05
TRAIL_LENGTH = 30  # lunghezza scia --> solo per tracking

classes = ["apple", "red apple", "ripe apple", "fruit"]  # prompt testuale YOLO-World, si puÃ² aggiungere anche Yellow Apple

In [3]:
# -----------------------
# Selected Model
# -----------------------

model = YOLOWorld(model_id="yolo_world/s")

Creating inference sessions




CLIP model loaded in 1.49 seconds


[0;93m2026-02-01 12:34:08.413268 [W:onnxruntime:, coreml_execution_provider.cc:112 GetCapability] CoreMLExecutionProvider::GetCapability, number of partitions supported by CoreML: 17 number of nodes in the graph: 985 number of nodes supported by CoreML: 43[m
[0;93m2026-02-01 12:34:09.620445 [W:onnxruntime:, helper.cc:83 IsInputSupported] CoreML does not support input dim > 16384. Input:token_embedding.weight, shape: {49408,512}[m
[0;93m2026-02-01 12:34:09.620659 [W:onnxruntime:, coreml_execution_provider.cc:112 GetCapability] CoreMLExecutionProvider::GetCapability, number of partitions supported by CoreML: 15 number of nodes in the graph: 1003 number of nodes supported by CoreML: 32[m


## Main Code

In [4]:
# -----------------------
# VIDEO INPUT
# -----------------------
cap = cv2.VideoCapture(INPUT_VIDEO)

if not cap.isOpened():
    raise RuntimeError("Errore nell'apertura del video")

width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps    = cap.get(cv2.CAP_PROP_FPS)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

print(f"ðŸ“¹ Video: {width}x{height} @ {fps}fps, {total_frames} frames")

# -----------------------
# VIDEO OUTPUT
# -----------------------
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out = cv2.VideoWriter(
    OUTPUT_VIDEO_obj_and_track,
    fourcc,
    fps,
    (width, height)
)

# -----------------------
# TRACKER
# -----------------------
tracker = SimpleIoUTracker(iou_threshold=0.3, max_age=60)

# -----------------------
# ANNOTATORI
# -----------------------
box_annotator = sv.BoxAnnotator(color=sv.Color.RED, thickness=2)

label_annotator = sv.LabelAnnotator(
    text_thickness=1,
    text_scale=0.3,
    text_color=sv.Color.WHITE,
    color=sv.Color.RED
)

trace_annotator = sv.TraceAnnotator(
    color=sv.Color.RED,
    position=sv.Position.CENTER,
    trace_length=TRAIL_LENGTH,
    thickness=2
)

FILL_COLOR = (0, 0, 255)
FILL_ALPHA = 0.5

# -----------------------
# LOOP FRAME
# -----------------------
frame_count = 0
total_tracked = 0
unique_ids = set()

while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    frame_count += 1

    results = model.infer(
        frame,
        text=classes,
        confidence=CONFIDENCE
    )
    detections = sv.Detections.from_inference(results)
    detections = tracker.update(detections)

    # Debug primi 10 frame
    if frame_count <= 10:
        has_ids = len(detections.tracker_id) > 0
        ids_str = str(detections.tracker_id[:5]) if has_ids else "[]"
        print(f"Frame {frame_count}: {len(detections)} det, IDs={ids_str}...")

    if len(detections) == 0 or len(detections.tracker_id) == 0:
        out.write(frame)
        continue

    total_tracked += len(detections.tracker_id)
    unique_ids.update(detections.tracker_id)

    labels = [str(int(tid)) for tid in detections.tracker_id]

    annotated_frame = frame.copy()

    # TRACE PRIMA
    annotated_frame = trace_annotator.annotate(
        scene=annotated_frame,
        detections=detections
    )

    # FILL ROSSO TRASPARENTE
    overlay = annotated_frame.copy()
    for box in detections.xyxy.astype(int):
        x1, y1, x2, y2 = box
        cv2.rectangle(overlay, (x1, y1), (x2, y2), FILL_COLOR, -1)

    annotated_frame = cv2.addWeighted(
        overlay,
        FILL_ALPHA,
        annotated_frame,
        1 - FILL_ALPHA,
        0
    )

    # BOX + LABEL SOPRA
    annotated_frame = box_annotator.annotate(
        scene=annotated_frame,
        detections=detections
    )

    annotated_frame = label_annotator.annotate(
        scene=annotated_frame,
        detections=detections,
        labels=labels
    )

    out.write(annotated_frame)

    if frame_count % 30 == 0:
        print(
            f"ðŸ“¹ Frame {frame_count}: "
            f"{len(detections.tracker_id)} tracked, "
            f"{len(unique_ids)} ID univoci totali"
        )

# -----------------------
# CLEANUP
# -----------------------
cap.release()
out.release()

print(f"\nâœ… Video salvato: {OUTPUT_VIDEO_obj_and_track}")
print(f"ðŸ“Š Frame totali: {frame_count}")
print(f"ðŸ“Š Oggetti tracciati totali: {total_tracked}")
print(f"ðŸ“Š ID univoci assegnati: {len(unique_ids)}")
print(f"ðŸ“Š Media detection/frame: {total_tracked / frame_count:.1f}")

ðŸ“¹ Video: 360x640 @ 24.0fps, 192 frames
Frame 1: 19 det, IDs=[1 2 3 4 5]...
Frame 2: 18 det, IDs=[1 3 5 4 2]...
Frame 3: 16 det, IDs=[1 3 2 5 6]...
Frame 4: 16 det, IDs=[ 1  3  7 11 12]...
Frame 5: 18 det, IDs=[ 1  7  3 12  4]...
Frame 6: 19 det, IDs=[1 7 3 4 8]...
Frame 7: 16 det, IDs=[ 7  3  4  1 10]...
Frame 8: 18 det, IDs=[ 7  3  4  1 10]...
Frame 9: 17 det, IDs=[ 7  3  4 10  8]...
Frame 10: 17 det, IDs=[ 3 10  7  1  8]...
ðŸ“¹ Frame 30: 16 tracked, 22 ID univoci totali
ðŸ“¹ Frame 60: 12 tracked, 26 ID univoci totali
ðŸ“¹ Frame 90: 13 tracked, 33 ID univoci totali
ðŸ“¹ Frame 120: 20 tracked, 42 ID univoci totali
ðŸ“¹ Frame 150: 14 tracked, 46 ID univoci totali
ðŸ“¹ Frame 180: 15 tracked, 52 ID univoci totali

âœ… Video salvato: output_annotated_obj_and_track.mp4
ðŸ“Š Frame totali: 192
ðŸ“Š Oggetti tracciati totali: 2767
ðŸ“Š ID univoci assegnati: 56
ðŸ“Š Media detection/frame: 14.4
