In [2]:
import cv2
import numpy as np
from ultralytics import YOLOE
from ultralytics.models.yolo.yoloe import YOLOEVPSegPredictor

In [9]:
from ultralytics import YOLOE
model = YOLOE("yoloe-11s-seg.pt")


[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yoloe-11s-seg.pt to 'yoloe-11s-seg.pt': 100% ━━━━━━━━━━━━ 26.5MB 192.2KB/s 2:21 2:21<0.0ss


In [11]:
import cv2
import time
import numpy as np
# from yoloe import YOLOE, YOLOEVPSegPredictor  # Assuming these are imported

# 1) Load model
# Ensure you are using a model that supports visual prompting
model = YOLOE("yoloe-11s-seg.pt") 

# 2) Grab a prompt frame with OpenCV (WITH 10-SEC DELAY)
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    raise RuntimeError("Cannot open camera")

# --- CHANGE 1: Updated instructions for 3 classes ---
print("Put ALL THREE: GLASS, METAL, and BIO-WASTE in front of the camera.")
print("Press SPACE to start the 10-second countdown.")

countdown_started = False
start_time = 0
delay_duration = 10  # seconds

while True:
    ret, frame = cap.read()
    if not ret:
        continue

    # Create a copy to draw the timer text on (so the final image is clean)
    display_frame = frame.copy()

    if countdown_started:
        elapsed_time = time.time() - start_time
        remaining_time = delay_duration - elapsed_time

        if remaining_time <= 0:
            # Time is up! Capture the CLEAN 'frame' (not display_frame)
            prompt_frame = frame.copy()
            print("Captured!")
            break
        else:
            # Draw countdown text
            text = f"Capturing in: {int(remaining_time) + 1}"
            cv2.putText(display_frame, text, (50, 100), 
                        cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 0, 255), 3)
    else:
        cv2.putText(display_frame, "Press SPACE to start timer", (20, 50), 
                    cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)

    cv2.imshow("Setup Phase", display_frame)
    key = cv2.waitKey(1) & 0xFF

    if key == 32 and not countdown_started:  # SPACE
        print("Countdown started...")
        countdown_started = True
        start_time = time.time()
        
    elif key == ord("q"):
        cap.release()
        cv2.destroyAllWindows()
        raise SystemExit("Quit before defining prompts")

cv2.destroyAllWindows()
cap.release()  # we will reopen for streaming later

# 3) Select ROIs for classes
# --- CHANGE 2: Added 'bio-waste' to the list ---
class_labels = ["glass", "metal", "bio-waste"]  # 0->glass, 1->metal, 2->bio-waste
bboxes = []
cls_ids = []

print(f"Captured frame ready. Prepare to draw boxes.")

for class_id, label in enumerate(class_labels):
    print(f"Draw a box around {label}, then press ENTER, then ESC.")

    # Note: We use prompt_frame here, which is the clean image captured after 10s
    roi = cv2.selectROI(f"Select {label}", prompt_frame, showCrosshair=True)
    cv2.destroyWindow(f"Select {label}")

    x, y, w, h = roi
    # Check if user cancelled or drew an empty box
    if w == 0 or h == 0:
        print(f"Skipped {label} (no ROI selected).")
        continue

    x_min, y_min = int(x), int(y)
    x_max, y_max = int(x + w), int(y + h)

    bboxes.append([x_min, y_min, x_max, y_max])
    cls_ids.append(class_id)

if len(bboxes) == 0:
    raise SystemExit("No prompts defined, exiting.")

visual_prompts = dict(
    bboxes=np.array(bboxes, dtype=np.float32),
    cls=np.array(cls_ids, dtype=np.int32),
)

print("Prompts defined. Starting real-time detection... Press 'q' to quit.")

# 4) Start YOLOE on webcam as a stream
stream = model.predict(
    source=0,                     
    visual_prompts=visual_prompts,
    refer_image=prompt_frame,     
    predictor=YOLOEVPSegPredictor,
    stream=True,
    verbose=False,
)

# 5) Loop over result frames
for results in stream:
    # --- CHANGE 3: Added ID 2 for bio-waste mapping ---
    results.names = {0: "glass", 1: "metal", 2: "bio-waste"}
    
    annotated = results.plot()
    cv2.imshow("YOLOE Detection (Glass, Metal, Bio-waste)", annotated)

    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cv2.destroyAllWindows()

Put ALL THREE: GLASS, METAL, and BIO-WASTE in front of the camera.
Press SPACE to start the 10-second countdown.
Countdown started...
Captured!
Captured frame ready. Prepare to draw boxes.
Draw a box around glass, then press ENTER, then ESC.
Draw a box around metal, then press ENTER, then ESC.
Draw a box around bio-waste, then press ENTER, then ESC.
Prompts defined. Starting real-time detection... Press 'q' to quit.
Ultralytics 8.3.228  Python-3.13.3 torch-2.9.1+cpu CPU (AMD Ryzen 7 5700U with Radeon Graphics)
YOLOe-11s-seg summary (fused): 137 layers, 13,693,398 parameters, 1,857,958 gradients
1/1: 0... Success  (inf frames of shape 640x480 at 30.00 FPS)

