In [1]:
%pip install ultralytics

Collecting ultralytics
  Using cached ultralytics-8.3.228-py3-none-any.whl.metadata (37 kB)
Collecting matplotlib>=3.3.0 (from ultralytics)
  Using cached matplotlib-3.10.7-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting torchvision>=0.9.0 (from ultralytics)
  Using cached torchvision-0.24.1-cp313-cp313-win_amd64.whl.metadata (5.9 kB)
Collecting ultralytics-thop>=2.0.18 (from ultralytics)
  Using cached ultralytics_thop-2.0.18-py3-none-any.whl.metadata (14 kB)
Using cached ultralytics-8.3.228-py3-none-any.whl (1.1 MB)
Using cached matplotlib-3.10.7-cp313-cp313-win_amd64.whl (8.1 MB)
Using cached torchvision-0.24.1-cp313-cp313-win_amd64.whl (4.3 MB)
Using cached ultralytics_thop-2.0.18-py3-none-any.whl (28 kB)
Installing collected packages: matplotlib, ultralytics-thop, torchvision, ultralytics
Successfully installed matplotlib-3.10.7 torchvision-0.24.1 ultralytics-8.3.228 ultralytics-thop-2.0.18
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import cv2
import numpy as np
from ultralytics import YOLOE
from ultralytics.models.yolo.yoloe import YOLOEVPSegPredictor

In [3]:
# test_cv2_gui.py
import cv2

img = 255 * (cv2.UMat(400, 400, 0).get() == 0).astype('uint8')  # simple white image
cv2.imshow("test", img)
cv2.waitKey(0)
cv2.destroyAllWindows()


In [4]:


# 1) Load model
model = YOLOE("yoloe-11l-seg.pt")

# 2) Grab a prompt frame with OpenCV
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    raise RuntimeError("Cannot open camera")

print("Put BOTH: GLASS and METAL in front of the camera.")
print("Press SPACE to capture a frame for drawing prompts.")

while True:
    ret, frame = cap.read()
    if not ret:
        continue

    cv2.imshow("Press SPACE to capture prompt frame", frame)
    key = cv2.waitKey(1) & 0xFF

    if key == 32:  # SPACE
        prompt_frame = frame.copy()
        break
    elif key == ord("q"):
        cap.release()
        cv2.destroyAllWindows()
        raise SystemExit("Quit before defining prompts")

cv2.destroyAllWindows()
cap.release()  # we will reopen for streaming later

# 3) Select ROIs for classes
class_labels = ["glass", "metal"]  # 0 -> glass, 1 -> metal
bboxes = []
cls_ids = []

for class_id, label in enumerate(class_labels):
    print(f"Draw a box around {label}, then press ENTER, then ESC.")

    roi = cv2.selectROI(f"Select {label}", prompt_frame, showCrosshair=True)
    cv2.destroyWindow(f"Select {label}")

    x, y, w, h = roi
    if w == 0 or h == 0:
        print(f"Skipped {label} (no ROI selected).")
        continue

    x_min, y_min = int(x), int(y)
    x_max, y_max = int(x + w), int(y + h)

    bboxes.append([x_min, y_min, x_max, y_max])
    cls_ids.append(class_id)

if len(bboxes) == 0:
    raise SystemExit("No prompts defined, exiting.")

visual_prompts = dict(
    bboxes=np.array(bboxes, dtype=np.float32),
    cls=np.array(cls_ids, dtype=np.int32),
)

print("Prompts defined. Starting real-time detection... Press 'q' to quit.")

# 4) Start YOLOE on webcam as a *stream*, using prompt_frame as refer_image
#    YOLOE will use prompt_frame + visual_prompts to build the embeddings,
#    then apply them to each new frame from the webcam.
stream = model.predict(
    source=0,                         # webcam
    visual_prompts=visual_prompts,
    refer_image=prompt_frame,         # VERY IMPORTANT
    predictor=YOLOEVPSegPredictor,
    stream=True,
    verbose=False,
)

# 5) Loop over result frames
for results in stream:
    # Map class IDs -> names for this result
    results.names = {0: "glass", 1: "metal"}

    annotated = results.plot()
    cv2.imshow("YOLOE Glass & Metal Detection", annotated)

    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cv2.destroyAllWindows()


[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yoloe-11l-seg.pt to 'yoloe-11l-seg.pt': 100% ━━━━━━━━━━━━ 67.7MB 2.8MB/s 24.0s24.0s<0.0sss
Put BOTH: GLASS and METAL in front of the camera.
Press SPACE to capture a frame for drawing prompts.
Draw a box around glass, then press ENTER, then ESC.
Draw a box around metal, then press ENTER, then ESC.
Prompts defined. Starting real-time detection... Press 'q' to quit.
Ultralytics 8.3.228  Python-3.13.3 torch-2.9.1+cpu CPU (AMD Ryzen 7 5700U with Radeon Graphics)
YOLOe-11l-seg summary (fused): 227 layers, 35,117,862 parameters, 2,254,374 gradients
1/1: 0... Success  (inf frames of shape 640x480 at 30.00 FPS)

