In [None]:
# install once in your environment:
# pip install ultralytics opencv-python
from ultralytics import YOLO
import cv2
import numpy as np
import mss
import torch
import seaborn
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Load YOLOv8 model on GPU
model = YOLO("model/yolo11x.pt").to(device)
# Load this if you dont have cuda
#model = YOLO("model/yolo11x.pt")

In [None]:
# Run prediction on an image
results = model("TestFootage\carspassingby.mp4", show=True)  # show=True opens a window with boxes

# Print detailed results
for result in results:
    boxes = result.boxes
    for box in boxes:
        print(f"Class: {model.keypoints[int(box.cls)]}, "
              f"Conf: {float(box.conf):.2f}, "
              f"Box: {box.xyxy.tolist()}")

In [None]:

#Run inference on an image
results = model("TestFootage\Frames\cartestframe10.jpg")   # path to your image

#Print detected classes and confidence
for result in results:
    boxes = result.boxes
    for box in boxes:
        print(f"Class: {model.names[int(box.cls)]}, "
              f"Conf: {float(box.conf):.2f}, "
              f"Box: {box.xyxy.tolist()}")



In [None]:
# Set up screen capture
sct = mss.mss()
monitor = sct.monitors[1]  # 1 = primary screen, [0] = virtual all

while True:
    # Grab screen
    screenshot = sct.grab(monitor)
    frame = np.array(screenshot)

    # Convert BGRA -> BGR
    frame = cv2.cvtColor(frame, cv2.COLOR_BGRA2BGR)

    # Run YOLO inference
    results = model(frame)

    # Draw only people (class 0 in COCO)
    for r in results:
        for box in r.boxes:
            cls = int(box.cls[0])
            conf = float(box.conf[0])
            if cls == 0 and conf > 0.5:  # class 0 = person
                x1, y1, x2, y2 = map(int, box.xyxy[0])
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                cv2.putText(frame, f"Person {conf:.2f}", (x1, y1 - 10),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

    # Show window
    cv2.imshow("Screen Capture - People Detection", frame)

    # Exit with 'q'
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cv2.destroyAllWindows()


In [19]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")

def describe_image(frame):
    # Convert numpy frame -> PIL image
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    inputs = processor(images=image, return_tensors="pt").to("cuda")
    out = blip_model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption

In [None]:

with mss.mss() as sct:
    monitor = sct.monitors[1]  # full screen
    while True:
        # Grab screen
        sct_img = sct.grab(monitor)
        frame = np.array(sct_img)[:, :, :3]  # drop alpha channel
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)

        # Run YOLO detection
        results = model(frame)[0]
        annotated_frame = results.plot()

        # Run BLIP caption
        caption = describe_image(frame)
        cv2.putText(annotated_frame, caption, (20, 40),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

        # Show results
        cv2.imshow("YOLOv10 + BLIP Captioning", annotated_frame)

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

cv2.destroyAllWindows()
