In [1]:
# install once in your environment:
# pip install ultralytics opencv-python
from ultralytics import YOLO
import cv2
import numpy as np
import mss
import torch
import seaborn
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import os
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Load YOLOv8 model on GPU
model = YOLO("model/yolo11x.pt").to(device)
# Load this if you dont have cuda
#model = YOLO("model/yolo11x.pt")

Using device: cpu


In [2]:
INPUT_FOLDER = "shared_input"

os.makedirs(INPUT_FOLDER, exist_ok=True)




def get_latest_upload():
    files = [os.path.join(INPUT_FOLDER, f) for f in os.listdir(INPUT_FOLDER)]
    if not files:
        return None
    # Sort by modification time, latest last
    latest_file = max(files, key=os.path.getmtime)
    return latest_file

latest = get_latest_upload()

In [10]:
# Run prediction on an image
results = model(latest, show=True)  # show=True opens a window with boxes

# Print detailed results
for result in results:
    boxes = result.boxes
    for box in boxes:
        print(f"Class: {model.keypoints[int(box.cls)]}, "
              f"Conf: {float(box.conf):.2f}, "
              f"Box: {box.xyxy.tolist()}")


inference results will accumulate in RAM unless `stream=True` is passed, causing potential out-of-memory
errors for large sources or long-running streams and videos. See https://docs.ultralytics.com/modes/predict/ for help.

Example:
    results = model(source=..., stream=True)  # generator of Results objects
    for r in results:
        boxes = r.boxes  # Boxes object for bbox outputs
        masks = r.masks  # Masks object for segment masks outputs
        probs = r.probs  # Class probabilities for classification outputs

video 1/1 (frame 1/44965) /Users/amytran/Documents/GitHub/ImageCaptioner/shared_input/Truth Speakers 5 Meeting 1 ECL308W.mp4: 384x640 (no detections), 511.2ms
video 1/1 (frame 2/44965) /Users/amytran/Documents/GitHub/ImageCaptioner/shared_input/Truth Speakers 5 Meeting 1 ECL308W.mp4: 384x640 (no detections), 398.3ms
video 1/1 (frame 3/44965) /Users/amytran/Documents/GitHub/ImageCaptioner/shared_input/Truth Speakers 5 Meeting 1 ECL308W.mp4: 384x640 1 person, 1 donu

KeyboardInterrupt: 

In [8]:

#Run inference on an image
results = model(latest)   # path to your image

#Print detected classes and confidence
for result in results:
    boxes = result.boxes
    for box in boxes:
        print(f"Class: {model.names[int(box.cls)]}, "
              f"Conf: {float(box.conf):.2f}, "
              f"Box: {box.xyxy.tolist()}")




inference results will accumulate in RAM unless `stream=True` is passed, causing potential out-of-memory
errors for large sources or long-running streams and videos. See https://docs.ultralytics.com/modes/predict/ for help.

Example:
    results = model(source=..., stream=True)  # generator of Results objects
    for r in results:
        boxes = r.boxes  # Boxes object for bbox outputs
        masks = r.masks  # Masks object for segment masks outputs
        probs = r.probs  # Class probabilities for classification outputs

video 1/1 (frame 1/44965) /Users/amytran/Documents/GitHub/ImageCaptioner/shared_input/Truth Speakers 5 Meeting 1 ECL308W.mp4: 384x640 (no detections), 509.4ms
video 1/1 (frame 2/44965) /Users/amytran/Documents/GitHub/ImageCaptioner/shared_input/Truth Speakers 5 Meeting 1 ECL308W.mp4: 384x640 (no detections), 420.4ms
video 1/1 (frame 3/44965) /Users/amytran/Documents/GitHub/ImageCaptioner/shared_input/Truth Speakers 5 Meeting 1 ECL308W.mp4: 384x640 1 person, 1 donu

KeyboardInterrupt: 

In [5]:
# Set up screen capture
sct = mss.mss()
monitor = sct.monitors[1]  # 1 = primary screen, [0] = virtual all

while True:
    # Grab screen
    screenshot = sct.grab(monitor)
    frame = np.array(screenshot)

    # Convert BGRA -> BGR
    frame = cv2.cvtColor(frame, cv2.COLOR_BGRA2BGR)

    # Run YOLO inference
    results = model(frame)

    # Draw only people (class 0 in COCO)
    for r in results:
        for box in r.boxes:
            cls = int(box.cls[0])
            conf = float(box.conf[0])
            if cls == 0 and conf > 0.5:  # class 0 = person
                x1, y1, x2, y2 = map(int, box.xyxy[0])
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                cv2.putText(frame, f"Person {conf:.2f}", (x1, y1 - 10),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

    # Show window
    cv2.imshow("Screen Capture - People Detection", frame)

    # Exit with 'q'
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cv2.destroyAllWindows()



0: 416x640 2 persons, 1 dog, 541.1ms
Speed: 3.2ms preprocess, 541.1ms inference, 0.7ms postprocess per image at shape (1, 3, 416, 640)

0: 416x640 1 person, 2 dogs, 474.3ms
Speed: 2.2ms preprocess, 474.3ms inference, 0.4ms postprocess per image at shape (1, 3, 416, 640)

0: 416x640 1 person, 3 dogs, 453.2ms
Speed: 2.1ms preprocess, 453.2ms inference, 0.4ms postprocess per image at shape (1, 3, 416, 640)

0: 416x640 4 dogs, 461.0ms
Speed: 2.9ms preprocess, 461.0ms inference, 0.4ms postprocess per image at shape (1, 3, 416, 640)

0: 416x640 3 dogs, 2 sheeps, 448.3ms
Speed: 3.0ms preprocess, 448.3ms inference, 0.4ms postprocess per image at shape (1, 3, 416, 640)

0: 416x640 3 dogs, 2 sheeps, 447.7ms
Speed: 2.3ms preprocess, 447.7ms inference, 0.4ms postprocess per image at shape (1, 3, 416, 640)

0: 416x640 3 dogs, 3 sheeps, 460.0ms
Speed: 3.2ms preprocess, 460.0ms inference, 0.5ms postprocess per image at shape (1, 3, 416, 640)

0: 416x640 3 dogs, 3 sheeps, 452.5ms
Speed: 2.0ms preproc

KeyboardInterrupt: 

In [6]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")

def describe_image(frame):
    # Convert numpy frame -> PIL image
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    inputs = processor(images=image, return_tensors="pt").to("cuda")
    out = blip_model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Cancellation requested; stopping current tasks.


KeyboardInterrupt: 

In [7]:

with mss.mss() as sct:
    monitor = sct.monitors[1]  # full screen
    while True:
        # Grab screen
        sct_img = sct.grab(monitor)
        frame = np.array(sct_img)[:, :, :3]  # drop alpha channel
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)

        # Run YOLO detection
        results = model(frame)[0]
        annotated_frame = results.plot()

        # Run BLIP caption
        caption = describe_image(frame)
        cv2.putText(annotated_frame, caption, (20, 40),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

        # Show results
        cv2.imshow("YOLOv10 + BLIP Captioning", annotated_frame)

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

cv2.destroyAllWindows()



0: 416x640 3 dogs, 2 sheeps, 512.0ms
Speed: 3.0ms preprocess, 512.0ms inference, 3.3ms postprocess per image at shape (1, 3, 416, 640)


NameError: name 'describe_image' is not defined