In [1]:
import requests

import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

model_id = "IDEA-Research/grounding-dino-base"

processor = AutoProcessor.from_pretrained(model_id)
device = "cuda"
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)


2024-07-25 11:24:02.082844: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-25 11:24:02.098265: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-25 11:24:02.098290: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-25 11:24:02.107861: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from pathlib import Path

def run_sam(image, text):
    inputs = processor(images=image, text=text, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    
    # convert outputs (bounding boxes and class logits) to COCO API
    
    target_sizes = torch.tensor([image.size[::-1]])
    
    results = processor.image_processor.post_process_object_detection(
    
        outputs, threshold=0.35, target_sizes=target_sizes
    
    )[0]

    box_list = []
    for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    
        box = [round(i, 1) for i in box.tolist()]
        conf = round(score.item(), 2)
        box_list.append((box, conf))
    
        print(f"Detected {label.item()} with confidence " f"{round(score.item(), 2)} at location {box}")
    return box_list

In [7]:
import json

import ultralytics
from ultralytics.utils.plotting import Annotator
import cv2

file_list = sorted(Path("plot/f1").glob("*jpg"))
print("episodes:", len(file_list))
# file_list = file_list[:2]

box_list = [] # list(image); image=(filename, boxes); boxes=((x,y,x,y), conf), ...

for file in file_list:
    print(file)
    image = Image.open(file)
    # Check for cats and remote controls
    text = "a pen."

    boxes = run_sam(image, text) # detections=list( ((x,y,x,y), conf) )
    box_list.append((file.stem, boxes))
    # better safe than sorry
    with open("data/boxes.json", "w") as f:
        json.dump(box_list, f, indent=4, ensure_ascii=False)

    plot_path = Path("plot/detect") / (file.name)
    plot_path.parent.mkdir(parents=True, exist_ok=True)

    annotator = Annotator(image)
    for box, conf in boxes:
        annotator.box_label(box, label=str(conf), color=(0, 204, 0)) # The bounding box coordinates (x1, y1, x2, y2)
    
    result = annotator.result()
    cv2.imwrite(plot_path, cv2.cvtColor(result, cv2.COLOR_RGB2BGR), [cv2.IMWRITE_JPEG_QUALITY, 100])

episodes: 100
plot/f1/2023-03-02-15h-03m-35s.jpg
plot/f1/2023-03-02-16h-51m-12s.jpg
Detected 1 with confidence 0.42 at location [637.4, 431.0, 708.3, 452.1]
plot/f1/2023-03-02-18h-13m-38s.jpg
Detected 1 with confidence 0.53 at location [565.0, 562.0, 673.2, 601.1]
plot/f1/2023-03-03-15h-30m-17s.jpg
Detected 1 with confidence 0.42 at location [538.0, 359.1, 682.0, 407.5]
plot/f1/2023-03-06-16h-02m-37s.jpg
Detected 1 with confidence 0.49 at location [740.7, 427.2, 886.6, 489.2]
plot/f1/2023-03-06-17h-06m-52s.jpg
Detected 1 with confidence 0.43 at location [651.6, 379.2, 696.3, 446.9]
plot/f1/2023-03-07-15h-36m-10s.jpg
Detected 1 with confidence 0.43 at location [676.0, 351.4, 741.2, 382.8]
plot/f1/2023-03-07-16h-20m-41s.jpg
plot/f1/2023-03-07-17h-59m-19s.jpg
plot/f1/2023-03-08-13h-29m-47s.jpg
plot/f1/2023-03-08-14h-45m-19s.jpg
Detected 1 with confidence 0.49 at location [780.7, 403.2, 858.7, 451.7]
plot/f1/2023-03-08-16h-45m-10s.jpg
Detected 1 with confidence 0.55 at location [816.3, 435

In [8]:
box_list

[('2023-03-02-15h-03m-35s', []),
 ('2023-03-02-16h-51m-12s', [([637.4, 431.0, 708.3, 452.1], 0.42)]),
 ('2023-03-02-18h-13m-38s', [([565.0, 562.0, 673.2, 601.1], 0.53)]),
 ('2023-03-03-15h-30m-17s', [([538.0, 359.1, 682.0, 407.5], 0.42)]),
 ('2023-03-06-16h-02m-37s', [([740.7, 427.2, 886.6, 489.2], 0.49)]),
 ('2023-03-06-17h-06m-52s', [([651.6, 379.2, 696.3, 446.9], 0.43)]),
 ('2023-03-07-15h-36m-10s', [([676.0, 351.4, 741.2, 382.8], 0.43)]),
 ('2023-03-07-16h-20m-41s', []),
 ('2023-03-07-17h-59m-19s', []),
 ('2023-03-08-13h-29m-47s', []),
 ('2023-03-08-14h-45m-19s', [([780.7, 403.2, 858.7, 451.7], 0.49)]),
 ('2023-03-08-16h-45m-10s', [([816.3, 435.1, 900.4, 498.0], 0.55)]),
 ('2023-03-08-19h-32m-03s', [([578.2, 562.9, 611.4, 631.6], 0.55)]),
 ('2023-03-08-19h-51m-18s', [([550.4, 457.0, 599.1, 496.6], 0.4)]),
 ('2023-03-09-18h-14m-34s', [([722.0, 452.9, 740.3, 498.6], 0.41)]),
 ('2023-03-09-18h-47m-35s', [([594.0, 377.2, 664.2, 405.3], 0.38)]),
 ('2023-03-09-19h-48m-15s', [([703.0, 327