In [3]:
import requests
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection 

In [4]:
model_id = "IDEA-Research/grounding-dino-tiny"
device = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device) 

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [5]:
image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Check for cats and remote controls
# VERY important: text queries need to be lowercased + end with a dot
text = "a cat. a remote control." 

In [7]:
inputs = processor(images=image, text=text, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model(**inputs)

results = processor.post_process_grounded_object_detection(
    outputs,
    inputs.input_ids,
    # box_threshold=0.4,
    text_threshold=0.3,
    target_sizes=[image.size[::-1]]
) 

In [23]:
# image

import cv2
import numpy as np
from PIL import Image

# 1. Load the image using PIL
# Replace 'input_image.jpg' with your image path
# pil_image = Image.open('input_image.jpg') 

# 2. Convert the PIL image to a NumPy array (which is in RGB format by default)
opencv_image = np.array(image) 

# 3. Convert the color space from RGB to BGR
opencv_image = cv2.cvtColor(opencv_image, cv2.COLOR_RGB2BGR)

# The 'opencv_image' variable is now a NumPy array in BGR format, ready for use with OpenCV functions.

# Optional: Display the image using OpenCV
# cv2.imshow("OpenCV Image", opencv_image)
# cv2.waitKey(0)
# cv2.destroyAllWindows()


In [8]:
results



[{'scores': tensor([0.4785, 0.4379, 0.4761, 0.3309]),
  'boxes': tensor([[344.6982,  23.1083, 637.1821, 374.2747],
          [ 12.2693,  51.9104, 316.8566, 472.4341],
          [ 38.5852,  70.0090, 176.7768, 118.1755],
          [332.1796,  74.5635, 370.6935, 186.9436]]),
  'text_labels': ['a cat', 'a cat', 'a remote control', 'a remote control'],
  'labels': ['a cat', 'a cat', 'a remote control', 'a remote control']}]

In [39]:
results[0]['boxes'].numpy().tolist() 

[[344.69818115234375,
  23.10826873779297,
  637.1820678710938,
  374.27471923828125],
 [12.269268035888672,
  51.91035842895508,
  316.85662841796875,
  472.43414306640625],
 [38.585243225097656,
  70.00904846191406,
  176.77679443359375,
  118.17546844482422],
 [332.17962646484375,
  74.56353759765625,
  370.6935119628906,
  186.94363403320312]]

In [49]:
# Display the results

import cv2
import numpy as np

def cv_bb (img, bounding_boxes):

    height, width, channels = img.shape
    box_color = (0, 255, 0)   # Green color (B, G, R)
    line_thickness = 2        # Thickness of the bounding box lines
    
    # Iterate through the list and draw each bounding box
    for box in bounding_boxes:
        x_min, y_min, x_max, y_max = box
        # Define the start point (top-left corner) and end point (bottom-right corner)
        start_point = (int(x_min), int(y_min))
        end_point = (int(x_max), int(y_max))
        # Draw the rectangle on the image
        cv2.rectangle (img, pt1=start_point, pt2=end_point, color=box_color, thickness=line_thickness)
    
    # 5. Display the image (optional, for viewing)
    cv2.imshow("Image with Bounding Boxes", img)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    
    # 6. Save the output image (optional)
    cv2.imwrite ("image_with_boxes.jpg", img)
    print("Bounding boxes drawn. Output saved to image_with_boxes.jpg")


In [50]:
bounding_boxes = results[0]['boxes'].numpy().tolist() 

cv_bb (opencv_image, bounding_boxes) 

Bounding boxes drawn. Output saved to image_with_boxes.jpg
