In [43]:
import requests

import torch
from PIL import Image, ImageDraw, ImageFilter
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
from accelerate import Accelerator

import os

In [2]:
model_id = "IDEA-Research/grounding-dino-tiny"
device = Accelerator().device

processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

The image processor of type `GroundingDinoImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. 
Loading weights: 100%|██████████| 990/990 [00:05<00:00, 180.26it/s, Materializing param=model.text_projection.weight]                                                                           


In [38]:
def run_model(image, text_labels, processor, model):
    print('text_labels: ', text_labels)
    inputs = processor(images=image, text=text_labels, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)

    results = processor.post_process_grounded_object_detection(
        outputs,
        inputs.input_ids,
        threshold=0.4,
        text_threshold=0.3,
        target_sizes=[image.size[::-1]]
    )

    result = results[0]
    coordinates = []
    for box, score, labels in zip(result["boxes"], result["scores"], result["labels"]):
        box = [round(x, 2) for x in box.tolist()]
        coordinates.append(box)
        print(f"Detected {labels} with confidence {round(score.item(), 3)} at location {box}")
    
    return coordinates

# run_model(image, text_labels, processor, model)


In [None]:
image_path = "vegetables.jpg"    # even with low quality, the model recognizes bulb, but not all BULBS
image = Image.open(image_path)
# Check for cats and remote controls
# text_labels = [["a lady"]]
text_labels = [["a bulb", "a lady"]]
coordinates = run_model(image, text_labels, processor, model)

text_labels:  [['a bulb', 'a lady']]
Detected a lady with confidence 0.806 at location [284.4, 127.31, 347.06, 307.8]
Detected a bulb with confidence 0.485 at location [148.9, 12.48, 168.5, 36.74]


In [71]:
image_path = "vegetables_hig_quality.jpg"    # even with low quality, the model recognizes bulb, but not all BULBS
image = Image.open(image_path)
text_labels = [["a bulb", "a lady", "watermelons"]]
coordinates = run_model(image, text_labels, processor, model)

text_labels:  [['a bulb', 'a lady', 'watermelons']]
Detected a lady with confidence 0.598 at location [848.58, 382.67, 1044.35, 920.51]
Detected a bulb with confidence 0.404 at location [448.62, 41.67, 504.18, 110.46]




In [None]:
# this code provides an image but the transition from the blurred background to the clear foreground is not smooth
def blur_outside_bbox_old(image_path, coords, blur_radius=10):
    if len(coords) != 4:
        raise ValueError("coords must be a list or tuple of [x1, y1, x2, y2].")

    x1, y1, x2, y2 = coords
    image = Image.open(image_path)

    blurred = image.filter(ImageFilter.GaussianBlur(radius=blur_radius))
    bbox = (int(x1), int(y1), int(x2), int(y2))
    region = image.crop(bbox)
    blurred.paste(region, bbox)

    root, ext = os.path.splitext(image_path)
    output_path = f"{root}_modified{ext}"
    blurred.save(output_path)
    return output_path

In [49]:
def blur_outside_bbox(image_path, coords, blur_radius=10, feather_radius=15, blur_amount=1.0):
    """Blur everything outside coords with controllable blur strength and feathering."""
    if len(coords) != 4:
        raise ValueError("coords must be a list or tuple of [x1, y1, x2, y2].")
    if not 0.0 <= blur_amount <= 1.0:
        raise ValueError("blur_amount must be between 0 and 1.")

    x1, y1, x2, y2 = coords
    image = Image.open(image_path).convert("RGB")

    blurred = image.filter(ImageFilter.GaussianBlur(radius=blur_radius))
    if blur_amount < 1.0:
        blurred = Image.blend(image, blurred, blur_amount)

    bbox = (int(x1), int(y1), int(x2), int(y2))

    mask = Image.new("L", image.size, 0)
    draw = ImageDraw.Draw(mask)
    draw.rectangle(bbox, fill=255)
    if feather_radius > 0:
        mask = mask.filter(ImageFilter.GaussianBlur(radius=feather_radius))

    blended = Image.composite(image, blurred, mask)

    root, ext = os.path.splitext(image_path)
    output_path = f"{root}_modified{ext}"
    blended.save(output_path)
    return output_path

In [19]:
image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Check for cats and remote controls
text_labels = [["a cat", "a remote control"]]
run_model(image, text_labels, processor, model)

text_labels:  [['a cat', 'a remote control']]
Detected a cat with confidence 0.468 at location [344.4, 23.25, 637.28, 374.46]
Detected a remote control with confidence 0.45 at location [38.56, 69.9, 176.79, 118.05]
Detected a cat with confidence 0.435 at location [12.37, 51.98, 316.9, 472.5]


In [57]:
image_path = "vegetables_hig_quality.jpg"
image = Image.open(image_path)
# Check for cats and remote controls
# text_labels = [["a lady"]]
text_labels = [["a bulb"]]
coordinates = run_model(image, text_labels, processor, model)
# this recognition is perfect

blur_radius = 18  # Gaussian blur strength applied to the background
feather_radius = 20  # controls softness of the transition edge
blur_amount = 0.85  # 0 keeps background sharp, 1 uses the fully blurred background
blur_outside_bbox(
    image_path,
    coordinates[0],
    blur_radius=blur_radius,
    feather_radius=feather_radius,
    blur_amount=blur_amount,
)


text_labels:  [['a bulb']]
Detected a bulb with confidence 0.59 at location [450.25, 42.73, 501.94, 108.35]
Detected a bulb with confidence 0.5 at location [681.66, 224.3, 710.03, 268.53]
Detected a bulb with confidence 0.467 at location [716.31, 267.13, 740.77, 299.47]


'vegetables_hig_quality_modified.jpg'

In [68]:
image_path = "vegetables.jpg"
image = Image.open(image_path)
# Check for cats and remote controls
# text_labels = [["a lady"]]
text_labels = [["a bulb"]]
coordinates = run_model(image, text_labels, processor, model)
# this recognition is perfect

blur_radius = 18  # Gaussian blur strength applied to the background
feather_radius = 8  # controls softness of the transition edge, Low values (e.g., 0–5) keep the mask edge tight, so the blur starts almost immediately outside the box.
blur_amount = 0.90  # 0 keeps background sharp, 1 uses the fully blurred background
blur_outside_bbox(
    image_path,
    coordinates[0],
    blur_radius=blur_radius,
    feather_radius=feather_radius,
    blur_amount=blur_amount,
)


text_labels:  [['a bulb']]
Detected a bulb with confidence 0.591 at location [149.73, 13.23, 167.74, 36.14]
Detected a bulb with confidence 0.441 at location [227.09, 76.02, 236.69, 89.29]


'vegetables_modified.jpg'

In [29]:
image = Image.open("vegetables.jpg")
# Check for cats and remote controls
text_labels = [["water melon"]]
run_model(image, text_labels, processor, model)
# WATERMELONS DID NOT WORK; BUT WATER MELON WORKED. THIS IS INTERESTING. 
# MAYBE THE MODEL CAN RECOGNIZE COMPOUND WORDS BETTER THAN SINGLE WORDS? 
# OR MAYBE IT JUST HAS A BETTER REPRESENTATION FOR WATER MELON THAN WATERMELON. THIS IS INTERESTING AND WORTH INVESTIGATING FURTHER.

text_labels:  [['water melon']]
Detected water melon with confidence 0.509 at location [207.06, 167.66, 276.57, 210.42]


In [33]:
image = Image.open("family.jpg")
# Check for cats and remote controls
text_labels = [["water melon", "pikachu"]]
run_model(image, text_labels, processor, model)

text_labels:  [['water melon', 'pikachu']]


In [27]:
image = Image.open("family.jpg")
# Check for cats and remote controls
text_labels = [["a man", "a small girl"]]
run_model(image, text_labels, processor, model)

text_labels:  [['a man', 'a small girl']]
Detected a man with confidence 0.557 at location [303.17, 153.06, 393.09, 387.56]
Detected a small girl with confidence 0.43 at location [212.71, 131.34, 274.23, 266.34]


In [28]:
image = Image.open("family.jpg")
# Check for cats and remote controls
text_labels = [["a man", "girl"]]
run_model(image, text_labels, processor, model)
# prediction is wrong, one reason is image might be not clear, and second is adult is also considered a girl


text_labels:  [['a man', 'girl']]
Detected a man with confidence 0.667 at location [303.39, 152.89, 392.7, 387.53]
Detected girl with confidence 0.442 at location [209.92, 131.78, 275.62, 272.19]
Detected girl with confidence 0.409 at location [205.61, 175.25, 288.67, 391.57]


### things to consider
#### different images, images wiht different resolutions, multiple objects within an image, keywords given to the DINO model
##### there is still difficulty for the model to identify the small objects in low resolution images

In [None]:
image_path = "kids_football.jpg"
image = Image.open(image_path)
# Check for cats and remote controls
# text_labels = [["a lady"]]
text_labels = [["orange tshirt kid"]]
coordinates = run_model(image, text_labels, processor, model)
# this recognition is perfect

blur_radius = 18  # Gaussian blur strength applied to the background
feather_radius = 20  # controls softness of the transition edge
blur_amount = 0.85  # 0 keeps background sharp, 1 uses the fully blurred background
blur_outside_bbox(
    image_path,
    coordinates[0],
    blur_radius=blur_radius,
    feather_radius=feather_radius,
    blur_amount=blur_amount,
)



text_labels:  [['orange tshirt kid']]
Detected orange tshirt kid with confidence 0.564 at location [777.46, 230.21, 935.8, 405.11]
Detected orange tshirt kid with confidence 0.587 at location [429.41, 243.01, 666.31, 435.45]


'kids_football_modified.jpg'