In [1]:
import time
import os
os.sys.path.append("..")
from GroundingDINO.groundingdino.util.inference import (
    predict,
    batch_predict,
    preprocess_caption,
)
from GroundingDINO.groundingdino.models import build_model
from GroundingDINO.groundingdino.util.misc import clean_state_dict
from GroundingDINO.groundingdino.util.slconfig import SLConfig
from PIL import Image
import torch
import GroundingDINO.groundingdino.datasets.transforms as T
import numpy as np



In [2]:
GROUNDING_DINO_CONFIG = (
    "/home/farouk-gpu/zia-vision/zia_vision/deploy/segmenter/grounding_dino_config.py"
)
GROUNDING_DINO_MODEL = "/home/farouk-gpu/models/checkpoint_best_regular-2.pth"
CAPTION = "single . multipack . price . promo ."


def load_model(
    model_config_path: str, model_checkpoint_path: str, device: str = "cuda"
):
    args = SLConfig.fromfile(model_config_path)
    args.device = device
    model = build_model(args)
    checkpoint = torch.load(
        model_checkpoint_path, map_location="cpu", weights_only=False
    )
    model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
    model.eval()
    return model


image_file = "/home/farouk-gpu/Grounded-Segment-Anything/scripts/data/a7d6a974-652b-4818-b7db-8fb6e9dbc896.jpg"
batch_size = 4
device = "cuda"

grounding_model = load_model(
    model_config_path=GROUNDING_DINO_CONFIG,
    model_checkpoint_path=GROUNDING_DINO_MODEL,
    device=device,
).to(device)



final text_encoder_type: bert-base-uncased


In [24]:
caption = preprocess_caption(caption=CAPTION)
text_dict = grounding_model.encode_captions(captions=[caption], device="cuda")

In [4]:
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
model_id = "IDEA-Research/grounding-dino-tiny"
device = "cuda"
processor = AutoProcessor.from_pretrained(model_id)

In [5]:
 from PIL import Image
 image = Image.open(image_file).convert("RGB")

In [15]:
inputs_single = processor(images=image, text=CAPTION, return_tensors="pt")

In [10]:
inputs_single['pixel_values'].shape

torch.Size([1, 3, 1066, 800])

In [11]:
def pre_process_image(image: Image.Image) -> tuple[np.ndarray, torch.Tensor]:
    transform = T.Compose(
        [
            T.RandomResize([800], max_size=1333),
            # Resize((800, 1200)),
            T.ToTensor(),
            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ]
    )
    image_np: np.ndarray = np.asarray(image)
    image_transformed, _ = transform(image, None)
    return image_np, image_transformed

In [18]:
image_np, image_tensor = pre_process_image(image)

In [19]:
torch.equal(image_tensor.unsqueeze_(0), inputs_single['pixel_values'])

True

In [25]:
inputs_single['input_ids']

tensor([[  101,  2309,  1012,  4800, 23947,  1012,  3976,  1012, 19430,  1012,
           102]])

In [28]:
text_dict.keys()

dict_keys(['encoded_text', 'text_token_mask', 'position_ids', 'text_self_attention_masks'])

In [29]:
text_dict['position_ids']

tensor([[0, 0, 1, 0, 1, 2, 0, 1, 0, 1, 0]], device='cuda:0')

In [33]:
processor.image_processor(image).keys()

dict_keys(['pixel_values', 'pixel_mask'])

In [35]:
from pathlib import Path
images = [Image.open(file).convert("RGB") for file in Path("/home/farouk-gpu/Grounded-Segment-Anything/scripts/data").iterdir()]
output = processor.image_processor(images)

In [None]:
output['pixel_mask'][0]

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [43]:
[image.size for image in images]

[(3840, 5120), (3000, 4000), (2448, 3264)]