In [48]:
import torch
import requests
from io import BytesIO
from PIL import Image, ImageDraw
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

In [50]:
processor = AutoProcessor.from_pretrained("omlab/omdet-turbo-swin-tiny-hf")
model = AutoModelForZeroShotObjectDetection.from_pretrained(
    "omlab/omdet-turbo-swin-tiny-hf"
)

classes = ["cat", "remote", "car", "truck", "bus", "traffic light"]
task = "Detect {}.".format(", ".join(classes))

url1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
image1 = Image.open(BytesIO(requests.get(url1).content)).convert("RGB")


url2 = "http://images.cocodataset.org/train2017/000000257813.jpg"
image2 = Image.open(BytesIO(requests.get(url2).content)).convert("RGB")

url3 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
image3 = Image.open(BytesIO(requests.get(url3).content)).convert("RGB")

image4 = Image.open(
    "/home/dbogdoll/mcity_data_engine/scripts/fisheye_small.png"
).convert("RGB")
image5 = Image.open(
    "/home/dbogdoll/mcity_data_engine/scripts/fisheye_test.png"
).convert("RGB")

images = [image1, image2, image3, image4, image5]
batch_classes = [classes] * len(images)
batch_task = [task] * len(images)
print(batch_task)
print(batch_classes)
print(batch_task)

for image in images:
    # Print image information
    print(f"Image size: {image.size}")
    print(f"Image mode: {image.mode}")
    print(f"Image format: {image.format}")

inputs = processor(
    images=images,
    text=batch_classes,
    task=batch_task,
    return_tensors="pt",
)
print(inputs)
with torch.amp.autocast("cuda"):
    with torch.no_grad():
        outputs = model(**inputs)

print(outputs)
# convert outputs (bounding boxes and class logits)
target_sizes = [img.size[::-1] for img in images]
print(target_sizes)
results = processor.post_process_grounded_object_detection(
    outputs,
    classes=batch_classes,
    target_sizes=target_sizes,
    score_threshold=0.2,
    nms_threshold=0.3,
)
print(results)
for i, result in enumerate(results):
    for score, class_name, box in zip(
        result["scores"], result["classes"], result["boxes"]
    ):
        box = [round(i, 1) for i in box.tolist()]
        print(
            f"Detected {class_name} with confidence "
            f"{round(score.item(), 2)} at location {box} in image {i}"
        )

['Detect cat, remote, car, truck, bus, traffic light.', 'Detect cat, remote, car, truck, bus, traffic light.', 'Detect cat, remote, car, truck, bus, traffic light.', 'Detect cat, remote, car, truck, bus, traffic light.', 'Detect cat, remote, car, truck, bus, traffic light.']
[['cat', 'remote', 'car', 'truck', 'bus', 'traffic light'], ['cat', 'remote', 'car', 'truck', 'bus', 'traffic light'], ['cat', 'remote', 'car', 'truck', 'bus', 'traffic light'], ['cat', 'remote', 'car', 'truck', 'bus', 'traffic light'], ['cat', 'remote', 'car', 'truck', 'bus', 'traffic light']]
['Detect cat, remote, car, truck, bus, traffic light.', 'Detect cat, remote, car, truck, bus, traffic light.', 'Detect cat, remote, car, truck, bus, traffic light.', 'Detect cat, remote, car, truck, bus, traffic light.', 'Detect cat, remote, car, truck, bus, traffic light.']
Image size: (640, 480)
Image mode: RGB
Image format: None
Image size: (640, 360)
Image mode: RGB
Image format: None
Image size: (1600, 1067)
Image mode:

In [47]:
# Draw bounding boxes on the image
i = 4
image = images[i]
draw = ImageDraw.Draw(image)

boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["classes"]


for box, label in zip(boxes, labels):
    draw.rectangle(box.tolist(), outline="red", width=2)
    draw.text((box[0], box[1]), label, fill="red")

# Display the image
display(image)

IndexError: list index out of range

In [23]:
import os

tokens = {}
with open("/home/dbogdoll/mcity_data_engine/.secret", "r") as file:
    for line in file:
        key, value = line.strip().split("=")
        tokens[key] = value

os.environ["HF_TOKEN"] = tokens["HF_TOKEN"]

In [24]:
import sys

sys.path.append("..")

import fiftyone as fo
from fiftyone.utils.huggingface import load_from_hub
from utils.data_loader import FiftyOneTorchDatasetCOCO, TorchToHFDatasetCOCO

try:
    dataset_v51 = load_from_hub("dbogdollumich/mcity_fisheye_v51", max_samples=32)
except:
    dataset_v51 = fo.load_dataset("dbogdollumich/mcity_fisheye_v51")
pytorch_dataset = FiftyOneTorchDatasetCOCO(dataset_v51)

Downloading config file fiftyone.yml from dbogdollumich/mcity_fisheye_v51
Loading dataset


In [54]:
from torchvision import transforms
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader

from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

os.environ["TOKENIZERS_PARALLELISM"] = "false"

model_name = "omlab/omdet-turbo-swin-tiny-hf"

batch_size = 2
pred_key = "predictions_omdet"
eval_key = "eval"
transform = transforms.Compose([transforms.ToTensor()])

pytorch_dataset = FiftyOneTorchDatasetCOCO(
    dataset_v51,
    # transforms=transform,
)
data_loader = DataLoader(
    pytorch_dataset,
    batch_size=batch_size,
    num_workers=8,
    pin_memory=True,
    collate_fn=lambda x: list(zip(*x)),
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

classes_v51 = dataset_v51.default_classes


processor = AutoProcessor.from_pretrained(model_name)
batch_classes = [classes_v51] * batch_size
task = "Detect {}.".format(", ".join(classes_v51))

batch_tasks = [task] * batch_size
print(classes_v51)
print(batch_classes)
print(batch_tasks)

model = AutoModelForZeroShotObjectDetection.from_pretrained(model_name).to(device)

for step, (images, targets) in enumerate(
    tqdm(data_loader, desc="Zero Shot Teacher Model " + model_name)
):

    for image in images:
        # Print image information
        print(f"Image size: {image.size}")
        print(f"Image mode: {image.mode}")
        print(f"Image format: {image.format}")

    target_sizes = [img.size[::-1] for img in images]

    # target_sizes = [
    #    tuple(img.shape[1:]) for img in images
    # ]  # style [(480,640),(480,640)] h,w
    # images = [(image).to(device, non_blocking=True) for image in images]

    print(images)
    print(target_sizes)

    inputs = processor(
        text=batch_classes,
        images=images,
        task=batch_tasks,
        return_tensors="pt",
    ).to(device)

    print(inputs)
    with torch.amp.autocast("cuda"):
        with torch.no_grad():
            outputs = model(**inputs)

    print(outputs)
    results = processor.post_process_grounded_object_detection(
        outputs,
        classes=batch_classes,
        score_threshold=0.2,
        nms_threshold=0.3,
        target_sizes=target_sizes,
    )
    print(results)
    for result, target in zip(results, targets):
        boxes, scores, labels = result["boxes"], result["scores"], result["classes"]

        detections = []
        for box, score, label in zip(boxes, scores, labels):
            # Get image size
            img_path = pytorch_dataset.img_paths[
                target["image_id"].item()
            ]  # ID is stored in annotation
            sample = dataset_v51[img_path]
            img_width = sample.metadata.width
            img_height = sample.metadata.height

            label = label
            # Convert bbox to V51 type
            top_left_x = box[0].item() / img_width
            top_left_y = box[1].item() / img_height
            box_width = (box[2].item() - box[0].item()) / img_width
            box_height = (box[3].item() - box[1].item()) / img_height

            detection = fo.Detection(
                label=label,
                bounding_box=[top_left_x, top_left_y, box_width, box_height],
                confidence=score.item(),
            )
            detections.append(detection)

        img_path = pytorch_dataset.img_paths[
            target["image_id"].item()
        ]  # ID is stored in annotation
        sample = dataset_v51[img_path]
        sample[pred_key] = fo.Detections(detections=detections)
        sample.save()

['car', 'truck', 'bus', 'trailer', 'motorbike/cycler', 'pedestrian', 'van', 'pickup']
[['car', 'truck', 'bus', 'trailer', 'motorbike/cycler', 'pedestrian', 'van', 'pickup'], ['car', 'truck', 'bus', 'trailer', 'motorbike/cycler', 'pedestrian', 'van', 'pickup']]
['Detect car, truck, bus, trailer, motorbike/cycler, pedestrian, van, pickup.', 'Detect car, truck, bus, trailer, motorbike/cycler, pedestrian, van, pickup.']


Zero Shot Teacher Model omlab/omdet-turbo-swin-tiny-hf:   0%|          | 0/16 [00:00<?, ?it/s]

Image size: (1280, 960)
Image mode: RGB
Image format: None
Image size: (1280, 960)
Image mode: RGB
Image format: None
[<PIL.Image.Image image mode=RGB size=1280x960 at 0x74B40B9FEAB0>, <PIL.Image.Image image mode=RGB size=1280x960 at 0x74B40B9FFCB0>]
[(960, 1280), (960, 1280)]
{'tasks_input_ids': tensor([[49406, 22744,  1615,   267,  4629,   267,  2840,   267,  4700,   267,
         33341,   270,  9791,   528,   267, 18256,   267,  2451,   267, 15382,
           269, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407],
        [49406, 22744,  1615,   267,  4629,   267,  2840,   267,  4700,   267,
         33

Zero Shot Teacher Model omlab/omdet-turbo-swin-tiny-hf:   6%|▋         | 1/16 [00:00<00:07,  1.95it/s]

Image size: (1280, 960)
Image mode: RGB
Image format: None
Image size: (1280, 960)
Image mode: RGB
Image format: None
[<PIL.Image.Image image mode=RGB size=1280x960 at 0x74B40B9FE330>, <PIL.Image.Image image mode=RGB size=1280x960 at 0x74B2860AF830>]
[(960, 1280), (960, 1280)]
{'tasks_input_ids': tensor([[49406, 22744,  1615,   267,  4629,   267,  2840,   267,  4700,   267,
         33341,   270,  9791,   528,   267, 18256,   267,  2451,   267, 15382,
           269, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407],
        [49406, 22744,  1615,   267,  4629,   267,  2840,   267,  4700,   267,
         33

Zero Shot Teacher Model omlab/omdet-turbo-swin-tiny-hf:  19%|█▉        | 3/16 [00:00<00:03,  3.92it/s]

Image size: (1280, 960)
Image mode: RGB
Image format: None
Image size: (1280, 960)
Image mode: RGB
Image format: None
[<PIL.Image.Image image mode=RGB size=1280x960 at 0x74B40B9FD160>, <PIL.Image.Image image mode=RGB size=1280x960 at 0x74B40B9FFEF0>]
[(960, 1280), (960, 1280)]
{'tasks_input_ids': tensor([[49406, 22744,  1615,   267,  4629,   267,  2840,   267,  4700,   267,
         33341,   270,  9791,   528,   267, 18256,   267,  2451,   267, 15382,
           269, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407],
        [49406, 22744,  1615,   267,  4629,   267,  2840,   267,  4700,   267,
         33

Zero Shot Teacher Model omlab/omdet-turbo-swin-tiny-hf:  25%|██▌       | 4/16 [00:01<00:02,  4.71it/s]

OmDetTurboObjectDetectionOutput(loss=None, decoder_coord_logits=tensor([[[0.4298, 0.3025, 0.0833, 0.0508],
         [0.3678, 0.2465, 0.0613, 0.0482],
         [0.7713, 0.4137, 0.0421, 0.0462],
         ...,
         [0.6536, 0.0582, 0.0546, 0.1169],
         [0.7208, 0.8510, 0.1627, 0.1863],
         [0.0466, 0.0877, 0.0494, 0.1014]],

        [[0.4110, 0.2961, 0.0816, 0.0545],
         [0.4137, 0.2392, 0.0687, 0.0500],
         [0.7269, 0.3893, 0.0558, 0.0594],
         ...,
         [0.7621, 0.6264, 0.0094, 0.0250],
         [0.3780, 0.8950, 0.0285, 0.0209],
         [0.2051, 0.7050, 0.1876, 0.4297]]], device='cuda:0'), decoder_class_logits=tensor([[[ 0.3713, -2.2285, -2.6035,  ..., -3.9688, -0.1764, -4.1719],
         [ 1.2012, -2.8848, -2.7070,  ..., -4.4766, -1.5586, -4.1133],
         [ 0.2708, -2.1562, -2.3848,  ..., -3.7520, -1.1787, -3.2129],
         ...,
         [-4.5625, -4.5078, -4.0664,  ..., -4.5273, -5.2461, -6.1953],
         [-4.0898, -4.4570, -4.2305,  ..., -4.5547,

Zero Shot Teacher Model omlab/omdet-turbo-swin-tiny-hf:  38%|███▊      | 6/16 [00:01<00:01,  5.26it/s]

Image size: (1280, 960)
Image mode: RGB
Image format: None
Image size: (1280, 960)
Image mode: RGB
Image format: None
[<PIL.Image.Image image mode=RGB size=1280x960 at 0x74B2860AE7E0>, <PIL.Image.Image image mode=RGB size=1280x960 at 0x74B2860ACEF0>]
[(960, 1280), (960, 1280)]
{'tasks_input_ids': tensor([[49406, 22744,  1615,   267,  4629,   267,  2840,   267,  4700,   267,
         33341,   270,  9791,   528,   267, 18256,   267,  2451,   267, 15382,
           269, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407],
        [49406, 22744,  1615,   267,  4629,   267,  2840,   267,  4700,   267,
         33

Zero Shot Teacher Model omlab/omdet-turbo-swin-tiny-hf:  44%|████▍     | 7/16 [00:01<00:01,  4.92it/s]

Image size: (1280, 960)
Image mode: RGB
Image format: None
Image size: (1280, 960)
Image mode: RGB
Image format: None
[<PIL.Image.Image image mode=RGB size=1280x960 at 0x74B42519E150>, <PIL.Image.Image image mode=RGB size=1280x960 at 0x74B2860AC290>]
[(960, 1280), (960, 1280)]
{'tasks_input_ids': tensor([[49406, 22744,  1615,   267,  4629,   267,  2840,   267,  4700,   267,
         33341,   270,  9791,   528,   267, 18256,   267,  2451,   267, 15382,
           269, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407],
        [49406, 22744,  1615,   267,  4629,   267,  2840,   267,  4700,   267,
         33

Zero Shot Teacher Model omlab/omdet-turbo-swin-tiny-hf:  50%|█████     | 8/16 [00:01<00:01,  4.07it/s]

Image size: (1280, 960)
Image mode: RGB
Image format: None
Image size: (1280, 960)
Image mode: RGB
Image format: None
[<PIL.Image.Image image mode=RGB size=1280x960 at 0x74B40B9FD940>, <PIL.Image.Image image mode=RGB size=1280x960 at 0x74B278FC6D20>]
[(960, 1280), (960, 1280)]
{'tasks_input_ids': tensor([[49406, 22744,  1615,   267,  4629,   267,  2840,   267,  4700,   267,
         33341,   270,  9791,   528,   267, 18256,   267,  2451,   267, 15382,
           269, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407],
        [49406, 22744,  1615,   267,  4629,   267,  2840,   267,  4700,   267,
         33

Zero Shot Teacher Model omlab/omdet-turbo-swin-tiny-hf:  62%|██████▎   | 10/16 [00:02<00:01,  4.32it/s]

Image size: (1280, 960)
Image mode: RGB
Image format: None
Image size: (1280, 960)
Image mode: RGB
Image format: None
[<PIL.Image.Image image mode=RGB size=1280x960 at 0x74B40B9ADA90>, <PIL.Image.Image image mode=RGB size=1280x960 at 0x74B278FC7980>]
[(960, 1280), (960, 1280)]
{'tasks_input_ids': tensor([[49406, 22744,  1615,   267,  4629,   267,  2840,   267,  4700,   267,
         33341,   270,  9791,   528,   267, 18256,   267,  2451,   267, 15382,
           269, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407],
        [49406, 22744,  1615,   267,  4629,   267,  2840,   267,  4700,   267,
         33

Zero Shot Teacher Model omlab/omdet-turbo-swin-tiny-hf:  75%|███████▌  | 12/16 [00:02<00:00,  4.48it/s]

Image size: (1280, 960)
Image mode: RGB
Image format: None
Image size: (1280, 960)
Image mode: RGB
Image format: None
[<PIL.Image.Image image mode=RGB size=1280x960 at 0x74B2D862BFE0>, <PIL.Image.Image image mode=RGB size=1280x960 at 0x74B27B7E2390>]
[(960, 1280), (960, 1280)]
{'tasks_input_ids': tensor([[49406, 22744,  1615,   267,  4629,   267,  2840,   267,  4700,   267,
         33341,   270,  9791,   528,   267, 18256,   267,  2451,   267, 15382,
           269, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407],
        [49406, 22744,  1615,   267,  4629,   267,  2840,   267,  4700,   267,
         33

Zero Shot Teacher Model omlab/omdet-turbo-swin-tiny-hf:  88%|████████▊ | 14/16 [00:03<00:00,  5.08it/s]

Image size: (1280, 960)
Image mode: RGB
Image format: None
Image size: (1280, 960)
Image mode: RGB
Image format: None
[<PIL.Image.Image image mode=RGB size=1280x960 at 0x74B42519F650>, <PIL.Image.Image image mode=RGB size=1280x960 at 0x74B278FC4470>]
[(960, 1280), (960, 1280)]
{'tasks_input_ids': tensor([[49406, 22744,  1615,   267,  4629,   267,  2840,   267,  4700,   267,
         33341,   270,  9791,   528,   267, 18256,   267,  2451,   267, 15382,
           269, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407],
        [49406, 22744,  1615,   267,  4629,   267,  2840,   267,  4700,   267,
         33

Zero Shot Teacher Model omlab/omdet-turbo-swin-tiny-hf:  94%|█████████▍| 15/16 [00:03<00:00,  5.33it/s]

[{'boxes': tensor([[ 933.5062,  420.6128,  993.9875,  465.5684],
        [ 627.6803,   50.6561,  654.6149,   68.0243],
        [ 603.2359,   64.6306,  630.5883,   84.1653],
        [ 551.3030,   73.1690,  596.5028,  102.6445],
        [ 667.2302,   41.3726,  691.4952,   54.3660],
        [ 422.7753,  129.2444,  490.6756,  171.7367],
        [ 465.0864,  837.9359,  521.8938,  866.5502],
        [ 353.5757,  813.1107,  377.2507,  838.2526],
        [ 488.3422,  113.6239,  536.4702,  148.8517],
        [1010.8802,  453.0645, 1051.2993,  547.1017],
        [ 503.2277,   98.4449,  537.4465,  118.6158],
        [ 933.5062,  420.6128,  993.9875,  465.5684],
        [ 371.1394,  847.9821,  392.8131,  869.6762],
        [ 555.2268,   89.1848,  578.6306,  104.7550],
        [ 539.4781,  824.0859,  558.2230,  859.1075],
        [ 551.3030,   73.1690,  596.5028,  102.6445],
        [ 808.8660,  388.8534,  916.7565,  531.3670],
        [ 488.5841,  101.9962,  545.1633,  148.2708],
        [ 578.790

Zero Shot Teacher Model omlab/omdet-turbo-swin-tiny-hf: 100%|██████████| 16/16 [00:03<00:00,  4.38it/s]


In [55]:
session = fo.launch_app(dataset_v51)