In [None]:
import torch
import requests
from io import BytesIO
from PIL import Image, ImageDraw
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

In [None]:
processor = AutoProcessor.from_pretrained("omlab/omdet-turbo-swin-tiny-hf")
model = AutoModelForZeroShotObjectDetection.from_pretrained(
    "omlab/omdet-turbo-swin-tiny-hf"
)

classes = ["cat", "remote", "car", "truck", "bus", "traffic light"]
task = "Detect {}.".format(", ".join(classes))

url1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
image1 = Image.open(BytesIO(requests.get(url1).content)).convert("RGB")


url2 = "http://images.cocodataset.org/train2017/000000257813.jpg"
image2 = Image.open(BytesIO(requests.get(url2).content)).convert("RGB")

url3 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
image3 = Image.open(BytesIO(requests.get(url3).content)).convert("RGB")

image4 = Image.open(
    "/home/dbogdoll/mcity_data_engine/scripts/fisheye_small.png"
).convert("RGB")
image5 = Image.open(
    "/home/dbogdoll/mcity_data_engine/scripts/fisheye_test.png"
).convert("RGB")

images = [image1, image2, image3, image4, image5]
batch_classes = [classes] * len(images)
batch_task = [task] * len(images)
print(batch_task)
print(batch_classes)
print(batch_task)

for image in images:
    # Print image information
    print(f"Image size: {image.size}")
    print(f"Image mode: {image.mode}")
    print(f"Image format: {image.format}")

inputs = processor(
    images=images,
    text=batch_classes,
    task=batch_task,
    return_tensors="pt",
)
print(inputs)
with torch.amp.autocast("cuda"):
    with torch.no_grad():
        outputs = model(**inputs)

print(outputs)
# convert outputs (bounding boxes and class logits)
target_sizes = [img.size[::-1] for img in images]
print(target_sizes)
results = processor.post_process_grounded_object_detection(
    outputs,
    classes=batch_classes,
    target_sizes=target_sizes,
    score_threshold=0.2,
    nms_threshold=0.3,
)
print(results)
for i, result in enumerate(results):
    for score, class_name, box in zip(
        result["scores"], result["classes"], result["boxes"]
    ):
        box = [round(i, 1) for i in box.tolist()]
        print(
            f"Detected {class_name} with confidence "
            f"{round(score.item(), 2)} at location {box} in image {i}"
        )

In [None]:
# Draw bounding boxes on the image
i = 4
image = images[i]
draw = ImageDraw.Draw(image)

boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["classes"]


for box, label in zip(boxes, labels):
    draw.rectangle(box.tolist(), outline="red", width=2)
    draw.text((box[0], box[1]), label, fill="red")

# Display the image
display(image)

In [None]:
import os

tokens = {}
with open("/home/dbogdoll/mcity_data_engine/.secret", "r") as file:
    for line in file:
        key, value = line.strip().split("=")
        tokens[key] = value

os.environ["HF_TOKEN"] = tokens["HF_TOKEN"]

In [None]:
import sys

sys.path.append("..")

import fiftyone as fo
from fiftyone.utils.huggingface import load_from_hub
from utils.data_loader import FiftyOneTorchDatasetCOCO, TorchToHFDatasetCOCO

try:
    dataset_v51 = load_from_hub("dbogdollumich/mcity_fisheye_v51", max_samples=32)
except:
    dataset_v51 = fo.load_dataset("dbogdollumich/mcity_fisheye_v51")
pytorch_dataset = FiftyOneTorchDatasetCOCO(dataset_v51)

In [None]:
from torchvision import transforms
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader

from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

os.environ["TOKENIZERS_PARALLELISM"] = "false"

model_name = "omlab/omdet-turbo-swin-tiny-hf"

batch_size = 2
pred_key = "predictions_omdet"
eval_key = "eval"
transform = transforms.Compose([transforms.ToTensor()])

pytorch_dataset = FiftyOneTorchDatasetCOCO(
    dataset_v51,
    # transforms=transform,
)
data_loader = DataLoader(
    pytorch_dataset,
    batch_size=batch_size,
    num_workers=8,
    pin_memory=True,
    collate_fn=lambda x: list(zip(*x)),
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

classes_v51 = dataset_v51.default_classes


processor = AutoProcessor.from_pretrained(model_name)
batch_classes = [classes_v51] * batch_size
task = "Detect {}.".format(", ".join(classes_v51))

batch_tasks = [task] * batch_size
print(classes_v51)
print(batch_classes)
print(batch_tasks)

model = AutoModelForZeroShotObjectDetection.from_pretrained(model_name).to(device)

for step, (images, targets) in enumerate(
    tqdm(data_loader, desc="Zero Shot Teacher Model " + model_name)
):

    for image in images:
        # Print image information
        print(f"Image size: {image.size}")
        print(f"Image mode: {image.mode}")
        print(f"Image format: {image.format}")

    target_sizes = [img.size[::-1] for img in images]

    # target_sizes = [
    #    tuple(img.shape[1:]) for img in images
    # ]  # style [(480,640),(480,640)] h,w
    # images = [(image).to(device, non_blocking=True) for image in images]

    print(images)
    print(target_sizes)

    inputs = processor(
        text=batch_classes,
        images=images,
        task=batch_tasks,
        return_tensors="pt",
    ).to(device)

    print(inputs)
    with torch.amp.autocast("cuda"):
        with torch.no_grad():
            outputs = model(**inputs)

    print(outputs)
    results = processor.post_process_grounded_object_detection(
        outputs,
        classes=batch_classes,
        score_threshold=0.2,
        nms_threshold=0.3,
        target_sizes=target_sizes,
    )
    print(results)
    for result, target in zip(results, targets):
        boxes, scores, labels = result["boxes"], result["scores"], result["classes"]

        detections = []
        for box, score, label in zip(boxes, scores, labels):
            # Get image size
            img_path = pytorch_dataset.img_paths[
                target["image_id"].item()
            ]  # ID is stored in annotation
            sample = dataset_v51[img_path]
            img_width = sample.metadata.width
            img_height = sample.metadata.height

            label = label
            # Convert bbox to V51 type
            top_left_x = box[0].item() / img_width
            top_left_y = box[1].item() / img_height
            box_width = (box[2].item() - box[0].item()) / img_width
            box_height = (box[3].item() - box[1].item()) / img_height

            detection = fo.Detection(
                label=label,
                bounding_box=[top_left_x, top_left_y, box_width, box_height],
                confidence=score.item(),
            )
            detections.append(detection)

        img_path = pytorch_dataset.img_paths[
            target["image_id"].item()
        ]  # ID is stored in annotation
        sample = dataset_v51[img_path]
        sample[pred_key] = fo.Detections(detections=detections)
        sample.save()

In [None]:
session = fo.launch_app(dataset_v51)