In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import requests
import os
import torch
from PIL import Image, ImageDraw
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
import torchvision.transforms as T

import sys

sys.path.append("..")

In [None]:
tokens = {}
with open("/home/dbogdoll/mcity_data_engine/.secret", "r") as file:
    for line in file:
        key, value = line.strip().split("=")
        tokens[key] = value

os.environ["HF_TOKEN"] = tokens["HF_TOKEN"]

In [None]:
import fiftyone as fo
from fiftyone.utils.huggingface import load_from_hub
from utils.data_loader import FiftyOneTorchDatasetCOCO, TorchToHFDatasetCOCO


try:
    dataset_v51 = load_from_hub("dbogdollumich/mcity_fisheye_v51", max_samples=16)
except:
    dataset_v51 = fo.load_dataset("dbogdollumich/mcity_fisheye_v51")
pytorch_dataset = FiftyOneTorchDatasetCOCO(dataset_v51)
pt_to_hf_converter = TorchToHFDatasetCOCO(pytorch_dataset)
hf_dataset = pt_to_hf_converter.convert()

In [None]:
from torch.utils.data import DataLoader


def zeroshot_collate_fn(batch):
    return list(zip(*batch))


data_loader = DataLoader(
    pytorch_dataset,
    batch_size=2,
    num_workers=8,
    pin_memory=True,
    collate_fn=zeroshot_collate_fn,
)

In [None]:
batch = next(iter(data_loader))
print(batch)
item = batch[0]
print(item)
image = item[0]

In [None]:
classes = [
    "car",
    "truck",
    "bus",
    "trailer",
    "motorbike/cycler",
    "pedestrian",
    "van",
    "pickup",
]

processed_classes = [part for classname in classes for part in classname.split("/")]
class_parts_dict = {
    part: classname for classname in classes for part in classname.split("/")
}

classes = processed_classes
print(classes)

In [None]:
model_id = "IDEA-Research/grounding-dino-tiny"
device = "cuda"

processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

text = ". ".join(classes) + "."
print(text)

tokenized_text = processor.tokenizer(text, return_tensors="pt")
print(tokenized_text)
inputs = processor(images=image, text=text, return_tensors="pt").to(device)
print(inputs)
with torch.no_grad():
    outputs = model(**inputs)

target_size = [tuple(image.shape[1:])]

results = processor.post_process_grounded_object_detection(
    outputs,
    inputs.input_ids,
    box_threshold=0.2,
    text_threshold=0.2,
    target_sizes=target_size
)
print(results)

In [None]:
# Assuming 'results' contains the bounding boxes and labels
boxes = results[0]["boxes"]
labels = results[0]["labels"]

image_pil = T.ToPILImage()(image)

# Draw bounding boxes on the image
draw = ImageDraw.Draw(image_pil)
for box, label in zip(boxes, labels):
    draw.rectangle(box.tolist(), outline="red", width=2)
    draw.text((box[0], box[1]), label, fill="red")

# Display the image
display(image_pil)

In [None]:
batch_tokenized_text = [tokenized_text] * data_loader.batch_size

print(batch_tokenized_text)
batch_text = [text] * data_loader.batch_size
print(batch_text)

In [None]:
import os
from tqdm import tqdm

batch_text = [text] * data_loader.batch_size
print(batch_text)

os.environ["TOKENIZERS_PARALLELISM"] = "true"
for images, targets in tqdm(data_loader):
    inputs = processor(text=batch_text, images=images, return_tensors="pt").to(device)
    print("Inputs shape (first run):", {k: v.shape for k, v in inputs.items()})
    with torch.no_grad():
        outputs = model(**inputs)
    results = processor.post_process_grounded_object_detection(
        outputs,
        inputs.input_ids,
        box_threshold=0.2,
    )

tokenized_texts = processor.tokenizer(
    batch_text,
    padding="max_length",
    return_tensors="pt",
    max_length=256,  # Adjust max_length to match vision hidden state
).to(device)
print("Tokenized texts shape:", {k: v.shape for k, v in tokenized_texts.items()})

for images, targets in tqdm(data_loader):
    inputs = processor(text=None, images=images, return_tensors="pt").to(device)
    inputs.update(tokenized_texts)
    print("Inputs shape (second run):", {k: v.shape for k, v in inputs.items()})
    with torch.no_grad():
        outputs = model(**inputs)
    results = processor.post_process_grounded_object_detection(
        outputs,
        inputs.input_ids,
        box_threshold=0.2,
    )
    for result in results:
        boxes, scores, labels = (
            result["boxes"],
            result["scores"],
            result["labels"],
        )
        print(len(labels))

In [None]:
import os

tokens = {}
with open("/home/dbogdoll/mcity_data_engine/.secret", "r") as file:
    for line in file:
        key, value = line.strip().split("=")
        tokens[key] = value

os.environ["HF_TOKEN"] = tokens["HF_TOKEN"]

In [None]:
import fiftyone as fo
from fiftyone.utils.huggingface import load_from_hub
from utils.data_loader import FiftyOneTorchDatasetCOCO, TorchToHFDatasetCOCO

try:
    dataset_v51 = load_from_hub("dbogdollumich/mcity_fisheye_v51")
except:
    dataset_v51 = fo.load_dataset("dbogdollumich/mcity_fisheye_v51")
pytorch_dataset = FiftyOneTorchDatasetCOCO(dataset_v51)
pt_to_hf_converter = TorchToHFDatasetCOCO(pytorch_dataset)
hf_dataset = pt_to_hf_converter.convert()

In [None]:
from tqdm import tqdm

predictions_view = dataset_v51.take(16, seed=51)

for sample in tqdm(predictions_view):
    image = Image.open(sample.filepath)
    inputs = processor(text=text, images=image, return_tensors="pt").to(device)
    # forward pass
    with torch.no_grad():
        outputs = model(**inputs)

    # Get original image size
    original_size = torch.Tensor([image.size[::-1]])

    # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
    results = processor.post_process_grounded_object_detection(
        outputs, inputs.input_ids, box_threshold=0.2, text_threshold=0.2
    )
    for result in results:
        boxes, scores, labels = (
            result["boxes"],
            result["scores"],
            result["labels"],
        )
        print(len(labels))

        # Convert to V51 format
        # Convert to [top-left-x, top-left-y, width, height]
        # in relative coordinates in [0, 1] x [0, 1]
        width, height = image.size
        detections = []
        for box, score, label in zip(boxes, scores, labels):
            top_left_x = box[0].item()
            top_left_y = box[1].item()
            box_width = (box[2] - box[0]).item()
            box_height = (box[3] - box[1]).item()

            detection = fo.Detection(
                label=label,
                bounding_box=[top_left_x, top_left_y, box_width, box_height],
                confidence=score.item(),
            )
            detections.append(detection)

    sample["predictions"] = fo.Detections(detections=detections)
    sample.save()

In [None]:
session = fo.launch_app(view=predictions_view)