In [None]:
import os
import sys

sys.path.append("..")
import fiftyone as fo

from fiftyone.utils.huggingface import load_from_hub
from utils.data_loader import (
    FiftyOneTorchDatasetCOCO,
    FiftyOneTorchDatasetCOCOFilepaths,
    TorchToHFDatasetCOCO,
)

from torch.utils.data import DataLoader

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
tokens = {}
with open("/home/dbogdoll/mcity_data_engine/.secret", "r") as file:
    for line in file:
        key, value = line.strip().split("=")
        tokens[key] = value

os.environ["HF_TOKEN"] = tokens["HF_TOKEN"]

try:
    dataset_v51 = load_from_hub("dbogdollumich/mcity_fisheye_v51")
except:
    dataset_v51 = fo.load_dataset("dbogdollumich/mcity_fisheye_v51")

print(dataset_v51.stats)
print(dataset_v51.summary)
print(dataset_v51.name)
print(len(dataset_v51))

In [None]:
sample = dataset_v51.first()
print(sample)
sample2 = dataset_v51[sample.filepath]

ground_truth = sample["ground_truth"]
print(ground_truth)
print(len(ground_truth["detections"]))

for detection in ground_truth["detections"]:
    print(detection)
    print(detection.bounding_box)

In [None]:
batch_size = 12
# dataset_v51 = dataset_v51.take(500)

In [None]:
torch_dataset_filepaths = FiftyOneTorchDatasetCOCOFilepaths(dataset_v51)
torch_dataloader_filepaths = DataLoader(
    torch_dataset_filepaths,
    batch_size=batch_size,
    num_workers=32,
    pin_memory=True,
    shuffle=False,
    prefetch_factor=2,
    persistent_workers=True,
)

In [None]:
torch_dataset_managed = FiftyOneTorchDatasetCOCO(dataset_v51)
torch_dataloader_managed = DataLoader(
    torch_dataset_managed,
    batch_size=batch_size,
    num_workers=32,
    pin_memory=True,
    shuffle=False,
    prefetch_factor=2,
    collate_fn=lambda batch: list(zip(*batch)),
)

print(torch_dataset_managed.get_classes())
print(torch_dataset_managed.get_splits())

In [None]:
converter = TorchToHFDatasetCOCO(torch_dataset_managed)
hf_dataset = converter.convert()

In [None]:
import fiftyone.utils.coco as fouc
import torch


def target_batch_to_detections(targets, dataset, gt_field="ground_truth"):
    classes = dataset_v51.default_classes
    batch_targets = []
    samples = [dataset[filepath] for filepath in targets]
    for sample in samples:
        metadata = sample.metadata
        id = sample.id
        boxes = []
        labels = []
        area = []
        iscrowd = []
        for detection in sample[gt_field]["detections"]:
            category_id = classes.index(detection.label)
            coco_obj = fouc.COCOObject.from_label(
                detection,
                metadata,
                category_id=category_id,
            )
            x, y, w, h = coco_obj.bbox
            boxes.append([x, y, w, h])
            labels.append(coco_obj.category_id)
            area.append(coco_obj.area)
            iscrowd.append(coco_obj.iscrowd)

        target_dict = {}
        target_dict["boxes"] = torch.as_tensor(boxes, dtype=torch.float32)
        target_dict["labels"] = torch.as_tensor(labels, dtype=torch.int64)
        target_dict["image_id"] = id
        target_dict["area"] = torch.as_tensor(area, dtype=torch.float32)
        target_dict["iscrowd"] = torch.as_tensor(iscrowd, dtype=torch.int64)
        batch_targets.append(target_dict)
    return batch_targets

In [None]:
from transformers import (
    AutoProcessor,
    AutoConfig,
    AutoTokenizer,
    AutoModelForZeroShotObjectDetection,
)

model_name = "google/owlv2-base-patch16-ensemble"
texts = [
    "car",
    "truck",
    "bus",
    "trailer",
    "motorbike/cycler",
    "pedestrian",
    "van",
    "pickup",
]

config = AutoConfig.from_pretrained(model_name)
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_name).to(device)

os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [None]:
batch_classes = texts * batch_size
tokenized_text = processor.tokenizer(
    batch_classes, padding="max_length", return_tensors="pt"
).to(device)

In [None]:
n_samples = len(dataset_v51)
estimated_steps = n_samples / batch_size
torch.cuda.empty_cache()
steps = 0
for batch_idx, (images, targets) in enumerate(torch_dataloader_managed):
    print(batch_idx)
    steps += 1
print("Estimated steps: ", estimated_steps)
print("Performed steps: ", steps)

In [None]:
torch.cuda.empty_cache()

for batch_idx, (images, targets) in enumerate(torch_dataloader_filepaths):
    targets = target_batch_to_detections(targets, dataset_v51)
    target_sizes = [tuple(img.shape[1:]) for img in images]
    inputs = processor(text=None, images=images, return_tensors="pt").to(device)
    inputs.update(tokenized_text)
    with torch.amp.autocast("cuda"):
        with torch.no_grad():
            outputs = model(**inputs)
    results = processor.post_process_object_detection(
        outputs=outputs,
        threshold=0.2,
        target_sizes=target_sizes,
    )
    for result, target in zip(results, targets):
        boxes, scores, labels = result["boxes"], result["scores"], result["labels"]

        detections = []
        for box, score, label in zip(boxes, scores, labels):

            # Get image size (ID is stored in annotation)
            sample = dataset_v51[target["image_id"]]
            img_width = sample.metadata.width
            img_height = sample.metadata.height

            # Convert bbox to V51 type
            label = texts[label]
            top_left_x = box[0].item() / img_width
            top_left_y = box[1].item() / img_height
            box_width = (box[2].item() - box[0].item()) / img_width
            box_height = (box[3].item() - box[1].item()) / img_height

            detection = fo.Detection(
                label=label,
                bounding_box=[
                    top_left_x,
                    top_left_y,
                    box_width,
                    box_height,
                ],
                confidence=score.item(),
            )

            detections.append(detection)

        # Attach label to V51 dataset
        sample = dataset_v51[target["image_id"]]
        sample["prediction"] = fo.Detections(detections=detections)
        sample.save()

In [None]:
torch.cuda.empty_cache()

for batch_idx, (images, targets) in enumerate(torch_dataloader_managed):
    target_sizes = [tuple(img.shape[1:]) for img in images]
    inputs = processor(text=None, images=images, return_tensors="pt").to(device)
    inputs.update(tokenized_text)
    with torch.amp.autocast("cuda"):
        with torch.no_grad():
            outputs = model(**inputs)
    results = processor.post_process_object_detection(
        outputs=outputs,
        threshold=0.2,
        target_sizes=target_sizes,
    )
    for result, target in zip(results, targets):
        boxes, scores, labels = result["boxes"], result["scores"], result["labels"]

        detections = []
        for box, score, label, img_size in zip(boxes, scores, labels, target_sizes):

            img_width = img_size[1]
            img_height = img_size[0]

            # Convert bbox to V51 type
            label = texts[label]
            top_left_x = box[0].item() / img_width
            top_left_y = box[1].item() / img_height
            box_width = (box[2].item() - box[0].item()) / img_width
            box_height = (box[3].item() - box[1].item()) / img_height

            detection = fo.Detection(
                label=label,
                bounding_box=[
                    top_left_x,
                    top_left_y,
                    box_width,
                    box_height,
                ],
                confidence=score.item(),
            )

            detections.append(detection)

        # Attach label to V51 dataset
        sample = dataset_v51[target["image_id"]]
        sample["prediction_managed"] = fo.Detections(detections=detections)
        sample.save()

Filepaths: 10.51 with batch_size 8  
Managed: 9.56 with batch_size 8 