https://huggingface.co/docs/transformers/v4.45.2/en/model_doc/grounding-dino
https://huggingface.co/IDEA-Research/grounding-dino-tiny

https://huggingface.co/docs/transformers/v4.45.2/en/model_doc/owlv2
https://huggingface.co/google/owlv2-base-patch16-ensemble

https://huggingface.co/docs/transformers/v4.45.2/en/model_doc/owlvit
https://huggingface.co/docs/transformers/v4.45.2/en/model_doc/omdet-turbo

https://huggingface.co/docs/transformers/main/model_doc/auto#transformers.AutoModelForZeroShotObjectDetection

https://huggingface.co/docs/transformers/en/tasks/zero_shot_object_detection


In [1]:
from PIL import Image
import numpy as np
import torch
from transformers import (
    AutoProcessor,
    AutoConfig,
    AutoTokenizer,
    AutoModelForZeroShotObjectDetection,
)
import sys

sys.path.append("..")

In [2]:
import os

tokens = {}
with open("/home/dbogdoll/mcity_data_engine/.secret", "r") as file:
    for line in file:
        key, value = line.strip().split("=")
        tokens[key] = value

os.environ["HF_TOKEN"] = tokens["HF_TOKEN"]

In [3]:
import fiftyone as fo
from fiftyone.utils.huggingface import load_from_hub
from utils.data_loader import FiftyOneTorchDatasetCOCO, TorchToHFDatasetCOCO

try:
    dataset_v51 = load_from_hub("dbogdollumich/mcity_fisheye_v51")
except:
    dataset_v51 = fo.load_dataset("dbogdollumich/mcity_fisheye_v51")
pytorch_dataset = FiftyOneTorchDatasetCOCO(dataset_v51)
pt_to_hf_converter = TorchToHFDatasetCOCO(pytorch_dataset)
hf_dataset = pt_to_hf_converter.convert()

Downloading config file fiftyone.yml from dbogdollumich/mcity_fisheye_v51
Loading dataset
Importing samples...
 100% |███████████████| 2744/2744 [79.7ms elapsed, 0s remaining, 34.4K samples/s]   


In [4]:
model_name = "google/owlv2-base-patch16-ensemble"
texts = [
    [
        "car",
        "truck",
        "bus",
        "trailer",
        "motorbike/cycler",
        "pedestrian",
        "van",
        "pickup",
    ]
]

config = AutoConfig.from_pretrained(model_name)
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(config)
print(processor)
print(model)
print(tokenizer)

Owlv2Config {
  "_name_or_path": "google/owlv2-base-patch16-ensemble",
  "architectures": [
    "Owlv2ForObjectDetection"
  ],
  "initializer_factor": 1.0,
  "logit_scale_init_value": 2.6592,
  "model_type": "owlv2",
  "projection_dim": 512,
  "text_config": {
    "model_type": "owlv2_text_model"
  },
  "torch_dtype": "float32",
  "transformers_version": "4.46.0",
  "vision_config": {
    "image_size": 960,
    "model_type": "owlv2_vision_model"
  }
}

Owlv2Processor:
- image_processor: Owlv2ImageProcessor {
  "do_normalize": true,
  "do_pad": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_processor_type": "Owlv2ImageProcessor",
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "processor_class": "Owlv2Processor",
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 960,
    "width": 960
  }
}

- tokenizer: CLIPTokenizerFast(name_or_path='google/owlv2-base

Training not yet implemented in HF, but maybe coming
- https://github.com/huggingface/transformers/pull/34057/commits/a4f3d660b7ba9ac269c1e0870ea6e9048f72bdc0
- https://github.com/huggingface/transformers/issues/33664
- https://github.com/huggingface/transformers/issues/20091
- https://github.com/stevebottos/owl-vit-object-detection
- https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit#fine-tuning


In [5]:
from tqdm import tqdm

predictions_view = dataset_v51.take(16, seed=51)

for sample in tqdm(predictions_view):
    image = Image.open(sample.filepath)
    inputs = processor(text=texts, images=image, return_tensors="pt")

    # forward pass
    with torch.no_grad():
        outputs = model(**inputs)

    # Get original image size
    original_size = torch.Tensor([image.size[::-1]])

    # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
    results = processor.post_process_object_detection(
        outputs=outputs, threshold=0.2, target_sizes=original_size
    )
    i = 0  # Retrieve predictions for the first image for the corresponding text queries
    text = texts[i]
    boxes, scores, labels = (
        results[i]["boxes"],
        results[i]["scores"],
        results[i]["labels"],
    )

    # Convert to V51 format
    # Convert to [top-left-x, top-left-y, width, height]
    # in relative coordinates in [0, 1] x [0, 1]
    width, height = image.size
    detections = []
    for box, score, label in zip(boxes, scores, labels):
        top_left_x = box[0].item() / width
        top_left_y = box[1].item() / height
        box_width = (box[2] - box[0]).item() / width
        box_height = (box[3] - box[1]).item() / height

        detection = fo.Detection(
            label=texts[0][label],
            bounding_box=[top_left_x, top_left_y, box_width, box_height],
            confidence=score.item(),
        )
        detections.append(detection)

    sample["predictions"] = fo.Detections(detections=detections)
    sample.save()

100%|██████████| 16/16 [00:53<00:00,  3.34s/it]


In [6]:
counts = predictions_view.count_values("ground_truth.detections.label")
classes = sorted(counts, key=counts.get, reverse=True)[:10]

results = predictions_view.evaluate_detections(
    "predictions",
    gt_field="ground_truth",
    eval_key="eval",
    compute_mAP=True,
)
results.print_report(classes=classes)
print("mAP: ", results.mAP())

# Print some statistics about the total TP/FP/FN counts
print("TP: %d" % predictions_view.sum("eval_tp"))
print("FP: %d" % predictions_view.sum("eval_fp"))
print("FN: %d" % predictions_view.sum("eval_fn"))

Evaluating detections...
 100% |███████████████████| 16/16 [331.5ms elapsed, 0s remaining, 48.3 samples/s]      
Performing IoU sweep...
 100% |███████████████████| 16/16 [215.2ms elapsed, 0s remaining, 74.4 samples/s]      
              precision    recall  f1-score   support

         car       0.21      0.36      0.27       172
      pickup       0.00      0.00      0.00         9
         van       0.00      0.00      0.00         5
  pedestrian       0.00      0.00      0.00         4
         bus       0.00      0.00      0.00         3
     trailer       0.00      0.00      0.00         2
       truck       0.00      0.00      0.00         1

   micro avg       0.19      0.32      0.23       196
   macro avg       0.03      0.05      0.04       196
weighted avg       0.19      0.32      0.23       196

mAP:  0.008406406202513959
TP: 62
FP: 271
FN: 134


In [7]:
plot = results.plot_pr_curves(classes=classes)
plot.show()



FigureWidget({
    'data': [{'customdata': array([0.47194517, 0.35150869, 0.33640278, 0.28810968, 0.25011544, 0.20931489,
                                   0.2029481 , 0.19749833, 0.19400765, 0.18415645, 0.15167029, 0.15070338,
                                   0.14392901, 0.11528651, 0.11283644, 0.11201805, 0.11009784, 0.10722623,
                                   0.10648181, 0.1046691 , 0.10093615, 0.09720277, 0.09624246, 0.06809093,
                                   0.06583647, 0.06385861, 0.0621736 , 0.06123711, 0.0554347 , 0.03243491,
                                   0.03147317, 0.02908981, 0.02823469, 0.02767637, 0.02727703, 0.02385185,
                                   0.02259308, 0.        , 0.        , 0.        , 0.        , 0.        ,
                                   0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
                                   0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
                      

In [8]:
plot = results.plot_confusion_matrix(classes=classes)
plot.show()



FigureWidget({
    'data': [{'mode': 'markers',
              'opacity': 0.1,
              'type': 'scatter',
              'uid': 'f157334b-ac8a-488b-89ec-8c552646e334',
              'x': array([0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
                          0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
                          0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]),
              'y': array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
                          3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
                          6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7])},
             {'colorscale': [[0.0, 'rgb(255,245,235)'], [0.125,
                             'rgb(254,230,206)'], [0.25, 'rgb(253,208,162)'],
                             [0.375, 'rgb(253,174,107)'], [0.5, 'rgb(253,141,60)'],
                             [0.625, 'rgb(241,105,19)'], 

In [9]:
session = fo.launch_app(view=predictions_view)

Connected to FiftyOne on port 5151 at localhost.
If you are not connecting to a remote session, you may need to start a new session and specify a port


In [10]:
categories = dataset_v51.default_classes
id2label = {index: x for index, x in enumerate(categories, start=0)}
label2id = {v: k for k, v in id2label.items()}


def collate_fn(batch):
    print(batch)
    data = {}
    data["pixel_values"] = torch.stack([x["pixel_values"] for x in batch])
    data["labels"] = [x["labels"] for x in batch]
    if "pixel_mask" in batch[0]:
        data["pixel_mask"] = torch.stack([x["pixel_mask"] for x in batch])

    input_ids = tokenizer(texts, padding=True, return_tensors="pt")["input_ids"]
    data["input_ids"] = input_ids.repeat(len(batch), 1)  # Match batch size
    return data


def transform_batch(examples, image_processor, return_pixel_mask=False):
    """Apply format annotations in COCO format for object detection task"""

    images = []
    annotations = []

    for image_path, annotation in zip(examples["image"], examples["target"]):
        image = Image.open(image_path).convert("RGB")
        image_np = np.array(image)
        images.append(image_np)

        # Annotation needs to be in COCO style annotation per bounding box
        coco_annotations = []
        for i, bbox in enumerate(annotation["bbox"]):

            # Convert bbox x_min, y_min, w, h to YOLO format x_center, y_center, w, h
            bbox[0] = bbox[0] + bbox[2] / 2.0
            bbox[1] = bbox[1] + bbox[3] / 2.0

            # Ensure bbox values are within the expected range
            assert all(0 <= coord <= 1 for coord in bbox), f"Invalid bbox: {bbox}"

            coco_annotation = {
                "image_id": annotation["image_id"],
                "bbox": bbox,
                "category_id": annotation["category_id"][i],
                "area": annotation["area"][i],
                "iscrowd": 0,
            }
            coco_annotations.append(coco_annotation)
        detr_annotation = {
            "image_id": annotation["image_id"],
            "annotations": coco_annotations,
        }
        annotations.append(detr_annotation)

    # Apply the image processor transformations: resizing, rescaling, normalization
    result = image_processor(images=images, text=texts, return_tensors="pt")

    if not return_pixel_mask:
        result.pop("pixel_mask", None)

    return result

In [11]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

from transformers import (
    AutoConfig,
    AutoProcessor,
    AutoModelForZeroShotObjectDetection,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)

import evaluate
from datasets import Split
from functools import partial

import wandb

In [12]:
# Finetuning the model on our data
image_processor = AutoProcessor.from_pretrained(
    model_name,
    do_resize=False,
    do_pad=False,  # Assumes all images have the same size
    do_convert_annotations=False,  # expects YOLO (center_x, center_y, width, height) between [0,1]
)

hf_model_config = AutoConfig.from_pretrained(model_name)
train_transform_batch = partial(transform_batch, image_processor=image_processor)
validation_transform_batch = partial(transform_batch, image_processor=image_processor)

hf_dataset[Split.TRAIN] = hf_dataset[Split.TRAIN].with_transform(train_transform_batch)
hf_dataset[Split.VALIDATION] = hf_dataset[Split.VALIDATION].with_transform(
    validation_transform_batch
)

model = AutoModelForZeroShotObjectDetection.from_pretrained(
    model_name,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
)


training_args = TrainingArguments(
    run_name=model_name,
    output_dir="output/models/teacher/" + model_name,
    num_train_epochs=3,
    fp16=False,
    per_device_train_batch_size=8,
    auto_find_batch_size=True,  # Automates the lowering process if CUDA OOM
    dataloader_num_workers=8,
    learning_rate=5e-05,
    lr_scheduler_type="cosine",
    weight_decay=0.0001,
    max_grad_norm=0.01,
    metric_for_best_model="eval_loss",  # eval_map,
    load_best_model_at_end=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    remove_unused_columns=False,
    eval_do_concat_batches=False,
    save_safetensors=False,
    push_to_hub=False,
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.0,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_dataset[Split.TRAIN],
    eval_dataset=hf_dataset[Split.VALIDATION],
    tokenizer=image_processor,
    data_collator=collate_fn,
    callbacks=[early_stopping_callback],
    # compute_metrics=eval_compute_metrics_fn, # TODO Write eval function
)

trainer.train()


`tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdaniel-bogdoll[0m ([33mmcity[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/750 [00:00<?, ?it/s]

[{'input_ids': tensor([49406,  1615, 49407,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0]), 'attention_mask': tensor([1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'pixel_values': tensor([[[-1.7923, -1.7923, -1.7923,  ..., -1.7923, -1.7923, -1.7923],
         [-1.7923, -1.7923, -1.7923,  ..., -1.7923, -1.7923, -1.7923],
         [-1.7923, -1.7923, -1.7923,  ..., -1.7923, -1.7923, -1.7923],
         ...,
         [-1.7923, -1.7923, -1.7923,  ..., -1.7923, -1.7923, -1.7923],
         [-1.7923, -1.7923, -1.7923,  ..., -1.7923, -1.7923, -1.7923],
         [-1.7923, -1.7923, -1.7923,  ..., -1.7923, -1.7923, -1.7923]],

        [[-1.7521, -1.7521, -1.7521,  ..., -1.7521, -1.7521, -1.7521],
         [-1.7521, -1.7521, -1.7521,  ..., -1.7521, -1.7521, -1.7521],
         [-1.7521, -1.7521, -1.7521,  ..., -1.7521, -1.7521, -1.7521],
         ...,
         [-1.7521, -1.7521, -1.7521,  ..., -1.7521, -1.7521, -1.7521],
         [-1.7521, -1.7

KeyError: Caught KeyError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/dbogdoll/mcity_data_engine/.venv/lib/python3.12/site-packages/torch/utils/data/_utils/worker.py", line 351, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
           ^^^^^^^^^^^^^^^^^^^^
  File "/home/dbogdoll/mcity_data_engine/.venv/lib/python3.12/site-packages/torch/utils/data/_utils/fetch.py", line 55, in fetch
    return self.collate_fn(data)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_3231154/1339239284.py", line 10, in collate_fn
    data["labels"] = [x["labels"] for x in batch]
                      ~^^^^^^^^^^
KeyError: 'labels'


[{'input_ids': tensor([49406,  1615, 49407,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0]), 'attention_mask': tensor([1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'pixel_values': tensor([[[-1.7923, -1.7923, -1.7923,  ..., -1.7923, -1.7923, -1.7923],
         [-1.7923, -1.7923, -1.7923,  ..., -1.7923, -1.7923, -1.7923],
         [-1.7923, -1.7923, -1.7923,  ..., -1.7923, -1.7923, -1.7923],
         ...,
         [-1.7923, -1.7923, -1.7923,  ..., -1.7923, -1.7923, -1.7923],
         [-1.7923, -1.7923, -1.7923,  ..., -1.7923, -1.7923, -1.7923],
         [-1.7923, -1.7923, -1.7923,  ..., -1.7923, -1.7923, -1.7923]],

        [[-1.7521, -1.7521, -1.7521,  ..., -1.7521, -1.7521, -1.7521],
         [-1.7521, -1.7521, -1.7521,  ..., -1.7521, -1.7521, -1.7521],
         [-1.7521, -1.7521, -1.7521,  ..., -1.7521, -1.7521, -1.7521],
         ...,
         [-1.7521, -1.7521, -1.7521,  ..., -1.7521, -1.7521, -1.7521],
         [-1.7521, -1.7