https://huggingface.co/docs/transformers/v4.45.2/en/model_doc/grounding-dino
https://huggingface.co/IDEA-Research/grounding-dino-tiny

https://huggingface.co/docs/transformers/v4.45.2/en/model_doc/owlv2
https://huggingface.co/google/owlv2-base-patch16-ensemble

https://huggingface.co/docs/transformers/v4.45.2/en/model_doc/owlvit
https://huggingface.co/docs/transformers/v4.45.2/en/model_doc/omdet-turbo

https://huggingface.co/docs/transformers/main/model_doc/auto#transformers.AutoModelForZeroShotObjectDetection

https://huggingface.co/docs/transformers/en/tasks/zero_shot_object_detection


In [None]:
from PIL import Image
import numpy as np
import torch
from transformers import (
    AutoProcessor,
    AutoConfig,
    AutoTokenizer,
    AutoModelForZeroShotObjectDetection,
)
import sys

sys.path.append("..")

In [None]:
import os

tokens = {}
with open("/home/dbogdoll/mcity_data_engine/.secret", "r") as file:
    for line in file:
        key, value = line.strip().split("=")
        tokens[key] = value

os.environ["HF_TOKEN"] = tokens["HF_TOKEN"]

In [None]:
import fiftyone as fo
from fiftyone.utils.huggingface import load_from_hub
from utils.data_loader import FiftyOneTorchDatasetCOCO, TorchToHFDatasetCOCO

try:
    dataset_v51 = load_from_hub("dbogdollumich/mcity_fisheye_v51")
except:
    dataset_v51 = fo.load_dataset("dbogdollumich/mcity_fisheye_v51")
pytorch_dataset = FiftyOneTorchDatasetCOCO(dataset_v51)
pt_to_hf_converter = TorchToHFDatasetCOCO(pytorch_dataset)
hf_dataset = pt_to_hf_converter.convert()

In [None]:
model_name = "google/owlv2-base-patch16-ensemble"
texts = [
    [
        "car",
        "truck",
        "bus",
        "trailer",
        "motorbike/cycler",
        "pedestrian",
        "van",
        "pickup",
    ]
]

config = AutoConfig.from_pretrained(model_name)
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(config)
print(processor)
print(model)
print(tokenizer)

Training not yet implemented in HF, but maybe coming
- https://github.com/huggingface/transformers/pull/34057/commits/a4f3d660b7ba9ac269c1e0870ea6e9048f72bdc0
- https://github.com/huggingface/transformers/issues/33664
- https://github.com/huggingface/transformers/issues/20091
- https://github.com/stevebottos/owl-vit-object-detection
- https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit#fine-tuning


In [None]:
from tqdm import tqdm

predictions_view = dataset_v51.take(16, seed=51)

for sample in tqdm(predictions_view):
    image = Image.open(sample.filepath)
    inputs = processor(text=texts, images=image, return_tensors="pt")

    # forward pass
    with torch.no_grad():
        outputs = model(**inputs)

    # Get original image size
    original_size = torch.Tensor([image.size[::-1]])

    # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
    results = processor.post_process_object_detection(
        outputs=outputs, threshold=0.2, target_sizes=original_size
    )
    i = 0  # Retrieve predictions for the first image for the corresponding text queries
    text = texts[i]
    boxes, scores, labels = (
        results[i]["boxes"],
        results[i]["scores"],
        results[i]["labels"],
    )

    # Convert to V51 format
    # Convert to [top-left-x, top-left-y, width, height]
    # in relative coordinates in [0, 1] x [0, 1]
    width, height = image.size
    detections = []
    for box, score, label in zip(boxes, scores, labels):
        top_left_x = box[0].item() / width
        top_left_y = box[1].item() / height
        box_width = (box[2] - box[0]).item() / width
        box_height = (box[3] - box[1]).item() / height

        detection = fo.Detection(
            label=texts[0][label],
            bounding_box=[top_left_x, top_left_y, box_width, box_height],
            confidence=score.item(),
        )
        detections.append(detection)

    sample["predictions"] = fo.Detections(detections=detections)
    sample.save()

In [None]:
counts = predictions_view.count_values("ground_truth.detections.label")
classes = sorted(counts, key=counts.get, reverse=True)[:10]

results = predictions_view.evaluate_detections(
    "predictions",
    gt_field="ground_truth",
    eval_key="eval",
    compute_mAP=True,
)
results.print_report(classes=classes)
print("mAP: ", results.mAP())

# Print some statistics about the total TP/FP/FN counts
print("TP: %d" % predictions_view.sum("eval_tp"))
print("FP: %d" % predictions_view.sum("eval_fp"))
print("FN: %d" % predictions_view.sum("eval_fn"))

In [None]:
plot = results.plot_pr_curves(classes=classes)
plot.show()

In [None]:
plot = results.plot_confusion_matrix(classes=classes)
plot.show()

In [None]:
session = fo.launch_app(view=predictions_view)

In [None]:
categories = dataset_v51.default_classes
id2label = {index: x for index, x in enumerate(categories, start=0)}
label2id = {v: k for k, v in id2label.items()}


def collate_fn(batch):
    print(batch)
    data = {}
    data["pixel_values"] = torch.stack([x["pixel_values"] for x in batch])
    data["labels"] = [x["labels"] for x in batch]
    if "pixel_mask" in batch[0]:
        data["pixel_mask"] = torch.stack([x["pixel_mask"] for x in batch])

    input_ids = tokenizer(texts, padding=True, return_tensors="pt")["input_ids"]
    data["input_ids"] = input_ids.repeat(len(batch), 1)  # Match batch size
    return data


def transform_batch(examples, image_processor, return_pixel_mask=False):
    """Apply format annotations in COCO format for object detection task"""

    images = []
    annotations = []

    for image_path, annotation in zip(examples["image"], examples["target"]):
        image = Image.open(image_path).convert("RGB")
        image_np = np.array(image)
        images.append(image_np)

        # Annotation needs to be in COCO style annotation per bounding box
        coco_annotations = []
        for i, bbox in enumerate(annotation["bbox"]):

            # Convert bbox x_min, y_min, w, h to YOLO format x_center, y_center, w, h
            bbox[0] = bbox[0] + bbox[2] / 2.0
            bbox[1] = bbox[1] + bbox[3] / 2.0

            # Ensure bbox values are within the expected range
            assert all(0 <= coord <= 1 for coord in bbox), f"Invalid bbox: {bbox}"

            coco_annotation = {
                "image_id": annotation["image_id"],
                "bbox": bbox,
                "category_id": annotation["category_id"][i],
                "area": annotation["area"][i],
                "iscrowd": 0,
            }
            coco_annotations.append(coco_annotation)
        detr_annotation = {
            "image_id": annotation["image_id"],
            "annotations": coco_annotations,
        }
        annotations.append(detr_annotation)

    # Apply the image processor transformations: resizing, rescaling, normalization
    result = image_processor(images=images, text=texts, return_tensors="pt")

    if not return_pixel_mask:
        result.pop("pixel_mask", None)

    return result

In [None]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

from transformers import (
    AutoConfig,
    AutoProcessor,
    AutoModelForZeroShotObjectDetection,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)

import evaluate
from datasets import Split
from functools import partial

import wandb

In [None]:
# Finetuning the model on our data
image_processor = AutoProcessor.from_pretrained(
    model_name,
    do_resize=False,
    do_pad=False,  # Assumes all images have the same size
    do_convert_annotations=False,  # expects YOLO (center_x, center_y, width, height) between [0,1]
)

hf_model_config = AutoConfig.from_pretrained(model_name)
train_transform_batch = partial(transform_batch, image_processor=image_processor)
validation_transform_batch = partial(transform_batch, image_processor=image_processor)

hf_dataset[Split.TRAIN] = hf_dataset[Split.TRAIN].with_transform(train_transform_batch)
hf_dataset[Split.VALIDATION] = hf_dataset[Split.VALIDATION].with_transform(
    validation_transform_batch
)

model = AutoModelForZeroShotObjectDetection.from_pretrained(
    model_name,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
)


training_args = TrainingArguments(
    run_name=model_name,
    output_dir="output/models/teacher/" + model_name,
    num_train_epochs=3,
    fp16=False,
    per_device_train_batch_size=8,
    auto_find_batch_size=True,  # Automates the lowering process if CUDA OOM
    dataloader_num_workers=8,
    learning_rate=5e-05,
    lr_scheduler_type="cosine",
    weight_decay=0.0001,
    max_grad_norm=0.01,
    metric_for_best_model="eval_loss",  # eval_map,
    load_best_model_at_end=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    remove_unused_columns=False,
    eval_do_concat_batches=False,
    save_safetensors=False,
    push_to_hub=False,
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.0,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_dataset[Split.TRAIN],
    eval_dataset=hf_dataset[Split.VALIDATION],
    tokenizer=image_processor,
    data_collator=collate_fn,
    callbacks=[early_stopping_callback],
    # compute_metrics=eval_compute_metrics_fn, # TODO Write eval function
)

trainer.train()