https://huggingface.co/docs/transformers/v4.45.2/en/model_doc/grounding-dino
https://huggingface.co/IDEA-Research/grounding-dino-tiny

https://huggingface.co/docs/transformers/v4.45.2/en/model_doc/owlv2
https://huggingface.co/google/owlv2-base-patch16-ensemble

https://huggingface.co/docs/transformers/v4.45.2/en/model_doc/owlvit
https://huggingface.co/docs/transformers/v4.45.2/en/model_doc/omdet-turbo

https://huggingface.co/docs/transformers/main/model_doc/auto#transformers.AutoModelForZeroShotObjectDetection

https://huggingface.co/docs/transformers/en/tasks/zero_shot_object_detection


In [1]:
import requests
from PIL import Image, ImageDraw
import numpy as np
import torch
from transformers import AutoProcessor, AutoConfig, AutoModelForZeroShotObjectDetection
import sys

sys.path.append("..")

In [2]:
import os

tokens = {}
with open("/home/dbogdoll/mcity_data_engine/.secret", "r") as file:
    for line in file:
        key, value = line.strip().split("=")
        tokens[key] = value

os.environ["HF_TOKEN"] = tokens["HF_TOKEN"]

In [3]:
import fiftyone as fo
from fiftyone.utils.huggingface import load_from_hub
from utils.data_loader import FiftyOneTorchDatasetCOCO, TorchToHFDatasetCOCO

try:
    dataset_v51 = load_from_hub("dbogdollumich/mcity_fisheye_v51")
except:
    dataset_v51 = fo.load_dataset("dbogdollumich/mcity_fisheye_v51")
pytorch_dataset = FiftyOneTorchDatasetCOCO(dataset_v51)
pt_to_hf_converter = TorchToHFDatasetCOCO(pytorch_dataset)
hf_dataset = pt_to_hf_converter.convert()

Downloading config file fiftyone.yml from dbogdollumich/mcity_fisheye_v51
Loading dataset


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [4]:
model_name = "google/owlv2-base-patch16-ensemble"
texts = [
    [
        "car",
        "truck",
        "bus",
        "trailer",
        "motorbike/cycler",
        "pedestrian",
        "van",
        "pickup",
    ]
]

config = AutoConfig.from_pretrained(model_name)
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_name)

In [5]:
predictions_view = dataset_v51.take(20, seed=51)


with fo.ProgressBar() as pb:
    for sample in predictions_view:
        image = Image.open(sample.filepath)
        inputs = processor(text=texts, images=image, return_tensors="pt")

        # forward pass
        with torch.no_grad():
            outputs = model(**inputs)

        # Get original image size
        original_size = torch.Tensor([image.size[::-1]])

        # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
        results = processor.post_process_object_detection(
            outputs=outputs, threshold=0.2, target_sizes=original_size
        )
        i = 0  # Retrieve predictions for the first image for the corresponding text queries
        text = texts[i]
        boxes, scores, labels = (
            results[i]["boxes"],
            results[i]["scores"],
            results[i]["labels"],
        )
        print(boxes)
        print(scores)
        print(labels)
        # Convert to [top-left-x, top-left-y, width, height]
        # in relative coordinates in [0, 1] x [0, 1]

        # Convert to V51 format
        width, height = image.size
        detections = []
        for box, score, label in zip(boxes, scores, labels):
            top_left_x = box[0].item() / width
            top_left_y = box[1].item() / height
            box_width = (box[2] - box[0]).item() / width
            box_height = (box[3] - box[1]).item() / height

            detection = fo.Detection(
                label=texts[0][label],
                bounding_box=[top_left_x, top_left_y, box_width, box_height],
                confidence=score.item(),
            )
            detections.append(detection)

        sample["predictions"] = fo.Detections(detections=detections)
        sample.save()

tensor([[ 335.3813,   64.1252,  367.6791,   91.6695],
        [ 324.7593,   61.8342,  372.4485,   99.0956],
        [ 596.6906,  112.2897,  686.4918,  162.0679],
        [ 617.3747,  162.1780,  725.4578,  219.8753],
        [ 298.4452,  183.7451,  328.8307,  218.2401],
        [ 374.4344,  178.7748,  410.8152,  222.6373],
        [ 967.3642,  314.2137,  988.6469,  337.5050],
        [ 977.0450,  332.9518, 1015.5792,  384.9277],
        [ 242.4142,  334.8897,  278.7531,  402.7682],
        [ 194.3034,  463.8850,  211.0045,  506.2927],
        [ 211.3878,  461.8627,  227.6301,  514.8760],
        [ 214.7865,  460.1653,  236.1884,  519.4133],
        [ 351.5726,  487.1773,  417.7432,  562.0014],
        [ 723.4384,  632.1711,  818.1383,  689.3246],
        [ 507.4895,  696.1407,  587.0485,  786.5597],
        [ 930.6060,  768.7997,  953.2947,  797.3034],
        [ 930.7635,  762.5784,  973.6462,  801.0005],
        [ 250.5635,  792.4049,  276.1303,  818.9023],
        [ 398.6930,  855.222

In [6]:
counts = predictions_view.count_values("ground_truth.detections.label")
classes = sorted(counts, key=counts.get, reverse=True)[:10]

results = predictions_view.evaluate_detections(
    "predictions",
    gt_field="ground_truth",
    eval_key="eval",
    compute_mAP=True,
)
results.print_report(classes=classes)
print("mAP: ", results.mAP())

# Print some statistics about the total TP/FP/FN counts
print("TP: %d" % predictions_view.sum("eval_tp"))
print("FP: %d" % predictions_view.sum("eval_fp"))
print("FN: %d" % predictions_view.sum("eval_fn"))

Evaluating detections...
 100% |███████████████████| 20/20 [419.4ms elapsed, 0s remaining, 47.7 samples/s]      
Performing IoU sweep...
 100% |███████████████████| 20/20 [363.5ms elapsed, 0s remaining, 55.0 samples/s]      
              precision    recall  f1-score   support

         car       0.23      0.37      0.29       224
      pickup       0.00      0.00      0.00        11
         van       0.00      0.00      0.00         6
         bus       0.00      0.00      0.00         4
  pedestrian       0.00      0.00      0.00         4
     trailer       0.00      0.00      0.00         2
       truck       0.00      0.00      0.00         2

   micro avg       0.21      0.33      0.25       253
   macro avg       0.03      0.05      0.04       253
weighted avg       0.21      0.33      0.25       253

mAP:  0.00903812703921962
TP: 83
FP: 315
FN: 170


In [7]:
plot = results.plot_pr_curves(classes=classes)
plot.show()



FigureWidget({
    'data': [{'customdata': array([0.47194517, 0.34224844, 0.29770085, 0.28677287, 0.24730159, 0.23642839,
                                   0.201217  , 0.19736231, 0.1931077 , 0.18859136, 0.18616994, 0.17895363,
                                   0.1497874 , 0.14731388, 0.14381127, 0.14042687, 0.11204458, 0.10919358,
                                   0.10706955, 0.10616252, 0.10412776, 0.1019895 , 0.10053787, 0.09723197,
                                   0.09474057, 0.08648565, 0.06343989, 0.06149893, 0.06051799, 0.05963548,
                                   0.03164314, 0.03147317, 0.03063828, 0.02908981, 0.02823469, 0.02741362,
                                   0.02660383, 0.02259308, 0.        , 0.        , 0.        , 0.        ,
                                   0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
                                   0.        , 0.        , 0.        , 0.        , 0.        , 0.        ,
                      

In [8]:
plot = results.plot_confusion_matrix(classes=classes)
plot.show()



FigureWidget({
    'data': [{'mode': 'markers',
              'opacity': 0.1,
              'type': 'scatter',
              'uid': 'd8f06fae-e396-4836-9c4a-bc340a7b242b',
              'x': array([0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
                          0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
                          0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]),
              'y': array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
                          3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
                          6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7])},
             {'colorscale': [[0.0, 'rgb(255,245,235)'], [0.125,
                             'rgb(254,230,206)'], [0.25, 'rgb(253,208,162)'],
                             [0.375, 'rgb(253,174,107)'], [0.5, 'rgb(253,141,60)'],
                             [0.625, 'rgb(241,105,19)'], 

In [9]:
session = fo.launch_app(dataset_v51)
session.view = predictions_view
session.show()