In [97]:
import requests
from PIL import Image, ImageDraw
import torch

from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

import sys

sys.path.append("..")

In [116]:
processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
model = AutoModelForZeroShotObjectDetection.from_pretrained(
    "google/owlvit-base-patch32"
)

device = "cuda"

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

texts = ["cat", "remote control"]
tokenized_texts = processor.tokenizer(texts, padding="max_length", return_tensors="pt")
print(tokenized_texts)

# Preprocess image
inputs = processor.image_processor(text=None, images=image, return_tensors="pt")

# Update inputs with tokenized text fields
inputs.update(tokenized_texts)
# inputs.update({
#    "input_ids": tokenized_texts["input_ids"],
#    "attention_mask": tokenized_texts["attention_mask"]
# })

# Forward pass
outputs = model(**inputs)

# Target image sizes (height, width) to rescale box predictions [batch_size, 2]
target_sizes = torch.Tensor([image.size[::-1]])
# Convert outputs (bounding boxes and class logits) to COCO API
results = processor.post_process_object_detection(
    outputs=outputs, threshold=0.1, target_sizes=target_sizes
)

i = 0  # Retrieve predictions for the first image for the corresponding text queries
text = texts[i]
boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]

# Print detected objects and rescaled box coordinates
for box, score, label in zip(boxes, scores, labels):
    box = [round(i, 2) for i in box.tolist()]
    print(
        f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}"
    )

{'input_ids': tensor([[49406,  2368, 49407,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0],
        [49406,  9687,  3366, 49407,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}
Detected a with confidence 0.328 at location [40.03, 72.43, 177.76, 115.58]
Detected a with confidence 0.159 at location [335.71, 74.17, 371.88, 187.55]
Detected c with confidence 0.287 at location [324.97, 20.44, 640.58, 373.29]
Detected c with confidence 0.254 at location [1.46, 55.26, 315.55, 472.17]


In [103]:
images = [image, image]
text_queries = [
    ["human face", "rocket", "nasa badge", "star-spangled banner"],
    ["hat", "book", "sunglasses", "camera"],
]

classes = ["human face", "rocket", "nasa badge", "star-spangled banner"]
text_queries = classes * 2

tokenized_text = processor.tokenizer(
    text_queries,
    padding="max_length",
    return_tensors="pt",
)

inputs = processor(text=tokenized_text, images=images, return_tensors="pt")

results = processor.post_process_object_detection(outputs=outputs, threshold=0.1)

i = 0  # Retrieve predictions for the first image for the corresponding text queries
text = texts[i]
boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]

# Print detected objects and rescaled box coordinates
for box, score, label in zip(boxes, scores, labels):
    box = [round(i, 2) for i in box.tolist()]
    print(
        f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}"
    )

TypeError: Input text should be a string, a list of strings or a nested list of strings

In [None]:
# Draw bounding boxes on the image
draw = ImageDraw.Draw(image)
for box, label in zip(boxes, labels):
    draw.rectangle(box.tolist(), outline="red", width=2)
    draw.text((box[0], box[1]), text[label], fill="red")

# Display the image
display(image)

In [72]:
import os

tokens = {}
with open("/home/dbogdoll/mcity_data_engine/.secret", "r") as file:
    for line in file:
        key, value = line.strip().split("=")
        tokens[key] = value

os.environ["HF_TOKEN"] = tokens["HF_TOKEN"]

In [None]:
import fiftyone as fo
from fiftyone.utils.huggingface import load_from_hub
from utils.data_loader import FiftyOneTorchDatasetCOCO, TorchToHFDatasetCOCO

try:
    dataset_v51 = load_from_hub("dbogdollumich/mcity_fisheye_v51")
except:
    dataset_v51 = fo.load_dataset("dbogdollumich/mcity_fisheye_v51")
pytorch_dataset = FiftyOneTorchDatasetCOCO(dataset_v51)
pt_to_hf_converter = TorchToHFDatasetCOCO(pytorch_dataset)
hf_dataset = pt_to_hf_converter.convert()

In [None]:
texts = [
    "car",
    "truck",
    "bus",
    "trailer",
    "motorbike/cycler",
    "pedestrian",
    "van",
    "pickup",
]

processed_classes = [part for classname in texts for part in classname.split("/")]
class_parts_dict = {
    part: classname for classname in texts for part in classname.split("/")
}
classes_v51 = processed_classes
print(processed_classes)

In [75]:
from torchvision import transforms
from torch.utils.data import DataLoader

dataset_v51 = dataset_v51.take(16, seed=51)
transform = transforms.Compose([transforms.ToTensor()])

pytorch_dataset = FiftyOneTorchDatasetCOCO(
    dataset_v51,
    transforms=transform,
)
data_loader = DataLoader(
    pytorch_dataset,
    batch_size=4,
    num_workers=4,
    pin_memory=True,
    collate_fn=lambda x: list(zip(*x)),
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
batch_classes = processed_classes * data_loader.batch_size
tokenized_text = processor.tokenizer(
    batch_classes, padding="max_length", return_tensors="pt"
).to(device)
print(tokenized_text)

In [77]:
from teacher import CustomOwlViTProcessor

processor = CustomOwlViTProcessor.from_pretrained(
    "google/owlvit-base-patch32", do_rescale=False
)

model = AutoModelForZeroShotObjectDetection.from_pretrained(
    "google/owlvit-base-patch32"
)

In [96]:
import os
from tqdm import tqdm

batch_text = [
    ["human face", "rocket", "nasa badge", "star-spangled banner"],
    ["hat", "book", "sunglasses", "camera"],
]

batch_text = text * data_loader.batch_size
print(batch_text)

os.environ["TOKENIZERS_PARALLELISM"] = "true"
for images, targets in tqdm(data_loader):
    inputs = processor(text=batch_text, images=images, return_tensors="pt").to(device)
    print("Inputs shape (first run):", {k: v.shape for k, v in inputs.items()})
    with torch.no_grad():
        outputs = model(**inputs)
    results = processor.post_process_object_detection(outputs=outputs, threshold=0.1)

tokenized_texts = processor.tokenizer(
    batch_text,
    padding="max_length",
    return_tensors="pt",
).to(device)
print("Tokenized texts shape:", {k: v.shape for k, v in tokenized_texts.items()})

for images, targets in tqdm(data_loader):
    inputs = processor(text=None, images=images, return_tensors="pt").to(device)
    inputs.update(tokenized_texts)
    print("Inputs shape (second run):", {k: v.shape for k, v in inputs.items()})
    with torch.no_grad():
        outputs = model(**inputs)
    results = processor.post_process_object_detection(outputs=outputs, threshold=0.1)

catcatcatcatcatcatcatcat


  0%|          | 0/2 [00:00<?, ?it/s]

Inputs shape (first run): {'input_ids': torch.Size([1, 16]), 'attention_mask': torch.Size([1, 16]), 'pixel_values': torch.Size([8, 3, 768, 768])}





RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

In [None]:
session = fo.launch_app(view=dataset)