In [1]:
import os
import torch

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches

from transformers import Owlv2Processor, Owlv2ForObjectDetection
from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def plot_results(results, texts, image):
    """
    Plots the results of object detection on an image.
    
    Args:
        results (list): List of dictionaries containing the detected object information.
        texts (list): List of text labels corresponding to the detected objects.
        image (PIL.JpegImagePlugin.JpegImageFile): The input image on which the objects are detected.
    """
    boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"]
    text = texts[0]
    colors = np.random.random((len(text),3))
    fig, ax = plt.subplots()
    ax.imshow(image)
    
    # Iteration on every boxes, scores and labels
    for box, score, label in zip(boxes, scores, labels):
        box = [round(i, 2) for i in box.detach().cpu().numpy()]

        # Convertion of the coordinates of the box for Matplotlib (x, y, width, length)
        x, y, xmax, ymax = box
        rect = patches.Rectangle((x, y), xmax - x, ymax - y, linewidth=1, edgecolor=colors[label], facecolor='none')
        
        ax.add_patch(rect)
        plt.text(x, y - 10, f'{text[label]}: {round(score.item(), 2)}', color='white', fontsize=8, backgroundcolor=colors[label])
        
        print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
   
    plt.axis('off')
    plt.show()


In [3]:
processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble").cuda()

In [None]:
target_image = Image.open(requests.get(target_url, stream=True).raw)
target_sizes = torch.Tensor([image.size[::-1]])

# Source image
source_image = Image.open(requests.get(source_url, stream=True).raw)

In [None]:
# Initialize processor and model
processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")

# Assuming you have a CUDA device available, move the model to the GPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Load the image
path = "/home/omilab-gpu/OWLv2-For_SAP_scenes_recognition/data/iLoveIMG IMG 6542.jpg"
image = Image.open(path)

# Example text input
texts = [["businessmen figure", "oldwomen front", "Bus figure"]]

# Generate inputs
inputs = processor(text=texts, images=image, return_tensors="pt")

# Move each tensor in the inputs dictionary to the GPU
inputs = {name: tensor.to(device) for name, tensor in inputs.items()}

# Perform inference 5 times
for _ in range(5):
    # Generate inputs
    inputs = processor(text=texts, images=image, return_tensors="pt")

    # Move each tensor in the inputs dictionary to the GPU
    inputs = {name: tensor.to(device) for name, tensor in inputs.items()}

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
    


# Target image sizes (height, width) to rescale box predictions [batch_size, 2]
target_sizes = torch.Tensor([image.size[::-1]])
# Convert outputs (bounding boxes and class logits) to Pascal VOC Format (xmin, ymin, xmax, ymax)
results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)
plot_results(results, texts, image)