In [2]:
import torch, torchvision
from torch.utils.data import Dataset
from transformers import OwlViTProcessor, OwlViTForObjectDetection
from PIL import Image

In [None]:
processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")

In [3]:
help( OwlViTProcessor.from_pretrained)

Help on method from_pretrained in module transformers.processing_utils:

from_pretrained(pretrained_model_name_or_path, **kwargs) method of builtins.type instance
    Instantiate a processor associated with a pretrained model.
    
    <Tip>
    
    This class method is simply calling the feature extractor
    [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and the tokenizer
    [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] methods. Please refer to the docstrings of the
    methods above for more information.
    
    </Tip>
    
    Args:
        pretrained_model_name_or_path (`str` or `os.PathLike`):
            This can be either:
    
            - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
              namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
   

In [None]:
#url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open("/content/gorceryitems.webp")
texts = [[ "food item", "a grocery item"]]
inputs = processor(text=texts, images=image, return_tensors="pt")
outputs = model(**inputs)

# Target image sizes (height, width) to rescale box predictions [batch_size, 2]
target_sizes = torch.Tensor([image.size[::-1]])
# Convert outputs (bounding boxes and class logits) to COCO API
results = processor.post_process(outputs=outputs, target_sizes=target_sizes)

i = 0  # Retrieve predictions for the first image for the corresponding text queries
text = texts[i]
boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]

score_threshold = 0.1
thresholded_boxes = []
for box, score, label in zip(boxes, scores, labels):
    box = [round(i, 2) for i in box.tolist()]
    if score >= score_threshold:
      thresholded_boxes.append(box)
      #print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")