In [2]:
# huggingface models
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 4.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 61.6 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 24.5 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


# google model, test

In [11]:
import requests
from PIL import Image
import torch

from transformers import OwlViTProcessor, OwlViTForObjectDetection

processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
url = "https://raw.githubusercontent.com/nils-holmberg/cca-cce/main/res/img/penguin.png"
image = Image.open(requests.get(url, stream=True).raw)
texts = [["a photo of a cat", "a photo of a dog"]]
texts = [["penguin", "camera", "person"]]
inputs = processor(text=texts, images=image, return_tensors="pt")
outputs = model(**inputs)

# Target image sizes (height, width) to rescale box predictions [batch_size, 2]
target_sizes = torch.Tensor([image.size[::-1]])
# Convert outputs (bounding boxes and class logits) to COCO API
results = processor.post_process(outputs=outputs, target_sizes=target_sizes)

i = 0  # Retrieve predictions for the first image for the corresponding text queries
text = texts[i]
boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]

# Print detected objects and rescaled box coordinates
score_threshold = 0.1
for box, score, label in zip(boxes, scores, labels):
    box = [round(i, 2) for i in box.tolist()]
    if score >= score_threshold:
        print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")


ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.


Detected camera with confidence 0.135 at location [393.15, 87.05, 679.15, 171.64]
Detected penguin with confidence 0.63 at location [120.32, 229.09, 326.95, 581.86]
Detected person with confidence 0.136 at location [657.78, 112.98, 937.76, 584.98]


# facebook model, test

In [9]:
from transformers import DetrFeatureExtractor, DetrForObjectDetection
import torch
from PIL import Image
import requests

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
url = "https://raw.githubusercontent.com/nils-holmberg/cca-cce/main/res/img/penguin.png"
image = Image.open(requests.get(url, stream=True).raw)

feature_extractor = DetrFeatureExtractor.from_pretrained("facebook/detr-resnet-50")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")

inputs = feature_extractor(images=image, return_tensors="pt")
outputs = model(**inputs)

# convert outputs (bounding boxes and class logits) to COCO API
target_sizes = torch.tensor([image.size[::-1]])
results = feature_extractor.post_process(outputs, target_sizes=target_sizes)[0]

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    # let's only keep detections with score > 0.9
    if score > 0.9:
        print(
            f"Detected {model.config.id2label[label.item()]} with confidence "
            f"{round(score.item(), 3)} at location {box}"
        )


Downloading:   0%|          | 0.00/274 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.59k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/167M [00:00<?, ?B/s]

Detected bird with confidence 0.988 at location [124.92, 229.17, 323.88, 584.06]
Detected person with confidence 0.998 at location [637.4, 108.07, 961.02, 582.93]




# testing, dont execute

In [3]:
import requests
from PIL import Image
import torch
from transformers import OwlViTProcessor, OwlViTForObjectDetection


In [4]:
processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
url = "https://raw.githubusercontent.com/nils-holmberg/cca-cce/main/res/img/penguin.png"
image = Image.open(requests.get(url, stream=True).raw)
texts = [["penguin", "camera", "person"]]
inputs = processor(text=texts, images=image, return_tensors="pt")
outputs = model(**inputs)


Downloading:   0%|          | 0.00/392 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/775 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/460 [00:00<?, ?B/s]

ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.


Downloading:   0%|          | 0.00/4.42k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/613M [00:00<?, ?B/s]

In [5]:
# Target image sizes (height, width) to rescale box predictions [batch_size, 2]
target_sizes = torch.Tensor([image.size[::-1]])
# Convert outputs (bounding boxes and class logits) to COCO API
results = processor.post_process(outputs=outputs, target_sizes=target_sizes)


In [6]:
i = 0  # Retrieve predictions for the first image for the corresponding text queries
text = texts[i]
boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]

score_threshold = 0.1
for box, score, label in zip(boxes, scores, labels):
    box = [round(i, 2) for i in box.tolist()]
    if score >= score_threshold:
        print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")


Detected camera with confidence 0.135 at location [393.15, 87.05, 679.15, 171.64]
Detected penguin with confidence 0.63 at location [120.32, 229.09, 326.95, 581.86]
Detected person with confidence 0.136 at location [657.78, 112.98, 937.76, 584.98]


In [10]:
!pip install timm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting timm
  Downloading timm-0.6.11-py3-none-any.whl (548 kB)
[K     |████████████████████████████████| 548 kB 4.2 MB/s 
Installing collected packages: timm
Successfully installed timm-0.6.11


In [1]:
import io
import requests
from PIL import Image
import torch
import numpy
#import timm

In [2]:
from transformers import DetrFeatureExtractor, DetrForSegmentation
from transformers.models.detr.feature_extraction_detr import rgb_to_id

feature_extractor = DetrFeatureExtractor.from_pretrained("facebook/detr-resnet-50-panoptic")
model = DetrForSegmentation.from_pretrained("facebook/detr-resnet-50-panoptic")


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

Downloading:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/172M [00:00<?, ?B/s]

Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnet50_a1_0-14fe96d1.pth" to /root/.cache/torch/hub/checkpoints/resnet50_a1_0-14fe96d1.pth


In [3]:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
url = "https://raw.githubusercontent.com/nils-holmberg/cca-cce/main/res/img/penguin.png"
image = Image.open(requests.get(url, stream=True).raw)

# prepare image for the model
inputs = feature_extractor(images=image, return_tensors="pt")

# forward pass
outputs = model(**inputs)

# Use the `post_process_panoptic_segmentation` method of `DetrFeatureExtractor` to retrieve post-processed panoptic segmentation maps
# Segmentation results are returned as a list of dictionaries
result = feature_extractor.post_process_panoptic_segmentation(outputs, target_sizes=[(300, 500)])

# A tensor of shape (height, width) where each value denotes a segment id, filled with -1 if no segment is found
panoptic_seg = result[0]["segmentation"]
# Get prediction score and segment_id to class_id mapping of each segment
panoptic_segments_info = result[0]["segments_info"]



In [4]:
print(panoptic_segments_info)

[{'id': 1, 'label_id': 1, 'was_fused': False, 'score': 0.99966}, {'id': 2, 'label_id': 159, 'was_fused': False, 'score': 0.969677}, {'id': 3, 'label_id': 187, 'was_fused': False, 'score': 0.99265}, {'id': 4, 'label_id': 16, 'was_fused': False, 'score': 0.987813}]


In [5]:
len(panoptic_segments_info)

4

In [8]:
print(panoptic_seg.shape)

torch.Size([300, 500])
