# Transformer for object detection

# way1 simple way (only returns objects with over 90 percent accuracy)

In [6]:
import requests
from PIL import Image
from transformers import pipeline

object_detector = pipeline('object-detection', model="facebook/detr-resnet-50")

# Download an image with cute cats
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png"
image_data = requests.get(url, stream=True).raw
image = Image.open(image_data)

# Allocate a pipeline for object detection
object_detector(image)

[{'score': 0.9075493216514587,
  'label': 'truck',
  'box': {'xmin': 620, 'ymin': 147, 'xmax': 641, 'ymax': 177}},
 {'score': 0.9951134324073792,
  'label': 'car',
  'box': {'xmin': 1367, 'ymin': 627, 'xmax': 1546, 'ymax': 757}},
 {'score': 0.9889329671859741,
  'label': 'car',
  'box': {'xmin': 1063, 'ymin': 385, 'xmax': 1164, 'ymax': 451}},
 {'score': 0.9581573009490967,
  'label': 'car',
  'box': {'xmin': 572, 'ymin': 246, 'xmax': 617, 'ymax': 281}},
 {'score': 0.9419158101081848,
  'label': 'truck',
  'box': {'xmin': 727, 'ymin': 552, 'xmax': 867, 'ymax': 684}},
 {'score': 0.9308367967605591,
  'label': 'truck',
  'box': {'xmin': 1110, 'ymin': 339, 'xmax': 1186, 'ymax': 410}},
 {'score': 0.9344722628593445,
  'label': 'truck',
  'box': {'xmin': 458, 'ymin': 372, 'xmax': 608, 'ymax': 578}},
 {'score': 0.9239617586135864,
  'label': 'truck',
  'box': {'xmin': 458, 'ymin': 372, 'xmax': 597, 'ymax': 577}},
 {'score': 0.985849916934967,
  'label': 'truck',
  'box': {'xmin': 863, 'ymin':

In [59]:
#https://github.com/christianversloot/machine-learning-articles/blob/main/easy-object-detection-with-python-huggingface-transformers-and-machine-learning.md
# Draw bounding box definition
from PIL import Image, ImageDraw, ImageFont
def draw_bounding_box(im, score, label, xmin, ymin, xmax, ymax, index, num_boxes):
	""" Draw a bounding box. """

	print(f"Drawing bounding box {index} of {num_boxes}...")

	# Draw the actual bounding box
	im_with_rectangle = ImageDraw.Draw(im)  
	im_with_rectangle.rounded_rectangle((xmin, ymin, xmax, ymax), outline = "red", width = 5, radius = 10)

	# Draw the label
	im_with_rectangle.text((xmin+35, ymin-25), label, fill="white", stroke_fill = "red")
    
	# Return the intermediate result
	return im

In [81]:
# Open the image
url = "https://images.data.gov.sg/api/traffic-images/2022/03/881b8734-cca2-49d2-844f-96f16e53a1ac.jpg"
image_data = requests.get(url, stream=True).raw
im = Image.open(image_data)

with im:
    # Perform object detection
    bounding_boxes = object_detector(im)

    # Iteration elements
    num_boxes = len(bounding_boxes)
    index = 0

    # Draw bounding box for each result
    for bounding_box in bounding_boxes:

        # Get actual box
        box = bounding_box["box"]
       
        # Draw the bounding box
        im = draw_bounding_box(im, bounding_box["score"], bounding_box["label"],\
                               box["xmin"], box["ymin"], box["xmax"], box["ymax"], index, num_boxes)
        
        # Increase index by one
        index += 1

    # Save image
    im.save("street_bboxes.jpg")
    print("Done!")

Drawing bounding box 0 of 12...
Drawing bounding box 1 of 12...
Drawing bounding box 2 of 12...
Drawing bounding box 3 of 12...
Drawing bounding box 4 of 12...
Drawing bounding box 5 of 12...
Drawing bounding box 6 of 12...
Drawing bounding box 7 of 12...
Drawing bounding box 8 of 12...
Drawing bounding box 9 of 12...
Drawing bounding box 10 of 12...
Drawing bounding box 11 of 12...
Done!


# Way2:  Detect object, returns objects on all accuracy, set threshold and filter it as per our need

In [89]:
# Draw bounding box definition
from PIL import Image, ImageDraw, ImageFont
def draw_bbox_without_total_items(im, score, label, xmin, ymin, xmax, ymax):

	# Draw the actual bounding box
	im_with_rectangle = ImageDraw.Draw(im)  
	im_with_rectangle.rounded_rectangle((xmin, ymin, xmax, ymax), outline = "red", width = 2, radius = 5)

	# Draw the label
	im_with_rectangle.text((xmin+35, ymin-25), label, fill="white", stroke_fill = "red")
    
	# Return the intermediate result
	return im

In [101]:
#https://huggingface.co/facebook/detr-resnet-50
from transformers import DetrFeatureExtractor, DetrForObjectDetection
import torch
from PIL import Image
import requests

feature_extractor = DetrFeatureExtractor.from_pretrained("facebook/detr-resnet-50")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
DETECTION_THRESHOLD = 0.4
NEEDED_LABELS = ["bus", "car", "truck"]

def detect_object(image):
    inputs = feature_extractor(images=image, return_tensors="pt")
    outputs = model(**inputs)

    # convert outputs (bounding boxes and class logits) to COCO API
    target_sizes = torch.tensor([image.size[::-1]])
    results = feature_extractor.post_process(outputs, target_sizes=target_sizes)[0]
    return results

##https://github.com/huggingface/transformers/blob/main/src/transformers/pipelines/object_detection.py

def save_bounding_box_as_img(im, results, filename):
    with im:
        for score, label, boxlist in zip(results["scores"], results["labels"], results["boxes"]):
            # let's only keep detections with score > DETECTION_THRESHOLD
            
            label = model.config.id2label[label.item()]
            
            if (score > DETECTION_THRESHOLD and label in NEEDED_LABELS):   
                
                box = [int(i) for i in boxlist.tolist()]

                # Get actual box
                # Draw the bounding box
                im = draw_bbox_without_total_items(im, score.item(), label,  
                                                           box[0], box[1], box[2], box[3])

                print(f"Detected {label} with confidence "
                    f"{round(score.item(), 3)} at location {box}")
        # Save image
        im.save(filename)

        # Done
        print("Done!")

In [102]:
url = "https://images.data.gov.sg/api/traffic-images/2022/03/881b8734-cca2-49d2-844f-96f16e53a1ac.jpg"
image = Image.open(requests.get(url, stream=True).raw)

result = detect_object(image)
save_bounding_box_as_img(image, result, "street_bboxes_0.4.jpg")

Detected car with confidence 0.484 at location [700, 193, 732, 216]
Detected truck with confidence 0.653 at location [696, 167, 724, 193]
Detected truck with confidence 0.908 at location [620, 147, 641, 177]
Detected car with confidence 0.79 at location [728, 553, 866, 683]
Detected truck with confidence 0.859 at location [742, 244, 793, 287]
Detected car with confidence 0.646 at location [555, 215, 589, 246]
Detected car with confidence 0.575 at location [546, 151, 570, 171]
Detected car with confidence 0.562 at location [509, 153, 530, 171]
Detected car with confidence 0.995 at location [1367, 627, 1546, 757]
Detected car with confidence 0.517 at location [487, 168, 510, 192]
Detected car with confidence 0.615 at location [482, 153, 502, 172]
Detected truck with confidence 0.536 at location [1069, 315, 1127, 379]
Detected car with confidence 0.485 at location [563, 154, 585, 172]
Detected car with confidence 0.989 at location [1063, 385, 1164, 451]
Detected car with confidence 0.625 

# way3 using Google pretrained, we can pass labels (only for indoor objects)

In [None]:
# For labels refer the link https://www.lvisdataset.org/dataset
#https://github.com/google-research/scenic/blob/main/scenic/projects/owl_vit/preprocessing/label_ops.py

In [84]:
import requests
from PIL import Image
import torch

#https://huggingface.co/docs/transformers/model_doc/owlvit
from transformers import OwlViTProcessor, OwlViTForObjectDetection

processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
DETECTION_THRESHOLD = 0.25

In [85]:
def detect_object_OwlViT(image, texts):
    inputs = processor(text=texts, images=image, return_tensors="pt")
    outputs = model(**inputs)

    # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
    target_sizes = torch.Tensor([image.size[::-1]])
    
    # Convert outputs (bounding boxes and class logits) to COCO API
    results = processor.post_process(outputs=outputs, target_sizes=target_sizes)
    
    return results

def save_bb(im, texts, results, filename):
    with im:
        for i in range(len(texts)):
            text = texts[i]
            boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]

            for score, label, boxlist in zip(scores, labels, boxes):
                # let's only keep detections with score > DETECTION_THRESHOLD
                if score > DETECTION_THRESHOLD:
                    box = [int(i) for i in boxlist.tolist()]

                    # Get actual box
                    # Draw the bounding box
                    im = draw_bbox_without_total_items(im, score.item(), text[label],  
                                                               box[0], box[1], box[2], box[3])

                    print(f"Detected {text[label]} with confidence "
                        f"{round(score.item(), 3)} at location {box}")
        # Save image
        im.save(filename)

        # Done
        print("Done!")

In [86]:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
texts = [["cat", "dog"]]

results = detect_object_OwlViT(image, texts)
save_bb(image, texts, results, "temp.jpg")

Detected cat with confidence 0.287 at location [324, 20, 640, 373]
Detected cat with confidence 0.254 at location [1, 55, 315, 472]
Done!
