##### Offline and it doesn't give me location

In [1]:
from transformers import pipeline, AutoFeatureExtractor, AutoModelForObjectDetection

# Option 1: Download the model manually first
# Visit huggingface.co/facebook/detr-resnet-50 and download the model files
# Save them to a directory like "local_model_directory"

# Then load the model from local files
model_path = "./model_detr_resnet_50"  # Path to where you saved the model files
feature_extractor = AutoFeatureExtractor.from_pretrained(model_path, local_files_only=True)
model = AutoModelForObjectDetection.from_pretrained(model_path, local_files_only=True)
detector = pipeline("object-detection", model=model, feature_extractor=feature_extractor)

Some weights of the model checkpoint at ./model_detr_resnet_50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [2]:
# Rest of your code remains the same
from PIL import Image

# Load an image from a local file
image_path = "./pictures/img_1.jpg"
image = Image.open(image_path)

# Run detection
results = detector(image)

# Print results
print("Detected objects:")
for result in results:
    label = result["label"]
    score = round(result["score"] * 100, 2)
    print(f"- {label} (confidence: {score}%)")

Detected objects:
- cup (confidence: 98.68%)
- book (confidence: 99.41%)


##### online

In [3]:
from transformers import DetrImageProcessor, DetrForObjectDetection
import torch
from PIL import Image
import requests

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

# you can specify the revision tag if you don't want the timm dependency
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50", revision="no_timm")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50", revision="no_timm")

inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)

# convert outputs (bounding boxes and class logits) to COCO API
# let's only keep detections with score > 0.9
target_sizes = torch.tensor([image.size[::-1]])
results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    print(
            f"Detected {model.config.id2label[label.item()]} with confidence "
            f"{round(score.item(), 3)} at location {box}"
    )

Detected remote with confidence 0.998 at location [40.16, 70.81, 175.55, 117.98]
Detected remote with confidence 0.996 at location [333.24, 72.55, 368.33, 187.66]
Detected couch with confidence 0.995 at location [-0.02, 1.15, 639.73, 473.76]
Detected cat with confidence 0.999 at location [13.24, 52.05, 314.02, 470.93]
Detected cat with confidence 0.999 at location [345.4, 23.85, 640.37, 368.72]


##### Offline

In [5]:
from transformers import DetrImageProcessor, DetrForObjectDetection
import torch
from PIL import Image

# Load image from local file system
image_path = "./pictures/img_1.jpg"  # Update with your actual path
image = Image.open(image_path)

# Load models from local paths (after you've saved them once)
# ------------- Downloaded one more time ---------------
# processor = DetrImageProcessor.from_pretrained("./detr_processor")
# model = DetrForObjectDetection.from_pretrained("./detr_model")

processor = DetrImageProcessor.from_pretrained("./model_detr_resnet_50")
model = DetrForObjectDetection.from_pretrained("./model_detr_resnet_50")

# Process the image
inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)

# Process results
target_sizes = torch.tensor([image.size[::-1]])
results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    print(
        f"Detected {model.config.id2label[label.item()]} with confidence "
        f"{round(score.item(), 3)} at location {box}"
    )

Some weights of the model checkpoint at ./model_detr_resnet_50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Detected cup with confidence 0.987 at location [788.98, 316.17, 1143.04, 1005.37]
Detected book with confidence 0.994 at location [291.63, 1125.75, 1779.65, 2647.75]
