In [2]:
import cv2
import torch
import torchvision.transforms as T
from PIL import Image
import numpy as np

# Load the pre-trained DETR model
model = torch.hub.load('facebookresearch/detr', 'detr_resnet50', pretrained=True)
model.eval()

# Transform for the input frame
transform = T.Compose([
    T.Resize(800),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

def get_bounding_boxes(outputs, threshold=0.7):
    """ Get bounding boxes from model output """
    probas = outputs['pred_logits'].softmax(-1)[0, :, :-1]
    keep = probas.max(-1).values > threshold
    boxes = outputs['pred_boxes'][0, keep]
    return boxes

def rescale_bboxes(out_bbox, size):
    """ Rescale bounding boxes to the original image size """
    img_w, img_h = size
    b = out_bbox.cpu().clone()
    b = box_cxcywh_to_xyxy(b)
    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
    return b

def box_cxcywh_to_xyxy(x):
    """ Convert bounding boxes from [c_x, c_y, w, h] to [x_min, y_min, x_max, y_max] """
    x_c, y_c, w, h = x.unbind(1)
    b = [x_c - 0.5 * w, y_c - 0.5 * h, x_c + 0.5 * w, y_c + 0.5 * h]
    return torch.stack(b, dim=1)

def draw_boxes(frame, boxes):
    """ Draw bounding boxes on the frame """
    for box in boxes:
        x_min, y_min, x_max, y_max = box.int().numpy()
        cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
    return frame

# Capture video from the webcam
cap = cv2.VideoCapture(0)

while True:
    # Capture a new frame
    ret, frame = cap.read()
    if not ret:
        break

    # Convert the frame to a PIL image
    pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    
    # Apply the transformations
    img = transform(pil_img).unsqueeze(0)
    
    # Make predictions
    with torch.no_grad():
        outputs = model(img)
    
    # Get bounding boxes
    boxes = get_bounding_boxes(outputs)
    
    # Rescale the bounding boxes to the original image size
    boxes = rescale_bboxes(boxes, pil_img.size)
    
    # Draw the bounding boxes on the frame
    frame = draw_boxes(frame, boxes)
    
    # Display the frame
    cv2.imshow("Tracking", frame)
    
    # Exit on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the capture and destroy any OpenCV windows
cap.release()
cv2.destroyAllWindows()


Using cache found in C:\Users\user/.cache\torch\hub\facebookresearch_detr_main
