In [None]:
# Code to test YOLOv5 object detection and MiDaS depth estimation on a webcam feed
# clone yolov5 git from https://github.com/ultralytics/yolov5.git (can also just directly download it)

import cv2
import torch
import numpy as np

# Load YOLOv5 model
yolo_model = torch.hub.load('/Users/josephsketl/yolov5', 'yolov5s', source='local', pretrained=True)
yolo_model.conf = 0.25  # Set confidence threshold

# Load MiDaS model
midas = torch.hub.load("intel-isl/MiDaS", "MiDaS_small")
midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms").small_transform

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
midas.to(device)
yolo_model.to(device)

# Capture video from webcam
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("Error: Could not open video source.")
    exit()

while True:
    ret, frame = cap.read()
    if not ret:
        print("Failed to grab frame")
        break

    # Convert frame to RGB
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Run YOLO detection
    yolo_results = yolo_model(rgb_frame)
    detections = yolo_results.xyxy[0]

    # Prepare frame for MiDaS
    input_batch = midas_transforms(rgb_frame).to(device)

    # Run MiDaS depth estimation
    with torch.no_grad():
        depth_map = midas(input_batch)
        depth_map = torch.nn.functional.interpolate(
            depth_map.unsqueeze(1),
            size=rgb_frame.shape[:2],
            mode="bicubic",
            align_corners=False,
        ).squeeze().cpu().numpy()

    # Normalize depth map for visualization
    depth_map_visual = cv2.normalize(depth_map, None, 0, 255, cv2.NORM_MINMAX, cv2.CV_8U)
    depth_map_color = cv2.applyColorMap(depth_map_visual, cv2.COLORMAP_MAGMA)

    # Process detections
    for *box, confidence, class_id in detections:
        (x1, y1, x2, y2) = map(int, box)
        label = yolo_model.names[int(class_id)]

        # Get depth value at the center of the bounding box
        cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
        object_depth = depth_map[cy, cx]

        # Display distance and bounding box
        distance_text = f"{label} ({confidence:.2f}): {object_depth:.2f} depth units"
        color = (0, 255, 0)
        cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
        cv2.putText(frame, distance_text, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    # Display results
    combined_frame = np.hstack((frame, depth_map_color))
    cv2.imshow("YOLOv5 + MiDaS Object Detection and Depth Estimation", combined_frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
