In [1]:
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort
import cv2

In [None]:
# import cv2
# from ultralytics import YOLO
# from deep_sort import DeepSort

# Initialize YOLO model
yolo_model = YOLO('yolov8s.pt')

# Get the list of class names from the model
class_names = yolo_model.names

# Initialize DeepSORT tracker
tracker = DeepSort(max_age=30, nn_budget=100)

# Constants for distance calculation
dist_class_width = {"bottle":8,"person":39,"laptop":35}
W = 8  # Actual width of the object in the real world (e.g., width of a laptop in cm)
F = 1071.4285714285713  # Focal length (derived from camera calibration)

# Path to your video file
video_path = 'http://192.168.142.63:8080/video'  # Ensure the correct extension
cap = cv2.VideoCapture(video_path)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Perform object detection
    results = yolo_model(frame)

    # Store detections for DeepSORT (only for laptops)
    detections = []
    widths = []
    for r in results:
        boxes = r.boxes  # Detected bounding boxes
        for box in boxes:
            # Get the object class index and map it to the class name
            class_idx = int(box.cls[0])  # Get the class index (as integer)
            class_name = class_names[class_idx]  # Map index to class name

            # Get bounding box coordinates
            xyxy = box.xyxy[0].tolist()
            x1, y1, x2, y2 = xyxy[:4]
            width = x2 - x1  # Width of the bounding box
            height = y2 - y1  # Height of the bounding box
            conf = box.conf.item()  # Confidence score

            # Filter for laptop objects only
            if class_name == 'bottle':
                detections.append([[x1, y1, width, height], conf])
                widths.append(width)  # Append the width for distance calculation

    # Update DeepSORT tracker with filtered detections
    tracks = tracker.update_tracks(detections, frame=frame)

    # Draw tracking results and calculate distance on the frame
    for i, track in enumerate(tracks):
        if not track.is_confirmed():
            continue
        track_id = track.track_id
        bbox = track.to_tlbr()  # Get the bounding box in (x1, y1, x2, y2) format

        # Draw bounding box and track ID
        cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (0, 255, 0), 2)
        cv2.putText(frame, f'ID: {track_id}', (int(bbox[0]), int(bbox[1] - 10)), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (255, 255, 255), 2)

        # Calculate the distance using the width of the object (P = width of the bounding box)
        if i < len(widths):  # Ensure there is a width for this track
            P = widths[i]
            D_dash = (W * F) / P 
            print(D_dash)
            cv2.putText(frame, f'Dist: {D_dash:.2f} cm', (int(bbox[0]), int(bbox[1] - 30)), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (255, 255, 255), 2)

    # Display the frame with tracking information
    cv2.imshow('Object Tracking and Distance Estimation', frame)

    # Exit loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

MULTIPLE OBJECTS DICTIONARY


In [2]:
import cv2
from ultralytics import YOLO
# from deep_sort import DeepSort

# Initialize YOLO model
yolo_model = YOLO('yolov8s.pt')

# Get the list of class names from the model
class_names = yolo_model.names

# Initialize DeepSORT tracker
tracker = DeepSort(max_age=30, nn_budget=100)

# Constants for distance calculation
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
dist_class_width = {"bottle": 8, "person": 39, "laptop": 35}  # Actual widths in cm
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
F = 1071.4285714285713  # Focal length (derived from camera calibration)

# Path to your video file
video_path = 'http://192.168.220.3:8080/video'  # Ensure the correct extension
cap = cv2.VideoCapture(video_path)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Perform object detection
    results = yolo_model(frame)

    # Store detections for DeepSORT (for laptops, persons, and bottles)
    detections = []
    widths = []
    for r in results:
        boxes = r.boxes  # Detected bounding boxes
        for box in boxes:
            # Get the object class index and map it to the class name
            class_idx = int(box.cls[0])  # Get the class index (as integer)
            class_name = class_names[class_idx]  # Map index to class name

            # Get bounding box coordinates
            xyxy = box.xyxy[0].tolist()
            x1, y1, x2, y2 = xyxy[:4]
            width_pixels = x2 - x1  # Width of the bounding box in pixels
            height_pixels = y2 - y1  # Height of the bounding box in pixels
            conf = box.conf.item()  # Confidence score

            # Filter for specific objects and use their widths for distance calculation
            if class_name in dist_class_width:
                detections.append([[x1, y1, width_pixels, height_pixels], conf])
                widths.append(dist_class_width[class_name])  # Use width from dictionary

    # Update DeepSORT tracker with filtered detections
    tracks = tracker.update_tracks(detections, frame=frame)

    # Draw tracking results and calculate distance on the frame
    for i, track in enumerate(tracks):
        if not track.is_confirmed():
            continue
        
        track_id = track.track_id
        bbox = track.to_tlbr()  # Get the bounding box in (x1, y1, x2, y2) format

        # Draw bounding box and track ID
        cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (0, 255, 0), 2)
        cv2.putText(frame, f'ID: {track_id}', (int(bbox[0]), int(bbox[1] - 10)), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (255, 255, 255), 2)

        # Calculate distance using width from dictionary and bounding box width in pixels
        if i < len(widths):  # Ensure there is a width for this track
            P_pixels = detections[i][0][2]  # Width of bounding box in pixels
            
            if P_pixels > 0:  # Avoid division by zero
                W_class = widths[i]  # Real-world width from dictionary
                D_dash = (W_class * F) / P_pixels 
                print(f'Distance to {class_names[class_idx]}: {D_dash:.2f} cm')
                cv2.putText(frame, f'Dist: {D_dash:.2f} cm', (int(bbox[0]), int(bbox[1] - 30)), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (255, 255, 255), 2)

    # Display the frame with tracking information
    cv2.imshow('Object Tracking and Distance Estimation', frame)

    # Exit loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


0: 384x640 1 laptop, 1 keyboard, 150.0ms
Speed: 4.0ms preprocess, 150.0ms inference, 7.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 laptop, 1 keyboard, 131.0ms
Speed: 3.0ms preprocess, 131.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 tv, 127.0ms
Speed: 2.0ms preprocess, 127.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 laptop, 112.0ms
Speed: 3.0ms preprocess, 112.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 laptop, 126.0ms
Speed: 2.0ms preprocess, 126.0ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 tv, 1 laptop, 116.0ms
Speed: 2.0ms preprocess, 116.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
Distance to tv: 20.22 cm

0: 384x640 1 person, 1 tv, 1 laptop, 114.0ms
Speed: 2.0ms preprocess, 114.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
Distance to person: 20.36 cm

0: 384x

Depth Map

In [None]:
import torch
import cv2
import urllib
from PIL import Image
from torchvision import transforms as T


midas = torch.hub.load("intel-isl/MiDaS", "DPT_Large") 
midas.eval()


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
midas.to(device)


midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")

transform = midas_transforms.dpt_transform

In [8]:
img = cv2.imread("Test_Images\\bottle2.jpg")  
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)


input_batch = transform(img).to(device)

# Predict depth
with torch.no_grad():
    prediction = midas(input_batch)

# Resize the output to match the input size
prediction = torch.nn.functional.interpolate(
    prediction.unsqueeze(1),
    size=img.shape[:2],
    mode="bicubic",
    align_corners=False,
).squeeze()

# Normalize the prediction for better visualization
depth_map = prediction.cpu().numpy()
depth_map = cv2.normalize(depth_map, None, 0, 255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8U)


cv2.imwrite("Test_Image_results\\depth_map1.jpg", depth_map)
cv2.waitKey(0)

-1