In [1]:
from ultralytics import YOLO
import torch
import cv2 as cv
import cvzone
from sort import *

In [2]:
# Set the device to GPU (MPS) if available, otherwise use CPU
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Load the YOLO model with pre-trained weights
model = YOLO("../yoloweights/yolov8l.pt").to(device)

# Define class names for the detected objects
classNames = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat",
              "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
              "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
              "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
              "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
              "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
              "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed",
              "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone",
              "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
              "teddy bear", "hair drier", "toothbrush"]

# Read the mask image for region selection
mask = cv.imread('../mask/people_mask.png')

# Initialize the SORT tracker for object tracking
tracker = Sort(max_age=20, min_hits=2, iou_threshold=0.3)

# Capture video from the specified file
cap = cv.VideoCapture('../videos/people.mp4')

# Load an overlay graphic image
imagegraphic = cv.imread('../images/graphics.png', cv.IMREAD_UNCHANGED)

# Get video properties: width, height, and frames per second (fps)
width = int(cap.get(cv.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv.CAP_PROP_FPS))

# Set up the output video writer
output_video = cv.VideoWriter("people_detected_video.mp4", cv.VideoWriter_fourcc(*"mp4v"), fps, (width, height))

# Define the upper and lower limits for counting objects
limitsUp = [103, 161, 296, 161]
limitsDown = [527, 489, 735, 489]

# Initialize lists to keep track of counted objects
total_countUp = []
total_countDown = []

# Process the video frame by frame
while True:
    # Read a frame from the video
    success, img = cap.read()
    if not success:
        break  # Exit loop if no more frames are available

    # Overlay graphic on the current image
    img = cvzone.overlayPNG(img, imagegraphic, (730, 260))

    # Apply the mask to select the region of interest
    imageRegion = cv.bitwise_and(img, mask)

    # Initialize detections array
    detections = np.empty((0, 5))
    
    # Perform inference on the image region using the YOLO model
    results = model(imageRegion, stream=True)

    # Iterate through results and extract bounding box information
    for r in results:
        boxes = r.boxes
        for box in boxes:
            x, y, w, h = box.xywh[0]  # Get bounding box coordinates
            x, y, w, h = int(x), int(y), int(w), int(h)
            
            x1, y1, x2, y2 = box.xyxy[0]  # Get corners of the bounding box
            x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)

            conf = round(float(box.conf), 2)  # Confidence score of the detection
            cls = int(box.cls[0])  # Class index of the detected object
            currentClass = classNames[cls]  # Get the class name
            
            # Check if the detected object is a person and the confidence is above the threshold
            if currentClass == 'person' and conf > 0.3:
                # Display the class name and confidence on the image
                cvzone.putTextRect(img, f'{currentClass} {conf}', (max(0, int(x - w/2)), max(35, int(y - h/2))), scale=0.7, thickness=1, offset=3)
                # Optionally draw a rectangle around the detected person (commented out)
                # cvzone.cornerRect(img, (int(x - w / 2), int(y - h / 2), w, h), l=5)
                
                # Store the detection in the detections array
                currentArray = np.array([x1, y1, x2, y2, conf])
                detections = np.vstack((detections, currentArray))

    # Draw lines for counting limits
    cv.line(img, (limitsUp[0], limitsUp[1]), (limitsUp[2], limitsUp[3]), (0, 0, 255), thickness=3)
    cv.line(img, (limitsDown[0], limitsDown[1]), (limitsDown[2], limitsDown[3]), (0, 0, 255), thickness=3)

    # Update the tracker with the new detections
    resultsTracker = tracker.update(detections)
    
    # Iterate through the tracked results
    for result in resultsTracker:
        x1, y1, x2, y2, ID = result  # Get coordinates and ID
        x1, y1, x2, y2, ID = int(x1), int(y1), int(x2), int(y2), int(ID)
        w, h = x2 - x1, y2 - y1  # Calculate width and height
        
        # Draw a rectangle around the tracked object
        cvzone.cornerRect(img, (x1, y1, w, h), l=9, rt=2, colorR=(255, 0, 0))
        # Optionally display the ID of the tracked object (commented out)
        # cvzone.putTextRect(img, f'{ID}', (max(0, x1), max(35, y1)), scale=1, thickness=3, offset=10)
        
        # Calculate the center of the bounding box
        cx, cy = x1 + w // 2, y1 + h // 2
        cv.circle(img, (cx, cy), 5, (255, 0, 0), -1)  # Draw a circle at the center

        # Check if the center of the object crosses the upper limit
        if limitsUp[0] < cx < limitsUp[2] and limitsUp[1] - 15 < cy < limitsUp[1] + 15:
            if total_countUp.count(ID) == 0:  # Only count if not counted before
                total_countUp.append(ID)  # Add ID to the count list
                cv.line(img, (limitsUp[0], limitsUp[1]), (limitsUp[2], limitsUp[3]), (0, 255, 0), thickness=3)

        # Check if the center of the object crosses the lower limit
        if limitsDown[0] < cx < limitsDown[2] and limitsDown[1] - 15 < cy < limitsDown[1] + 15:
            if total_countDown.count(ID) == 0:  # Only count if not counted before
                total_countDown.append(ID)  # Add ID to the count list
                cv.line(img, (limitsDown[0], limitsDown[1]), (limitsDown[2], limitsDown[3]), (0, 255, 0), thickness=3)

    # Display the total count of objects that crossed the upper and lower limits
    cv.putText(img, str(len(total_countUp)), (929, 345), cv.FONT_HERSHEY_PLAIN, 5, (139, 195, 75), 7)
    cv.putText(img, str(len(total_countDown)), (1191, 345), cv.FONT_HERSHEY_PLAIN, 5, (50, 50, 230), 7)

    # Show the processed image in a window
    cv.imshow("Image", img)
    
    # Write the processed frame to the output video file
    output_video.write(img)
    
    # Exit the loop if 'q' key is pressed
    if cv.waitKey(1) & 0xff == ord('q'):
        break

# Clean up resources
cv.destroyAllWindows()
cap.release()  # Release the video capture object
output_video.release()  # Release the video writer object



0: 384x640 2 persons, 317.2ms
Speed: 15.5ms preprocess, 317.2ms inference, 128.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 76.7ms
Speed: 3.0ms preprocess, 76.7ms inference, 16.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 62.0ms
Speed: 1.9ms preprocess, 62.0ms inference, 23.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 58.2ms
Speed: 2.8ms preprocess, 58.2ms inference, 23.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 59.3ms
Speed: 2.4ms preprocess, 59.3ms inference, 21.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 71.6ms
Speed: 1.9ms preprocess, 71.6ms inference, 19.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 61.8ms
Speed: 2.4ms preprocess, 61.8ms inference, 20.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 54.4ms
Speed: 1.9ms preprocess, 54.4ms inference, 22.1ms postprocess per imag

In [4]:
model.device

device(type='mps', index=0)