In [15]:
from ultralytics import YOLO
import cv2

# Load YOLO object detection model (pre-trained YOLO model)
yolo_object = YOLO('yolov8n.pt')  # Load YOLOv8 model

cap = cv2.VideoCapture('ch01_20240909095216 (yolo-pose training).mp4')  # Open video file (change path if necessary)

while cap.isOpened():
    ret, frame = cap.read()  # Read frame from the video
    if not ret:
        break

    # Step 1: Run YOLO object detection on the frame
    results = yolo_object(frame, conf=0.5)  # Detect objects, set a confidence threshold

    # Step 2: Extract bounding boxes for detected people (class 0 = person in COCO dataset)
    for result in results:  # Iterate over the results list
        boxes = result.boxes  # Get the detected boxes
        for box in boxes:
            xywh = box.xywh[0]  # Get the box coordinates (x, y, width, height)
            conf = box.conf[0]  # Get confidence
            cls = int(box.cls[0])  # Get class ID

            # Check if the class corresponds to a person (class 0 in COCO)
            if cls == 0:  # Class 0 corresponds to 'person' in COCO dataset
                x_min, y_min, w, h = xywh  # Unpack the coordinates
                x_max, y_max = int(x_min + w), int(y_min + h)  # Calculate the bottom-right corner
                
                # Draw bounding box around detected person
                cv2.rectangle(frame, (int(x_min), int(y_min)), (x_max, y_max), (0, 255, 0), 2)

    # Step 3: Display the frame with bounding boxes
    cv2.imshow('YOLO Object Detection', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):  # Press 'q' to exit
        break

cap.release()
cv2.destroyAllWindows()



0: 384x640 7 persons, 2 laptops, 61.0ms
Speed: 0.0ms preprocess, 61.0ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 laptops, 66.5ms
Speed: 0.0ms preprocess, 66.5ms inference, 0.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 2 laptops, 52.4ms
Speed: 0.0ms preprocess, 52.4ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 2 laptops, 50.0ms
Speed: 1.2ms preprocess, 50.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 2 laptops, 52.9ms
Speed: 2.4ms preprocess, 52.9ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 1 laptop, 49.8ms
Speed: 1.0ms preprocess, 49.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 1 laptop, 50.1ms
Speed: 0.5ms preprocess, 50.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 2 laptops, 49

In [22]:
from ultralytics import YOLO
import cv2

# Load YOLO object detection model (pre-trained YOLO model for object detection)
yolo_object = YOLO('yolov8n.pt')  # Load YOLOv8 model for object detection

# Load YOLO pose estimation model (pre-trained YOLO model for pose estimation)
yolo_pose = YOLO('yolov8n-pose.pt')  # Load YOLO Pose model (correct file)

cap = cv2.VideoCapture('ch01_20240909095216 (yolo-pose training).mp4')  # Open the correct video file

while cap.isOpened():
    ret, frame = cap.read()  # Read frame from the video
    if not ret:
        break

    # Step 1: Run YOLO object detection on the frame (detect people)
    results = yolo_object(frame, conf=0.5)  # Detect objects, set a confidence threshold

    # Step 2: Extract bounding boxes for detected people (class 0 = person in COCO dataset)
    for result in results:  # Iterate over the results list
        boxes = result.boxes  # Get the detected boxes
        for box in boxes:
            xywh = box.xywh[0]  # Get the box coordinates (x, y, width, height)
            conf = box.conf[0]  # Get confidence
            cls = int(box.cls[0])  # Get class ID

            # Check if the class corresponds to a person (class 0 in COCO)
            if cls == 0:  # Class 0 corresponds to 'person' in COCO dataset
                x_min, y_min, w, h = xywh  # Unpack the coordinates
                x_max, y_max = int(x_min + w), int(y_min + h)  # Calculate the bottom-right corner
                
                # Crop the detected person's area from the frame
                person_crop = frame[int(y_min):int(y_max), int(x_min):int(x_max)]

                # Step 3: Run YOLO Pose on the cropped area (person's body)
                pose_results = yolo_pose(person_crop)

                # Step 4: Draw Pose keypoints on the frame
                for pose in pose_results:
                    keypoints = pose.keypoints  # Get pose keypoints

                    # Debug: Print the entire keypoint data to inspect its structure
                    print("Full Keypoint Data:", keypoints)

                    # Step 5: Handle each keypoint based on its structure
                    for keypoint in keypoints:
                        # Print the keypoint to inspect it
                        print("Single Keypoint:", keypoint)

                        # Check the type of keypoint data
                        if isinstance(keypoint, list) or isinstance(keypoint, tuple):
                            if len(keypoint) == 3:  # Expecting (x, y, confidence)
                                x, y, conf = keypoint
                            elif len(keypoint) == 2:  # If it's just (x, y)
                                x, y = keypoint
                                conf = 1.0  # Set default confidence if missing
                            else:
                                print("Unexpected keypoint format:", keypoint)
                        else:
                            print("Invalid keypoint structure:", keypoint)

                        if conf > 0.5:  # Only draw keypoints with confidence > 0.5
                            cv2.circle(frame, (int(x + x_min), int(y + y_min)), 5, (0, 0, 255), -1)  # Draw keypoint

                # Draw bounding box around detected person
                cv2.rectangle(frame, (int(x_min), int(y_min)), (x_max, y_max), (0, 255, 0), 2)

    # Step 6: Display the frame with bounding boxes and pose keypoints
    cv2.imshow('YOLO Object Detection + Pose Estimation', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):  # Press 'q' to exit
        break

cap.release()
cv2.destroyAllWindows()



0: 384x640 7 persons, 2 laptops, 57.4ms
Speed: 15.3ms preprocess, 57.4ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 640x640 1 person, 76.9ms
Speed: 4.3ms preprocess, 76.9ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)
Full Keypoint Data: ultralytics.engine.results.Keypoints object with attributes:

conf: tensor([[1.2708e-02, 8.4761e-04, 7.7161e-02, 2.9633e-03, 8.3934e-01, 1.3918e-02, 4.3021e-01, 7.4395e-04, 2.9970e-02, 6.4100e-03, 6.7710e-02, 1.7934e-03, 5.6815e-03, 2.0327e-03, 5.2268e-03, 1.3245e-03, 2.1047e-03]])
data: tensor([[[0.0000e+00, 0.0000e+00, 1.2708e-02],
         [0.0000e+00, 0.0000e+00, 8.4761e-04],
         [0.0000e+00, 0.0000e+00, 7.7161e-02],
         [0.0000e+00, 0.0000e+00, 2.9633e-03],
         [1.0919e+02, 5.9387e+01, 8.3934e-01],
         [0.0000e+00, 0.0000e+00, 1.3918e-02],
         [0.0000e+00, 0.0000e+00, 4.3021e-01],
         [0.0000e+00, 0.0000e+00, 7.4395e-04],
         [0.0000e+00, 0.0000e+00, 2.9970e-02],
   

NameError: name 'x' is not defined