In [40]:
import cv2
import time
import torch
import argparse
import numpy as np
import os
from utils.datasets import letterbox
from utils.torch_utils import select_device
from models.experimental import attempt_load
from utils.plots import output_to_keypoint, plot_skeleton_kpts
from utils.general import non_max_suppression_kpt, strip_optimizer
from torchvision import transforms

In [41]:
@torch.no_grad()
def run(poseweights='yolov7-w6-pose.pt', source='pose.mp4', device='cpu', display=True):
    """
    Run pose estimation on a video or webcam feed
    
    Args:
        poseweights: Path to the YOLOv7 pose weights
        source: Path to video file or webcam ID (0, 1, etc.)
        device: Device to run inference on ('cpu' or '0', '1', etc. for GPU)
        display: Whether to show video with pose estimation in real-time
    """
    # Create output directory if it doesn't exist
    os.makedirs('output', exist_ok=True)
    
    path = source
    ext = path.split('/')[-1].split('.')[-1].strip().lower() if '/' in path else ''
    if ext in ["mp4", "webm", "avi"] or ext not in ["mp4", "webm", "avi"] and (ext.isnumeric() or path.isnumeric()):
        input_path = int(path) if path.isnumeric() else path
        device = select_device(device)
        half = device.type != 'cpu'  # half precision only supported on CUDA
        model = attempt_load(poseweights, map_location=device)
        _ = model.eval()

        cap = cv2.VideoCapture(input_path)

        if (cap.isOpened() == False):
            print('Error while trying to read video. Please check path again')
            return

        frame_width, frame_height = int(cap.get(3)), int(cap.get(4))
        
        # Read first frame to get dimensions for letterbox
        ret, first_frame = cap.read()
        if not ret:
            print('Error reading the first frame')
            return
            
        vid_write_image = letterbox(first_frame, (frame_width), stride=64, auto=True)[0]
        resize_height, resize_width = vid_write_image.shape[:2]
        
        # Reset video capture to start
        cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
        
        # Set output video name and path
        if str(path).isnumeric():
            out_video_name = "webcam_output"
        else:
            try:
                out_video_name = os.path.basename(input_path).split('.')[0]
            except:
                out_video_name = "output"
        
        # Save to output directory
        output_path = os.path.join('output', f"{out_video_name}_result.mp4")
        out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), 30, (resize_width, resize_height))
        
        print(f"Output will be saved to: {output_path}")

        frame_count, total_fps = 0, 0

        while cap.isOpened():
            print(f"Frame {frame_count} Processing")
            ret, frame = cap.read()
            if ret:
                orig_image = frame

                # preprocess image
                image = cv2.cvtColor(orig_image, cv2.COLOR_BGR2RGB)
                image = letterbox(image, (frame_width), stride=64, auto=True)[0]
                image_ = image.copy()
                image = transforms.ToTensor()(image)
                image = torch.tensor(np.array([image.numpy()]))

                image = image.to(device)
                image = image.float()
                start_time = time.time()

                with torch.no_grad():
                    output, _ = model(image)

                output = non_max_suppression_kpt(output, 0.25, 0.65, nc=model.yaml['nc'], nkpt=model.yaml['nkpt'], kpt_label=True)
                output = output_to_keypoint(output)
                img = image[0].permute(1, 2, 0) * 255
                img = img.cpu().numpy().astype(np.uint8)

                img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

                for idx in range(output.shape[0]):
                    plot_skeleton_kpts(img, output[idx, 7:].T, 3)

                # Display the frame with pose estimation if requested
                if display:
                    cv2.imshow("Pose Estimation", img)
                    key = cv2.waitKey(1)
                    if key == ord('q'):  # Press 'q' to exit
                        break

                end_time = time.time()
                fps = 1 / (end_time - start_time)
                total_fps += fps
                frame_count += 1
                out.write(img)
            else:
                break

        cap.release()
        out.release()
        cv2.destroyAllWindows()
        
        if frame_count > 0:
            avg_fps = total_fps / frame_count
            print(f"Average FPS: {avg_fps:.3f}")
            print(f"Video saved to: {output_path}")
        else:
            print("No frames were processed.")

In [42]:
def parse_opt():
    parser = argparse.ArgumentParser()
    parser.add_argument('--poseweights', nargs='+', type=str, default='yolov7-w6-pose.pt', help='model path(s)')
    parser.add_argument('--source', type=str, help='path to video or 0 for webcam')
    parser.add_argument('--device', type=str, default='cpu', help='cpu/0,1,2,3(gpu)')
    opt = parser.parse_args()
    return opt

In [43]:
def main(poseweights='yolov7-w6-pose.pt', source='yoga/yoga8.mp4', device='0', display=True):
    """
    Run pose estimation with the given parameters
    
    Args:
        poseweights: Path to the YOLOv7 pose weights
        source: Path to video file or webcam ID (0, 1, etc.)
        device: Device to run inference on ('cpu' or '0', '1', etc. for GPU)
        display: Whether to display the video with pose estimation in real-time
    """
    # Optional: Strip optimizer
    strip_optimizer(device, poseweights)
    
    # Run pose estimation
    run(poseweights=poseweights, source=source, device=device, display=display)

In [44]:
def run_interactive():
    """
    Interactive function to run pose estimation with user input
    """
    # Get the weights file
    poseweights = input("Enter path to weights file [default: yolov7-w6-pose.pt]: ") or "yolov7-w6-pose.pt"
    
    # Get device type
    use_gpu = input("Use GPU? (y/n) [default: y]: ").lower() or "y"
    if use_gpu == "y":
        device = input("Enter GPU device ID [default: 0]: ") or "0"
    else:
        device = "cpu"
    
    # Get source type
    print("\nSelect input source:")
    print("1: Video file")
    print("2: Webcam")
    source_choice = input("Enter choice [1/2]: ")
    
    if source_choice == "1":
        # Video file
        print("\nDefault video directory: yoga/")
        default_video = "yoga/yoga8.mp4"
        source = input(f"Enter video file path [default: {default_video}]: ") or default_video
        # Ask if user wants to display the processed video in real-time
        display_video = input("Display video with pose estimation in real-time? (y/n) [default: y]: ").lower() or "y"
    else:
        # Webcam
        cam_id = input("Enter webcam ID [default: 0]: ") or "0"
        source = cam_id
        display_video = "y"  # Always display for webcam
        
    print(f"\nRunning pose estimation with:")
    print(f"- Weights: {poseweights}")
    print(f"- Device: {device}")
    print(f"- Source: {source}")
    print(f"- Display: {'Yes' if display_video == 'y' else 'No'}")
    confirmation = input("\nConfirm? (y/n) [default: y]: ").lower() or "y"
    
    if confirmation == "y":
        # Run the model
        run_with_display = (display_video == "y")
        main(poseweights=poseweights, source=source, device=device, display=run_with_display)
    else:
        print("Operation cancelled")

In [45]:
# Run in interactive mode
run_interactive()

Enter path to weights file [default: yolov7-w6-pose.pt]:  
Use GPU? (y/n) [default: y]:  y
Enter GPU device ID [default: 0]:  



Select input source:
1: Video file
2: Webcam


Enter choice [1/2]:  1



Default video directory: yoga/


Enter video file path [default: yoga/yoga8.mp4]:  
Display video with pose estimation in real-time? (y/n) [default: y]:  y



Running pose estimation with:
- Weights: yolov7-w6-pose.pt
- Device: 0
- Source: yoga/yoga8.mp4
- Display: Yes



Confirm? (y/n) [default: y]:  y


Optimizer stripped from yolov7-w6-pose.pt, 161.1MB


  ckpt = torch.load(w, map_location=map_location)  # load


Fusing layers... 


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


Output will be saved to: output\yoga8_result.mp4
Frame 0 Processing
Frame 1 Processing
Frame 2 Processing
Frame 3 Processing
Frame 4 Processing
Frame 5 Processing
Frame 6 Processing
Frame 7 Processing
Frame 8 Processing
Frame 9 Processing
Frame 10 Processing
Frame 11 Processing
Frame 12 Processing
Frame 13 Processing
Frame 14 Processing
Frame 15 Processing
Frame 16 Processing
Frame 17 Processing
Frame 18 Processing
Frame 19 Processing
Frame 20 Processing
Frame 21 Processing
Frame 22 Processing
Frame 23 Processing
Frame 24 Processing
Frame 25 Processing
Frame 26 Processing
Frame 27 Processing
Frame 28 Processing
Frame 29 Processing
Frame 30 Processing
Frame 31 Processing
Frame 32 Processing
Frame 33 Processing
Frame 34 Processing
Frame 35 Processing
Frame 36 Processing
Frame 37 Processing
Frame 38 Processing
Frame 39 Processing
Frame 40 Processing
Frame 41 Processing
Frame 42 Processing
Frame 43 Processing
Frame 44 Processing
Frame 45 Processing
Frame 46 Processing
Frame 47 Processing
F

In [46]:
# Example usage with GPU
# main(poseweights='yolov7-w6-pose.pt', source='./yoga/yoga8.mp4', device='0')

# For CPU-only inference
# main(poseweights='yolov7-w6-pose.pt', source='./yoga/yoga8.mp4', device='cpu')

# For webcam (usually ID 0)
# main(poseweights='yolov7-w6-pose.pt', source='0', device='0')