In [5]:
import cv2
import mediapipe as mp
import numpy as np
import argparse
from pathlib import Path
import math

# BVH writing utilities
class BVHJoint:
    def __init__(self, name, offset=(0, 0, 0), children=None):
        self.name = name
        self.offset = offset
        self.children = children or []
        self.channels = []
        self.is_end_site = False

    def add_child(self, child):
        self.children.append(child)
        return child

class BVHWriter:
    def __init__(self, root_joint, frame_time=0.033333):
        self.root_joint = root_joint
        self.frame_time = frame_time
        self.motion_data = []
        
    def add_frame(self, frame_data):
        self.motion_data.append(frame_data)
        
    def write_to_file(self, file_path):
        with open(file_path, 'w') as f:
            f.write("HIERARCHY\n")
            self._write_joint(f, self.root_joint, 0)
            
            f.write("MOTION\n")
            f.write(f"Frames: {len(self.motion_data)}\n")
            f.write(f"Frame Time: {self.frame_time}\n")
            
            for frame in self.motion_data:
                f.write(" ".join(map(str, frame)) + "\n")
                
    def _write_joint(self, file, joint, indent_level):
        indent = "  " * indent_level
        
        if indent_level == 0:
            file.write(f"{indent}ROOT {joint.name}\n")
        elif joint.is_end_site:
            file.write(f"{indent}End Site\n")
        else:
            file.write(f"{indent}JOINT {joint.name}\n")
            
        file.write(f"{indent}{{\n")
        file.write(f"{indent}  OFFSET {joint.offset[0]} {joint.offset[1]} {joint.offset[2]}\n")
        
        if not joint.is_end_site:
            if indent_level == 0:
                # Root joint typically has 6 channels: position and rotation
                file.write(f"{indent}  CHANNELS 6 Xposition Yposition Zposition Zrotation Xrotation Yrotation\n")
                joint.channels = ["Xposition", "Yposition", "Zposition", "Zrotation", "Xrotation", "Yrotation"]
            else:
                # Other joints typically have 3 channels for rotation only
                file.write(f"{indent}  CHANNELS 3 Zrotation Xrotation Yrotation\n")
                joint.channels = ["Zrotation", "Xrotation", "Yrotation"]
        
        for child in joint.children:
            self._write_joint(file, child, indent_level + 1)
            
        file.write(f"{indent}}}\n")

# MediaPipe pose detection setup
def setup_mediapipe():
    mp_pose = mp.solutions.pose
    return mp_pose.Pose(
        static_image_mode=False,
        model_complexity=2,
        enable_segmentation=False,
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5
    )

# Create a simplified skeleton hierarchy for BVH
def create_skeleton():
    root = BVHJoint("Hips", offset=(0, 0, 0))
    
    spine = root.add_child(BVHJoint("Spine", offset=(0, 10, 0)))
    neck = spine.add_child(BVHJoint("Neck", offset=(0, 15, 0)))
    head = neck.add_child(BVHJoint("Head", offset=(0, 5, 0)))
    head_end = head.add_child(BVHJoint("HeadEnd", offset=(0, 3, 0)))
    head_end.is_end_site = True
    
    left_shoulder = spine.add_child(BVHJoint("LeftShoulder", offset=(5, 12, 0)))
    left_elbow = left_shoulder.add_child(BVHJoint("LeftElbow", offset=(10, 0, 0)))
    left_wrist = left_elbow.add_child(BVHJoint("LeftWrist", offset=(8, 0, 0)))
    left_hand = left_wrist.add_child(BVHJoint("LeftHand", offset=(4, 0, 0)))
    left_hand.is_end_site = True
    
    right_shoulder = spine.add_child(BVHJoint("RightShoulder", offset=(-5, 12, 0)))
    right_elbow = right_shoulder.add_child(BVHJoint("RightElbow", offset=(-10, 0, 0)))
    right_wrist = right_elbow.add_child(BVHJoint("RightWrist", offset=(-8, 0, 0)))
    right_hand = right_wrist.add_child(BVHJoint("RightHand", offset=(-4, 0, 0)))
    right_hand.is_end_site = True
    
    left_hip = root.add_child(BVHJoint("LeftHip", offset=(3.5, 0, 0)))
    left_knee = left_hip.add_child(BVHJoint("LeftKnee", offset=(0, -15, 0)))
    left_ankle = left_knee.add_child(BVHJoint("LeftAnkle", offset=(0, -15, 0)))
    left_foot = left_ankle.add_child(BVHJoint("LeftFoot", offset=(0, -3, 5)))
    left_foot.is_end_site = True
    
    right_hip = root.add_child(BVHJoint("RightHip", offset=(-3.5, 0, 0)))
    right_knee = right_hip.add_child(BVHJoint("RightKnee", offset=(0, -15, 0)))
    right_ankle = right_knee.add_child(BVHJoint("RightAnkle", offset=(0, -15, 0)))
    right_foot = right_ankle.add_child(BVHJoint("RightFoot", offset=(0, -3, 5)))
    right_foot.is_end_site = True
    
    return root

# Calculate rotation between two vectors
def calculate_rotation(v1, v2):
    # Normalize vectors
    v1 = v1 / np.linalg.norm(v1)
    v2 = v2 / np.linalg.norm(v2)
    
    # Find rotation axis
    axis = np.cross(v1, v2)
    if np.linalg.norm(axis) < 1e-10:
        # Vectors are parallel, use an arbitrary perpendicular axis
        if abs(np.dot(v1, [0, 1, 0])) < 0.9:
            axis = np.cross(v1, [0, 1, 0])
        else:
            axis = np.cross(v1, [1, 0, 0])
            
    axis = axis / np.linalg.norm(axis)
    
    # Find rotation angle
    dot = np.clip(np.dot(v1, v2), -1.0, 1.0)
    angle = np.arccos(dot)
    
    # Convert to Euler angles (simplified)
    x_rot = np.degrees(axis[0] * angle)
    y_rot = np.degrees(axis[1] * angle)
    z_rot = np.degrees(axis[2] * angle)
    
    return z_rot, x_rot, y_rot

# Extract joint position from MediaPipe landmarks
def get_joint_position(landmarks, idx):
    landmark = landmarks[idx]
    # Scale and transform coordinates to match BVH expectations
    # Note: MediaPipe uses a different coordinate system than typical BVH
    return np.array([
        landmark.x * 100,  # Scale up for better visualization
        -landmark.y * 100,  # Negative Y for BVH compatibility
        -landmark.z * 100   # Negative Z for BVH compatibility
    ])

# Process video and generate BVH
def process_video(video_path, output_path, fps_target=30):
    cap = cv2.VideoCapture(video_path)
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    video_fps = cap.get(cv2.CAP_PROP_FPS)
    
    # Determine frame sampling to match target FPS
    frame_skip = max(1, round(video_fps / fps_target))
    frame_time = 1.0 / fps_target
    
    # Initialize MediaPipe
    pose_detector = setup_mediapipe()
    
    # Create BVH skeleton and writer
    root_joint = create_skeleton()
    bvh_writer = BVHWriter(root_joint, frame_time=frame_time)
    
    frame_count = 0
    processed_count = 0
    
    mp_pose = mp.solutions.pose
    
    # Mapping from MediaPipe landmarks to our skeleton
    landmarks_map = {
        "Hips": [mp_pose.PoseLandmark.LEFT_HIP.value, mp_pose.PoseLandmark.RIGHT_HIP.value],
        "Spine": [mp_pose.PoseLandmark.LEFT_SHOULDER.value, mp_pose.PoseLandmark.RIGHT_SHOULDER.value],
        "Neck": [mp_pose.PoseLandmark.LEFT_SHOULDER.value, mp_pose.PoseLandmark.RIGHT_SHOULDER.value],
        "Head": [mp_pose.PoseLandmark.NOSE.value],
        "LeftShoulder": [mp_pose.PoseLandmark.LEFT_SHOULDER.value],
        "LeftElbow": [mp_pose.PoseLandmark.LEFT_ELBOW.value],
        "LeftWrist": [mp_pose.PoseLandmark.LEFT_WRIST.value],
        "RightShoulder": [mp_pose.PoseLandmark.RIGHT_SHOULDER.value],
        "RightElbow": [mp_pose.PoseLandmark.RIGHT_ELBOW.value],
        "RightWrist": [mp_pose.PoseLandmark.RIGHT_WRIST.value],
        "LeftHip": [mp_pose.PoseLandmark.LEFT_HIP.value],
        "LeftKnee": [mp_pose.PoseLandmark.LEFT_KNEE.value],
        "LeftAnkle": [mp_pose.PoseLandmark.LEFT_ANKLE.value],
        "RightHip": [mp_pose.PoseLandmark.RIGHT_HIP.value],
        "RightKnee": [mp_pose.PoseLandmark.RIGHT_KNEE.value],
        "RightAnkle": [mp_pose.PoseLandmark.RIGHT_ANKLE.value]
    }
    
    # Define reference directions for joints
    reference_directions = {
        "Hips": np.array([0, 1, 0]),
        "Spine": np.array([0, 1, 0]),
        "Neck": np.array([0, 1, 0]),
        "Head": np.array([0, 1, 0]),
        "LeftShoulder": np.array([1, 0, 0]),
        "LeftElbow": np.array([1, 0, 0]),
        "LeftWrist": np.array([1, 0, 0]),
        "RightShoulder": np.array([-1, 0, 0]),
        "RightElbow": np.array([-1, 0, 0]),
        "RightWrist": np.array([-1, 0, 0]),
        "LeftHip": np.array([0, -1, 0]),
        "LeftKnee": np.array([0, -1, 0]),
        "LeftAnkle": np.array([0, 0, 1]),
        "RightHip": np.array([0, -1, 0]),
        "RightKnee": np.array([0, -1, 0]),
        "RightAnkle": np.array([0, 0, 1])
    }
    
    joint_positions = {}
    
    def get_joint_direction(landmarks, joint_name, child_joint_name=None):
        if joint_name not in landmarks_map:
            return reference_directions[joint_name]
        
        # Use parent-child direction if child is specified
        if child_joint_name and child_joint_name in landmarks_map:
            parent_pos = get_joint_position(landmarks, landmarks_map[joint_name][0])
            child_pos = get_joint_position(landmarks, landmarks_map[child_joint_name][0])
            direction = child_pos - parent_pos
            if np.linalg.norm(direction) > 1e-10:
                return direction / np.linalg.norm(direction)
        
        # Use reference direction as fallback
        return reference_directions[joint_name]
    
    print(f"Processing video: {video_path}")
    print(f"Target FPS: {fps_target}, Frame time: {frame_time}")
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        frame_count += 1
        
        # Process only selected frames to match target FPS
        if (frame_count - 1) % frame_skip != 0:
            continue
        
        processed_count += 1
        
        # Convert to RGB for MediaPipe
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Detect pose
        results = pose_detector.process(rgb_frame)
        
        if results.pose_landmarks:
            landmarks = results.pose_landmarks.landmark
            
            # Extract positions for all joints
            for joint_name in landmarks_map:
                if landmarks_map[joint_name]:
                    joint_positions[joint_name] = get_joint_position(landmarks, landmarks_map[joint_name][0])
            
            # Calculate hip center explicitly (average of left and right hip)
            if "LeftHip" in joint_positions and "RightHip" in joint_positions:
                joint_positions["Hips"] = (joint_positions["LeftHip"] + joint_positions["RightHip"]) / 2
            
            # Generate frame data for BVH
            frame_data = []
            
            # Root position (translation)
            if "Hips" in joint_positions:
                hips_pos = joint_positions["Hips"]
                frame_data.extend([hips_pos[0], hips_pos[1], hips_pos[2]])
            else:
                # Default position if hips not detected
                frame_data.extend([0, 0, 0])
            
            # Joint rotations
            def process_joint_rotations(joint):
                if joint.name in joint_positions:
                    # Find child joint for direction
                    child_name = None
                    if joint.children and not joint.children[0].is_end_site:
                        child_name = joint.children[0].name
                    
                    # Get current direction vector and calculate rotation
                    current_dir = get_joint_direction(landmarks, joint.name, child_name)
                    reference_dir = reference_directions[joint.name]
                    rotation = calculate_rotation(reference_dir, current_dir)
                    
                    if joint.name == "Hips":
                        # For root, add all channel data
                        frame_data.extend(rotation)
                    else:
                        # For other joints, add rotation channels
                        frame_data.extend(rotation)
                    
                    # Process children
                    for child in joint.children:
                        if not child.is_end_site:
                            process_joint_rotations(child)
                else:
                    # If joint not detected, use default rotations
                    if joint.name == "Hips":
                        frame_data.extend([0, 0, 0])  # Default rotation for root
                    else:
                        frame_data.extend([0, 0, 0])  # Default rotation for other joints
                    
                    # Process children
                    for child in joint.children:
                        if not child.is_end_site:
                            process_joint_rotations(child)
            
            # Start processing from root
            process_joint_rotations(root_joint)
            
            # Add frame to BVH
            bvh_writer.add_frame(frame_data)
            
            if processed_count % 10 == 0:
                print(f"Processed {processed_count} frames...")
    
    cap.release()
    
    # Write BVH file
    bvh_writer.write_to_file(output_path)
    print(f"BVH file created: {output_path}")
    print(f"Total frames processed: {processed_count}")



In [4]:

# parser = argparse.ArgumentParser(description="Convert video to BVH motion capture file")
# parser.add_argument("video_path", help="Path to input video file")
# parser.add_argument("--output", help="Path to output BVH file")
# parser.add_argument("--fps", type=int, default=30, help="Target frames per second for BVH file")

# args = parser.parse_args()

# video_path = args.video_path
# output_path = args.output

# if not output_path:
#     # Create output path if not specified
#     video_name = Path(video_path).stem
#     output_path = f"{video_name}.bvh"

filename = "fight3"
process_video(f"{filename}.mp4", f"{filename}.bvh", fps_target=30)

Processing video: fight3.mp4
Target FPS: 30, Frame time: 0.03333333333333333


I0000 00:00:1740594097.180436   13034 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1740594097.184636   18669 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 23.2.1-1ubuntu3.1~22.04.3), renderer: Mesa Intel(R) UHD Graphics (CML GT2)
W0000 00:00:1740594097.270691   18657 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1740594097.368446   18663 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Processed 10 frames...
Processed 20 frames...
Processed 30 frames...
BVH file created: fight3.bvh
Total frames processed: 37
