In [1]:
import cv2
import numpy as np
import mediapipe as mp
import math
import argparse
from typing import List, Dict, Tuple
import os

class MediaPipeToBVH:
    def __init__(self, fps: int = 30, scale: float = 100.0):
        self.mp_pose = mp.solutions.pose
        self.fps = fps
        self.scale = scale
        
        # Define the BVH skeleton hierarchy
        self.joint_hierarchy = {
            "Hips": ["Spine", "LeftUpLeg", "RightUpLeg"],
            "Spine": ["Spine1"],
            "Spine1": ["Neck", "LeftShoulder", "RightShoulder"],
            "Neck": ["Head"],
            "Head": ["End_Head"],
            "LeftShoulder": ["LeftArm"],
            "LeftArm": ["LeftForeArm"],
            "LeftForeArm": ["LeftHand"],
            "LeftHand": ["End_LeftHand"],
            "RightShoulder": ["RightArm"],
            "RightArm": ["RightForeArm"],
            "RightForeArm": ["RightHand"],
            "RightHand": ["End_RightHand"],
            "LeftUpLeg": ["LeftLeg"],
            "LeftLeg": ["LeftFoot"],
            "LeftFoot": ["End_LeftFoot"],
            "RightUpLeg": ["RightLeg"],
            "RightLeg": ["RightFoot"],
            "RightFoot": ["End_RightFoot"]
        }
        
        # MediaPipe landmark indices for joints
        self.landmark_to_joint = {
            "Hips": [23, 24],
            "Spine": [11, 12, 23, 24],
            "Spine1": [11, 12],
            "Neck": [11, 12],
            "Head": [0],
            "LeftShoulder": [11],
            "LeftArm": [13],
            "LeftForeArm": [15],
            "LeftHand": [17, 19, 21],
            "RightShoulder": [12],
            "RightArm": [14],
            "RightForeArm": [16],
            "RightHand": [18, 20, 22],
            "LeftUpLeg": [23],
            "LeftLeg": [25],
            "LeftFoot": [27, 31],
            "RightUpLeg": [24],
            "RightLeg": [26],
            "RightFoot": [28, 32]
        }
        
        # End site offsets (fixed offsets for end joints)
        self.end_site_offsets = {
            "End_Head": (0, 20, 0),
            "End_LeftHand": (0, -10, 0),
            "End_RightHand": (0, -10, 0),
            "End_LeftFoot": (0, -5, 10),
            "End_RightFoot": (0, -5, 10)
        }
        
        # Initial poses
        self.pose = self.mp_pose.Pose(
            static_image_mode=False,
            model_complexity=2,
            smooth_landmarks=True,
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5
        )

    def process_video(self, video_path):
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            raise ValueError(f"Could not open video file {video_path}")
        
        frames = []
        frame_count = 0
        
        while True:
            success, image = cap.read()
            if not success:
                break
            
            # Skip frames for efficiency if needed
            frame_count += 1
            if frame_count % 2 != 0:  # Process every 2nd frame
                continue
                
            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            results = self.pose.process(image_rgb)
            
            if results.pose_world_landmarks:
                landmarks_dict = {}
                for i, landmark in enumerate(results.pose_world_landmarks.landmark):
                    landmarks_dict[i] = {
                        'x': landmark.x,
                        'y': landmark.y,
                        'z': landmark.z,
                        'visibility': landmark.visibility
                    }
                frames.append(landmarks_dict)
                
                if len(frames) % 10 == 0:
                    print(f"Processed {len(frames)} frames...")
            else:
                print(f"No pose detected in frame {frame_count}")
                if frames:
                    frames.append(frames[-1])
        
        cap.release()
        print(f"Extracted pose data from {len(frames)} frames")
        return frames

    def get_joint_positions(self, frames):
        joint_positions = []
        
        for frame in frames:
            positions = {}
            
            for joint_name, indices in self.landmark_to_joint.items():
                pos = [0, 0, 0]
                weight = 1.0 / len(indices)
                
                for idx in indices:
                    pos[0] += frame[idx]['x'] * weight
                    pos[1] += frame[idx]['y'] * weight
                    pos[2] += frame[idx]['z'] * weight
                
                # Scale and convert coordinates
                # MediaPipe: Y is up, X is right, Z is toward camera
                # Standard BVH: Y is up, X is right, Z is forward (away from camera)
                positions[joint_name] = {
                    'x': pos[0] * self.scale,
                    'y': pos[1] * self.scale,
                    'z': -pos[2] * self.scale  # Negate Z to flip direction
                }
            
            joint_positions.append(positions)
        
        return joint_positions

    def smooth_positions(self, positions, window=5):
        if len(positions) <= 1:
            return positions
            
        smoothed = []
        joint_names = positions[0].keys()
        
        # First frame remains the same
        smoothed.append(positions[0])
        
        # Apply moving average to middle frames
        for i in range(1, len(positions) - 1):
            smooth_frame = {}
            
            for joint in joint_names:
                start = max(0, i - window // 2)
                end = min(len(positions), i + window // 2 + 1)
                
                x_sum = y_sum = z_sum = 0
                count = 0
                
                for j in range(start, end):
                    if joint in positions[j]:
                        x_sum += positions[j][joint]['x']
                        y_sum += positions[j][joint]['y']
                        z_sum += positions[j][joint]['z']
                        count += 1
                
                if count > 0:
                    smooth_frame[joint] = {
                        'x': x_sum / count,
                        'y': y_sum / count,
                        'z': z_sum / count
                    }
                else:
                    smooth_frame[joint] = positions[i][joint]
            
            smoothed.append(smooth_frame)
        
        # Last frame remains the same
        if len(positions) > 1:
            smoothed.append(positions[-1])
        
        return smoothed

    def calculate_offsets(self, reference_frame):
        offsets = {}
        
        for joint, children in self.joint_hierarchy.items():
            if joint not in reference_frame:
                continue
                
            parent_pos = reference_frame[joint]
            
            for child in children:
                if child.startswith("End_"):
                    # Use predefined end site offset
                    offsets[child] = self.end_site_offsets[child]
                elif child in reference_frame:
                    child_pos = reference_frame[child]
                    offsets[child] = (
                        child_pos['x'] - parent_pos['x'],
                        child_pos['y'] - parent_pos['y'],
                        child_pos['z'] - parent_pos['z']
                    )
        
        return offsets

    def write_bvh(self, positions, output_path):
        # Use the first good frame for the skeleton
        offsets = self.calculate_offsets(positions[0])
        
        with open(output_path, 'w') as f:
            # Write header
            f.write("HIERARCHY\n")
            f.write("ROOT Hips\n")
            f.write("{\n")
            f.write("\tOFFSET 0.00 0.00 0.00\n")
            f.write("\tCHANNELS 6 Xposition Yposition Zposition Xrotation Yrotation Zrotation\n")
            
            # Write joint hierarchy
            self._write_joint_hierarchy(f, "Hips", offsets, 1)
            
            # End hierarchy section
            f.write("}\n")
            
            # Write motion section
            f.write("MOTION\n")
            f.write(f"Frames: {len(positions)}\n")
            f.write(f"Frame Time: {1.0/self.fps:.6f}\n")
            
            # Write each frame
            for i, frame in enumerate(positions):
                line = []
                
                # Root position
                hips = frame["Hips"]
                line.extend([hips['x'], hips['y'], hips['z']])
                
                # Add all rotations (simplified - all 0 for this example)
                for joint in self._get_joint_list("Hips"):
                    if not joint.startswith("End_"):
                        # For simplicity, just use 0 rotations
                        # In a real implementation, calculate proper joint rotations
                        line.extend([0.0, 0.0, 0.0])
                
                f.write(" ".join(f"{val:.6f}" for val in line) + "\n")
        
        print(f"BVH file written to {output_path}")

    def _write_joint_hierarchy(self, file, joint_name, offsets, indent_level):
        indent = "\t" * indent_level
        
        for child in self.joint_hierarchy.get(joint_name, []):
            if child.startswith("End_"):
                # Write end site
                file.write(f"{indent}End Site\n")
                file.write(f"{indent}{{\n")
                
                offset = offsets.get(child, (0, 0, 0))
                file.write(f"{indent}\tOFFSET {offset[0]:.6f} {offset[1]:.6f} {offset[2]:.6f}\n")
                
                file.write(f"{indent}}}\n")
            else:
                # Write child joint
                file.write(f"{indent}JOINT {child}\n")
                file.write(f"{indent}{{\n")
                
                offset = offsets.get(child, (0, 0, 0))
                file.write(f"{indent}\tOFFSET {offset[0]:.6f} {offset[1]:.6f} {offset[2]:.6f}\n")
                
                # All non-root joints have rotation only
                file.write(f"{indent}\tCHANNELS 3 Xrotation Yrotation Zrotation\n")
                
                # Write child's children
                self._write_joint_hierarchy(file, child, offsets, indent_level + 1)
                
                file.write(f"{indent}}}\n")

    def _get_joint_list(self, start_joint):
        """Get a flat list of all joints in hierarchy order"""
        joints = [start_joint]
        
        for child in self.joint_hierarchy.get(start_joint, []):
            if not child.startswith("End_"):
                joints.extend(self._get_joint_list(child))
        
        return joints

    def convert_video_to_bvh(self, video_path, output_path):
        print(f"Processing video: {video_path}")
        frames = self.process_video(video_path)
        
        if not frames:
            print("No pose data detected. Check your video.")
            return False
        
        print("Calculating joint positions...")
        positions = self.get_joint_positions(frames)
        
        print("Smoothing motion...")
        smoothed = self.smooth_positions(positions)
        
        print(f"Writing BVH file to: {output_path}")
        self.write_bvh(smoothed, output_path)
        
        return True

# def main():
#     parser = argparse.ArgumentParser(description='Convert video to BVH using MediaPipe.')
#     parser.add_argument('--input', type=str, required=True, help='Input video file')
#     parser.add_argument('--output', type=str, help='Output BVH file')
#     parser.add_argument('--fps', type=int, default=30, help='Frames per second for BVH')
#     parser.add_argument('--scale', type=float, default=100.0, help='Scale factor for the skeleton')
    
#     args = parser.parse_args()
    
#     if not args.output:
#         base_name = os.path.splitext(os.path.basename(args.input))[0]
#         args.output = f"{base_name}.bvh"
    
#     converter = MediaPipeToBVH(fps=args.fps, scale=args.scale)
#     converter.convert_video_to_bvh(args.input, args.output)

# if __name__ == "__main__":
#     main()

2025-02-26 13:22:56.977065: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740604976.995605   32783 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740604977.001471   32783 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-26 13:22:57.022299: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
filename = "fight1"
converter = MediaPipeToBVH(fps=30, scale=100.0)
converter.convert_video_to_bvh(f"{filename}.mp4", f"{filename}.bvh")

I0000 00:00:1740604980.513083   32783 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1740604980.515587   34100 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 23.2.1-1ubuntu3.1~22.04.3), renderer: Mesa Intel(R) UHD Graphics (CML GT2)
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1740604980.606802   34079 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1740604980.714121   34087 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Processing video: fight1.mp4


W0000 00:00:1740604980.828017   34087 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.


Processed 10 frames...
Processed 20 frames...
Processed 30 frames...
Processed 40 frames...
Extracted pose data from 45 frames
Calculating joint positions...
Smoothing motion...
Writing BVH file to: fight1.bvh
BVH file written to fight1.bvh


True