In [1]:
import cv2
import numpy as np
import mediapipe as mp
import math
import argparse
import os
from typing import List, Dict, Tuple

class MediaPipeToBVH:
    def __init__(self, fps: int = 30, scale: float = 100.0):
        self.mp_pose = mp.solutions.pose
        self.mp_drawing = mp.solutions.drawing_utils
        self.fps = fps
        self.scale = scale
        
        # Define the BVH skeleton hierarchy
        self.joint_hierarchy = {
            "Hips": ["Spine", "LeftUpLeg", "RightUpLeg"],
            "Spine": ["Spine1"],
            "Spine1": ["Neck", "LeftShoulder", "RightShoulder"],
            "Neck": ["Head"],
            "Head": ["End_Head"],
            "LeftShoulder": ["LeftArm"],
            "LeftArm": ["LeftForeArm"],
            "LeftForeArm": ["LeftHand"],
            "LeftHand": ["End_LeftHand"],
            "RightShoulder": ["RightArm"],
            "RightArm": ["RightForeArm"],
            "RightForeArm": ["RightHand"],
            "RightHand": ["End_RightHand"],
            "LeftUpLeg": ["LeftLeg"],
            "LeftLeg": ["LeftFoot"],
            "LeftFoot": ["End_LeftFoot"],
            "RightUpLeg": ["RightLeg"],
            "RightLeg": ["RightFoot"],
            "RightFoot": ["End_RightFoot"]
        }
        
        # MediaPipe landmark indices mapping to BVH joints
        self.landmark_to_joint = {
            # Main body
            "Hips": {"indices": [23, 24], "weights": [0.5, 0.5]},
            "Spine": {"indices": [23, 24, 11, 12], "weights": [0.25, 0.25, 0.25, 0.25]},
            "Spine1": {"indices": [11, 12], "weights": [0.5, 0.5]},
            "Neck": {"indices": [11, 12], "weights": [0.5, 0.5]},
            "Head": {"indices": [0], "weights": [1.0]},
            
            # Left arm
            "LeftShoulder": {"indices": [11], "weights": [1.0]},
            "LeftArm": {"indices": [13], "weights": [1.0]},
            "LeftForeArm": {"indices": [15], "weights": [1.0]},
            "LeftHand": {"indices": [15, 17, 19], "weights": [0.4, 0.3, 0.3]},
            
            # Right arm
            "RightShoulder": {"indices": [12], "weights": [1.0]},
            "RightArm": {"indices": [14], "weights": [1.0]},
            "RightForeArm": {"indices": [16], "weights": [1.0]},
            "RightHand": {"indices": [16, 18, 20], "weights": [0.4, 0.3, 0.3]},
            
            # Left leg - Properly weighted to match visual
            "LeftUpLeg": {"indices": [23], "weights": [1.0]},
            "LeftLeg": {"indices": [25], "weights": [1.0]},
            "LeftFoot": {"indices": [27, 31], "weights": [0.8, 0.2]},
            
            # Right leg - Properly weighted to match visual
            "RightUpLeg": {"indices": [24], "weights": [1.0]},
            "RightLeg": {"indices": [26], "weights": [1.0]},
            "RightFoot": {"indices": [28, 32], "weights": [0.8, 0.2]}
        }
        
        # End site offsets (fixed lengths for end joints)
        self.end_site_offsets = {
            "End_Head": (0, 15, 0),         # Up from head
            "End_LeftHand": (-5, 0, 0),     # Left from hand
            "End_RightHand": (5, 0, 0),     # Right from hand
            "End_LeftFoot": (0, -5, 5),     # Down and forward from foot
            "End_RightFoot": (0, -5, 5)     # Down and forward from foot
        }
        
        # Initialize pose tracking
        self.pose = self.mp_pose.Pose(
            static_image_mode=False,
            model_complexity=2,
            smooth_landmarks=True,
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5
        )
        
        # Default orientation is facing forward (-Z)
        self.model_forward = -1  # -1 means model faces -Z, 1 means model faces +Z

    def process_video(self, video_path, visualize=False, max_frames=None):
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            raise ValueError(f"Could not open video file {video_path}")
        
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        if max_frames and max_frames < total_frames:
            total_frames = max_frames
            
        fps = cap.get(cv2.CAP_PROP_FPS)
        print(f"Video has {total_frames} frames at {fps} FPS")
        
        # Optional visualization video
        out = None
        if visualize:
            width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
            height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
            out = cv2.VideoWriter('pose_tracking.mp4', fourcc, fps, (width, height))
        
        frames = []
        frame_count = 0
        last_good_landmarks = None
        
        while frame_count < total_frames:
            success, image = cap.read()
            if not success:
                break
            
            frame_count += 1
            # Progress indicator
            if frame_count % max(1, total_frames // 10) == 0:
                print(f"Processing frame {frame_count}/{total_frames} ({frame_count/total_frames*100:.1f}%)")
            
            # Convert to RGB for MediaPipe
            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            results = self.pose.process(image_rgb)
            
            if visualize and results.pose_landmarks:
                # Draw pose landmarks on image
                annotated_image = image.copy()
                self.mp_drawing.draw_landmarks(
                    annotated_image, 
                    results.pose_landmarks,
                    self.mp_pose.POSE_CONNECTIONS
                )
                out.write(annotated_image)
            
            if results.pose_world_landmarks:
                # Store landmarks
                landmarks_dict = {}
                for i, landmark in enumerate(results.pose_world_landmarks.landmark):
                    landmarks_dict[i] = {
                        'x': landmark.x,
                        'y': landmark.y,
                        'z': landmark.z,
                        'visibility': landmark.visibility
                    }
                frames.append(landmarks_dict)
                last_good_landmarks = landmarks_dict
            else:
                # If no pose detected, use last good landmarks
                if last_good_landmarks:
                    frames.append(last_good_landmarks.copy())
                else:
                    print(f"Warning: No pose detected in frame {frame_count} and no previous landmarks available")
        
        if visualize and out:
            out.release()
            
        cap.release()
        
        if len(frames) == 0:
            raise ValueError("No pose data detected in the video")
        
        print(f"Extracted pose data from {len(frames)} frames")
        return frames

    def calculate_joint_positions(self, frames, flip_forward=True):
        """Calculate BVH joint positions from MediaPipe landmarks"""
        joint_positions = []
        
        for frame_idx, landmarks in enumerate(frames):
            frame_positions = {}
            
            # Calculate position for each joint
            for joint_name, mapping in self.landmark_to_joint.items():
                indices = mapping["indices"]
                weights = mapping["weights"]
                
                # Calculate weighted average position
                x = sum(landmarks[idx]['x'] * weights[i] for i, idx in enumerate(indices))
                y = sum(landmarks[idx]['y'] * weights[i] for i, idx in enumerate(indices))
                z = sum(landmarks[idx]['z'] * weights[i] for i, idx in enumerate(indices))
                
                # Convert MediaPipe coordinates to standard BVH coordinates:
                # MediaPipe: Y is up, X is right, Z is toward camera
                # Standard BVH: Y is up, X is right, Z is forward (away from camera)
                
                # For 180° rotation, we negate both X and Z
                if flip_forward:
                    frame_positions[joint_name] = {
                        'x': -x * self.scale,       # Flip X to rotate 180°
                        'y': y * self.scale,        # Y stays the same
                        'z': z * self.scale         # Z is toward camera (not flipped to rotate 180°)
                    }
                else:
                    frame_positions[joint_name] = {
                        'x': x * self.scale,        # X stays the same
                        'y': y * self.scale,        # Y stays the same
                        'z': -z * self.scale        # Z is flipped to make forward away from camera
                    }
            
            joint_positions.append(frame_positions)
        
        return joint_positions

    def detect_facing_direction(self, positions):
        """Detect which way the person is predominantly facing"""
        if len(positions) < 10:  # Need at least a few frames to be reliable
            return False  # Default to not flipping
            
        # Look at the hip-to-shoulder relationship to determine facing
        facing_camera_frames = 0
        facing_away_frames = 0
        
        for frame in positions[:min(30, len(positions))]:  # Check first 30 frames (or all if fewer)
            if "Hips" in frame and "Spine1" in frame:
                # Z distance from hips to shoulders
                z_diff = frame["Spine1"]["z"] - frame["Hips"]["z"]
                
                if z_diff > 0:
                    facing_camera_frames += 1
                else:
                    facing_away_frames += 1
        
        # If majority of frames face the camera, flip the model
        return facing_camera_frames > facing_away_frames

    def smooth_positions(self, positions, window_size=3):
        """Apply smoothing to reduce jitter"""
        if len(positions) <= 1:
            return positions
            
        smoothed = []
        joint_names = positions[0].keys()
        half_window = max(1, window_size // 2)
        
        for i in range(len(positions)):
            smooth_frame = {}
            
            for joint in joint_names:
                # Calculate window bounds
                start = max(0, i - half_window)
                end = min(len(positions), i + half_window + 1)
                
                # Calculate average position within window
                x_sum = y_sum = z_sum = 0
                count = 0
                
                for j in range(start, end):
                    if joint in positions[j]:
                        x_sum += positions[j][joint]['x']
                        y_sum += positions[j][joint]['y']
                        z_sum += positions[j][joint]['z']
                        count += 1
                
                if count > 0:
                    smooth_frame[joint] = {
                        'x': x_sum / count,
                        'y': y_sum / count,
                        'z': z_sum / count
                    }
                else:
                    smooth_frame[joint] = positions[i][joint].copy()
            
            smoothed.append(smooth_frame)
        
        return smoothed

    def calculate_joint_offsets(self, reference_frame):
        """Calculate bone offsets from reference frame"""
        offsets = {}
        
        for joint_name, children in self.joint_hierarchy.items():
            if joint_name not in reference_frame:
                continue
                
            parent_pos = reference_frame[joint_name]
            
            for child in children:
                if child.startswith("End_"):
                    # Use predefined end site offset
                    offsets[child] = self.end_site_offsets[child]
                elif child in reference_frame:
                    child_pos = reference_frame[child]
                    offsets[child] = (
                        child_pos['x'] - parent_pos['x'],
                        child_pos['y'] - parent_pos['y'],
                        child_pos['z'] - parent_pos['z']
                    )
        
        return offsets

    def find_tpose_frame(self, positions):
        """Try to find a T-pose frame in the sequence"""
        best_idx = 0
        best_score = -float('inf')
        
        # Check first few frames, they often contain initialization pose
        check_frames = min(30, len(positions))
        
        for i in range(check_frames):
            # Calculate T-pose score based on arm extension and symmetry
            if "LeftShoulder" not in positions[i] or "RightShoulder" not in positions[i]:
                continue
                
            left_arm_hor = 0
            right_arm_hor = 0
            
            # Check horizontal arm extension
            if "LeftHand" in positions[i] and "LeftShoulder" in positions[i]:
                dx = positions[i]["LeftHand"]["x"] - positions[i]["LeftShoulder"]["x"]
                dy = positions[i]["LeftHand"]["y"] - positions[i]["LeftShoulder"]["y"]
                left_arm_hor = abs(dx) - abs(dy)  # Higher is better (more horizontal)
                
            if "RightHand" in positions[i] and "RightShoulder" in positions[i]:
                dx = positions[i]["RightHand"]["x"] - positions[i]["RightShoulder"]["x"]
                dy = positions[i]["RightHand"]["y"] - positions[i]["RightShoulder"]["y"]
                right_arm_hor = abs(dx) - abs(dy)  # Higher is better (more horizontal)
            
            # Check if shoulders are level
            shoulder_level = 0
            if "LeftShoulder" in positions[i] and "RightShoulder" in positions[i]:
                shoulder_level = -abs(positions[i]["LeftShoulder"]["y"] - positions[i]["RightShoulder"]["y"])
            
            # Calculate overall T-pose score
            score = left_arm_hor + right_arm_hor + shoulder_level
            
            if score > best_score:
                best_score = score
                best_idx = i
        
        return best_idx

    def generate_direct_animation(self, positions, reference_frame, offsets):
        """
        Generate animation data using direct position mode.
        This mode prioritizes visual matching over anatomical correctness.
        """
        animation_data = []
        
        # Get the list of all joints in hierarchical order
        all_joints = self._get_joint_list("Hips")
        
        for frame_idx, frame_positions in enumerate(positions):
            frame_data = []
            
            # Add root position
            root_pos = frame_positions["Hips"]
            frame_data.extend([root_pos['x'], root_pos['y'], root_pos['z']])
            
            # Add a global Y rotation to the hips (facing +Z direction)
            # 180° = pi radians
            frame_data.extend([0.0, 180.0, 0.0])
            
            # For each joint (except Hips which we've already handled), calculate rotations
            for joint_name in all_joints:
                if joint_name == "Hips":
                    continue  # Root joint already handled above
                    
                x_rot = y_rot = z_rot = 0.0
                
                # Find parent of this joint
                parent_name = self._get_parent(joint_name)
                
                if parent_name and parent_name in frame_positions and joint_name in frame_positions:
                    # Get positions of this joint and its parent
                    joint_pos = frame_positions[joint_name]
                    parent_pos = frame_positions[parent_name]
                    
                    # Calculate direction vector from parent to joint
                    dx = joint_pos['x'] - parent_pos['x']
                    dy = joint_pos['y'] - parent_pos['y']
                    dz = joint_pos['z'] - parent_pos['z']
                    
                    # Set rotations to position the joint correctly
                    if abs(dx) > 0.001 or abs(dy) > 0.001 or abs(dz) > 0.001:
                        # X rotation (pitch) - rotation around X axis affects Y and Z
                        x_rot = math.degrees(math.atan2(dz, dy)) if (abs(dy) > 0.001 or abs(dz) > 0.001) else 0
                        
                        # Y rotation (yaw) - rotation around Y axis affects X and Z
                        y_rot = math.degrees(math.atan2(dx, dz)) if (abs(dx) > 0.001 or abs(dz) > 0.001) else 0
                        
                        # Z rotation (roll) - rotation around Z axis affects X and Y
                        z_rot = math.degrees(math.atan2(dx, dy)) if (abs(dx) > 0.001 or abs(dy) > 0.001) else 0
                
                # Add this joint's rotations
                frame_data.extend([x_rot, y_rot, z_rot])
            
            animation_data.append(frame_data)
        
        return animation_data

    def _get_parent(self, joint_name):
        """Find the parent of a joint in the hierarchy"""
        for parent, children in self.joint_hierarchy.items():
            if joint_name in children:
                return parent
        return None  # No parent found

    def write_direct_bvh_file(self, positions, output_path):
        """Write BVH file using direct position approach for maximum visual accuracy"""
        if len(positions) < 2:
            raise ValueError("Need at least 2 frames to create animation")
            
        # Find the best reference frame for skeleton
        ref_idx = self.find_tpose_frame(positions)
        reference_frame = positions[ref_idx]
        print(f"Using frame {ref_idx} as reference for skeleton")
        
        # Calculate joint offsets from reference frame
        offsets = self.calculate_joint_offsets(reference_frame)
        
        # Generate animation data using direct position approach
        animation_data = self.generate_direct_animation(positions, reference_frame, offsets)
        
        with open(output_path, 'w') as f:
            # Write HIERARCHY section
            f.write("HIERARCHY\n")
            f.write("ROOT Hips\n")
            f.write("{\n")
            f.write("\tOFFSET 0.00 0.00 0.00\n")
            f.write("\tCHANNELS 6 Xposition Yposition Zposition Xrotation Yrotation Zrotation\n")
            
            # Write joint hierarchy
            self._write_joint_hierarchy(f, "Hips", offsets, 1)
            
            # End HIERARCHY section
            f.write("}\n")
            
            # Write MOTION section
            f.write("MOTION\n")
            f.write(f"Frames: {len(positions)}\n")
            f.write(f"Frame Time: {1.0/self.fps:.6f}\n")
            
            # Write frame data
            for frame_data in animation_data:
                f.write(" ".join(f"{val:.6f}" for val in frame_data) + "\n")
        
        print(f"BVH file written to {output_path}")

    def _write_joint_hierarchy(self, file, joint_name, offsets, indent_level):
        """Write joint hierarchy to BVH file"""
        indent = "\t" * indent_level
        
        for child in self.joint_hierarchy.get(joint_name, []):
            if child.startswith("End_"):
                # Write end site
                file.write(f"{indent}End Site\n")
                file.write(f"{indent}{{\n")
                
                offset = offsets.get(child, (0, 0, 0))
                file.write(f"{indent}\tOFFSET {offset[0]:.6f} {offset[1]:.6f} {offset[2]:.6f}\n")
                
                file.write(f"{indent}}}\n")
            else:
                # Write child joint
                file.write(f"{indent}JOINT {child}\n")
                file.write(f"{indent}{{\n")
                
                offset = offsets.get(child, (0, 0, 0))
                file.write(f"{indent}\tOFFSET {offset[0]:.6f} {offset[1]:.6f} {offset[2]:.6f}\n")
                
                # All non-root joints have rotation only
                file.write(f"{indent}\tCHANNELS 3 Xrotation Yrotation Zrotation\n")
                
                # Write child's children
                self._write_joint_hierarchy(file, child, offsets, indent_level + 1)
                
                file.write(f"{indent}}}\n")

    def _get_joint_list(self, start_joint):
        """Get a flat list of all joints in hierarchy order"""
        joints = [start_joint]
        
        for child in self.joint_hierarchy.get(start_joint, []):
            if not child.startswith("End_"):
                joints.extend(self._get_joint_list(child))
        
        return joints

    def convert_video_to_bvh(self, video_path, output_path, visualize=False, max_frames=None):
        """Convert video to BVH file"""
        print(f"Processing video: {video_path}")
        
        # Extract pose data from video
        frames = self.process_video(video_path, visualize, max_frames)
        
        # Temporary position calculation to determine facing direction
        temp_positions = self.calculate_joint_positions(frames, flip_forward=False)
        should_flip_forward = self.detect_facing_direction(temp_positions)
        
        if should_flip_forward:
            print("Detected person facing camera - will flip model 180 degrees for correct orientation")
        else:
            print("Detected person facing away from camera - standard orientation will be used")
        
        # Final position calculation with correct orientation
        positions = self.calculate_joint_positions(frames, flip_forward=should_flip_forward)
        
        # Apply minimal smoothing to preserve most motion details
        print("Applying minimal smoothing...")
        smoothed_positions = self.smooth_positions(positions, window_size=3)
        
        # Write BVH file using direct position approach for better visual matching
        print(f"Writing BVH file to: {output_path}")
        self.write_direct_bvh_file(smoothed_positions, output_path)
        
        print("Conversion complete!")
        return True


# def main():
#     parser = argparse.ArgumentParser(description='Convert video to BVH using MediaPipe.')
#     parser.add_argument('--input', type=str, required=True, help='Input video file')
#     parser.add_argument('--output', type=str, help='Output BVH file')
#     parser.add_argument('--fps', type=int, default=30, help='Frames per second for BVH')
#     parser.add_argument('--scale', type=float, default=100.0, help='Scale factor for the skeleton')
#     parser.add_argument('--visualize', action='store_true', help='Save visualization of pose tracking')
#     parser.add_argument('--max-frames', type=int, help='Maximum number of frames to process')
#     parser.add_argument('--flip', action='store_true', help='Force 180 degree flip of the model')
    
#     args = parser.parse_args()
    
#     # If output path is not specified, use input filename with .bvh extension
#     if not args.output:
#         base_name = os.path.splitext(os.path.basename(args.input))[0]
#         args.output = f"{base_name}.bvh"
    
#     try:
#         converter = MediaPipeToBVH(fps=args.fps, scale=args.scale)
#         converter.convert_video_to_bvh(args.input, args.output, args.visualize, args.max_frames)
#         print(f"✓ Successfully created BVH file: {args.output}")
#     except Exception as e:
#         print(f"Error: {str(e)}")
#         return 1
    
#     return 0


# if __name__ == "__main__":
#     main()

2025-02-26 16:10:24.259670: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740615024.274419   19896 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740615024.279174   19896 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-26 16:10:24.294484: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
filename = "dance6"
converter = MediaPipeToBVH(fps=30, scale=100.0)
converter.convert_video_to_bvh(f"{filename}.mp4", f"{filename}.bvh", visualize=True, max_frames=None)

Processing video: dance6.mp4
Video has 481 frames at 60.0 FPS


I0000 00:00:1740615843.576481   19896 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1740615843.578963   22964 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 23.2.1-1ubuntu3.1~22.04.3), renderer: Mesa Intel(R) UHD Graphics (CML GT2)
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1740615843.652414   22944 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1740615843.734535   22943 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1740615843.801130   22945 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.


Processing frame 48/481 (10.0%)
Processing frame 96/481 (20.0%)
Processing frame 144/481 (29.9%)
Processing frame 192/481 (39.9%)
Processing frame 240/481 (49.9%)
Processing frame 288/481 (59.9%)
Processing frame 336/481 (69.9%)
Processing frame 384/481 (79.8%)
Processing frame 432/481 (89.8%)
Processing frame 480/481 (99.8%)
Extracted pose data from 481 frames
Detected person facing camera - will flip model 180 degrees for correct orientation
Applying minimal smoothing...
Writing BVH file to: dance6.bvh
Using frame 4 as reference for skeleton
BVH file written to dance6.bvh
Conversion complete!


True