In [16]:
import cv2
import numpy as np
import mediapipe as mp
import math
import argparse
import os
from typing import List, Dict, Tuple

class MediaPipeToBVH:
    def __init__(self, fps: int = 30, scale: float = 100.0):
        self.mp_pose = mp.solutions.pose
        self.mp_drawing = mp.solutions.drawing_utils
        self.fps = fps
        self.scale = scale
        
        # Define the BVH skeleton hierarchy
        self.joint_hierarchy = {
            "Hips": ["Spine", "LeftUpLeg", "RightUpLeg"],
            "Spine": ["Spine1"],
            "Spine1": ["Neck", "LeftShoulder", "RightShoulder"],
            "Neck": ["Head"],
            "Head": ["End_Head"],
            "LeftShoulder": ["LeftArm"],
            "LeftArm": ["LeftForeArm"],
            "LeftForeArm": ["LeftHand"],
            "LeftHand": ["End_LeftHand"],
            "RightShoulder": ["RightArm"],
            "RightArm": ["RightForeArm"],
            "RightForeArm": ["RightHand"],
            "RightHand": ["End_RightHand"],
            "LeftUpLeg": ["LeftLeg"],
            "LeftLeg": ["LeftFoot"],
            "LeftFoot": ["End_LeftFoot"],
            "RightUpLeg": ["RightLeg"],
            "RightLeg": ["RightFoot"],
            "RightFoot": ["End_RightFoot"]
        }
        
        # MediaPipe landmark indices mapping to BVH joints
        self.landmark_to_joint = {
            "Hips": {"indices": [23, 24], "weights": [0.5, 0.5]},
            "Spine": {"indices": [23, 24, 11, 12], "weights": [0.25, 0.25, 0.25, 0.25]},
            "Spine1": {"indices": [11, 12], "weights": [0.5, 0.5]},
            "Neck": {"indices": [11, 12], "weights": [0.5, 0.5]},
            "Head": {"indices": [0], "weights": [1.0]},
            "LeftShoulder": {"indices": [11], "weights": [1.0]},
            "LeftArm": {"indices": [13], "weights": [1.0]},
            "LeftForeArm": {"indices": [15], "weights": [1.0]},
            "LeftHand": {"indices": [15, 17, 19], "weights": [0.2, 0.4, 0.4]},
            "RightShoulder": {"indices": [12], "weights": [1.0]},
            "RightArm": {"indices": [14], "weights": [1.0]},
            "RightForeArm": {"indices": [16], "weights": [1.0]},
            "RightHand": {"indices": [16, 18, 20], "weights": [0.2, 0.4, 0.4]},
            "LeftUpLeg": {"indices": [23], "weights": [1.0]},
            "LeftLeg": {"indices": [25], "weights": [1.0]},
            "LeftFoot": {"indices": [27, 31], "weights": [0.7, 0.3]},
            "RightUpLeg": {"indices": [24], "weights": [1.0]},
            "RightLeg": {"indices": [26], "weights": [1.0]},
            "RightFoot": {"indices": [28, 32], "weights": [0.7, 0.3]}
        }
        
        # End site offsets (fixed lengths for end joints)
        self.end_site_offsets = {
            "End_Head": (0, 15, 0),
            "End_LeftHand": (0, -10, 0),
            "End_RightHand": (0, -10, 0),
            "End_LeftFoot": (0, -3, 12),
            "End_RightFoot": (0, -3, 12)
        }
        
        # Initialize pose tracking
        self.pose = self.mp_pose.Pose(
            static_image_mode=False,
            model_complexity=2,
            smooth_landmarks=True,
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5
        )
        
        # Store joint angles between frames
        self.joint_angles = {}

    def process_video(self, video_path, visualize=False):
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            raise ValueError(f"Could not open video file {video_path}")
        
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        fps = cap.get(cv2.CAP_PROP_FPS)
        print(f"Video has {total_frames} frames at {fps} FPS")
        
        # Output video writer (if visualization is enabled)
        out = None
        if visualize:
            width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
            height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
            out = cv2.VideoWriter('pose_tracking.mp4', fourcc, fps, (width, height))
        
        frames = []
        frame_count = 0
        last_good_landmarks = None
        
        while True:
            success, image = cap.read()
            if not success:
                break
            
            frame_count += 1
            # Show progress every 10% of the video
            if frame_count % max(1, total_frames // 10) == 0:
                print(f"Processing frame {frame_count}/{total_frames} ({frame_count/total_frames*100:.1f}%)")
            
            # Convert to RGB and process with MediaPipe
            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            results = self.pose.process(image_rgb)
            
            if visualize and results.pose_landmarks:
                # Draw pose landmarks on the image
                annotated_image = image.copy()
                self.mp_drawing.draw_landmarks(
                    annotated_image,
                    results.pose_landmarks,
                    self.mp_pose.POSE_CONNECTIONS
                )
                out.write(annotated_image)
            
            if results.pose_world_landmarks:
                # Store landmarks in dictionary format
                landmarks_dict = {}
                for i, landmark in enumerate(results.pose_world_landmarks.landmark):
                    landmarks_dict[i] = {
                        'x': landmark.x,
                        'y': landmark.y,
                        'z': landmark.z,
                        'visibility': landmark.visibility
                    }
                frames.append(landmarks_dict)
                last_good_landmarks = landmarks_dict
            else:
                # If no pose detected, use the last good landmarks
                if last_good_landmarks:
                    frames.append(last_good_landmarks.copy())
                else:
                    print(f"Warning: No pose detected in frame {frame_count} and no previous landmarks available")
        
        if visualize and out:
            out.release()
            
        cap.release()
        
        if len(frames) == 0:
            raise ValueError("No pose data detected in the video")
        
        print(f"Extracted pose data from {len(frames)} frames")
        return frames

    def calculate_joint_positions(self, frames):
        """Calculate BVH joint positions from MediaPipe landmarks"""
        joint_positions = []
        
        for frame_idx, landmarks in enumerate(frames):
            frame_positions = {}
            
            # Calculate position for each joint using weighted landmark indices
            for joint_name, mapping in self.landmark_to_joint.items():
                indices = mapping["indices"]
                weights = mapping["weights"]
                
                # Calculate weighted average of landmark positions
                x = sum(landmarks[idx]['x'] * weights[i] for i, idx in enumerate(indices))
                y = sum(landmarks[idx]['y'] * weights[i] for i, idx in enumerate(indices))
                z = sum(landmarks[idx]['z'] * weights[i] for i, idx in enumerate(indices))
                
                # Scale and convert coordinates (flip Z for BVH convention)
                frame_positions[joint_name] = {
                    'x': x * self.scale,
                    'y': y * self.scale,
                    'z': -z * self.scale  # Flip Z axis
                }
            
            joint_positions.append(frame_positions)
        
        return joint_positions

    def smooth_positions(self, positions, window_size=5):
        """Apply smoothing to reduce jitter"""
        if len(positions) <= 1:
            return positions
            
        smoothed = []
        joint_names = positions[0].keys()
        half_window = max(1, window_size // 2)
        
        for i in range(len(positions)):
            smooth_frame = {}
            
            for joint in joint_names:
                # Calculate window bounds
                start = max(0, i - half_window)
                end = min(len(positions), i + half_window + 1)
                
                # Calculate average position within window
                x_sum = y_sum = z_sum = 0
                count = 0
                
                for j in range(start, end):
                    if joint in positions[j]:
                        x_sum += positions[j][joint]['x']
                        y_sum += positions[j][joint]['y']
                        z_sum += positions[j][joint]['z']
                        count += 1
                
                if count > 0:
                    smooth_frame[joint] = {
                        'x': x_sum / count,
                        'y': y_sum / count,
                        'z': z_sum / count
                    }
                else:
                    smooth_frame[joint] = positions[i][joint].copy()
            
            smoothed.append(smooth_frame)
        
        return smoothed

    def calculate_joint_offsets(self, reference_frame):
        """Calculate bone offsets from reference frame"""
        offsets = {}
        
        for joint_name, children in self.joint_hierarchy.items():
            if joint_name not in reference_frame:
                continue
                
            parent_pos = reference_frame[joint_name]
            
            for child in children:
                if child.startswith("End_"):
                    # Use predefined end site offset
                    offsets[child] = self.end_site_offsets[child]
                elif child in reference_frame:
                    child_pos = reference_frame[child]
                    offsets[child] = (
                        child_pos['x'] - parent_pos['x'],
                        child_pos['y'] - parent_pos['y'],
                        child_pos['z'] - parent_pos['z']
                    )
        
        return offsets

    def calculate_joint_angles(self, positions):
        """Calculate joint angles for each frame"""
        angles = []
        
        for frame_idx, frame_positions in enumerate(positions):
            frame_angles = {}
            
            for joint_name, children in self.joint_hierarchy.items():
                if joint_name not in frame_positions:
                    continue
                    
                # Set default rotations (no rotation)
                frame_angles[joint_name] = [0.0, 0.0, 0.0]  # X, Y, Z
                
                # Skip end sites and joints without children
                if not children or all(child.startswith("End_") for child in children):
                    continue
                
                # Find first valid child to calculate direction
                for child in children:
                    if child.startswith("End_") or child not in frame_positions:
                        continue
                        
                    # Calculate direction vector from parent to child
                    dx = frame_positions[child]['x'] - frame_positions[joint_name]['x']
                    dy = frame_positions[child]['y'] - frame_positions[joint_name]['y']
                    dz = frame_positions[child]['z'] - frame_positions[joint_name]['z']
                    
                    # Skip if direction is too small (would cause instability)
                    if abs(dx) < 0.001 and abs(dy) < 0.001 and abs(dz) < 0.001:
                        continue
                    
                    # Convert to simple Euler angles
                    # Note: This is a simplified method and not always accurate for all joints
                    # X rotation (around X-axis, affects Y and Z)
                    x_angle = math.degrees(math.atan2(dz, dy))
                    # Y rotation (around Y-axis, affects X and Z)
                    y_angle = math.degrees(math.atan2(dx, dz))
                    # Z rotation (around Z-axis, affects X and Y)
                    z_angle = math.degrees(math.atan2(dx, dy))
                    
                    # Add specific offset based on joint type
                    if "Shoulder" in joint_name:
                        # Shoulders point out to sides
                        x_angle += 90 if "Left" in joint_name else -90
                    elif "UpLeg" in joint_name:
                        # Legs point down
                        y_angle += 180
                    
                    frame_angles[joint_name] = [x_angle, y_angle, z_angle]
                    break
            
            angles.append(frame_angles)
        
        return angles

    def find_tpose_frame(self, positions):
        """Try to find a T-pose frame in the sequence"""
        best_idx = 0
        best_score = -float('inf')
        
        # Check first few frames, they often contain initialization pose
        check_frames = min(30, len(positions))
        
        for i in range(check_frames):
            # Calculate T-pose score based on arm extension and symmetry
            # In a T-pose:
            # - Arms are extended horizontally (X-axis)
            # - Left and right sides are symmetrical
            # - Shoulders are level (similar Y values)
            
            if "LeftShoulder" not in positions[i] or "RightShoulder" not in positions[i]:
                continue
                
            left_arm_hor = 0
            right_arm_hor = 0
            
            # Check horizontal arm extension
            if "LeftHand" in positions[i] and "LeftShoulder" in positions[i]:
                dx = positions[i]["LeftHand"]["x"] - positions[i]["LeftShoulder"]["x"]
                dy = positions[i]["LeftHand"]["y"] - positions[i]["LeftShoulder"]["y"]
                left_arm_hor = abs(dx) - abs(dy)  # Higher is better (more horizontal)
                
            if "RightHand" in positions[i] and "RightShoulder" in positions[i]:
                dx = positions[i]["RightHand"]["x"] - positions[i]["RightShoulder"]["x"]
                dy = positions[i]["RightHand"]["y"] - positions[i]["RightShoulder"]["y"]
                right_arm_hor = abs(dx) - abs(dy)  # Higher is better (more horizontal)
            
            # Check if shoulders are level
            shoulder_level = 0
            if "LeftShoulder" in positions[i] and "RightShoulder" in positions[i]:
                shoulder_level = -abs(positions[i]["LeftShoulder"]["y"] - positions[i]["RightShoulder"]["y"])
            
            # Check if arms are extended outward (not forward)
            arm_extension = 0
            if "LeftHand" in positions[i] and "LeftShoulder" in positions[i]:
                arm_extension += abs(positions[i]["LeftHand"]["x"] - positions[i]["LeftShoulder"]["x"])
            if "RightHand" in positions[i] and "RightShoulder" in positions[i]:
                arm_extension += abs(positions[i]["RightHand"]["x"] - positions[i]["RightShoulder"]["x"])
            
            # Calculate overall T-pose score
            score = left_arm_hor + right_arm_hor + shoulder_level + arm_extension
            
            if score > best_score:
                best_score = score
                best_idx = i
        
        return best_idx

    def write_bvh_file(self, positions, angles, output_path):
        """Write BVH file from joint positions and angles"""
        if len(positions) < 2:
            raise ValueError("Need at least 2 frames to create animation")
            
        # Find the best reference frame for skeleton
        ref_idx = self.find_tpose_frame(positions)
        reference_frame = positions[ref_idx]
        print(f"Using frame {ref_idx} as reference for skeleton")
        
        # Calculate joint offsets from reference frame
        offsets = self.calculate_joint_offsets(reference_frame)
        
        with open(output_path, 'w') as f:
            # Write HIERARCHY section
            f.write("HIERARCHY\n")
            f.write("ROOT Hips\n")
            f.write("{\n")
            f.write("\tOFFSET 0.00 0.00 0.00\n")
            f.write("\tCHANNELS 6 Xposition Yposition Zposition Xrotation Yrotation Zrotation\n")
            
            # Write joint hierarchy
            self._write_joint_hierarchy(f, "Hips", offsets, 1)
            
            # End HIERARCHY section
            f.write("}\n")
            
            # Write MOTION section
            f.write("MOTION\n")
            f.write(f"Frames: {len(positions)}\n")
            f.write(f"Frame Time: {1.0/self.fps:.6f}\n")
            
            # Write frame data
            for i in range(len(positions)):
                line = []
                
                # Root position (Hips)
                hips = positions[i]["Hips"]
                line.extend([hips['x'], hips['y'], hips['z']])
                
                # Add joint rotations
                for joint in self._get_joint_list("Hips"):
                    if joint in angles[i]:
                        # Add XYZ rotations
                        line.extend(angles[i][joint])
                    else:
                        # No rotation data, use zeros
                        line.extend([0.0, 0.0, 0.0])
                
                # Write frame line
                f.write(" ".join(f"{val:.6f}" for val in line) + "\n")
        
        print(f"BVH file written to {output_path}")

    def _write_joint_hierarchy(self, file, joint_name, offsets, indent_level):
        """Write joint hierarchy to BVH file"""
        indent = "\t" * indent_level
        
        for child in self.joint_hierarchy.get(joint_name, []):
            if child.startswith("End_"):
                # Write end site
                file.write(f"{indent}End Site\n")
                file.write(f"{indent}{{\n")
                
                offset = offsets.get(child, (0, 0, 0))
                file.write(f"{indent}\tOFFSET {offset[0]:.6f} {offset[1]:.6f} {offset[2]:.6f}\n")
                
                file.write(f"{indent}}}\n")
            else:
                # Write child joint
                file.write(f"{indent}JOINT {child}\n")
                file.write(f"{indent}{{\n")
                
                offset = offsets.get(child, (0, 0, 0))
                file.write(f"{indent}\tOFFSET {offset[0]:.6f} {offset[1]:.6f} {offset[2]:.6f}\n")
                
                # All non-root joints have rotation only
                file.write(f"{indent}\tCHANNELS 3 Xrotation Yrotation Zrotation\n")
                
                # Write child's children
                self._write_joint_hierarchy(file, child, offsets, indent_level + 1)
                
                file.write(f"{indent}}}\n")

    def _get_joint_list(self, start_joint):
        """Get a flat list of all joints in hierarchy order"""
        joints = [start_joint]
        
        for child in self.joint_hierarchy.get(start_joint, []):
            if not child.startswith("End_"):
                joints.extend(self._get_joint_list(child))
        
        return joints

    def convert_video_to_bvh(self, video_path, output_path, visualize=True):
        """Convert video to BVH file"""
        print(f"Processing video: {video_path}")
        
        # Extract pose data from video
        frames = self.process_video(video_path, visualize)
        
        # Calculate joint positions for each frame
        print("Calculating joint positions...")
        positions = self.calculate_joint_positions(frames)
        
        # Apply smoothing to reduce jitter
        print("Smoothing motion...")
        smoothed_positions = self.smooth_positions(positions, window_size=7)
        
        # Calculate joint angles
        print("Calculating joint rotations...")
        angles = self.calculate_joint_angles(smoothed_positions)
        
        # Write BVH file
        print(f"Writing BVH file to: {output_path}")
        self.write_bvh_file(smoothed_positions, angles, output_path)
        
        print("Conversion complete!")
        return True


# def main():
#     parser = argparse.ArgumentParser(description='Convert video to BVH using MediaPipe.')
#     parser.add_argument('--input', type=str, required=True, help='Input video file')
#     parser.add_argument('--output', type=str, help='Output BVH file')
#     parser.add_argument('--fps', type=int, default=30, help='Frames per second for BVH')
#     parser.add_argument('--scale', type=float, default=100.0, help='Scale factor for the skeleton')
#     parser.add_argument('--visualize', action='store_true', help='Save visualization of pose tracking')
    
#     args = parser.parse_args()
    
#     # If output path is not specified, use input filename with .bvh extension
#     if not args.output:
#         base_name = os.path.splitext(os.path.basename(args.input))[0]
#         args.output = f"{base_name}.bvh"
    
#     try:
#         converter = MediaPipeToBVH(fps=args.fps, scale=args.scale)
#         converter.convert_video_to_bvh(args.input, args.output, args.visualize)
#         print(f"✓ Successfully created BVH file: {args.output}")
#     except Exception as e:
#         print(f"Error: {str(e)}")
#         return 1
    
#     return 0


# if __name__ == "__main__":
#     main()

In [17]:
filename = "fight1"
converter = MediaPipeToBVH(fps=30, scale=100.0)
converter.convert_video_to_bvh(f"{filename}.mp4", f"{filename}.bvh")

I0000 00:00:1740604874.842905   30947 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1740604874.843658   33897 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 23.2.1-1ubuntu3.1~22.04.3), renderer: Mesa Intel(R) UHD Graphics (CML GT2)
W0000 00:00:1740604874.957901   33887 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Processing video: fight1.mp4
Video has 91 frames at 30.0 FPS


W0000 00:00:1740604875.094148   33894 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Processing frame 9/91 (9.9%)
Processing frame 18/91 (19.8%)
Processing frame 27/91 (29.7%)
Processing frame 36/91 (39.6%)
Processing frame 45/91 (49.5%)
Processing frame 54/91 (59.3%)
Processing frame 63/91 (69.2%)
Processing frame 72/91 (79.1%)
Processing frame 81/91 (89.0%)
Processing frame 90/91 (98.9%)
Extracted pose data from 91 frames
Calculating joint positions...
Smoothing motion...
Calculating joint rotations...
Writing BVH file to: fight1.bvh
Using frame 29 as reference for skeleton
BVH file written to fight1.bvh
Conversion complete!


True