In [1]:
import cv2
import numpy as np
import mediapipe as mp
import argparse
import os
from datetime import datetime
import time

# BVH creation utilities
class Joint:
    def __init__(self, name, parent=None):
        self.name = name
        self.parent = parent
        self.children = []
        self.offset = np.zeros(3)
        self.channels = []
        self.motion = []
        
    def add_child(self, child):
        self.children.append(child)
        
class BVHSkeleton:
    def __init__(self):
        self.root = None
        self.joints = {}
        self.frames = 0
        self.frame_time = 1.0/30.0  # Default to 30fps
        
    def create_hierarchy(self, landmark_names):
        # Create a simplified hierarchy for MediaPipe landmarks
        # Root joint (typically hips in BVH)
        self.root = Joint("Hips")
        self.joints["Hips"] = self.root
        
        # Setup spine chain
        spine = Joint("Spine", self.root)
        self.root.add_child(spine)
        self.joints["Spine"] = spine
        
        neck = Joint("Neck", spine)
        spine.add_child(neck)
        self.joints["Neck"] = neck
        
        head = Joint("Head", neck)
        neck.add_child(head)
        self.joints["Head"] = head
        
        # Left arm
        left_shoulder = Joint("LeftShoulder", spine)
        spine.add_child(left_shoulder)
        self.joints["LeftShoulder"] = left_shoulder
        
        left_arm = Joint("LeftArm", left_shoulder)
        left_shoulder.add_child(left_arm)
        self.joints["LeftArm"] = left_arm
        
        left_forearm = Joint("LeftForeArm", left_arm)
        left_arm.add_child(left_forearm)
        self.joints["LeftForeArm"] = left_forearm
        
        left_hand = Joint("LeftHand", left_forearm)
        left_forearm.add_child(left_hand)
        self.joints["LeftHand"] = left_hand
        
        # Right arm
        right_shoulder = Joint("RightShoulder", spine)
        spine.add_child(right_shoulder)
        self.joints["RightShoulder"] = right_shoulder
        
        right_arm = Joint("RightArm", right_shoulder)
        right_shoulder.add_child(right_arm)
        self.joints["RightArm"] = right_arm
        
        right_forearm = Joint("RightForeArm", right_arm)
        right_arm.add_child(right_forearm)
        self.joints["RightForeArm"] = right_forearm
        
        right_hand = Joint("RightHand", right_forearm)
        right_forearm.add_child(right_hand)
        self.joints["RightHand"] = right_hand
        
        # Left leg
        left_upleg = Joint("LeftUpLeg", self.root)
        self.root.add_child(left_upleg)
        self.joints["LeftUpLeg"] = left_upleg
        
        left_leg = Joint("LeftLeg", left_upleg)
        left_upleg.add_child(left_leg)
        self.joints["LeftLeg"] = left_leg
        
        left_foot = Joint("LeftFoot", left_leg)
        left_leg.add_child(left_foot)
        self.joints["LeftFoot"] = left_foot
        
        # Right leg
        right_upleg = Joint("RightUpLeg", self.root)
        self.root.add_child(right_upleg)
        self.joints["RightUpLeg"] = right_upleg
        
        right_leg = Joint("RightLeg", right_upleg)
        right_upleg.add_child(right_leg)
        self.joints["RightLeg"] = right_leg
        
        right_foot = Joint("RightFoot", right_leg)
        right_leg.add_child(right_foot)
        self.joints["RightFoot"] = right_foot
        
        # Setup channels for each joint
        self.setup_channels()
    
    def setup_channels(self):
        # Root has 6 channels: position and rotation
        self.root.channels = ["Xposition", "Yposition", "Zposition", "Zrotation", "Xrotation", "Yrotation"]
        
        # All other joints have 3 channels (rotation only)
        for name, joint in self.joints.items():
            if joint != self.root:
                joint.channels = ["Zrotation", "Xrotation", "Yrotation"]
    
    def update_joint_offsets(self, landmarks):
        # Use first frame landmarks to set initial offsets between joints
        scale_factor = 100  # Scale factor to make the skeleton dimensions appropriate for BVH
        
        # Map MediaPipe landmarks to BVH joints - these are approximate mappings
        mapping = {
            "Hips": 23,  # hip center
            "Spine": 11,  # spine mid-point
            "Neck": 12,   # shoulder center
            "Head": 0,    # nose
            "LeftShoulder": 11,  # left shoulder
            "LeftArm": 13,       # left elbow
            "LeftForeArm": 15,   # left wrist
            "LeftHand": 19,      # left index finger
            "RightShoulder": 12, # right shoulder
            "RightArm": 14,      # right elbow
            "RightForeArm": 16,  # right wrist
            "RightHand": 20,     # right index finger
            "LeftUpLeg": 23,     # left hip
            "LeftLeg": 25,       # left knee
            "LeftFoot": 27,      # left ankle
            "RightUpLeg": 24,    # right hip
            "RightLeg": 26,      # right knee
            "RightFoot": 28      # right ankle
        }
        
        for name, joint in self.joints.items():
            if joint.parent is None:  # Root joint
                joint.offset = np.zeros(3)
            else:
                # Calculate offset from parent
                parent_idx = mapping[joint.parent.name]
                child_idx = mapping[name]
                
                parent_pos = np.array([landmarks[parent_idx].x, landmarks[parent_idx].y, landmarks[parent_idx].z])
                child_pos = np.array([landmarks[child_idx].x, landmarks[child_idx].y, landmarks[child_idx].z])
                
                # Calculate offset and apply scale
                offset = (child_pos - parent_pos) * scale_factor
                
                # In BVH, Y is up, but in MediaPipe, Y is down
                joint.offset = np.array([offset[0], -offset[1], offset[2]])
    
    def calculate_rotation(self, joint_name, parent_idx, child_idx, landmarks):
        # This is a simplified rotation calculation based on direction vectors
        # In a real implementation, you would use more sophisticated quaternion methods
        
        if parent_idx is None or child_idx is None:
            return [0, 0, 0]
            
        # Get parent and child positions
        parent_pos = np.array([landmarks[parent_idx].x, landmarks[parent_idx].y, landmarks[parent_idx].z])
        child_pos = np.array([landmarks[child_idx].x, landmarks[child_idx].y, landmarks[child_idx].z])
        
        # Calculate direction vector
        direction = child_pos - parent_pos
        
        # Normalize
        length = np.linalg.norm(direction)
        if length > 0:
            direction = direction / length
        
        # Simple rotation calculation (this is a simplification)
        # A better approach would use forward kinematics and quaternions
        x_rot = np.arctan2(direction[2], direction[1]) * 180 / np.pi
        y_rot = np.arctan2(direction[0], direction[2]) * 180 / np.pi
        z_rot = np.arctan2(direction[1], direction[0]) * 180 / np.pi
        
        return [z_rot, x_rot, y_rot]
    
    def process_frame(self, landmarks):
        # Process landmarks for a single frame
        frame_data = []
        
        # Root position (global translation)
        hips_idx = 23  # MediaPipe hip center index
        root_pos = [
            landmarks[hips_idx].x * 100,
            -landmarks[hips_idx].y * 100,
            landmarks[hips_idx].z * 100
        ]
        
        # Add root position
        frame_data.extend(root_pos)
        
        # Map landmarks to joints for rotation calculation
        mapping = {
            "Hips": (None, 23),  # hip center
            "Spine": (23, 11),   # hip to spine
            "Neck": (11, 12),    # spine to neck
            "Head": (12, 0),     # neck to nose
            "LeftShoulder": (11, 13),  # spine to left shoulder
            "LeftArm": (13, 15),       # left shoulder to left elbow
            "LeftForeArm": (15, 19),   # left elbow to left wrist
            "LeftHand": (19, 21),      # left wrist to left index finger
            "RightShoulder": (12, 14), # spine to right shoulder
            "RightArm": (14, 16),      # right shoulder to right elbow
            "RightForeArm": (16, 20),  # right elbow to right wrist
            "RightHand": (20, 22),     # right wrist to right index finger
            "LeftUpLeg": (23, 25),     # left hip to left knee
            "LeftLeg": (25, 27),       # left knee to left ankle
            "LeftFoot": (27, 31),      # left ankle to left foot
            "RightUpLeg": (24, 26),    # right hip to right knee
            "RightLeg": (26, 28),      # right knee to right ankle
            "RightFoot": (28, 32)      # right ankle to right foot
        }
        
        # Calculate joint rotations for each joint
        for name, joint in self.joints.items():
            parent_idx, child_idx = mapping.get(name, (None, None))
            
            if joint == self.root:
                # Root joint: add position (already added) and rotation
                rotation = self.calculate_rotation(name, parent_idx, child_idx, landmarks)
                frame_data.extend(rotation)
            else:
                # Other joints: add rotation only
                rotation = self.calculate_rotation(name, parent_idx, child_idx, landmarks)
                frame_data.extend(rotation)
        
        return frame_data
    
    def write_bvh(self, filename):
        with open(filename, 'w') as f:
            # Write header
            f.write("HIERARCHY\n")
            
            # Write joints recursively
            self._write_joint(f, self.root, 0)
            
            # Write motion data
            f.write("MOTION\n")
            f.write(f"Frames: {self.frames}\n")
            f.write(f"Frame Time: {self.frame_time}\n")
            
            # Write frame data
            for frame in range(self.frames):
                line = []
                for joint_name, joint in self.joints.items():
                    channels = len(joint.channels)
                    
                    if not joint.motion or frame >= len(joint.motion):
                        # If no motion data, write zeros
                        line.extend([0.0] * channels)
                    else:
                        channel_data = joint.motion[frame]
                        if len(channel_data) < channels:
                            # Ensure correct number of channels
                            channel_data = channel_data + [0.0] * (channels - len(channel_data))
                        line.extend(channel_data)
                
                f.write(" ".join(map(lambda x: f"{x:.6f}", line)) + "\n")
    
    def _write_joint(self, f, joint, indent_level):
        indent = "  " * indent_level
        
        if joint.parent is None:
            # Root joint
            f.write(f"{indent}ROOT {joint.name}\n")
        else:
            f.write(f"{indent}JOINT {joint.name}\n")
        
        f.write(f"{indent}{{\n")
        
        # Write offset
        offset_str = " ".join(map(lambda x: f"{x:.6f}", joint.offset))
        f.write(f"{indent}  OFFSET {offset_str}\n")
        
        # Write channels
        channels_str = " ".join(joint.channels)
        f.write(f"{indent}  CHANNELS {len(joint.channels)} {channels_str}\n")
        
        # Write children
        for child in joint.children:
            self._write_joint(f, child, indent_level + 1)
        
        f.write(f"{indent}}}\n")

def process_video(input_file, output_file, visualize=False, debug=True):
    # Initialize MediaPipe Pose
    mp_pose = mp.solutions.pose
    mp_drawing = mp.solutions.drawing_utils
    
    # Create BVH skeleton
    skeleton = BVHSkeleton()
    
    # Verify file exists
    if not os.path.exists(input_file):
        print(f"Error: Input file '{input_file}' does not exist.")
        return False
    
    # Open video file
    print(f"Opening video file: {input_file}")
    cap = cv2.VideoCapture(input_file)
    
    # Check if video opened successfully
    if not cap.isOpened():
        print(f"Error: Could not open video file '{input_file}'.")
        print("Please check that the file exists and is a valid video format.")
        print("Supported formats include: mp4, avi, mov, etc.")
        return False
    
    # Get video properties
    fps = cap.get(cv2.CAP_PROP_FPS)
    if fps <= 0:
        fps = 30  # Default if unable to determine
        print(f"Warning: Could not determine video FPS, using default of {fps}")
    
    skeleton.frame_time = 1.0 / fps
    
    # Count frames
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    
    print(f"Video info: {width}x{height}, {fps} FPS, {total_frames} frames")
    
    if total_frames <= 0:
        print("Warning: Could not determine total frame count")
        total_frames = float('inf')  # Process until end of video
    
    # Initialize variables
    frame_count = 0
    processed_count = 0
    is_first_frame = True
    start_time = time.time()
    
    # Setup Pose detection
    print("Initializing MediaPipe pose detection...")
    
    pose_config = {
        'min_detection_confidence': 0.5,
        'min_tracking_confidence': 0.5,
        'model_complexity': 1  # 0=Lite, 1=Full, 2=Heavy (more accurate but slower)
    }
    
    if debug:
        print(f"MediaPipe pose config: {pose_config}")
    
    with mp_pose.Pose(**pose_config) as pose:
        while cap.isOpened():
            success, image = cap.read()
            if not success:
                if debug and frame_count < total_frames:
                    print(f"Warning: Failed to read frame {frame_count}. Stopping.")
                break
            
            frame_count += 1
            
            if frame_count % 100 == 0 or debug:
                print(f"Processing frame {frame_count}/{total_frames if total_frames != float('inf') else '?'}")
            
            # Convert the BGR image to RGB
            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            
            # For better performance, optionally mark the image as not writeable
            image_rgb.flags.writeable = False
            
            # Process the image and detect pose
            try:
                results = pose.process(image_rgb)
            except Exception as e:
                print(f"Error processing frame {frame_count}: {e}")
                continue
            
            # Enable image writing for visualization
            image_rgb.flags.writeable = True
            
            # Check if pose landmarks are detected
            if results.pose_landmarks:
                processed_count += 1
                
                # Debug landmark visibility
                if debug and processed_count == 1:
                    for i, landmark in enumerate(results.pose_landmarks.landmark):
                        print(f"Landmark {i}: vis={landmark.visibility:.2f}, pos=({landmark.x:.2f}, {landmark.y:.2f}, {landmark.z:.2f})")
                
                # If first frame with landmarks, initialize the skeleton hierarchy
                if is_first_frame:
                    print("First pose detected! Initializing skeleton...")
                    landmark_names = [f"landmark_{i}" for i in range(33)]
                    skeleton.create_hierarchy(landmark_names)
                    skeleton.update_joint_offsets(results.pose_landmarks.landmark)
                    is_first_frame = False
                    print("Skeleton initialized")
                
                # Process frame for motion data
                try:
                    motion_data = skeleton.process_frame(results.pose_landmarks.landmark)
                except Exception as e:
                    print(f"Error processing motion data for frame {frame_count}: {e}")
                    continue
                
                # Draw the pose landmarks on the image if visualization is enabled
                if visualize:
                    # Draw pose landmarks
                    mp_drawing.draw_landmarks(
                        image,
                        results.pose_landmarks,
                        mp_pose.POSE_CONNECTIONS,
                        mp_drawing.DrawingSpec(color=(245, 117, 66), thickness=2, circle_radius=2),
                        mp_drawing.DrawingSpec(color=(245, 66, 230), thickness=2, circle_radius=1)
                    )
                    
                    # Add frame number
                    cv2.putText(
                        image, f"Frame: {frame_count}/{total_frames}", 
                        (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2
                    )
                    
                    # Display the image with landmarks
                    cv2.imshow('MediaPipe Pose Detection', image)
                    
                    # Break loop on 'q' key press
                    key = cv2.waitKey(5) & 0xFF
                    if key == ord('q') or key == 27:  # 'q' or ESC
                        print("Visualization stopped by user")
                        break
                
                # Add motion data to skeleton
                for i, (name, joint) in enumerate(skeleton.joints.items()):
                    # Initialize motion list if needed
                    if not hasattr(joint, 'motion') or joint.motion is None:
                        joint.motion = []
                    
                    # Calculate start index for this joint's data in the motion_data array
                    start_idx = 0
                    for prev_name, prev_joint in list(skeleton.joints.items())[:i]:
                        start_idx += len(prev_joint.channels)
                    
                    # Extract relevant motion data for this joint
                    channel_count = len(joint.channels)
                    joint_data = motion_data[start_idx:start_idx + channel_count]
                    
                    # Ensure we have the right number of values
                    if len(joint_data) < channel_count:
                        joint_data.extend([0.0] * (channel_count - len(joint_data)))
                    
                    joint.motion.append(joint_data)
            else:
                if debug and frame_count % 10 == 0:
                    print(f"No pose detected in frame {frame_count}")
    
    # Release resources
    cap.release()
    if visualize:
        cv2.destroyAllWindows()
    
    # Duration and statistics
    duration = time.time() - start_time
    
    print(f"\nProcessing complete: {processed_count} frames with pose data out of {frame_count} total frames")
    print(f"Time taken: {duration:.2f} seconds ({frame_count/duration:.2f} FPS)")
    
    # Check if we have any processed frames
    if processed_count == 0:
        print("\nERROR: No frames were successfully processed!")
        print("Possible issues:")
        print("1. The video doesn't contain a clearly visible person")
        print("2. The lighting conditions make pose detection difficult")
        print("3. The person is too small in the frame or too far from the camera")
        print("4. The video format might be incompatible")
        print("\nTry with a different video or with the person closer to camera")
        return False
    
    # Set the total number of frames in the BVH file
    skeleton.frames = processed_count
    
    # Write BVH file
    try:
        print(f"Writing BVH file to {output_file}...")
        skeleton.write_bvh(output_file)
        print(f"BVH file written successfully with {processed_count} frames of motion data")
        return True
    except Exception as e:
        print(f"Error writing BVH file: {e}")
        return False

# if __name__ == "__main__":
#     parser = argparse.ArgumentParser(description='Convert video to BVH motion file using MediaPipe.')
#     parser.add_argument('input', help='Input video file')
#     parser.add_argument('--output', help='Output BVH file (default: output.bvh)')
#     parser.add_argument('--visualize', action='store_true', help='Show visualization during processing')
#     parser.add_argument('--debug', action='store_true', help='Enable debug output')
    
#     args = parser.parse_args()
    
#     # Set default output filename if not provided
#     if not args.output:
#         base_name = os.path.splitext(os.path.basename(args.input))[0]
#         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
#         args.output = f"{base_name}_{timestamp}.bvh"
    
#     success = process_video(args.input, args.output, args.visualize, args.debug)
    
#     if success:
#         print("\nConversion completed successfully!")
#     else:
#         print("\nConversion failed. Please check the error messages above.")

2025-02-26 16:49:25.804750: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740617365.820725   26298 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740617365.825852   26298 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-26 16:49:25.842144: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:

# parser = argparse.ArgumentParser(description="Convert video to BVH motion capture file")
# parser.add_argument("video_path", help="Path to input video file")
# parser.add_argument("--output", help="Path to output BVH file")
# parser.add_argument("--fps", type=int, default=30, help="Target frames per second for BVH file")

# args = parser.parse_args()

# video_path = args.video_path
# output_path = args.output

# if not output_path:
#     # Create output path if not specified
#     video_name = Path(video_path).stem
#     output_path = f"{video_name}.bvh"
filename = "fight2"
process_video(f"{filename}.mp4", f"{filename}.bvh", True, False)

Opening video file: fight2.mp4
Video info: 1280x720, 30.0 FPS, 43 frames
Initializing MediaPipe pose detection...


I0000 00:00:1740617367.792169   26298 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1740617367.794105   26397 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 23.2.1-1ubuntu3.1~22.04.3), renderer: Mesa Intel(R) UHD Graphics (CML GT2)
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1740617367.859976   26375 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1740617367.893540   26373 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1740617367.914986   26377 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.


First pose detected! Initializing skeleton...
Skeleton initialized

Processing complete: 43 frames with pose data out of 43 total frames
Time taken: 1.94 seconds (22.17 FPS)
Writing BVH file to fight2.bvh...
BVH file written successfully with 43 frames of motion data


True