In [5]:
import numpy as np
import pandas as pd
import cv2
import mediapipe as mp
from IPython.display import clear_output
import time


# Initial Research

MediaPipe has a z-axis output, which apparently estimates the pose in a 2m3 box 
wherer the origin is the center of the hip. As usual, the point of the algo is 
to determine the _relataive_ position of the key points, not their absolute
position or distance. MP does provide this as `world_coordinates`


# Mediapipe

Simple framework:
1. Capture the frame from the video feed.
2. Process the frame using the pose detection model to find keypoints.
3. Draw the keypoints directly onto the frame.
4. Display the frame with the drawn keypoints. 

Below is some code using MediaPipe to get the key metrics out of the pose estimation.
**MODEL COMPLEXITY:** The intended moden (lite, full, heavy) can be adjusted my changing the `model_complexity` kwarg when initializing the `pose` variable.

In [8]:
import cv2
import mediapipe as mp

# Initialize MediaPipe Pose.
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5,
                    min_tracking_confidence=0.5, model_complexity=2)
mp_drawing = mp.solutions.drawing_utils

# Start capturing video from the webcam.
cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Convert the frame to RGB.
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Process the frame with MediaPipe Pose.
    results = pose.process(frame_rgb)

    # Draw the pose annotations on the frame.
    mp_drawing.draw_landmarks(frame, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)

    # Display the frame.
    cv2.imshow('Mediapipe Feed', frame)

    if cv2.waitKey(10) & 0xFF == ord('q'):
        break

# Release the webcam and destroy all OpenCV windows.
cap.release()
cv2.destroyAllWindows()
cv2.waitKey(1)


Downloading model to /Users/homemasaki/.pyenv/versions/3.10.6/envs/yoga-pose-detector/lib/python3.10/site-packages/mediapipe/modules/pose_landmark/pose_landmark_heavy.tflite


I0000 00:00:1707103413.135410       1 gl_context.cc:344] GL version: 2.1 (2.1 INTEL-22.1.29), renderer: Intel(R) Iris(TM) Plus Graphics 655


-1

Another implementation example, with some readouts. This is also mediapipe.

In [3]:
# Initialize stuff from mediapipe
mp_drawing = mp.solutions.drawing_utils
mp_pose = mp.solutions.pose

cap = cv2.VideoCapture(0)

with mp_pose.Pose(min_detection_confidence=0.5, min_tracking_confidence=0.5) as pose:
    while cap.isOpened():
        ret, frame = cap.read()

        # Recolor image
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        image.flags.writeable = False

        # Make detection
        results = pose.process(image)

        try:
            landmarks = results.pose_landmarks.landmark
            print(f"R_Sh:\n{landmarks[mp_pose.PoseLandmark.RIGHT_SHOULDER.value]}")
            print(f"L_Sh:\n{landmarks[mp_pose.PoseLandmark.LEFT_SHOULDER.value]}")

            # Save values if torso Z values close to zero
            if np.max(np.abs([landmarks[mp_pose.PoseLandmark.RIGHT_SHOULDER.value].z,
                              landmarks[mp_pose.PoseLandmark.LEFT_SHOULDER.value].z,
                              landmarks[mp_pose.PoseLandmark.RIGHT_HIP.value].z,
                              landmarks[mp_pose.PoseLandmark.LEFT_HIP.value].z])) > 0.5 and\
                np.min([landmarks[mp_pose.PoseLandmark.RIGHT_SHOULDER.value].visibility,
                        landmarks[mp_pose.PoseLandmark.LEFT_SHOULDER.value].visibility,
                        landmarks[mp_pose.PoseLandmark.RIGHT_HIP.value].visibility,
                        landmarks[mp_pose.PoseLandmark.LEFT_HIP.value].visibility]) > 0.9:
                print("Can See Torso")

                # TODO Write Framework here to save dataframe of correct torso values.
                # The Z value threshold in the above lines need tweaking as well.

        except:
            pass

        # Recolor image back to BGR
        image.flags.writeable = True
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

        # Show detections
        mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_pose.POSE_CONNECTIONS,
                                  mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=2),
                                  mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2),)

        cv2.imshow("Mediapipe feed", image)

        if cv2.waitKey(10) & 0xFF == ord("q"):
            break

        clear_output(wait=True)
    cap.release()
    cv2.destroyAllWindows()
    cv2.waitKey(1)


R_Sh:
x: 0.30530035
y: 0.79772305
z: -0.3842694
visibility: 0.99893445

L_Sh:
x: 0.7562555
y: 0.8035553
z: -0.29845184
visibility: 0.9988245



## Experimenting with asynchronous processing of frames

Another method to run Mediapipe, as described here:
https://developers.google.com/mediapipe/solutions/vision/pose_landmarker/python#live-stream

Results and performance of each models (lite, full, heavy) are summarized here: https://storage.googleapis.com/mediapipe-assets/Model%20Card%20BlazePose%20GHUM%203D.pdf

**Issues with asynchronous processing:** Looks like asynchronous readouts are not suitable if we are doing a video frame overlay. Keep synchronous processing such that overlay is still present with the live feed. The main code difference is using a timestamp in ms and also calling `detect_async()` instead of `detect()`.

**From the oracle**
> If you're running things synchronously, especially for real-time or live feed scenarios with frameworks like MediaPipe, you typically process each frame one at a time in a loop. In such cases, you don't necessarily need to use a callback function. Instead, you can directly process each frame as you capture it, analyze it with the pose detection model, and immediately draw the keypoints or landmarks onto the frame before displaying it. This approach ensures minimal delay between capturing a frame, processing it, and displaying the results, making it suitable for real-time applications.

In [3]:
"""
Belwo is a playaround with asynchronous processing, which is a bit more complex
than running a simple live feed. This method is better if we do not have any
overlay on the video, but ignore this part since it is not quite critical
to run thing asynchronously. Code is kept in notebook for reference.
"""

import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
import time

# Load model
model_path = "/Users/homemasaki/code/projects/fit_me/models/pose_landmarker_full.task"

# Drawing utility
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

# Create the task
BaseOptions = mp.tasks.BaseOptions
PoseLandmarker = mp.tasks.vision.PoseLandmarker
PoseLandmarkerOptions = mp.tasks.vision.PoseLandmarkerOptions
VisionRunningMode = mp.tasks.vision.RunningMode

# Create a pose landmarker instance with the live stream mode:

options = PoseLandmarkerOptions(
    base_options=BaseOptions(model_asset_path=model_path),
    running_mode=VisionRunningMode.LIVE_STREAM)

# Grab frame using OpenCV
cap = cv2.VideoCapture(0)

with PoseLandmarker.create_from_options(options) as landmarker:
    while cap.isOpened():

        # Grab frame using OpenCV. Assume frame is grabbed.
        _, frame = cap.read()

        # Convert frame to MediaPipe Image object
        mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame.tobytes(),
                            width=frame.shape[1], height=frame.shape[0])
        landmarker.detect(mp_image)

        #cv2.imshow("Mediapipe feed", frame)

        if cv2.waitKey(10) & 0xFF == ord("q"):
            break

cap.release()
cv2.destroyAllWindows()
cv2.waitKey(1)


ValueError: The vision task is in live stream mode, a user-defined result callback must be provided.

## YOLO/Ultralytics

YOLO v8 has a pose estimation, but does not include many key ponts (17).
Also, YOLO is more known for object detection, hence may not be the best for this application.

Table this model for now, can return to it if we have reason.

# Extracting poses from still images for NN model

Below is code which takes in a directory, and can output a df including all KP and extracted data.

In [9]:
import cv2
import mediapipe as mp
import pandas as pd

# Initialize MediaPipe Pose
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=True, model_complexity=1,
                    min_detection_confidence=0.5, min_tracking_confidence=0.5)

# Placeholder for collected data
data = []

# File paths here
image_files = ['path/to/image1.jpg', ...]

# Process each image
for image_file in image_files:
    image = cv2.imread(image_file)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = pose.process(image_rgb)

    # Initialize a row with the image file name
    row = [image_file]

    if results.pose_landmarks:
        landmarks = results.pose_landmarks.landmark

        # Flatten all landmark data into the row
        for landmark in landmarks:
            row.extend([landmark.x, landmark.y, landmark.z, landmark.visibility, landmark.presence])
    else:
        # If no landmarks are detected, fill in with None or a placeholder value
        row.extend([None] * 33 * 5)

    # Append the row to the data list
    data.append(row)

# Define column names
columns = ['image_file']
for idx in range(1, 34):  # MediaPipe Pose has 33 landmarks, starting index at 1 for readability
    prefix = f'kp{idx}_'
    attributes = ['x', 'y', 'z', 'visibility', 'presence']
    columns += [f'{prefix}{attr}' for attr in attributes]

# Create DataFrame
df = pd.DataFrame(data, columns=columns)

# Optionally, save DataFrame to CSV
df.to_csv('pose_data.csv', index=False)


I0000 00:00:1707106777.192229       1 gl_context.cc:344] GL version: 2.1 (2.1 INTEL-22.1.29), renderer: Intel(R) Iris(TM) Plus Graphics 655
[ WARN:0@8995.056] global loadsave.cpp:248 findDecoder imread_('path/to/image1.jpg'): can't open/read file: check file path/integrity


error: OpenCV(4.9.0) /Users/runner/work/opencv-python/opencv-python/opencv/modules/imgproc/src/color.cpp:196: error: (-215:Assertion failed) !_src.empty() in function 'cvtColor'


: 