In [46]:
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
from mediapipe.tasks.python.vision import PoseLandmarkerResult
from mediapipe import solutions as mp_solutions
from mediapipe.framework.formats import landmark_pb2

import cv2 as cv
import pandas as pd
import tensorflow as tf
import numpy as np

In [102]:
# Absolute Path to the Pose Model
modelPath = '/home/matt/Documents/projects/swingAnalysis/swing-analysis-prototyping/models/pose_landmarker_full.task'

# Set up MediaPipe base options
BaseOptions = mp.tasks.BaseOptions
PoseLandmarker = mp.tasks.vision.PoseLandmarker
PoseLandmarkerOptions = mp.tasks.vision.PoseLandmarkerOptions
VisionRunningMode = mp.tasks.vision.RunningMode

# Set up OpenCV Video Capture
VIDEO_NUMBER = 363
cap = cv.VideoCapture(f'../data/videos_160/{VIDEO_NUMBER}.mp4')
fps = cap.get(cv.CAP_PROP_FPS)
print(cap.get(cv.CAP_PROP_FRAME_COUNT))
print(4 / cap.get(cv.CAP_PROP_FRAME_COUNT) * 46)
frame_index = 0

403.0
0.456575682382134


In [103]:
# Read in the swing segmentation data
df = pd.read_pickle('../data/GolfDB_Filtered.pkl')
df['sex'] = df['sex'].astype('category')
df['club'] = df['club'].astype('category')
df['view'] = df['view'].astype('category')

df['sex_codes'] = df['sex'].cat.codes
df['club_codes'] = df['club'].cat.codes
df['view_codes'] = df['view'].cat.codes

df = df.drop(columns=['youtube_id','handedness','bbox', 'player', 'split'])
# df = df[df['view'] == 'face-on']
video_row = df[df['id'] == VIDEO_NUMBER]

In [104]:
# Create a pose landmarker instance with video mode on
options = PoseLandmarkerOptions(
    base_options=BaseOptions(model_asset_path=modelPath),
    running_mode=VisionRunningMode.VIDEO
)

In [105]:
# Methods for data augmentation
# Get angles for the following measurements
# 1) Left arm
# 2) Right arm
# 3) Shoulders relative to flat
# Will return the values as an numpy array that can be appended to the landmark list
def get_angles(landmarks):
    # Get the necessary points
    leftShoulder = landmarks[0][11]
    leftElbow = landmarks[0][13]
    leftWrist = landmarks[0][15]
    rightShoulder = landmarks[0][12]
    rightElbow = landmarks[0][14]
    rightWrist = landmarks[0][16]

    # Shoulder Angle
    LS = np.array([leftShoulder.x, leftShoulder.y, leftShoulder.z])
    RS = np.array([rightShoulder.x, rightShoulder.y, rightShoulder.z])
    P3 = np.array([leftShoulder.x, rightShoulder.y, leftShoulder.z]) # Third point to calculate angle against flat

    shoulderAngle = angle_between_points(LS, RS, P3)

    # Left Arm Angle
    LE = np.array([leftElbow.x, leftElbow.y, leftElbow.z])
    LW = np.array([leftWrist.x, leftWrist.y, leftWrist.z])

    leftArmAngle = angle_between_points(LS, LE, LW)

    # Right Arm Angle
    RE = np.array([rightElbow.x, rightElbow.y, rightElbow.z])
    RW = np.array([rightWrist.x, rightWrist.y, rightWrist.z])

    rightArmAngle = angle_between_points(RS, RE, RW)

    return np.array([shoulderAngle, leftArmAngle, rightArmAngle])
    
def angle_between_points(p1, p2, p3):
    # Get vectors from points
    v1 = np.array(p1) - np.array(p2)
    v2 = np.array(p3) - np.array(p2)

    dot_product = np.dot(v1, v2)
    magnitude_v1 = np.linalg.norm(v1)
    magnitude_v2 = np.linalg.norm(v2)

    if magnitude_v1 == 0 or magnitude_v2 == 0:
        return 0.0  # Avoid division by zero

    cosine_angle = dot_product / (magnitude_v1 * magnitude_v2)
    cosine_angle = np.clip(cosine_angle, -1.0, 1.0)

    return np.degrees(np.arccos(cosine_angle))

# Create a function that holds the process for getting pose and adding it to a dataset
def process_frame_and_add(lm, frame_to_use, other_features, timestamp):
    # Convert the image to mp image
    mpImage = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame_to_use)

    # Get the landmarks for the image
    pose_landmarker_result = lm.detect_for_video(mpImage, timestamp)

    if pose_landmarker_result.pose_landmarks:
        # Convert the landmarks to format that can be passed into model
        landmarks = pose_landmarker_result.pose_landmarks
        flat_landmarks = np.array([[l.x, l.y, l.z] for l in landmarks[0]]).flatten()

        # Append other useful information from the dataframe
        flat_data = np.append(flat_landmarks, other_features)

        # Append angle information
        flat_data = np.append(flat_data, get_angles(landmarks))

        # Append to the dataset
        return flat_data

In [106]:
# Load in the pretrained keras model
model = tf.keras.models.load_model('../models/model_v0.keras')

In [107]:
# Run the Landmarker
with PoseLandmarker.create_from_options(options) as landmarker:
    while cap.isOpened():
        ret, frame = cap.read()
        # if frame is read correctly ret is True
        if not ret:
            print("Can't receive frame (stream end?). Exiting ...")
            break

        # Create other features for this row (consistent through time)
        other_features = [video_row['sex_codes'], video_row['club_codes'], video_row['view_codes']]
        
        # Get the timestamp
        timestamp = int(1000 * frame_index / fps)
        frame_index += 1
        # Convert the CV image to MP
        mpImage = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)
        
        # Get the row that will be used as X data
        X = process_frame_and_add(landmarker, frame, other_features, timestamp).reshape(1,-1)
        X_lmk = X[:,0:99]
        X_features = X[:,99:]
        X ={
            "pose_input": X_lmk,
            "eng_input": X_features,
        }
        
        # Pass the x data through the model
        y = model.predict(X)
        print(y)
        label = np.argmax(y)
        
        resized_frame = cv.resize(frame, None, fx=3.0, fy=3.0, interpolation=cv.INTER_LINEAR)
        cv.putText(resized_frame, str(label), (30, 50), cv.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2, cv.LINE_AA)
        cv.imshow('Pose Detection', resized_frame)

        if cv.waitKey(1) == ord('q'): # Change wait time to play video slower or faster
            break
                
    cap.release()
    cv.destroyAllWindows()
    

I0000 00:00:1751395525.228579   19865 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1751395525.231165   76618 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 25.0.3-1ubuntu2), renderer: zink Vulkan 1.4(NVIDIA GeForce RTX 4070 (NVIDIA_PROPRIETARY))
W0000 00:00:1751395525.277829   76625 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1751395525.306624   76624 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 316ms/step
[[9.9136180e-01 4.0184757e-05 1.9744556e-08 7.8181142e-07 5.3548973e-08
  8.5971467e-03 1.3230911e-08 3.3923985e-08]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[[9.9423426e-01 3.9338607e-05 2.0486219e-08 7.9232410e-07 6.8809790e-08
  5.7255141e-03 1.2059592e-08 3.5437409e-08]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[[9.9447340e-01 5.3562031e-05 2.5222670e-08 8.9153616e-07 9.2216887e-08
  5.4719434e-03 1.4128383e-08 3.8866748e-08]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[[9.9412245e-01 7.4084317e-05 3.9802490e-08 1.2648994e-06 1.6195791e-07
  5.8019119e-03 2.1779327e-08 4.5483272e-08]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[[9.9776375e-01 2.3277518e-05 5.0143725e-09 2.3997009e-07 2.0829315e-08
  2.2126709e-03 2.1193507e-09 4.3949622e-09]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0