In [1]:
import cv2
import mediapipe as mp
import numpy as np
from pickle import dump, load

In [7]:
import matplotlib.pyplot as plt

mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_holistic = mp.solutions.holistic

img_path = "mano.jpeg"
img = plt.imread(img_path)

In [8]:
with mp_holistic.Holistic(static_image_mode=True) as holistic:
    results = holistic.process(img)
    annotated_img = img.copy()

    mp_drawing.draw_landmarks(
        annotated_img,
        results.right_hand_landmarks,
        mp_hands.HAND_CONNECTIONS,
        landmark_drawing_spec=mp_drawing_styles.get_default_hand_landmarks_style(),
        connection_drawing_spec=mp_drawing_styles.get_default_hand_connections_style()
    )

In [11]:
plt.imshow(annotated_img

array([[[211, 216, 210],
        [211, 216, 210],
        [211, 216, 210],
        ...,
        [195, 200, 196],
        [195, 200, 196],
        [195, 200, 196]],

       [[211, 216, 210],
        [211, 216, 210],
        [211, 216, 210],
        ...,
        [195, 200, 196],
        [195, 200, 196],
        [195, 200, 196]],

       [[211, 216, 210],
        [211, 216, 210],
        [211, 216, 210],
        ...,
        [195, 200, 196],
        [195, 200, 196],
        [195, 200, 196]],

       ...,

       [[ 94, 108, 109],
        [ 93, 107, 108],
        [ 92, 106, 107],
        ...,
        [188, 193, 171],
        [186, 194, 171],
        [187, 195, 172]],

       [[ 92, 106, 107],
        [ 91, 105, 106],
        [ 91, 105, 106],
        ...,
        [190, 195, 173],
        [188, 196, 173],
        [188, 196, 173]],

       [[ 90, 104, 105],
        [ 90, 104, 105],
        [ 90, 104, 105],
        ...,
        [190, 195, 173],
        [189, 197, 174],
        [189, 197, 174]]

In [2]:
def try_setting(letter, out_video_path, model_complexity, smooth_landmarks, refine_face_landmarks,
                min_detection_confidence, min_tracking_confidence):
    """
    Just for choosing good settings for detecting landmarks.
    """
    # Initialize MediaPipe Solutions
    mp_drawing = mp.solutions.drawing_utils
    mp_drawing_styles = mp.solutions.drawing_styles
    mp_holistic = mp.solutions.holistic
    
    # Upload the video
    video_path = "../Datos/Brutos/Letras_Sematos/%s.mp4" % letter
    cap = cv2.VideoCapture(video_path)

    # Check if the video is uploaded
    if not cap.isOpened():
        raise Exception("Problem uploading the video.")

    # Initialize output video
    out = cv2.VideoWriter(
        filename=out_video_path,
        fourcc=cv2.VideoWriter_fourcc(*'mp4v'),
        fps=cap.get(cv2.CAP_PROP_FPS),
        frameSize=(int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)))
    )
    
    # Configure MediaPipe Holistic Landmaker
    with mp_holistic.Holistic(
        static_image_mode=False,
        model_complexity=model_complexity,
        smooth_landmarks=smooth_landmarks,
        enable_segmentation=False,
        refine_face_landmarks=refine_face_landmarks,
        min_detection_confidence=min_detection_confidence,
        min_tracking_confidence=min_tracking_confidence
    ) as holistic:
        while cap.isOpened():
            # Take a frame of the video
            ret, frame = cap.read()
            
            # Check if the frame is read
            if not ret:
                # If not, finish the process
                break

            # Detect landmarks (OpenCV works in BGR, while the landmaker in RGB)
            results = holistic.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

            # Create the frame where landmarks will be drawn
            annotated_frame = frame.copy()

            # Draw pose, left and right hands, and face landmarks on the image.
            mp_drawing.draw_landmarks(
                annotated_frame,
                results.pose_landmarks,
                mp_holistic.POSE_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles.get_default_pose_landmarks_style()
            )
            mp_drawing.draw_landmarks(
                annotated_frame,
                results.left_hand_landmarks,
                mp_holistic.HAND_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles.get_default_hand_landmarks_style(),
                connection_drawing_spec=mp_drawing_styles.get_default_hand_connections_style()
            )
            mp_drawing.draw_landmarks(
                annotated_frame,
                results.right_hand_landmarks,
                mp_holistic.HAND_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles.get_default_hand_landmarks_style(),
                connection_drawing_spec=mp_drawing_styles.get_default_hand_connections_style()
            )
            mp_drawing.draw_landmarks(
                annotated_frame,
                results.face_landmarks,
                mp_holistic.FACEMESH_TESSELATION,
                landmark_drawing_spec=None,
                connection_drawing_spec=mp_drawing_styles.get_default_face_mesh_tesselation_style()
            )

            # Include the annotated frame in the output video
            out.write(annotated_frame)
    
    # Release resources
    out.release()
    cap.release()

In [3]:
options = [(0, 0.5), (0, 0.7), (0, 0.9), (1, 0.5), (1, 0.7), (1, 0.9), (2, 0.5), (2, 0.7), (2, 0.9)]
for i, (o1, o2) in enumerate(options):
    print("Option %s" % i)
    try_setting(
        letter="A",
        out_video_path="../Datos/Procesados/Pruebas_A/prueba%s_A.mp4" % i,
        model_complexity=o1,
        smooth_landmarks=True,
        refine_face_landmarks=True,
        min_detection_confidence=0.5,
        min_tracking_confidence=o2
    )

Option 0
Option 1
Option 2
Option 3
Option 4
Option 5
Option 6
Option 7
Option 8


Revisando visualmente los resultados, elijo (1, 0.9).

In [5]:
def generate_skeleton(video_path):
    # Initialize MediaPipe Holistic Landmarker
    mp_holistic = mp.solutions.holistic
    
    # Upload the video
    cap = cv2.VideoCapture(video_path)

    # Check if the video is uploaded
    if not cap.isOpened():
        raise Exception("Problem uploading the video.")

    # Extract info of the video
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Initialize results' list
    vid_results = []
    
    # Configure MediaPipe Holistic Landmaker
    with mp_holistic.Holistic(
        static_image_mode=False,
        model_complexity=1,
        smooth_landmarks=True,
        enable_segmentation=False,
        refine_face_landmarks=True,
        min_detection_confidence=0.5,
        min_tracking_confidence=0.9
    ) as holistic:
        while cap.isOpened():
            # Take a frame of the video
            ret, frame = cap.read()
            
            # Check if the frame is read
            if not ret:
                # If not, finish the process
                break

            # Detect landmarks (OpenCV works in BGR, while the landmaker in RGB)
            results = holistic.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

            # Add results to the list
            vid_results.append(results)
    
    # Release resources
    cap.release()

    # Return info of the video and results' list
    return fps, frame_width, frame_height, vid_results

In [20]:
def obtain_collection_dict_landmarks(list_video_paths, list_video_abbreviations, result_path):
    """
    Given a list of paths to videos, a list of the same length of the abbreviations of each video
    and a path for storing the result, this function stores in the result_path a dictionary with
    the Holistic Landmarks of each video.
    The structure of the dictionary is the following:
      - The keys of the dict are the abbreviations of the videos.
      - Each key has a dictionary associated, whose keys are:
        - n_frames (int): number of frames of the video.
        - fps (int): frames per second of the video.
        - frame_width (int): the width of the frame in pixels.
        - frame_height (int): the height of the frame in pixels.
        - landmarks (dict):
          - face (ndarray): 3D array where each row is the face landmark list of a frame.
          - pose (ndarray): 3D array where each row is the pose landmark list of a frame.
          - pose_world (ndarray): 3D array where each row is the pose world landmark list of a frame.
          - left_hand (ndarray): 3D array where each row is the left hand landmark list of a frame.
          - right_hand (ndarray): 3D array where each row is the right hand landmark list of a frame.
    """
    total_results = {}
    for video_path, video_abbreviation in zip(list_video_paths, list_video_abbreviations):
        fps, frame_width, frame_height, vid_results = generate_skeleton(video_path)
        n_frames = len(vid_results)
        
        vid_dict = {
            "n_frames": n_frames,
            "fps": fps,
            "frame_width": frame_width,
            "frame_height": frame_height
        }
    
        face_array = np.empty((n_frames, 478, 3), dtype=np.float64)
        pose_array = np.empty((n_frames, 33, 4), dtype=np.float64)
        pose_world_array = pose_array.copy()
        left_hand_array = np.empty((n_frames, 21, 3), dtype=np.float64)
        right_hand_array = left_hand_array.copy()
    
        for i, results in enumerate(vid_results):
            face_array[i] = (list(map(lambda landmark :
                                     list(map(lambda pos : pos[1], landmark.ListFields())),
                                     results.face_landmarks.ListFields()[0][1]))
                             if results.face_landmarks else np.nan)
            
            pose_array[i] = (list(map(lambda landmark :
                                     list(map(lambda pos : pos[1], landmark.ListFields())),
                                     results.pose_landmarks.ListFields()[0][1]))
                             if results.pose_landmarks else np.nan)

            pose_world_array[i] = (list(map(lambda landmark :
                                     list(map(lambda pos : pos[1], landmark.ListFields())),
                                     results.pose_world_landmarks.ListFields()[0][1]))
                             if results.pose_landmarks else np.nan)
    
            left_hand_array[i] = (list(map(lambda landmark :
                                          list(map(lambda pos : pos[1], landmark.ListFields())),
                                          results.left_hand_landmarks.ListFields()[0][1]))
                                  if results.left_hand_landmarks else np.nan)
    
            right_hand_array[i] = (list(map(lambda landmark :
                                           list(map(lambda pos : pos[1], landmark.ListFields())),
                                           results.right_hand_landmarks.ListFields()[0][1]))
                                   if results.right_hand_landmarks else np.nan)
        
        landmarks = {
            "face": face_array,
            "pose": pose_array,
            "pose_world": pose_world_array,
            "left_hand": left_hand_array,
            "right_hand": right_hand_array
        }
    
        vid_dict["landmarks"] = landmarks
        
        total_results[video_abbreviation] = vid_dict
        
    with open(result_path, "wb") as f:
        dump(total_results, f)

In [21]:
alphabet = [
    'A', 'B', 'C', 'CH', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'LL', 'M',
    'N', 'N_', 'O', 'P', 'Q', 'R', 'RR', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'
]
list_video_paths = ["../Datos/Brutos/Letras_Spread/%s.mp4" % letter for letter in alphabet]
result_path = "../Datos/Procesados/alphabet_landmarks_spread.pkl"

obtain_collection_dict_landmarks(list_video_paths, alphabet, result_path)