In [1]:
from crop import *
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import mediapipe as mp
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, LayerNormalization
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.models import load_model
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score




In [2]:
#first crop the viedos
#train test split
#preprocess the data
def extract_keypoints(results):
    pose = np.array([[res.x, res.y] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*2)

    lh = np.array([[res.x, res.y] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*2)

    rh = np.array([[res.x, res.y] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*2)

    return np.concatenate([pose, lh, rh])

def extract_video_keypoints(results):
    data = []
    for r in results:
        keypoints = extract_keypoints(r)
        data.append(keypoints)
    return np.array(data)

In [11]:
mp_pose = mp.solutions.pose
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils 
mp_drawing_styles = mp.solutions.drawing_styles

def detect_landmarks(image_, model):
    image_.flags.writeable = False
    image_ = cv2.cvtColor(image_, cv2.COLOR_BGR2RGB)
    results_ = model.process(image_)
    image_.flags.writeable = True
    image_ = cv2.cvtColor(image_, cv2.COLOR_RGB2BGR)
    return image_, results_

def draw_landmarks(image_, results_):
    mp_drawing.draw_landmarks(
            image_,
            results_.pose_landmarks,
            mp_holistic.POSE_CONNECTIONS,
            mp_drawing.DrawingSpec(color =(255,0,0), thickness =1 ,circle_radius = 1),
            mp_drawing.DrawingSpec(color =(80,256,121), thickness =1 ,circle_radius = 1))
    mp_drawing.draw_landmarks(
            image_,
            results_.right_hand_landmarks,
            mp_holistic.HAND_CONNECTIONS,
            mp_drawing.DrawingSpec(color =(255,0,0), thickness =1 ,circle_radius = 1),
            mp_drawing.DrawingSpec(color =(80,256,121), thickness =1 ,circle_radius = 1))
        
    mp_drawing.draw_landmarks(
            image_,
            results_.left_hand_landmarks,
            mp_holistic.HAND_CONNECTIONS,
            mp_drawing.DrawingSpec(color =(255,0,0), thickness =1 ,circle_radius = 1),
            mp_drawing.DrawingSpec(color =(80,256,121), thickness =1 ,circle_radius = 1))



def pose_detection(frames):
    detection_results = []
    with mp_holistic.Holistic(
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5) as holistic:
        for image in frames:
            image, results = detect_landmarks(image, holistic) 

            detection_results.append(results)
    return detection_results

def show_landmarks(original_image,results):
    fig = plt.figure(figsize=(12, 9)) 
    fig.add_subplot(1, 2, 1) 
    plt.title("Original Image")
    
    plt.imshow(cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB))
    fig.add_subplot(1, 2, 2) 
    plt.title("Keypoints")
    cropped_image_cp = original_image.copy()
    draw_landmarks(cropped_image_cp,results)
    plt.imshow( cv2.cvtColor(cropped_image_cp, cv2.COLOR_BGR2RGB))
    plt.show() 
    
def extract_video(path):
    cap = cv2.VideoCapture(path)
    original_frames =[]
    frames = []
    with mp_holistic.Holistic(
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5) as holistic:
        while cap.isOpened():
            success, image = cap.read()
            if not success:
                break
            image = cv2.resize(image, (400, 400))
            image, results = detect_landmarks(image, holistic) 
            frames.append(image)
            
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break
    
    cap.release()
    cv2.destroyAllWindows()
    detection_results = pose_detection(frames) 
    return frames, detection_results

In [22]:
actions = ["acele", "acikmak", "afiyet olsun", "agabey", "agac", "agir", "aglamak", "aile", "akilli", "akilsiz"]
model = load_model("model_99acc.h5")

In [20]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_6 (LSTM)               (None, 60, 32)            23424     
                                                                 
 lstm_7 (LSTM)               (None, 16)                3136      
                                                                 
 dense_6 (Dense)             (None, 32)                544       
                                                                 
 dropout_3 (Dropout)         (None, 32)                0         
                                                                 
 dense_7 (Dense)             (None, 10)                330       
                                                                 
Total params: 27434 (107.16 KB)
Trainable params: 27434 (107.16 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [33]:
colors = [(245,117,16), (117,245,16), (16,117,245)]

# 1. New detection variables
frames =[]
sequence = []
sentence = []
threshold = 0.99

cap = cv2.VideoCapture(0, cv2.CAP_DSHOW) # this is the magic!

cap.set(cv2.CAP_PROP_FPS, 30)

# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()
        #680x480
        frame = frame[40:440, 140:540]
        
        # Make detections
        image, results = detect_landmarks(frame, holistic)
        
        frames.append(frame)
        # Draw landmarks
        draw_landmarks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints(results)
#         sequence.insert(0,keypoints)
#         sequence = sequence[:30]

        sequence.append(keypoints)
        sequence = sequence[-60:]
        frames = frames[-60:]
        if len(frames) == 60:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            
        #3. Viz logic
            if res[np.argmax(res)] > threshold: 
                if len(sentence) > 0: 
                    if actions[np.argmax(res)] != sentence[-1]:
                        sentence.append(actions[np.argmax(res)])
                else:
                    sentence.append(actions[np.argmax(res)])

            if len(sentence) > 3: 
                sentence = sentence[-3:]

            # Viz probabilities
            #image = prob_viz(res, actions, image, colors)
            
        cv2.rectangle(image, (0,0), (400, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3,30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()









In [13]:
def optimize_keypoints_size(keypoints, size):
    if keypoints.shape[0] ==size:
        return keypoints
    
    if keypoints.shape[0] < size:
        missing_frames_size = size- keypoints.shape[0]
        original_indices = np.array([i for i in range(len(keypoints))])
        indices_to_duplicate = np.linspace(0, keypoints.shape[0] - 1, missing_frames_size, dtype=int)
        all_indices = np.sort(np.concatenate((original_indices, indices_to_duplicate), axis=0))
        return keypoints[all_indices]


    if keypoints.shape[0] > size:
        # Delete frames based on a pattern to match the desired size
        indices_to_keep = np.linspace(0, keypoints.shape[0] - 1, size, dtype=int)
        return keypoints[indices_to_keep]

def optimize_data(size):
    optimized_data = []
    for d in data:
        optimized_keypoints = optimize_keypoints_size(d, size)
        optimized_data.append(optimized_keypoints)
    return np.array(optimized_data)

In [31]:
frames, detection_results = extract_video("../demodata/livedemo/acele.webm")
keypoints = extract_video_keypoints(detection_results)
keypoints_optimized =optimize_keypoints_size(np.array(keypoints), 60)
y_pred = model.predict(np.array([keypoints_optimized]))
y_predicted = np.argmax(y_pred, axis=1).tolist()
actions[y_predicted[0]]



'acele'