In [39]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp

In [40]:
mp_holistic  = mp.solutions.holistic #holistic model
mp_drawing = mp.solutions.drawing_utils #utilidades de dibujo

In [41]:
def mediapipe_detection(image, model):
    # Convert from BGR to RGB
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Make a copy of the image to avoid modifying the original image
    input_image = np.copy(image_rgb)
    
    # Make the input image non-writable before processing
    input_image.flags.writeable = False
    
    # Perform detection
    results = model.process(input_image)
    
    # Make the input image writable again
    input_image.flags.writeable = True
    
    # Convert the image back from RGB to BGR
    image = cv2.cvtColor(input_image, cv2.COLOR_RGB2BGR)
    
    return image, results



In [42]:
def draw_landmarks(image,results): #dibujar las marcas en la imagen
    mp_drawing.draw_landmarks(image,results.face_landmarks,mp_holistic.FACEMESH_TESSELATION) #dibuja las conecciones de la cara
    mp_drawing.draw_landmarks(image,results.pose_landmarks,mp_holistic.POSE_CONNECTIONS) #Dibuja las pose conecciones
    mp_drawing.draw_landmarks(image,results.left_hand_landmarks,mp_holistic.HAND_CONNECTIONS) #dibuja las conecciones de la mano izquierda
    mp_drawing.draw_landmarks(image,results.right_hand_landmarks,mp_holistic.HAND_CONNECTIONS) #dibuja las conecciones de la mano derecha 

    

In [43]:
def draw_styled_landmarks(image,results): #dibuja los landmarks refinados 
    mp_drawing.draw_landmarks(image,results.face_landmarks,mp_holistic.FACEMESH_TESSELATION,mp_drawing.DrawingSpec(color=(80,110,10),thickness=1,circle_radius=1),mp_drawing.DrawingSpec(color=(80,256,121),thickness=1,circle_radius=1)) #dibuja las conecciones de la cara
    mp_drawing.draw_landmarks(image,results.pose_landmarks,mp_holistic.POSE_CONNECTIONS,mp_drawing.DrawingSpec(color=(80,22,10),thickness=2,circle_radius=4),mp_drawing.DrawingSpec(color=(80,44,121),thickness=2,circle_radius=2)) #Dibuja las pose conecciones
    mp_drawing.draw_landmarks(image,results.left_hand_landmarks,mp_holistic.HAND_CONNECTIONS,mp_drawing.DrawingSpec(color=(121,22,76),thickness=2,circle_radius=4),mp_drawing.DrawingSpec(color=(121,44,250),thickness=2,circle_radius=2)) #dibuja las conecciones de la mano izquierda
    mp_drawing.draw_landmarks(image,results.right_hand_landmarks,mp_holistic.HAND_CONNECTIONS,mp_drawing.DrawingSpec(color=(245,117,66),thickness=2,circle_radius=4),mp_drawing.DrawingSpec(color=(245,66,230),thickness=2,circle_radius=2)) #dibuja las conecciones de la mano derecha 


In [44]:
#abre la webcam y va entre los frames
cap = cv2.VideoCapture(2)
#setteamos el modelo de mediapipe
with mp_holistic.Holistic(min_detection_confidence = 0.5, min_tracking_confidence = 0.5) as holistic:
  while cap.isOpened():
    #lee el feed
    ret,frame = cap.read()


    #hace la deteccion
    image, results = mediapipe_detection(frame,holistic)
    print(results)
    #dibuja  los landmarks
    draw_styled_landmarks(image,results)
    #Lo muestra en la pantalla
    cv2.imshow('OpenCV Feed',image)

    #Rompe el loop si se apreta q
    if cv2.waitKey(10) & 0xFF == ord('q'):
      break
  cap.release()
  cv2.destroyAllWindows()

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

In [45]:
pose = [] #array para gaursar los landmarks
for res in results.pose_landmarks.landmark:
    test = np.array([res.x,res.y,res.z,res.visibility])
    pose.append(test)

In [46]:
pose

[array([ 0.59339899,  0.59540832, -1.88038576,  0.99895346]),
 array([ 0.62240916,  0.50646269, -1.81809831,  0.99790913]),
 array([ 0.63756084,  0.50759542, -1.81766057,  0.99809468]),
 array([ 0.65291083,  0.51045889, -1.81846809,  0.99772668]),
 array([ 0.56543136,  0.50427902, -1.86967504,  0.99824429]),
 array([ 0.5396539 ,  0.50256807, -1.86884785,  0.99859196]),
 array([ 0.51355225,  0.50103343, -1.86920428,  0.99838567]),
 array([ 0.66374934,  0.5281505 , -1.21015763,  0.99789923]),
 array([ 0.45307964,  0.50505584, -1.43494558,  0.99897867]),
 array([ 0.60610074,  0.67569381, -1.63526988,  0.99907017]),
 array([ 0.54297447,  0.67400163, -1.69920635,  0.99920589]),
 array([ 0.77621001,  0.85037655, -0.64588368,  0.9982717 ]),
 array([ 0.24201123,  0.83098567, -1.0791316 ,  0.99880379]),
 array([ 0.84832108,  1.25947356, -0.02645181,  0.23341414]),
 array([ 0.03761551,  1.07284749, -0.96227521,  0.65580076]),
 array([ 0.88563222,  1.61219931, -0.02281112,  0.0265931 ]),
 array([

In [47]:
pose = np.array([[res.x,res.y,res.z,res.visibility]for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(132)#todos los landmarks en un solo array grande
face = np.array([[res.x,res.y,res.z]for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(1404)#todos los landmarks en un solo array grande
lh = np.array([[res.x,res.y,res.z]for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)#todos los landmarks en un solo array grande
rh = np.array([[res.x,res.y,res.z]for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)#todos los landmarks en un solo array grande

In [48]:

def extract_keypoints(results):
    pose = np.array([[res.x,res.y,res.z,res.visibility]for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(132)#todos los landmarks en un solo array grande
    face = np.array([[res.x,res.y,res.z]for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(1404)#todos los landmarks en un solo array grande
    lh = np.array([[res.x,res.y,res.z]for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)#todos los landmarks en un solo array grande
    rh = np.array([[res.x,res.y,res.z]for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)#todos los landmarks en un solo array grande
    return np.concatenate([pose,face,lh,rh])
    

In [49]:
result_test = extract_keypoints(results)

In [50]:
result_test

array([ 0.59339899,  0.59540832, -1.88038576, ...,  0.        ,
        0.        ,  0.        ])

In [51]:
np.save('0',result_test)

In [52]:
np.load('0.npy')

array([ 0.59339899,  0.59540832, -1.88038576, ...,  0.        ,
        0.        ,  0.        ])

In [53]:
extract_keypoints(results).shape

(1662,)

In [54]:
#path para la data exportada
DATA_PATH = os.path.join('MP_Data') # variable que guarda 

#Aciones detectadas
actions = np.array(['hello','thanks','iloveyou'])
no_sequences = 30 #Cantidad de videos que se recolectaran por accion
sequence_length = 30 #30 es la cantidad de frames de data que va a utilizar 
#lo que se hara para la info es recolectar 30 videos de 30 frames cada un por accion (son 3 acciones) por la cantidad de keypoints que se tienen que son 1662 (entre todas las  variables)

In [55]:
for action in actions: 
    for sequence in range(no_sequences):
        try: 
            os.makedirs(os.path.join(DATA_PATH,action,str(sequence)))
        except:
            pass

In [60]:
cap = cv2.VideoCapture(2)
#setteamos el modelo de mediapipe
with mp_holistic.Holistic(min_detection_confidence = 0.5, min_tracking_confidence = 0.5) as holistic:
    for action in actions:
        for sequence in range(no_sequences):
            for frame_num in range(sequence_length):
    #lee el feed
                ret,frame = cap.read()

    #hace la deteccion
                image, results = mediapipe_detection(frame,holistic)
                print(results)
            #dibuja  los landmarks
                draw_styled_landmarks(image,results)
            
                if frame_num == 0 :
                    cv2.putText(image,'Sarting Collection',(120,200),cv2.FONT_HERSHEY_SIMPLEX,1,(0,255,0),4,cv2.LINE_AA)# IMprime Starting Collection
                    cv2.putText(image,'Colelecting Frasmes for {} Video Number {}'.format(action,sequence),(15,12),cv2.FONT_HERSHEY_SIMPLEX,0.5,(0,0,255),1,cv2.LINE_AA)
                    cv2.waitKey(2000)
                
                else :
                    cv2.putText(image,'Colelecting Frasmes for {} Video Number {}'.format(action,sequence),(15,12),cv2.FONT_HERSHEY_SIMPLEX,0.5,(0,0,255),1,cv2.LINE_AA)
                
                #Exportamos los Keypoints
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(DATA_PATH,action,str(sequence),str(frame_num))
                np.save(npy_path,keypoints)
                #Lo muestra en la pantalla
                cv2.imshow('OpenCV Feed',image)

                #Rompe el loop si se apreta q
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break
cap.release()
cv2.destroyAllWindows()

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

In [None]:
cap.release()
cv2.destroyAllWindows()

In [57]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [58]:
label_map = {label:num for num, label in enumerate(actions)} #esto loopea entre cada una de las palabras

In [None]:
label_map #se estructura en un array grande por accion, se termina creando 90 arrays con 30 fraames en cada array con 1662 valores que representan los keypoints

In [61]:
sequences, labels = [],[] # sequences representa la dara y labels representa los label
for action in actions: # se loopea entre las acciones y las sequencias
    for sequence in range(no_sequences):
        window = [] #windows representa todas las ventanas de video
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH,action,str(sequence),"{}.npy".format(frame_num))) # se cargan los diferentes frames desde las carpetas  
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [62]:
np.array(sequences).shape

(90, 30, 1662)

In [None]:
x = np.array(sequences)

In [None]:
x.shape

In [None]:
y = to_categorical(labels).astype(int)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.05)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

In [None]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir = log_dir)

In [None]:

model = Sequential() # Se crean layers para entrenar los modelos 
model.add(LSTM(64,return_sequences = True, activation='relu',input_shape = (30,1662)))
model.add(LSTM(128,return_sequences = True, activation='relu'))
model.add(LSTM(64,return_sequences = False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0],activation = 'softmax')) #softmas te da un conjunto de probabilidades que sumadas te suman 1  

In [None]:
res = [0.7,0.2,0.1]

In [None]:
actions[np.argmax(res)]

In [None]:
model.compile(optimizer = 'Adam', loss = 'categorical_crossentropy', metrics = ['categorical_accuracy'])

In [None]:

model.fit(x_train,y_train, epochs = 2000, callbacks = [tb_callback])

In [None]:
model.summary()

In [None]:
res = model.predict(x_test)

In [None]:
actions[np.argmax(res[4])]

In [None]:
actions[np.argmax(y_test[4])]

In [None]:
model.save('action.h5')

In [63]:
model.load_weights('action.h5')

In [64]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [65]:
yhat = model.predict(x_test)



In [66]:
ytrue = np.argmax(y_test, axis = 1).tolist()
yhat =  np.argmax(yhat,axis = 1).tolist()

In [67]:
multilabel_confusion_matrix(ytrue,yhat)

array([[[3, 1],
        [0, 1]],

       [[4, 0],
        [0, 1]],

       [[2, 0],
        [1, 2]]])

In [68]:
accuracy_score(ytrue,yhat)

0.8

*Real time Testing*

In [78]:
#Variables de deteccion
sequence = []
sentence = []
predictions = []
threshold = 0.5


#abre la webcam y va entre los frames
cap = cv2.VideoCapture(2)

#setteamos el modelo de mediapipe
with mp_holistic.Holistic(min_detection_confidence = 0.5, min_tracking_confidence = 0.5) as holistic:
  while cap.isOpened():
    #lee el feed
    ret,frame = cap.read()

    #hace la deteccion
    image, results = mediapipe_detection(frame,holistic)
    print(results)
    #dibuja  los landmarks
    draw_styled_landmarks(image,results)

    #Logica de prediccion
    keypoints = extract_keypoints(results)
    sequence.append(keypoints)
    #[-30:] son los ultimos 30 frames/ 30 set de keypoints
    sequence = sequence[-30:]
    
    #si la longitud de la sequencia tiene una longitud de 30 frames, recien ahi se hace la prediccion
    if len(sequence) == 30:
        res = model.predict(np.expand_dims(sequence,axis = 0))[0]
        print(actions[np.argmax(res)])
        predictions.append(np.argmax(res))
        
        
    #Visualizacion Logic  
    #if all(res[np.argmax(res)]) > threshold:   
    #if all(res[np.argmax(res)]) > threshold:
    if res[np.argmax(res)] > threshold:
      if len(sentence) > 0:
        if actions[np.argmax(res)] != sentence[-1]:
          sentence.append(actions[np.argmax(res)])
      else:
          sentence.append(actions[np.argmax(res)])
      
    if len(sentence) > 5:
      sentence = sentence[-5:]
    
    #Visualizar probabilidades
    #image = prob_viz(res,actions,image, colors)
    
    cv2.rectangle(image,(0,0),(640,40),(245,117,16),-1)
    cv2.putText(image,' '.join(sentence),(3,30),cv2.FONT_HERSHEY_SIMPLEX,1,(255,255,255),2,cv2.LINE_AA)
          
    
    #Lo muestra en la pantalla
    cv2.imshow('OpenCV Feed',image)

    #Rompe el loop si se apreta q
    if cv2.waitKey(10) & 0xFF == ord('q'):
      break
  cap.release()
  cv2.destroyAllWindows()

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

In [None]:
model.predict(x_test[0])