In [5]:
import mediapipe as mp
from protobuf_to_dict import protobuf_to_dict
import numpy as np
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_face_mesh = mp.solutions.face_mesh

In [2]:
upper_lip = [185, 184, 183, 191, 
             40, 74, 42, 80,
             39, 73, 41, 81, 
             37, 72, 38, 82, 
             0, 11, 12, 13, 
             267, 302, 208, 312,
             269, 303, 271, 311,
             270, 304, 272, 310,
             409, 408, 407, 415]

lower_lip = [146, 77, 96, 95,
             91, 90, 89, 88,
             181, 180, 179, 178,
             84, 85, 86, 87,
             17, 16, 15, 14,
             314, 315, 316, 317,
             405, 404, 403, 402,
             321, 320, 319, 318,
             375, 307, 325, 324]

In [19]:
seq_length = 30
is_sequential = False
is_pca = False

In [20]:
weight_func = 'logspace'
# weigh_val = None
if weight_func == 'logspace':
  weight_val = np.logspace(0.1, 1, num=seq_length, base=2.0)/np.sum(np.logspace(0.1, 10, num=seq_length, base=2.0))
elif weight_func == 'linear':
    weight_val = np.arange(1, seq_length+1)/np.sum(np.arange(1, seq_length + 1))
else:
  weigh_val = np.ones(seq_length)/np.sum(seq_length)

In [7]:
weight_val

array([0.00022073, 0.00022553, 0.00023043, 0.00023544, 0.00024056,
       0.00024579, 0.00025114, 0.0002566 , 0.00026218, 0.00026788,
       0.0002737 , 0.00027965, 0.00028573, 0.00029195, 0.0002983 ,
       0.00030478, 0.00031141, 0.00031818, 0.0003251 , 0.00033217,
       0.00033939, 0.00034677, 0.00035431, 0.00036202, 0.00036989,
       0.00037793, 0.00038615, 0.00039455, 0.00040312, 0.00041189])

In [23]:
import cv2
import numpy as np


record_video = False
color_idx = [(81,89,2), (115, 71, 22)]
confidence_weight = np.arange(1, seq_length+1)/np.sum(np.arange(1, seq_length + 1))
sp_cls = np.zeros(seq_length)
max_num_faces=2
refine_landmarks=True
min_detection_confidence=0.5
min_tracking_confidence=0.5
conf_analysis = {
  'speaking': {
    'count': 0,
    'conf': []
  },
    'not_speaking': {
    'count': 0,
    'conf': []
  }
}
with mp_face_mesh.FaceMesh(
        max_num_faces=max_num_faces,
        refine_landmarks=refine_landmarks,
        min_detection_confidence=min_detection_confidence,
        min_tracking_confidence=min_tracking_confidence,
    ) as face_mesh:
    cap = cv2.VideoCapture('output.avi')

    # Check if camera opened successfully
    if (cap.isOpened()== False):
      print("Error opening video stream or file")

    if record_video:
      # Read until video is completed
      frame_width = int(cap.get(3))
      frame_height = int(cap.get(4))
        
      size = (frame_width, frame_height)
      out = cv2.VideoWriter('output_videos/is_speaking.avi', 
                          cv2.VideoWriter_fourcc(*'MJPG'),
                          10, size)
    count = 0

    while(cap.isOpened()):
      # Capture frame-by-frame
      ret, frame = cap.read()
      if ret == True:
        # To improve performance, optionally mark the image as not writeable to
        # pass by reference.
        # image.flags.writeable = False
        image = frame
        # image = frame
        results = face_mesh.process(image)

        # Draw the face mesh annotations on the image.
        # image.flags.writeable = True
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if results.multi_face_landmarks:
            for i, face_landmarks in enumerate(results.multi_face_landmarks):
                # add facelandmarks to the keypoints listimage
                # print(len(results.))
                keypoints = protobuf_to_dict(face_landmarks)["landmark"]


                s = np.array([[i['x'], i['y'], i['z']] for i in keypoints])
                anchorpoint = tuple([int(s[10][0]*cap.get(3)), int(s[10][1]*cap.get(4))])
                s = np.expand_dims(s, axis=0)

                sp_cls = np.roll(sp_cls, -1, axis=0)
                lip_distance = np.abs(s[0, upper_lip, 1] - s[0, lower_lip, 1])
                lip_distance = lip_distance/(lip_distance.max() - lip_distance.min())
                sp_cls[seq_length - 1] = 1 if np.sum(lip_distance) > 10 else 0
                decision_val = 'speaking' if np.sum(sp_cls * weight_val) >= 0.006 else 'not speaking' # better result
                diff = np.sum(np.abs(s[0, upper_lip, 1] - s[0, lower_lip, 1]))
                
                cv2.rectangle(image, (anchorpoint[0]-100, 30), (anchorpoint[0]+100, 150), color_idx[i], thickness=-1)

                window_name = 'Image'

                # font
                font = cv2.FONT_HERSHEY_SIMPLEX
                # org
                org = (anchorpoint[0]-85, 50)
                
                # fontScale
                fontScale = 0.5
                
                # Blue color in BGR
                color = (255, 0, 0)
                
                # Line thickness of 2 px
                thickness = 1
                confidence = np.sum(sp_cls * confidence_weight)
                
                if decision_val == 'speaking':
                    conf_analysis['speaking']['count'] += 1
                    conf_analysis['speaking']['conf'].append(confidence)
                else:
                    conf_analysis['not_speaking']['count'] += 1
                    conf_analysis['not_speaking']['conf'].append(confidence)
                
                # Using cv2.putText() method
                # image = cv2.putText(image, label_text[int(y_pred)], org, font, fontScale, color, thickness, cv2.LINE_AA)
                color_ = (0,255,0) if lip_distance.sum() > 5.0 else (0,0,255)
                image = cv2.putText(image, f'CLD: {lip_distance.sum():.5f}', org, font, fontScale, color_, thickness, cv2.LINE_AA)
                color_ = (0,255,0) if np.sum(sp_cls * weight_val) > 0.010 else (0,0,255)
                image = cv2.putText(image, f'CSD: {np.sum(sp_cls * weight_val):.5f}', (anchorpoint[0]-85, 80), font, fontScale, color_, thickness, cv2.LINE_AA)
                image = cv2.putText(image, f'User is: {decision_val}', (anchorpoint[0]-85, 110), font, fontScale, color_, thickness, cv2.LINE_AA)
                image = cv2.putText(image, f'Confidence is: {confidence*100:.2f}%', (anchorpoint[0]-85, 140), font, fontScale, color_, thickness, cv2.LINE_AA)

        else:
          continue
      
        if record_video:
          out.write(image)
        cv2.imshow('Frame',image)

        # Press Q on keyboard to  exit
        if cv2.waitKey(25) & 0xFF == ord('q'):
          break

      # Break the loop
      else: 
        break

    # When everything done, release the video capture object
    cap.release()
    if record_video:
      out.release()

# Closes all the frames
cv2.destroyAllWindows()

QObject::moveToThread: Current thread (0x55a93b39b370) is not the object's thread (0x55a93b66feb0).
Cannot move to target thread (0x55a93b39b370)

QObject::moveToThread: Current thread (0x55a93b39b370) is not the object's thread (0x55a93b66feb0).
Cannot move to target thread (0x55a93b39b370)

QObject::moveToThread: Current thread (0x55a93b39b370) is not the object's thread (0x55a93b66feb0).
Cannot move to target thread (0x55a93b39b370)

QObject::moveToThread: Current thread (0x55a93b39b370) is not the object's thread (0x55a93b66feb0).
Cannot move to target thread (0x55a93b39b370)

QObject::moveToThread: Current thread (0x55a93b39b370) is not the object's thread (0x55a93b66feb0).
Cannot move to target thread (0x55a93b39b370)

QObject::moveToThread: Current thread (0x55a93b39b370) is not the object's thread (0x55a93b66feb0).
Cannot move to target thread (0x55a93b39b370)

QObject::moveToThread: Current thread (0x55a93b39b370) is not the object's thread (0x55a93b66feb0).
Cannot move to tar