In [1]:
import mediapipe as mp
from protobuf_to_dict import protobuf_to_dict
import numpy as np
import pickle
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_face_mesh = mp.solutions.face_mesh

In [2]:
def data_to_numpy(data: list) -> np.array:
    arr = np.empty((len(data),len(data[0]), len(data[0][0])), np.float64)
    for i in range(len(data)):
        for j in range(len(data[0])):
            arr[i, j, :] = [data[i][j]['x'], data[i][j]['y'], data[i][j]['z']]
    return arr

def standardize_data(data, axis_, center=True, scale=True):
    shape_ = list(data.shape)
    shape_[axis_] = 1
        
    if center:
        data = data - data.mean(axis=axis_).reshape(*shape_)
    if scale:
        data = data - data.min(axis=axis_).reshape(*shape_)
        mm = data.max(axis=axis_) - data.min(axis=axis_)
        mm = mm.reshape(*shape_)
        data = data / (data.max(axis=axis_) - data.min(axis=axis_)).reshape(*shape_)

    return data

def to_sequential(data: np.ndarray, seq_length: int=10, axis: int=1) -> np.ndarray:
    '''Transforms to sequential data

        # sp_cls[0] = 1 if np.sum(np.abs(s[0, 13] - s[0, 14])) > 0.025 else 0
        # txt = 'speaking' if np.sum(sp_cls)/2 >= 0.5 else 'not speaking'
    Parameters
    ----------
    data: np.ndarray
        The data to be processed
    seq_length: int
        The total number of consequent samples that would be used
        to generate the sequential data
    axis: int
        The nex axis where the sequences would be put in

    Returns
    -------
        np.ndarray
        The transformed sequential data

    '''

    shape_ = tuple(val - seq_length + 1 if i == 0 else val for i, val in enumerate(data.shape))
    shape_ = shape_[:axis] + (seq_length,) + shape_[axis:]
    seq_data = np.zeros(shape_)
    for i in range(data.shape[0]-seq_length+1):
        seq_data[i, :, :, :] = data[i:i+seq_length, :, :]
    return seq_data

In [3]:
upper_lip = [185, 184, 183, 191, 
             40, 74, 42, 80,
             39, 73, 41, 81, 
             37, 72, 38, 82, 
             0, 11, 12, 13, 
             267, 302, 208, 312,
             269, 303, 271, 311,
             270, 304, 272, 310,
             409, 408, 407, 415]

lower_lip = [146, 77, 96, 95,
             91, 90, 89, 88,
             181, 180, 179, 178,
             84, 85, 86, 87,
             17, 16, 15, 14,
             314, 315, 316, 317,
             405, 404, 403, 402,
             321, 320, 319, 318,
             375, 307, 325, 324]

In [4]:
seq_length = 30
is_sequential = True
seq_length_speak = 20
is_pca = True

In [5]:
is_sequential

True

In [6]:
with open('clfs.pickle', 'rb') as f:
    clfs = pickle.load(f)

In [7]:
with open('pca.pickle', 'rb') as f:
    pca = pickle.load(f)

In [8]:
pca

PCA(n_components=150)

In [9]:
weight_func = 'logspace'
# weigh_val = None
if weight_func == 'logspace':
  weight_val = np.logspace(0, 4, num=seq_length_speak, base=2.0)
  weight_val /= weight_val.sum()
elif weight_func == 'linear':
    weight_val = np.arange(1, seq_length_speak+1)/np.sum(np.arange(1, seq_length_speak + 1))
else:
  weigh_val = np.ones(seq_length_speak)/np.sum(seq_length_speak)

In [10]:
weight_val.sum()

1.0

In [15]:
import cv2
import numpy as np
from time import time

clf = clfs['Linear SVM']
record_video = False
theta_CLD = 7
theta_CSD = 0.5
color_idx = [(0,0,0), (0,0,0)]
confidence_weight = np.arange(1, seq_length_speak+1)/np.sum(np.arange(1, seq_length_speak + 1))
sp_cls = np.zeros(seq_length_speak)
max_num_faces=2
refine_landmarks=True
min_detection_confidence=0.5
min_tracking_confidence=0.5
conf_analysis = {
  'speaking': {
    'count': 0,
    'conf': []
  },
    'not_speaking': {
    'count': 0,
    'conf': []
  }
}
with mp_face_mesh.FaceMesh(
        max_num_faces=max_num_faces,
        refine_landmarks=refine_landmarks,
        min_detection_confidence=min_detection_confidence,
        min_tracking_confidence=min_tracking_confidence,
    ) as face_mesh:
    cap = cv2.VideoCapture('1.avi')

    # Check if camera opened successfully
    if (cap.isOpened() == False):
      print("Error opening video stream or file")

    if record_video:
      # Read until video is completed
      frame_width = int(cap.get(3))
      frame_height = int(cap.get(4))
        
      size = (frame_width, frame_height)
      out = cv2.VideoWriter('output_videos/fin_demo_4.avi', 
                          cv2.VideoWriter_fourcc(*'MJPG'),
                          10, size)
    count1 = 0
    count2 = 0
    seq_frames1 = np.zeros((seq_length, 478, 3))
    seq_frames2 = np.zeros((seq_length, 478, 3))
    now = time()
    
    while(cap.isOpened()):

      # Capture frame-by-frame
      ret, frame = cap.read()
      # if time() - now < 15:
      #   continue
      if ret == True:
        # To improve performance, optionally mark the image as not writeable to
        # pass by reference.
        # image.flags.writeable = False
        image = frame
        # image = frame
        results = face_mesh.process(image)

        # Draw the face mesh annotations on the image.
        # image.flags.writeable = True
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if results is None and len(results.multi_face_landmarks) != 2:
          continue
        
        if results.multi_face_landmarks:
            for i, face_landmarks in enumerate(results.multi_face_landmarks):
                # add facelandmarks to the keypoints listimage
                # print(len(results.))
                keypoints = protobuf_to_dict(face_landmarks)["landmark"]


                s = np.array([[i['x'], i['y'], i['z']] for i in keypoints])
                anchorpoint = tuple([int(s[10][0]*cap.get(3)), int(s[10][1]*cap.get(4))])
                s = np.expand_dims(s, axis=0)
                if i == 0:
                  if is_sequential:
                    if count1 <= seq_length - 1:
                      seq_frames1[count1] = s
                      count1 += 1
                    else:
                      seq_frames1 = np.roll(seq_frames1, -1, axis=0)
                      seq_frames1[seq_length-1] = s
                  else:
                    inference_frame1 = s.reshape(1, -1)
                    
                if i == 1:
                  if is_sequential:
                    if count2 <= seq_length - 1:
                      seq_frames2[count2] = s
                      count2 += 1
                    else:
                      seq_frames2 = np.roll(seq_frames2, -1, axis=0)
                      seq_frames2[seq_length-1] = s
                  else:
                    inference_frame2 = s.reshape(1, -1)
                  
                if is_sequential:
                  if i == 0:
                    inference_frame1 = np.expand_dims(seq_frames1, axis=0)
                    inference_frame1 = standardize_data(inference_frame1, axis_=2)
                    inference_frame1 = inference_frame1.reshape(1, -1)
                  elif i == 1:
                    inference_frame2 = np.expand_dims(seq_frames2, axis=0)
                    inference_frame2 = standardize_data(inference_frame2, axis_=2)
                    inference_frame2 = inference_frame2.reshape(1, -1)

                  # print(inference_frame.shape)
                  # break
                else:
                  if i == 0:
                    inference_frame1 = standardize_data(inference_frame1, axis_=1)
                    inference_frame1 = inference_frame1.reshape(1, -1)
                    
                  elif i == 1:
                    inference_frame2 = standardize_data(inference_frame2, axis_=1)
                    inference_frame2 = inference_frame2.reshape(1, -1)
                try:
                  if np.isnan(inference_frame1).any():
                    continue
                  if np.isnan(inference_frame2).any():
                    continue
                except NameError:
                  continue 
                
                # fix this
                
                pred_text = "waiting"
                if count1 == seq_length:    
                  if i == 0:
                    inference_frame = pca.transform(inference_frame1) if is_pca else inference_frame1
                  elif i == 1:
                    inference_frame = pca.transform(inference_frame2) if is_pca else inference_frame2


                  y_pred = clf.predict(inference_frame)
                  pred_text = "facing" if y_pred == 1 else "not facing"


                sp_cls = np.roll(sp_cls, -1, axis=0)
                lip_distance = np.abs(s[0, upper_lip, 1] - s[0, lower_lip, 1])
                lip_distance = lip_distance/(lip_distance.max() - lip_distance.min())
                sp_cls[seq_length_speak - 1] = 1 if np.sum(lip_distance) > theta_CLD else 0
                decision_val = 'speaking' if np.sum(sp_cls * weight_val) >= theta_CSD else 'not speaking' # better result
                diff = np.sum(np.abs(s[0, upper_lip, 1] - s[0, lower_lip, 1]))
                
                cv2.rectangle(image, (anchorpoint[0]-100, 5), (anchorpoint[0]+100, 60), color_idx[i], thickness=-1)

                window_name = 'Image'

                # font
                font = cv2.FONT_HERSHEY_SIMPLEX
                # org
                org = (anchorpoint[0]-85, 25)
                
                # fontScale
                fontScale = 0.5
                
                # Blue color in BGR
                color = (255, 0, 0)
                
                # Line thickness of 2 px
                thickness = 1
                confidence = np.sum(sp_cls * weight_val)
                # decision_val = 'speaking' if confidence >= 0.5 else 'not speaking' # better result

                confidence = confidence if decision_val == "speaking" else 1 - confidence
                
                if decision_val == 'speaking':
                    conf_analysis['speaking']['count'] += 1
                    conf_analysis['speaking']['conf'].append(confidence)
                else:
                    conf_analysis['not_speaking']['count'] += 1
                    conf_analysis['not_speaking']['conf'].append(confidence)
                
                # Using cv2.putText() method
                # image = cv2.putText(image, label_text[int(y_pred)], org, font, fontScale, color, thickness, cv2.LINE_AA)
                color_ = (0,255,0) if decision_val == "speaking" else (0,0,255)
                # image = cv2.putText(image, f'CLD: {lip_distance.sum():.5f}', org, font, fontScale, (255, 0, 0), thickness, cv2.LINE_AA)
                # color_ = (0,255,0) if np.sum(sp_cls * weight_val) > 0.010 else (0,0,255)
                # image = cv2.putText(image, f'CSD: {np.sum(sp_cls * weight_val):.5f}', (anchorpoint[0]-85, 45), font, fontScale, (255, 0, 0), thickness, cv2.LINE_AA)
                image = cv2.putText(image, f'User is: {decision_val}', (anchorpoint[0]-85, 25), font, fontScale, color_, thickness, cv2.LINE_AA)
                # image = cv2.putText(image, f'Confidence is: {confidence*100:.2f}%', (anchorpoint[0]-85, 85), font, fontScale, (255, 0, 0), thickness, cv2.LINE_AA)
                image = cv2.putText(image, f'User is: {pred_text}', (anchorpoint[0]-85, 50), font, fontScale, (0,255,0) if pred_text == "facing" else (0,0,255), thickness, cv2.LINE_AA)

        else:
          continue
      
        if record_video:
          out.write(image)
        cv2.imshow('Frame',image)

        # Press Q on keyboard to  exit
        if cv2.waitKey(25) & 0xFF == ord('q'):
          break

      # Break the loop
      else: 
        break

    # When everything done, release the video capture object
    cap.release()
    if record_video:
      out.release()

# Closes all the frames
cv2.destroyAllWindows()

  data = data / (data.max(axis=axis_) - data.min(axis=axis_)).reshape(*shape_)


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''