In [1]:
import mediapipe as mp
import cv2
import scipy.io as sio
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
from math import cos, sin
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split

In [2]:
dir_path = 'AFLW2000/'
paths = os.listdir(dir_path)
images_paths = [x for x in paths if x.split('.')[1] == 'jpg']
mats_paths = [x for x in paths if x.split('.')[1] == 'mat']

In [3]:
# Create The DataFrame
raw_df = pd.DataFrame(columns=list(range(1, (468*2)+1)) + ['Yaw', 'Pitch', 'Roll'])
# FaceMesh has two real-time deep neural network models that work together: 
# 1st --> Detector Model that operates on the full image and computes face locations
# 2nd --> Face Landmark Model that operates on those locations and predicts the approximate 3D surface via regression
i = 0
faceModule = mp.solutions.face_mesh
for img_path, mat_path in zip(images_paths, mats_paths):
    lst = []
    with faceModule.FaceMesh(static_image_mode=True) as face:
        # Load the image
        image = cv2.imread(dir_path + img_path)
        # Process the image to extract the points (468 point) --> Landmarks ---- len=1
        # Which is a list of 1 item maybe a data structure contains all landmarks
        results = face.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

        if results.multi_face_landmarks != None:
            # Loop over faces in the image
            for face in results.multi_face_landmarks:
                # Loop over lanmarks in every face
                for landmark in face.landmark:
                    # Every landmark has x and y
                    x = landmark.x
                    y = landmark.y
                    lst.append(x)
                    lst.append(y)

                # Load the mat file
                lst = lst + list(sio.loadmat(dir_path+mat_path)['Pose_Para'][0][:3])

                raw_df.loc[i] = lst
                i += 1

# Check loading data into DataFrame
# print(raw_df.shape)
# raw_df.head(3)

(1853, 939)


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,930,931,932,933,934,935,936,Yaw,Pitch,Roll
0,0.485651,0.686891,0.489194,0.639722,0.489803,0.647257,0.47973,0.562529,0.490558,0.621214,...,0.492813,0.541429,0.500371,0.665669,0.471336,0.676091,0.464005,-0.399231,0.018227,0.085676
1,0.441502,0.642091,0.417273,0.592251,0.448839,0.610546,0.435994,0.545069,0.415757,0.576966,...,0.511724,0.500658,0.514913,0.584451,0.500244,0.59406,0.495387,0.470065,1.189533,0.300959
2,0.317797,0.79919,0.321328,0.776498,0.318316,0.782066,0.311064,0.745973,0.321459,0.767816,...,0.720868,0.328639,0.723342,0.373404,0.713071,0.377643,0.709798,-0.18465,0.881137,-0.236852


In [5]:
def capture_image():
    cam = cv2.VideoCapture(0)
    while True:
        _, img = cam.read()
        cv2.imshow("cam",  img)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            cv2.imwrite(filename='saved_img.jpg', img=img)
            break

    cam.release()
    cv2.destroyAllWindows()
    img = cv2.imread('saved_img.jpg')
    return img

In [58]:
def get_XY_landmarks(image):
    # FaceMesh has two real-time deep neural network models that work together: (BlazePose)
    # 1st --> Detector Model that operates on the full image and computes face locations
    # 2nd --> Face Landmark Model that operates on those locations and predicts the approximate 3D surface via regression
    faceModule = mp.solutions.face_mesh
    mp_face_detection = mp.solutions.face_detection

    lst = []
    nose = None

    # To get location of Nose keypoint
    with mp_face_detection.FaceDetection(min_detection_confidence = 0.6) as face_detector:
        results = face_detector.process(image)
        if results.detections != None:
            for face in results.detections:
                landmarks = face.location_data.relative_keypoints
                nose = (int(landmarks[2].x * image.shape[1]), int(landmarks[2].y * image.shape[0]))

    # To get all landmarks
    with faceModule.FaceMesh(static_image_mode=True) as face:
        # Process the image to extract the points (468 point) --> Landmarks ---- len=1
        # Which is a list of 1 item maybe a data structure contains all landmarks
        results = face.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        # results = face.process(image)
        if results.multi_face_landmarks != None:
            # Loop over faces in the image
            for face in results.multi_face_landmarks:
                # Loop over lanmarks in every face
                for landmark in face.landmark:
                    # Every landmark has x and y
                    x = landmark.x
                    y = landmark.y
                    lst.append(x)
                    lst.append(y)

    return (np.array([lst]), nose)

In [18]:
def draw_axis(img, yaw, pitch, roll, tdx=None, tdy=None, size = 100):
    
    yaw = -yaw
    if tdx != None and tdy != None:
        tdx = tdx
        tdy = tdy
    else:
        height, width = img.shape[:2]
        tdx = width / 2
        tdy = height / 2

    # X-Axis pointing to right. drawn in red
    x1 = size * (cos(yaw) * cos(roll)) + tdx
    y1 = size * (cos(pitch) * sin(roll) + cos(roll) * sin(pitch) * sin(yaw)) + tdy

    # Y-Axis | drawn in green
    #        v
    x2 = size * (-cos(yaw) * sin(roll)) + tdx
    y2 = size * (cos(pitch) * cos(roll) - sin(pitch) * sin(yaw) * sin(roll)) + tdy

    # Z-Axis (out of the screen) drawn in blue
    x3 = size * (sin(yaw)) + tdx
    y3 = size * (-cos(yaw) * sin(pitch)) + tdy

    cv2.line(img, (int(tdx), int(tdy)), (int(x1),int(y1)),(0,0,255),3)
    cv2.line(img, (int(tdx), int(tdy)), (int(x2),int(y2)),(0,255,0),3)
    cv2.line(img, (int(tdx), int(tdy)), (int(x3),int(y3)),(255,0,0),2)

    return img

In [19]:
features = raw_df.drop(['Yaw', 'Pitch', 'Roll'], axis=1)
labels = raw_df[['Yaw', 'Pitch', 'Roll']]
features_train, features_validation, labels_train, labels_validation = train_test_split(features, labels, test_size=0.15, random_state=42)

In [20]:
svr = MultiOutputRegressor(SVR(kernel='rbf', gamma=0.1)).fit(features_train, labels_train)
print('Training Score:\n', svr.score(features_validation, labels_validation))

Training Score:
 0.8321881418782495


# Test on Camera (Image)

In [74]:
# captured_img = capture_image()
# landmarks, nose = get_XY_landmarks(captured_img)
# pred = svr.predict(landmarks)
# out = draw_axis(img=captured_img, yaw=pred[0][0], pitch=pred[0][1], roll=pred[0][2], tdx=nose[0], tdy=nose[1])

# cv2.namedWindow('finalImg', cv2.WINDOW_NORMAL)
# cv2.imshow('finalImg', out)
# cv2.waitKey(5000)
# cv2.destroyAllWindows()

# Test on Camera (Video - Real-Time)

In [103]:
mp_drawing = mp.solutions.drawing_utils
cam = cv2.VideoCapture(0)
out = cv2.VideoWriter('output.mp4', -1, 20.0, (640,480))

while cam.isOpened:
    _, frame = cam.read()
    try:
        landmarks, nose = get_XY_landmarks(frame)
        pred = svr.predict(landmarks)
        frame = draw_axis(img=frame, yaw=pred[0][0], pitch=pred[0][1], roll=pred[0][2], tdx=nose[0], tdy=nose[1])

        out.write(frame)

        cv2.namedWindow('Head Pose Estimation', cv2.WND_PROP_FULLSCREEN)
        cv2.imshow('Head Pose Estimation', frame)

    except Exception as error:
        print('Error:\n', error)
        
    if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    
cam.release()
out.release()
cv2.destroyAllWindows()

Error:
 'NoneType' object is not subscriptable
Error:
 Found array with 0 feature(s) (shape=(1, 0)) while a minimum of 1 is required by SVR.
Error:
 'NoneType' object is not subscriptable
Error:
 Found array with 0 feature(s) (shape=(1, 0)) while a minimum of 1 is required by SVR.
Error:
 'NoneType' object is not subscriptable
Error:
 Found array with 0 feature(s) (shape=(1, 0)) while a minimum of 1 is required by SVR.
Error:
 Found array with 0 feature(s) (shape=(1, 0)) while a minimum of 1 is required by SVR.
Error:
 Found array with 0 feature(s) (shape=(1, 0)) while a minimum of 1 is required by SVR.
Error:
 'NoneType' object is not subscriptable
Error:
 Found array with 0 feature(s) (shape=(1, 0)) while a minimum of 1 is required by SVR.
Error:
 Found array with 0 feature(s) (shape=(1, 0)) while a minimum of 1 is required by SVR.


In [99]:
cam.release()