## Head Pose Estimation Project

This project uses MediaPipe Face Mesh and Support Vector Regression (SVR) to estimate head pose (pitch, yaw, roll) from facial landmarks.

In [None]:
# Install required packages using: pip install -r requirements.txt
# Or install manually: pip install mediapipe opencv-python numpy scipy scikit-learn

##Import libaries

In [None]:
import numpy as np
import cv2
import math
import glob
import os
import scipy.io as sio
from math import cos, sin
import mediapipe
import warnings
warnings.filterwarnings('ignore')

# Libraries for model
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.model_selection import cross_validate
from sklearn.metrics import r2_score, mean_absolute_error

# Configuration
DATA_DIR = 'AFLW2000'  # Directory containing the dataset


## Loading the Data

**Note:** Download the AFLW2000-3D dataset from:
http://www.cbsr.ia.ac.cn/users/xiangyuzhu/projects/3DDFA/Database/AFLW2000-3D.zip

Extract it to the project directory. The dataset should be in the `AFLW2000` folder.

In [None]:
# Download the dataset manually or use the following code:
# import urllib.request
# url = 'http://www.cbsr.ia.ac.cn/users/xiangyuzhu/projects/3DDFA/Database/AFLW2000-3D.zip'
# urllib.request.urlretrieve(url, 'AFLW2000-3D.zip')
# 
# Then extract: import zipfile
# with zipfile.ZipFile('AFLW2000-3D.zip', 'r') as zip_ref:
#     zip_ref.extractall('.')

In [None]:
# Check if data directory exists
if not os.path.exists(DATA_DIR):
    print(f"Warning: {DATA_DIR} directory not found. Please download and extract the dataset.")
else:
    print(f"Data directory found: {DATA_DIR}")

##Read the images & extract the features from the face

In [None]:
images_dir = glob.glob(f'{DATA_DIR}/*.jpg')
print(f"Found {len(images_dir)} images")

# Empty list to store the image arrays
features_data = []
labels_data = []
faceModule = mediapipe.solutions.face_mesh

with faceModule.FaceMesh(static_image_mode=True) as faces:
    # Loop over all the images in the directory
    for img_path in images_dir:
        # Read the image and corresponding label 
        base_name = os.path.splitext(img_path)[0]
        image = cv2.imread(base_name + '.jpg')
        
        if image is None:
            print(f"Warning: Could not read image {base_name}.jpg")
            continue
            
        label_path = base_name + '.mat'
        if not os.path.exists(label_path):
            print(f"Warning: Label file not found: {label_path}")
            continue
            
        label = sio.loadmat(label_path)['Pose_Para'][0][:3]
        
        # Processing the face to extract the landmark points (468 points) for each x,y
        results = faces.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        if results.multi_face_landmarks is not None: 
            # Looping over the faces in the image
            for face in results.multi_face_landmarks:
                x_points = []
                y_points = []
                for landmark in face.landmark:
                    x = landmark.x
                    y = landmark.y
                    x_points.append(x)
                    y_points.append(y)
                x_point = np.array(x_points)
                y_point = np.array(y_points)
                x_center = x_point - x_point[0]
                y_center = y_point - y_point[0]   
                features_data.append(np.hstack([x_center, y_center]))
                labels_data.append(label)

# Convert the lists to NumPy arrays
features = np.array(features_data)
labels = np.array(labels_data)

print(f"Extracted features from {len(features)} faces")
print(f"Features shape: {features.shape}")
print(f"Labels shape: {labels.shape}")


In [8]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

y_train_pitch= y_train[:,0]
y_train_yaw= y_train[:,1]
y_train_roll= y_train[:,2]

y_test_pitch= y_test[:,0]
y_test_yaw= y_test[:,1]
y_test_roll= y_test[:,2]


In [9]:
# Create a SVR model
pitch_model = SVR()
yaw_model = SVR()
roll_model = SVR()

# Train the model on the training set
pitch_model.fit(X_train, y_train_pitch)
yaw_model.fit(X_train, y_train_yaw)
roll_model.fit(X_train, y_train_roll)

# Make predictions on the test set
y_pred_pitch = pitch_model.predict(X_test)
y_pred_yaw = yaw_model.predict(X_test)
y_pred_roll = roll_model.predict(X_test)


In [11]:
score = cross_validate(pitch_model,features,labels[:,0],cv=5,scoring=["neg_mean_absolute_error",'r2'])

score = cross_validate(yaw_model,features,labels[:,1],cv=5,scoring=["neg_mean_absolute_error",'r2'])

score = cross_validate(roll_model,features,labels[:,2],cv=5,scoring=["neg_mean_absolute_error",'r2'])


In [12]:
#function to draw the pitch,yaw and roll 
def draw_axis(img, pitch,yaw,roll, tdx=None, tdy=None, size = 100):

    yaw = -yaw
    if tdx != None and tdy != None:
        tdx = tdx
        tdy = tdy
    else:
        height, width = img.shape[:2]
        tdx = width / 2
        tdy = height / 2

    # X-Axis pointing to right. drawn in red
    x1 = size * (cos(yaw) * cos(roll)) + tdx
    y1 = size * (cos(pitch) * sin(roll) + cos(roll) * sin(pitch) * sin(yaw)) + tdy

    # Y-Axis | drawn in green
    #        v
    x2 = size * (-cos(yaw) * sin(roll)) + tdx
    y2 = size * (cos(pitch) * cos(roll) - sin(pitch) * sin(yaw) * sin(roll)) + tdy

    # Z-Axis (out of the screen) drawn in blue
    x3 = size * (sin(yaw)) + tdx
    y3 = size * (-cos(yaw) * sin(pitch)) + tdy

    cv2.line(img, (int(tdx), int(tdy)), (int(x1),int(y1)),(0,0,255),3)
    cv2.line(img, (int(tdx), int(tdy)), (int(x2),int(y2)),(0,255,0),3)
    cv2.line(img, (int(tdx), int(tdy)), (int(x3),int(y3)),(255,0,0),2)

    return img

In [None]:
# Example usage:
# frames = ['image1.jpg', 'image2.jpg', 'image3.jpg']
# processed_images = draw_images(frames)

def draw_images(frames):
  features_data_test = []
  images = []
  faceModule = mediapipe.solutions.face_mesh
  with faceModule.FaceMesh(static_image_mode=True) as faces:
    for img in frames:
      try:
         image = cv2.imread(img)
      except:
        image = img
      # processing the face to extract the landmark points (468 point) for each x,y
      results = faces.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
      if results.multi_face_landmarks != None: 
        # looping over the faces in the image
        for face in results.multi_face_landmarks:
            x_points = []
            y_points = []
            for landmark in face.landmark:
                x = landmark.x
                y = landmark.y
                # note: the x and y values are scaled to the their width and height so we will get back their actual value in the image
                shape = image.shape 
                x_points.append(x)
                y_points.append(y)
            x_point = np.array(x_points)
            y_point = np.array(y_points)
            x_center = x_point - x_point[0]
            y_center = y_point - y_point[0]   
            features =np.hstack([x_center,y_center]).reshape(1,-1)
            # Convert the list to a NumPy array
            y_pred_pitch = pitch_model.predict(features)
            y_pred_yaw = yaw_model.predict(features)
            y_pred_roll = roll_model.predict(features)
            draw_axis(image,y_pred_pitch,y_pred_yaw,y_pred_roll,x_points[1]*shape[1],y_points[1]*shape[0])
            images.append(image)
  return images


## Testing the model on the image


## Testing the model on a video

In [None]:
# Configuration for video processing
INPUT_VIDEO = 'input_video.mp4'  # Change this to your input video path
OUTPUT_VIDEO = 'output.mp4'      # Output video path

if not os.path.exists(INPUT_VIDEO):
    print(f"Error: Input video not found: {INPUT_VIDEO}")
    print("Please update INPUT_VIDEO variable with the correct path.")
else:
    cap = cv2.VideoCapture(INPUT_VIDEO)
    frame_rate = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(OUTPUT_VIDEO, fourcc, frame_rate, (width, height))
    
    frames = []
    while True:
        try:
            # Read the next frame from the video
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.flip(frame, 1)
            frames.append(frame)
        except Exception as error:
            print(error)
            break
    
    print(f"Processing {len(frames)} frames...")
    processed_frames = draw_images(frames)
    
    for frame in processed_frames:
        out.write(frame)
    
    cap.release()
    out.release()
    print(f"Output video saved to: {OUTPUT_VIDEO}")