# Notebook 3: MediaPipe + RealSense Depth-Based Gaze Estimation (Real-Time)
This notebook uses MediaPipe to detect facial and iris landmarks and combines them with RealSense depth data to estimate a 3D gaze vector in real time.
Ensure you have MediaPipe installed (`pip install mediapipe`) and the Intel RealSense SDK set up.

In [None]:
import pyrealsense2 as rs
import mediapipe as mp
import numpy as np
import cv2
import os


In [None]:
mp_face_mesh = mp.solutions.face_mesh
face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False,
                                   max_num_faces=1,
                                   refine_landmarks=True,
                                   min_detection_confidence=0.5,
                                   min_tracking_confidence=0.5)


In [None]:
pipeline = rs.pipeline()
config = rs.config()
config.enable_stream(rs.stream.depth, 640, 480, rs.format.z16, 30)
config.enable_stream(rs.stream.color, 640, 480, rs.format.bgr8, 30)
profile = pipeline.start(config)

# Get intrinsics for depth-to-3D conversion
align_to = rs.stream.color
align = rs.align(align_to)
intrinsics = profile.get_stream(rs.stream.color).as_video_stream_profile().get_intrinsics()


In [None]:
def deproject(x, y, depth):
    return rs.rs2_deproject_pixel_to_point(intrinsics, [x, y], depth)

print("Press 'q' to quit.")
try:
    while True:
        frames = pipeline.wait_for_frames()
        aligned_frames = align.process(frames)
        depth_frame = aligned_frames.get_depth_frame()
        color_frame = aligned_frames.get_color_frame()

        if not depth_frame or not color_frame:
            continue

        color_image = np.asanyarray(color_frame.get_data())
        rgb_image = cv2.cvtColor(color_image, cv2.COLOR_BGR2RGB)
        results = face_mesh.process(rgb_image)

        if results.multi_face_landmarks:
            face_landmarks = results.multi_face_landmarks[0]
            h, w, _ = color_image.shape

            # Iris center (landmark 468 for left eye)
            lm = face_landmarks.landmark[468]
            x_px = int(lm.x * w)
            y_px = int(lm.y * h)
            z = depth_frame.get_distance(x_px, y_px)

            if z > 0:
                iris_3D = np.array(deproject(x_px, y_px, z))

                # Use another point on the eye (e.g., landmark 474) to estimate direction
                ref_lm = face_landmarks.landmark[474]
                ref_x = int(ref_lm.x * w)
                ref_y = int(ref_lm.y * h)
                ref_z = depth_frame.get_distance(ref_x, ref_y)

                if ref_z > 0:
                    ref_3D = np.array(deproject(ref_x, ref_y, ref_z))
                    gaze_vector = iris_3D - ref_3D
                    gaze_vector /= np.linalg.norm(gaze_vector)

                    # Visualize 2D projection of 3D gaze
                    pt1 = (x_px, y_px)
                    pt2 = (int(x_px + gaze_vector[0] * 100), int(y_px + gaze_vector[1] * 100))
                    cv2.arrowedLine(color_image, pt1, pt2, (0, 255, 0), 2)

        cv2.imshow("MediaPipe + RealSense Gaze", color_image)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

finally:
    pipeline.stop()
    cv2.destroyAllWindows()
