In [1]:
import os
import cv2
import mediapipe as mp
import numpy as np 
import matplotlib.pyplot as plt
from keras_facenet import FaceNet

In [2]:
# Initialize MediaPipe Face Mesh
mp_face_mesh = mp.solutions.face_mesh

# 3D face model points
MODEL_POINTS = np.array([
    (0.0, 0.0, 0.0),         # Nose tip
    (0.0, -330.0, -65.0),    # Chin
    (-225.0, 170.0, -135.0), # Left eye left corner
    (225.0, 170.0, -135.0),  # Right eye right corner
    (-150.0, -150.0, -125.0),# Left mouth corner
    (150.0, -150.0, -125.0)  # Right mouth corner
], dtype=np.float64)

In [3]:
LANDMARK_INDICES = [1, 152, 33, 263, 61, 291]  # Corresponding 2D landmarks
def get_camera_matrix(width, height):
    focal_length = width
    center = (width / 2, height / 2)
    return np.array([
        [focal_length, 0, center[0]],
        [0, focal_length, center[1]],
        [0, 0, 1]
    ], dtype=np.float64)

In [4]:
# Initialize FaceNet model
facenet_model = FaceNet()
embedding_file = "embeddings.npz"
known_faces_dir = "known_faces"
threshold = 0.3

# Globals to hold known face embeddings and names
known_embeddings = []
known_names = []




In [5]:
def get_euler_angles(model_points, image_points, camera_matrix):
    dist_coeffs = np.zeros((4, 1))  # No lens distortion
    success, rotation_vector, translation_vector = cv2.solvePnP(
        model_points, image_points, camera_matrix, dist_coeffs
    )
    if not success:
        return None

    rotation_matrix, _ = cv2.Rodrigues(rotation_vector)
    proj_matrix = np.hstack((rotation_matrix, translation_vector))
    _, _, _, _, _, _, euler_angles = cv2.decomposeProjectionMatrix(proj_matrix)

    pitch, yaw, roll = [angle[0] for angle in euler_angles]

    # Normalize pitch to center around 0
    if pitch > 90: pitch -= 180
    elif pitch < -90: pitch += 180

    return pitch, yaw, roll

In [6]:
def classify_direction(pitch, yaw, pitch_thresh=25, yaw_thresh=15):
    if abs(yaw) < yaw_thresh and abs(pitch) < pitch_thresh:
        return "Forward"
    if yaw <= -yaw_thresh:
        return "Looking Left"
    if yaw >= yaw_thresh:
        return "Looking Right"
    if pitch <= -pitch_thresh:
        return "Looking Up"
    if pitch >= pitch_thresh:
        return "Looking Down"
    return "Unknown"

In [7]:
def extract_landmarks(face_landmarks, img_w, img_h):
    image_points = []
    all_landmarks = []
    for idx, lm in enumerate(face_landmarks.landmark):
        x, y = int(lm.x * img_w), int(lm.y * img_h)
        all_landmarks.append((x, y))

    for idx in LANDMARK_INDICES:
        x, y = int(face_landmarks.landmark[idx].x * img_w), int(face_landmarks.landmark[idx].y * img_h)
        image_points.append((x, y))
    return np.array(image_points, dtype=np.float64), all_landmarks

In [None]:
def preprocess_face(face):
    os.makedirs("faces", exist_ok=True)  # Create folder if not exists

    # Generate a unique filename based on timestamp or count
    import time
    # filename = f"faces/face_{int(time.time() * 1000)}.jpg"
    # cv2.imwrite(filename, face)  # Save original BGR face crop
    face = cv2.cvtColor(face, cv2.COLOR_BGR2RGB)  # Convert to RGB (FaceNet expects this)
    face = cv2.resize(face, (160, 160))
    face = face.astype("float32") / 255.0
    return np.expand_dims(face, axis=0)


In [9]:


# Initialize FaceNet model
facenet_model = FaceNet()

# Initialize MediaPipe face detection
mp_face_detection = mp.solutions.face_detection
face_detection = mp_face_detection.FaceDetection(model_selection=1, min_detection_confidence=0.5)

# Build known face embeddings
def build_known_faces():

    print("üì¶ Building embeddings for known faces...")

    for person_name in os.listdir("known_faces"):
        person_dir = os.path.join("known_faces", person_name)
        if not os.path.isdir(person_dir):
            continue

        for image_name in os.listdir(person_dir):
            image_path = os.path.join(person_dir, image_name)
            print(f"üì∑ Processing: {image_path}")
            image = cv2.imread(image_path)

            if image is None:
                print(f"‚ùå Couldn't read image: {image_path}")
                continue

            face_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            result = face_detection.process(face_rgb)

            if result.detections:
                for detection in result.detections:
                    bboxC = detection.location_data.relative_bounding_box
                    h, w, _ = image.shape
                    x = int(bboxC.xmin * w)
                    y = int(bboxC.ymin * h)
                    w_box = int(bboxC.width * w)
                    h_box = int(bboxC.height * h)
                    face_crop = image[y:y+h_box, x:x+w_box]

                    if face_crop.size == 0:
                        print("‚ùå Cropped face is empty, skipping.")
                        continue

                    face_preprocessed = preprocess_face(face_crop)
                    embedding = facenet_model.model.predict(face_preprocessed)[0]  # ‚¨ÖÔ∏è FIXED LINE
                    known_embeddings.append(embedding)
                    known_names.append(person_name)
                    print(f"‚úÖ Added {person_name}")

    print(f"\n‚úÖ Total faces embedded: {len(known_names)}")
    np.savez("embeddings.npz", embeddings=known_embeddings, names=known_names)
    return known_embeddings, known_names


In [10]:
def load_known_faces():
    global known_embeddings, known_names  # ‚úÖ Add this line

    if os.path.exists(embedding_file):
        print("Loading embeddings from file...")
        data = np.load(embedding_file, allow_pickle=True)
        known_embeddings = data["embeddings"]
        known_names = data["names"].tolist()
    else:
        known_embeddings, known_names = build_known_faces()


In [11]:
from scipy.spatial.distance import cosine

def recognize_face(embedding, position):
    min_dist = float("inf")
    identity = "Unknown"

    for idx, known_embedding in enumerate(known_embeddings):
        dist = cosine(embedding, known_embedding)  # Lower = more similar
        # print(f"Distance to {known_names[idx]}: {dist:.4f}")
        if dist < min_dist:
            min_dist = dist
            identity = known_names[idx] + '(' + position + ')'

    if min_dist > threshold:
        identity = "Unknown"

    return identity


In [12]:
def draw_annotations(frame, direction, position, all_landmarks, yaw, pitch):
    xs = [x for x, y in all_landmarks]
    ys = [y for x, y in all_landmarks]
    x_min, x_max = min(xs), max(xs)
    y_min, y_max = min(ys), max(ys)

    # label = f"{direction}, y:{int(abs(yaw)):.2f}, p:{int(abs(pitch)):.2f}"
    # cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
    # cv2.putText(frame, label, (x_min, y_min - 10),
    #             cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
    
    # Only recognize if the person is looking forward
    if direction == "Forward":
        face_crop = frame[y_min:y_max, x_min:x_max]
        if face_crop.size > 0:
            face_preprocessed = preprocess_face(face_crop)
            face_embedding = facenet_model.model.predict(face_preprocessed)[0]
            identity = recognize_face(face_embedding, position)
        else:
            identity = "Face not found"
    else:
        identity = "Not looking"
    
    # Draw annotations
    cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
    label = f"{identity}"
    cv2.putText(frame, label, (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)


In [13]:
def get_clock_direction(x, img_width):
    relative_pos = x / img_width

    if relative_pos < 0.15:
        return "9 o'clock"
    elif relative_pos < 0.30:
        return "10 o'clock"
    elif relative_pos < 0.45:
        return "11 o'clock"
    elif relative_pos < 0.55:
        return "12 o'clock"
    elif relative_pos < 0.70:
        return "1 o'clock"
    elif relative_pos < 0.85:
        return "2 o'clock"
    else:
        return "3 o'clock"


In [14]:
def get_face_center_x(landmarks):
    xs = [pt[0] for pt in landmarks]
    return sum(xs) / len(xs)


In [18]:
def main():
    load_known_faces()
    facenet_model = FaceNet()
    cap = cv2.VideoCapture('./test3.mp4')
    fps = cap.get(cv2.CAP_PROP_FPS) or 30  # Fallback in case of 0
    delay = int(1000 / fps)
    # delay = 1
    
    with mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=5,
                               min_detection_confidence=0.5, min_tracking_confidence=0.5) as face_mesh:
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            img_h, img_w = frame.shape[:2]
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = face_mesh.process(rgb_frame)

            if results.multi_face_landmarks:
                for face_landmarks in results.multi_face_landmarks:
                    image_points, all_landmarks = extract_landmarks(face_landmarks, img_w, img_h)
                    camera_matrix = get_camera_matrix(img_w, img_h)
                    angles = get_euler_angles(MODEL_POINTS, image_points, camera_matrix)
                    if angles is None:
                        continue
                    pitch, yaw, roll = angles
                    direction = classify_direction(pitch, yaw)
                    center_x = get_face_center_x(all_landmarks)
                    position = get_clock_direction(center_x, img_w)
                    draw_annotations(frame, direction, position, all_landmarks, yaw, pitch)

            scale_factor = 0.5
            display_frame = cv2.resize(frame, (int(img_w * scale_factor), int(img_h * scale_factor)))
            cv2.imshow("Video", display_frame)

            if cv2.waitKey(delay) & 0xFF == ord("q"):
                break

    cap.release()
    cv2.destroyAllWindows()

main()


Loading embeddings from file...
[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚

In [26]:
import cv2
import numpy as np
from keras_facenet import FaceNet

def preprocess_face(face):
    face = cv2.cvtColor(face, cv2.COLOR_BGR2RGB)
    face = cv2.resize(face, (160, 160))
    face = face.astype("float32") / 255.0
    return np.expand_dims(face, axis=0)

# Load images
img1 = cv2.imread("known_faces/nahin/1.jpg")
img2 = cv2.imread("known_faces/elon/1.jpg")

input1 = preprocess_face(img1)
input2 = preprocess_face(img2)

print("Input1 sum:", np.sum(input1))
print("Input2 sum:", np.sum(input2))

# Initialize model once
facenet_model = FaceNet()

# Use raw model.predict() instead of .embeddings() to avoid caching issues
emb1 = facenet_model.model.predict(input1)[0]
emb2 = facenet_model.model.predict(input2)[0]

print("Embedding 1 first 10:", emb1[:10])
print("Embedding 2 first 10:", emb2[:10])
print("Distance between embeddings:", np.linalg.norm(emb1 - emb2))


Input1 sum: 27632.178
Input2 sum: 47494.445
[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 56ms/step
Embedding 1 first 10: [ 0.01105747  0.02691554 -0.01947343 -0.01546158 -0.03590532 -0.00074519
  0.00959834  0.03275811  0.03332273 -0.0075451 ]
Embedding 2 first 10: [ 0.06260297 -0.01816018  0.05560359 -0.06730364  0.06026611  0.06765513
 -0.01969156  0.0125491   0.00032403  0.0750424 ]
Distance between embeddings: 1.4861053


In [None]:
[ 0.01040817  0.01323742 -0.08136161  0.03703693 -0.00583532  0.09873082
 -0.02493108  0.06667083 -0.03647001 -0.02634569 -0.01645712  0.07746425
  0.02978942 -0.03657433 -0.0085121  -0.02543744  0.01909423  0.03340082
 -0.01135216 -0.12970872 -0.09350711 -0.00186395  0.07484037 -0.04299356
  0.03251214 -0.01524376 -0.01608279 -0.04873962 -0.04360052  0.02108771
 -0.00943475 -0.00025414  0.00208789 -0.02367694 -0.03761331  0.06066559
  0.04307509 -0.01791337 -0.11657034  0.02633476  0.01289774  0.03899858
 -0.00975658 -0.02284099  0.03940051  0.00723536  0.03981997  0.08939075
 -0.10278585 -0.0819013 ]

[ 0.01040817  0.01323742 -0.08136161  0.03703693 -0.00583532  0.09873082
 -0.02493108  0.06667083 -0.03647001 -0.02634569 -0.01645712  0.07746425
  0.02978942 -0.03657433 -0.0085121  -0.02543744  0.01909423  0.03340082
 -0.01135216 -0.12970872 -0.09350711 -0.00186395  0.07484037 -0.04299356
  0.03251214 -0.01524376 -0.01608279 -0.04873962 -0.04360052  0.02108771
 -0.00943475 -0.00025414  0.00208789 -0.02367694 -0.03761331  0.06066559
  0.04307509 -0.01791337 -0.11657034  0.02633476  0.01289774  0.03899858
 -0.00975658 -0.02284099  0.03940051  0.00723536  0.03981997  0.08939075
 -0.10278585 -0.0819013 ]