# Top-down face recognition with DeepFace

This notebook implements a top-down solution for face recognition on a given bulk of images. Why top-down? Because it receives a source file with an image of a known person to get similar images from the bulk. The pipeline looks like this:

<img src="./assets/img_2.jpg" alt="drawing" width="800"/>

Why is this approach not great? Because we need to define who we want to look for, instead of just getting faces that are similar to each other based on their vector similarity. Moreover, the approach does not work very well as soon as the face is not perfectly visible and is very prone to mistakes based on the image quality, color profile, and other aspects. What we want instead would be an approach that judges human faces based on their similarity, group them in distinct clusters, and simply return the clusters, without straight-up recognizing the subject.

## Detect and crop faces to reduce noise for facial recognition

In [None]:
# installs dedicated module for background removal https://github.com/Ir1d/image-background-remove-tool
%pip install deepface matplotlib pillow tf-keras seaborn torch pandas opencv-python carvekit --extra-index-url https://download.pytorch.org/whl/cpu

In [None]:
import cv2
from deepface import DeepFace
from PIL import Image

import os
import pandas as pd
from multiprocessing.pool import ThreadPool
import shutil

In [None]:
class InputImage:
    def __init__(self, img_name):
        self.img = cv2.imread(img_name)
        self.__name = img_name

    def __str__(self):
        return self.__name

In [None]:
# download: https://github.com/opencv/opencv_zoo/blob/main/models/face_detection_yunet/face_detection_yunet_2023mar.onnx
detector = cv2.FaceDetectorYN.create("./utils/face_detection_yunet_2023mar.onnx",  "", (0, 0))

## Iterates over the images in a given folder, detects faces and performs a crop on the face

In [None]:
#Iterates over the files in a given folder to perform cropping
input_folder = '../../data/images/nytimes/'

In [None]:
# Converts whatever is not a .jpg into a .jpg
def convert_img_format_to_jpg(image_path):
   with Image.open(image_path) as img:
      if img.format != 'JPEG':
         rgb_img = img.convert('RGB')
         jpg_path = os.path.splitext(image_path)[0] + '.jpg'
         rgb_img.save(jpg_path, 'JPEG')
         print(f"Converted {image_path} to {jpg_path}")
         return jpg_path
      return image_path

In [None]:
def detect_and_crop_faces(image_path, detector, enlargement_factor=2):
   # Read input image
   loaded_img = InputImage(image_path)
   print(loaded_img)
   img = loaded_img.img

   # Check if the image was loaded correctly
   if img is None:
      print(f"Error: Unable to load image at {image_path}")
      return

   height, width, _ = img.shape
   detector.setInputSize((width, height))
   _, faces = detector.detect(img)

   # If faces exist
   if faces is not None:
      # Then crop
      for f, face in enumerate(faces):
         base_name = os.path.basename(loaded_img.__str__())
         name, ext = os.path.splitext(base_name)
         unique_face_filename = f"{input_folder}/detected-faces/{name}{ext}"
         # Available parameters: x1, y1, w, h, x_re, y_re, x_le, y_le, x_nt, y_nt, x_rcm, y_rcm, x_lcm, y_lcm
         (x, y, w, h) = face[:4]
         x = int(x)
         y = int(y)

         y1 = max(0, int(y - h * (enlargement_factor - 1) / 2))
         y2 = min(height, int(y + h * (1 + (enlargement_factor - 1) / 2)))
         x1 = max(0, int(x - w * (enlargement_factor - 1) / 2))
         x2 = min(width, int(x + w * (1 + (enlargement_factor - 1) / 2)))
         facecrop = img[y1:y2, x1:x2]

         cv2.imwrite(unique_face_filename, facecrop)
         convert_img_format_to_jpg(unique_face_filename)


In [None]:
for filename in os.listdir(input_folder):
    file_path = os.path.join(input_folder, filename)
    if os.path.isfile(file_path):
        detect_and_crop_faces(file_path, detector)

## Removes crops that are too small

In [None]:
img_dir = f"{input_folder}/detected-faces/"

for filename in os.listdir(img_dir):
    filepath = os.path.join(img_dir, filename)
    with Image.open(filepath) as im:
        x, y = im.size
    totalsize = x*y
    if totalsize < 12100:
        os.remove(filepath)

## Face recognition

Let's assume we have a folder at `input_data/nyt/source-img/` that contains portraits of known people, like this one:

<img src="./assets/beyonce_knowles.jpg" alt="drawing" width="200"/>

In [None]:
# Function that checks if an image is broken and cannot be opened. If that is the case, it deletes the image.
IMAGE_EXT = ('.jpg', '.jpeg', '.png', '.gif')

def check_image(image_path):
    try:
        Image.open(image_path)
        print(f'Image is OK: {image_path}')
    except:
        os.remove(image_path)
        print(f'Image deleted: {image_path}')

def delete_broken_images(root_dir):
    pool = ThreadPool(processes=10)
    for subdir, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith(IMAGE_EXT):
                image_path = os.path.join(subdir, file)
                pool.apply_async(check_image, (image_path,)).get()

We read the images.

In [None]:
# Here is where you would load the source images for the people that need to be found
source_folder = 'input_data/nyt/source-img/'
source_images = [f for f in os.listdir(source_folder) if f.endswith(IMAGE_EXT)]
print(source_images)

We set up the interface for our model. We use DeepFace to load Facenet512. The "find_faces" function takes two parameters: the `img_path` with the portrait of who needs to be found and the `db_path`, a folder with all the images that need to be analyzed.

In [None]:
def find_faces(source_img, db_path): 
  df =  DeepFace.find(
    img_path = source_img,
    db_path = db_path,
    enforce_detection = False,
    model_name = "Facenet512"
  )
  print(f'found images {len(df[0])}')
  return df

In [None]:
appended_data = []

In [None]:
 
for image in source_images: 
    print(f'Checking {image}')
    # Here we run find_faces
    data = find_faces(f'{source_folder}/{image}', f"{input_folder}/detected-faces/")
    if not data[0].empty:
        data[0]['source'] = image
    appended_data.append(data[0])

In [None]:
all_data = pd.concat(appended_data, axis=0)
all_data.head(5)

In [None]:
all_data['source'] = all_data['source'].str.replace('.jpg', '')
all_data.sample(5)

In [None]:
source_counts = all_data['source'].value_counts()
print(source_counts)

In [None]:
# Create the new folder if it doesn't exist
politicians_folder = f"{input_folder}/detected-faces/recognized"
os.makedirs(politicians_folder, exist_ok=True)

# Copy each image to the new folder
for image_path in all_data['identity']:
    shutil.copy(image_path, politicians_folder)

In [None]:
all_data.to_csv("../../data/images/nytimes/recognized_faces_nyt.csv")