In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import random
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import cv2
from matplotlib import pyplot as plt
from mtcnn import MTCNN

In [None]:
dataset_url = "./dataset/"

folder_count = 0
total_file_count = 0

for root, dirs, files in os.walk(dataset_url):
  folder_count += len(dirs)
  folder_file_count = len(files)
  total_file_count += folder_file_count

  print(f"Folder: {os.path.basename(root)}, Files: {folder_file_count}")

print(f"\nNumber of folders: {folder_count}")
print(f"Total number of files: {total_file_count}")

In [None]:
dataset_url = './dataset'

all_folders = [folder for folder in os.listdir(dataset_url) if os.path.isdir(os.path.join(dataset_url, folder))]
selected_folders = random.sample(all_folders, 10)
fig, axes = plt.subplots(2, 5, figsize=(10, 5))

for i, folder in enumerate(selected_folders):
    folder_path = os.path.join(dataset_url, folder)
    all_images = [img for img in os.listdir(folder_path) if img.endswith('.jpg')]
    selected_image = random.choice(all_images)
    image_path = os.path.join(folder_path, selected_image)
    
    img = mpimg.imread(image_path)
    axes[i // 5, i % 5].imshow(img)
    axes[i // 5, i % 5].axis('off')
    
    label = folder[5:]
    axes[i // 5, i % 5].set_title(label)
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.patches as patches
from mtcnn import MTCNN

def detect_display_faces(image_path):
  img = cv2.imread(image_path)
  img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
  detector = MTCNN()
    
  # Perform face detection
  faces = detector.detect_faces(img_rgb)
  
  # Display the original image
  fig, ax = plt.subplots()
  ax.imshow(img_rgb)
  
  # Add bounding boxes for each detected face
  for face in faces:
      x, y, w, h = face['box']
      rect = patches.Rectangle((x, y), w, h, linewidth=2, edgecolor='r', facecolor='none')
      ax.add_patch(rect)
  plt.axis('off')
  plt.show()

dataset_url = './dataset'
all_folders = [folder for folder in os.listdir(dataset_url) if os.path.isdir(os.path.join(dataset_url, folder))]

# Randomly pick a folder
selected_folder = random.choice(all_folders)
folder_path = os.path.join(dataset_url, selected_folder)

# Get a list of all images in the selected folder
all_images = [img for img in os.listdir(folder_path) if img.endswith('.jpg')]
selected_image = random.choice(all_images)
image_path = os.path.join(folder_path, selected_image)

# Perform face detection using MTCNN and display the result
detect_display_faces(image_path)

In [None]:
import os
import shutil

train_dataset_url = './train-dataset'

for item in os.listdir(train_dataset_url):
  item_path = os.path.join(train_dataset_url, item)
  if os.path.isfile(item_path) or os.path.islink(item_path):
    os.unlink(item_path)
  elif os.path.isdir(item_path):
    shutil.rmtree(item_path)

print("Contents of the working directory cleared.")


In [None]:
import shutil

# Source directory
source_directory = './dataset'

# Destination directory
destination_directory = './train-dataset'

# Create train and test directories if not exist
train_directory = os.path.join(destination_directory, 'train')
os.makedirs(train_directory, exist_ok=True)


# Get a list of all folders in the source directory
all_folders = [folder for folder in os.listdir(source_directory) if os.path.isdir(os.path.join(source_directory, folder))]

for folder in all_folders:
    folder_path = os.path.join(source_directory, folder)
    
    # Get a list of all images in the folder
    all_images = [img for img in os.listdir(folder_path) if img.endswith('.jpg')]
    
    # Create a label for the folder (drop the first 5 characters)
    label = folder[5:]
    
    # Shuffle the images
    random.shuffle(all_images)
    
    # Move the first 85 images to the train directory
    for i in range(85):
        image_path = os.path.join(folder_path, all_images[i])
        destination_path = os.path.join(train_directory, label, all_images[i])
        os.makedirs(os.path.dirname(destination_path), exist_ok=True)
        shutil.copy(image_path, destination_path)
    

print("Data splitting completed.")

In [None]:
from matplotlib.patches import Rectangle
from mtcnn.mtcnn import MTCNN

# Directory paths
train_directory = './train-dataset/train'

# Get a list of all folders in the train directory
all_folders = [folder for folder in os.listdir(train_directory) if os.path.isdir(os.path.join(train_directory, folder))]

# Randomly select two folders
selected_folders = random.sample(all_folders, 2)

# Initialize MTCNN detector
detector = MTCNN()

for folder in selected_folders:
  folder_path = os.path.join(train_directory, folder)
  
  # Get a list of all images in the folder
  all_images = [img for img in os.listdir(folder_path) if img.endswith('.jpg')]
  
  # Randomly select one image
  selected_image = random.choice(all_images)
  image_path = os.path.join(folder_path, selected_image)
  
  # Read the image
  image = cv2.imread(image_path)
  image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
  
  # Detect faces using MTCNN
  faces = detector.detect_faces(image)
    
  # Display image with bounding boxes around detected faces
  plt.figure(figsize=(8, 3))
  
  # Display original image
  plt.subplot(1, 2, 1)
  plt.imshow(image_rgb)
  plt.title('Original Image')
  
  # Display image with bounding boxes
  plt.subplot(1, 2, 2)
  plt.imshow(image_rgb)
  for face in faces:
    x, y, width, height = face['box']
    rect = Rectangle((x, y), width, height, fill=False, color='red')
    plt.gca().add_patch(rect)
    
    # Display additional keypoints
    for key, value in face['keypoints'].items():
      plt.scatter(value[0], value[1], s=30, color='blue', marker='o')
      plt.text(value[0] + 5, value[1], key, color='blue')
    
plt.title('Detected Faces with Keypoints')
plt.show()

# Display metadata of detected faces
print(f"Metadata of detected faces in {folder}/{selected_image}:")
for i, face in enumerate(faces):
    print(f"Face {i + 1}:")
    print(f"   Confidence: {face['confidence']:.2f}")
    print(f"   Bounding Box: {face['box']}")
    print(f"   Keypoints: {face['keypoints']}")
    print()


In [None]:
from facenet_pytorch import MTCNN, InceptionResnetV1, extract_face
from PIL import Image

def process_image(image_path, face_detector, face_embedder, label):
  image = cv2.imread(image_path)
  image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
  boxes, probs = face_detector.detect(image_rgb)

  if boxes is not None:
    # Display label of the image using the folder it is picked up from
    plt.title(f"Label: {label}")

    # Display original image
    plt.imshow(Image.fromarray(image_rgb))

    # Draw a rectangle around each detected face on the original image
    for i, box in enumerate(boxes):
        x, y, w, h = box
        # Draw a rectangle around the detected face with thinner lines
        cv2.rectangle(image_rgb, (int(x), int(y)), (int(x + w), int(y + h)), (255, 0, 0), 1)

    # Display the image with the rectangles around detected faces
    plt.imshow(Image.fromarray(image_rgb))

    plt.axis('off')
    plt.show()

    # Apply transformations and normalization
    transformed_faces = [face_embedder(extract_face(image_rgb, box).unsqueeze(0)) for box in boxes]

    # Convert PyTorch tensor to NumPy array
    transformed_faces_np = [transformed_face.squeeze().detach().numpy() for transformed_face in transformed_faces]
    for i, embeddings in enumerate(transformed_faces_np):
      print(f"Embeddings for Detected Face {i + 1}:", embeddings)

# Directory paths
train_directory = './train-dataset/train/'

# Randomly pick a folder
selected_folder = random.choice(os.listdir(train_directory))
selected_folder_path = os.path.join(train_directory, selected_folder)

# Randomly pick one file from the selected folder
file = random.choice(os.listdir(selected_folder_path))
file_path = os.path.join(selected_folder_path, file)

# Initialize MTCNN for face detection with adjusted parameters
face_detector = MTCNN(margin=20, post_process=False, select_largest=False)
   
# Initialize InceptionResnetV1 for face embedding
face_embedder = InceptionResnetV1(pretrained='vggface2').eval()

# Process image, detect faces, calculate embeddings, and display results
process_image(file_path, face_detector, face_embedder, label=selected_folder)

In [None]:
from facenet_pytorch import MTCNN, InceptionResnetV1, extract_face
from PIL import Image
from sklearn.metrics.pairwise import euclidean_distances

def process_images(folder_path1, file1, folder_path2, file2, face_detector, face_embedder):
  # Read the images
  image1_path = os.path.join(folder_path1, file1)
  image2_path = os.path.join(folder_path2, file2)

  image1 = cv2.imread(image1_path)
  image1_rgb = cv2.cvtColor(image1, cv2.COLOR_BGR2RGB)

  image2 = cv2.imread(image2_path)
  image2_rgb = cv2.cvtColor(image2, cv2.COLOR_BGR2RGB)

  # Detect faces using MTCNN
  faces1, _ = face_detector.detect(image1_rgb)
  faces2, _ = face_detector.detect(image2_rgb)

  # Get labels from folder names
  label1 = os.path.basename(folder_path1)
  label2 = os.path.basename(folder_path2)

  # Display original images with rectangles around detected faces
  plt.subplot(1, 2, 1)
  plt.imshow(Image.fromarray(image1_rgb))
  plt.title(f"Original Image 1\nLabel: {label1}")

  for i, face in enumerate(faces1):
    x, y, w, h = face
    # Draw a rectangle around the detected face
    cv2.rectangle(image1_rgb, (int(x), int(y)), (int(x+w), int(y+h)), (255, 0, 0), 2)

  plt.subplot(1, 2, 2)
  plt.imshow(Image.fromarray(image1_rgb))
  plt.title(f"Detected Faces 1")
  plt.axis('off')

  plt.show()

  plt.subplot(1, 2, 1)
  plt.imshow(Image.fromarray(image2_rgb))
  plt.title(f"Original Image 2\nLabel: {label2}")

  for i, face in enumerate(faces2):
    x, y, w, h = face
    # Draw a rectangle around the detected face
    cv2.rectangle(image2_rgb, (int(x), int(y)), (int(x+w), int(y+h)), (255, 0, 0), 2)

  plt.subplot(1, 2, 2)
  plt.imshow(Image.fromarray(image2_rgb))
  plt.title(f"Detected Faces 2")
  plt.axis('off')

  plt.show()

  # Calculate embeddings for the first image
  embeddings1 = [face_embedder(extract_face(image1_rgb, face).unsqueeze(0)).squeeze().detach().numpy() for face in faces1]
  embeddings2 = [face_embedder(extract_face(image2_rgb, face).unsqueeze(0)).squeeze().detach().numpy() for face in faces2]
  distance = euclidean_distances(embeddings1[0].reshape(1, -1), embeddings2[0].reshape(1, -1))[0][0]

  # Display the distance between the two embeddings
  print(f"Distance between embeddings: {distance:.4f}")

train_directory = './train-dataset/train/'

# Randomly pick two folders
selected_folders = random.sample(os.listdir(train_directory), 2)

# Randomly pick one file from each selected folder
file1 = random.choice(os.listdir(os.path.join(train_directory, selected_folders[0])))
file2 = random.choice(os.listdir(os.path.join(train_directory, selected_folders[1])))

# Initialize MTCNN for face detection
face_detector = MTCNN(keep_all=True)

# Initialize InceptionResnetV1 for face embedding
face_embedder = InceptionResnetV1(pretrained='vggface2').eval()

# Process images, detect faces, calculate embeddings, and display results
process_images(
    os.path.join(train_directory, selected_folders[0]),
    file1,
    os.path.join(train_directory, selected_folders[1]),
    file2,
    face_detector,
    face_embedder
)

In [None]:
from facenet_pytorch import MTCNN, InceptionResnetV1, extract_face
from PIL import Image
from sklearn.metrics.pairwise import euclidean_distances

# Function to process images, detect faces, and calculate embeddings
def process_images(folder_path, face_detector, face_embedder):
  # Randomly pick two files from the selected folder
  files = random.sample(os.listdir(folder_path), 2)
  
  # Read the images
  image1_path = os.path.join(folder_path, files[0])
  image2_path = os.path.join(folder_path, files[1])

  image1 = cv2.imread(image1_path)
  image1_rgb = cv2.cvtColor(image1, cv2.COLOR_BGR2RGB)

  image2 = cv2.imread(image2_path)
  image2_rgb = cv2.cvtColor(image2, cv2.COLOR_BGR2RGB)

  # Detect faces using MTCNN
  faces1, _ = face_detector.detect(image1_rgb)
  faces2, _ = face_detector.detect(image2_rgb)

  # Get the label from the folder name
  label = os.path.basename(folder_path)

  # Display original images with rectangles around detected faces
  plt.subplot(1, 2, 1)
  plt.imshow(Image.fromarray(image1_rgb))
  plt.title(f"Original Image 1\nLabel: {label}")

  for i, face in enumerate(faces1):
    x, y, w, h = face
    # Draw a rectangle around the detected face
    cv2.rectangle(image1_rgb, (int(x), int(y)), (int(x+w), int(y+h)), (255, 0, 0), 2)

  plt.subplot(1, 2, 2)
  plt.imshow(Image.fromarray(image1_rgb))
  plt.title(f"Detected Faces 1")
  plt.axis('off')

  plt.show()

  plt.subplot(1, 2, 1)
  plt.imshow(Image.fromarray(image2_rgb))
  plt.title(f"Original Image 2\nLabel: {label}")

  for i, face in enumerate(faces2):
    x, y, w, h = face
    # Draw a rectangle around the detected face
    cv2.rectangle(image2_rgb, (int(x), int(y)), (int(x+w), int(y+h)), (255, 0, 0), 2)

  plt.subplot(1, 2, 2)
  plt.imshow(Image.fromarray(image2_rgb))
  plt.title(f"Detected Faces 2")
  plt.axis('off')

  plt.show()

  # Calculate embeddings for the first image
  embeddings1 = [face_embedder(extract_face(image1_rgb, face).unsqueeze(0)).squeeze().detach().numpy() for face in faces1]
  embeddings2 = [face_embedder(extract_face(image2_rgb, face).unsqueeze(0)).squeeze().detach().numpy() for face in faces2]
  distance = euclidean_distances(embeddings1[0].reshape(1, -1), embeddings2[0].reshape(1, -1))[0][0]

  # Display the distance between the two embeddings
  print(f"Distance between embeddings: {distance:.4f}")

# Directory paths
train_directory = './train-dataset/train/'

# Randomly pick one folder
selected_folder = random.choice(os.listdir(train_directory))
selected_folder_path = os.path.join(train_directory, selected_folder)

# Initialize MTCNN for face detection
face_detector = MTCNN(keep_all=True)

# Initialize InceptionResnetV1 for face embedding
face_embedder = InceptionResnetV1(pretrained='vggface2').eval()

# Process images, detect faces, calculate embeddings, and display results
process_images(selected_folder_path, face_detector, face_embedder)

## Preprocessing

In [None]:
import os
import shutil

# Set the directory path
directory_path = './train_detected_faces'

# Remove all files and subdirectories in the directory
for item in os.listdir(directory_path):
  item_path = os.path.join(directory_path, item)
  if os.path.isfile(item_path) or os.path.islink(item_path):
    os.unlink(item_path)
  elif os.path.isdir(item_path):
    shutil.rmtree(item_path)

print("Contents of train_detected_face cleared.")

In [None]:
import os
import cv2
import numpy as np

data_directory = './train-dataset/train'
output_directory = './train_detected_faces/'

def detect_faces_and_save(image_paths, output_directory):
  processed_images = []
  images_without_faces = 0

  for i, image_path in enumerate(image_paths):
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Use OpenCV's deep learning-based face detector
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.3, minNeighbors=5)
    if len(faces) > 0:
      # Assuming only one face is detected, adjust the dimensions
      x, y, w, h = faces[0]
      face_roi = img[y:y+h, x:x+w]
      resized_face = cv2.resize(face_roi, (224, 224))

      # Get the original folder name
      folder_name = image_path.split('/')[-2]
      # Create the output folder if it doesn't exist
      output_folder = os.path.join(output_directory, folder_name)
      os.makedirs(output_folder, exist_ok=True)

      # Save the detected face with the same folder structure
      output_path = os.path.join(output_folder, f"detected_face_{i}.jpg")
      cv2.imwrite(output_path, resized_face)

      processed_images.append(resized_face)
    else:
      images_without_faces += 1

    if i % 50 == 0:
      print(f"{i}/{len(image_paths)} images processed", end='\r', flush=True)
  
  print(f"\nImages without faces detected/Total images: {images_without_faces}/{len(image_paths)}")
  return np.array(processed_images)

# Create output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Get list of all image paths
all_image_paths = []
for folder_name in os.listdir(data_directory):
  folder_path = os.path.join(data_directory, folder_name)
  if os.path.isdir(folder_path):
    image_paths = [os.path.join(folder_path, image_name) for image_name in os.listdir(folder_path)]
    all_image_paths.extend(image_paths)

X_all_processed = detect_faces_and_save(all_image_paths, output_directory)

print("face detection complete")

In [None]:
import os
import cv2
import matplotlib.pyplot as plt
import numpy as np

# Replace this with the path to your detected faces directory
output_directory = './train_detected_faces'

# Get a list of subdirectories (folders) in the output directory
folders = [f for f in os.listdir(output_directory) if os.path.isdir(os.path.join(output_directory, f))]

# Randomly pick five folders
selected_folders = np.random.choice(folders, size=5, replace=False)

# Set up the subplot
fig, axes = plt.subplots(nrows=5, ncols=5, figsize=(15, 15))

for i, folder_name in enumerate(selected_folders):
  folder_path = os.path.join(output_directory, folder_name)
  image_paths = [os.path.join(folder_path, image_name) for image_name in os.listdir(folder_path)[:5]]

  for j, image_path in enumerate(image_paths):
    # Read the image using OpenCV
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # Display the image
    axes[i, j].imshow(img)
    axes[i, j].set_title(f'{folder_name} - {j+1}')
    axes[i, j].axis('off')

plt.tight_layout()
plt.show()

In [None]:
import os

output_directory = './train_detected_faces'

# Get a list of all folders under the output directory
all_folders = [folder for folder in os.listdir(output_directory) if os.path.isdir(os.path.join(output_directory, folder))]

# Iterate through each folder and print the folder name and file count
for folder in all_folders:
  folder_path = os.path.join(output_directory, folder)
  file_count = len(os.listdir(folder_path))
  print(f"Folder: {folder}, File Count: {file_count}")

In [None]:
from facenet_pytorch import InceptionResnetV1
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import euclidean_distances
import torch

def process_images(face1_path, face2_path, face_embedder):
  # Read the pre-detected face images
  face1 = cv2.imread(face1_path)
  face1_rgb = cv2.cvtColor(face1, cv2.COLOR_BGR2RGB)

  face2 = cv2.imread(face2_path)
  face2_rgb = cv2.cvtColor(face2, cv2.COLOR_BGR2RGB)

  # Get labels from folder names
  label1 = os.path.basename(os.path.dirname(face1_path))
  label2 = os.path.basename(os.path.dirname(face2_path))

  # Display original face images
  plt.subplot(1, 2, 1)
  plt.imshow(Image.fromarray(face1_rgb))
  plt.title(f"Original Face 1\nLabel: {label1}")
  plt.axis('off')

  plt.subplot(1, 2, 2)
  plt.imshow(Image.fromarray(face2_rgb))
  plt.title(f"Original Face 2\nLabel: {label2}")
  plt.axis('off')

  plt.show()
  # Convert NumPy arrays to PyTorch tensors
  face1_tensor = torch.from_numpy(face1_rgb.transpose(2, 0, 1)).unsqueeze(0).float() / 255.0
  face2_tensor = torch.from_numpy(face2_rgb.transpose(2, 0, 1)).unsqueeze(0).float() / 255.0

  # Calculate embeddings for the first face
  embeddings1 = face_embedder(face1_tensor).detach().numpy()

  # Print the embeddings for the first face
  # print("Embeddings for Face 1:", embeddings1)

  # Calculate embeddings for the second face
  embeddings2 = face_embedder(face2_tensor).detach().numpy()

  # Print the embeddings for the second face
  # print("Embeddings for Face 2:", embeddings2)

  # Calculate distance between the two embeddings
  distance = euclidean_distances(embeddings1.reshape(1, -1), embeddings2.reshape(1, -1))[0][0]

  # Display the distance between the two embeddings
  print(f"Distance between embeddings: {distance:.4f}")

# Directory paths
train_detected_faces_directory = './train_detected_faces/'

# Randomly pick two folders
selected_folders = random.sample(os.listdir(train_detected_faces_directory), 2)

# Randomly pick one file from each selected folder
file1 = random.choice(os.listdir(os.path.join(train_detected_faces_directory, selected_folders[0])))
file2 = random.choice(os.listdir(os.path.join(train_detected_faces_directory, selected_folders[1])))

# Full paths for the selected faces
face1_path = os.path.join(train_detected_faces_directory, selected_folders[0], file1)
face2_path = os.path.join(train_detected_faces_directory, selected_folders[1], file2)

# Initialize InceptionResnetV1 for face embedding
face_embedder = InceptionResnetV1(pretrained='vggface2').eval()

# Process face images, calculate embeddings, and display results
process_images(face1_path, face2_path, face_embedder)

## Extracting embeddings

In [None]:
import shutil
import os

folder_path = './extracted_embeddings'

# Check if the folder exists before deleting
if os.path.exists(folder_path):
  shutil.rmtree(folder_path)
  print(f"The folder {folder_path} has been deleted.")
else:
  print(f"The folder {folder_path} does not exist.")

In [None]:
from facenet_pytorch import InceptionResnetV1
from PIL import Image
from torchvision import transforms
from tqdm import tqdm

# Function to extract embeddings from a single folder with data augmentation
def extract_embeddings_from_folder(folder_path, face_embedder, device, output_directory):
  embeddings = {}
  data_transform = transforms.Compose([
      transforms.RandomHorizontalFlip(),
      transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
      transforms.RandomRotation(10),
      transforms.ToTensor()
  ])
  label = os.path.basename(folder_path)  # Get the label from the folder name

  for image_name in tqdm(os.listdir(folder_path), desc=f"Processing {folder_path}"):
    image_path = os.path.join(folder_path, image_name)
    try:
      img = Image.open(image_path)

      # Convert image to tensor
      img_tensor = transforms.ToTensor()(img).unsqueeze(0).float().to(device)

      # Calculate embedding
      embedding = face_embedder(img_tensor).squeeze().detach().cpu().numpy()
      embeddings[image_name] = embedding

      # Save the embedding for the original image
      output_emb_path = os.path.join(output_directory, f"{label}_{os.path.splitext(image_name)[0]}_embedding.npy")
      np.save(output_emb_path, embedding)

      # Apply data augmentation
      augmented_img = data_transform(img)

      # Convert augmented image to tensor
      img_tensor_augmented = augmented_img.unsqueeze(0).float().to(device)

      # Calculate embedding for the augmented image
      embedding_augmented = face_embedder(img_tensor_augmented).squeeze().detach().cpu().numpy()
      embeddings[f"{os.path.splitext(image_name)[0]}_augmented_embedding.npy"] = embedding_augmented
      output_emb_path_augmented = os.path.join(output_directory, f"{label}_{os.path.splitext(image_name)[0]}_augmented_embedding.npy")
      np.save(output_emb_path_augmented, embedding_augmented)
    except Exception as e:
      print(f"Error processing {image_name}: {str(e)}")

  return label, embeddings

input_directory = './train_detected_faces'
output_directory = './extracted_embeddings'

# Initialize InceptionResnetV1 for face embedding
device = 'cpu'
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
face_embedder = InceptionResnetV1(pretrained='vggface2').eval().to(device)

# Loop through each folder in the input directory
for folder_name in os.listdir(input_directory):
  folder_path = os.path.join(input_directory, folder_name)

  # Create subfolder in output_directory for the current label
  label_output_directory = os.path.join(output_directory, folder_name)
  os.makedirs(label_output_directory, exist_ok=True)

  # Extract embeddings from the current folder with data augmentation
  label, embeddings = extract_embeddings_from_folder(folder_path, face_embedder, device, label_output_directory)

print("Embeddings extraction complete.")


In [None]:
import os

# Directory path
output_directory = './extracted_embeddings'

# Get folder names and file counts
folders_and_counts = [(folder, len(os.listdir(os.path.join(output_directory, folder)))) for folder in os.listdir(output_directory)]

# Print results
for folder, count in folders_and_counts:
    print(f"Folder: {folder}, Number of Files: {count}")

In [None]:
import shutil

folder_path = './extracted_embeddingstest'

# Check if the folder exists before deleting
if os.path.exists(folder_path):
    shutil.rmtree(folder_path)
    print(f"The folder {folder_path} has been deleted.")
else:
    print(f"The folder {folder_path} does not exist.")

In [None]:
import shutil

source_folder = "./extracted_embeddings"
destination_folder = "./extracted_embeddingstest"

# Create the destination folder if it doesn't exist
os.makedirs(destination_folder, exist_ok=True)

# Iterate through each folder in the source directory
for folder_name in os.listdir(source_folder):
  folder_path = os.path.join(source_folder, folder_name)

  # Check if it's a directory
  if os.path.isdir(folder_path):
    # Create the corresponding folder in the destination directory
    destination_folder_path = os.path.join(destination_folder, folder_name)
    os.makedirs(destination_folder_path, exist_ok=True)

    # Get the list of files in the current folder
    files = os.listdir(folder_path)

    # Move the first two files that do not contain "augmented" in their names
    moved_files = 0
    for file_name in files:
      if "augmented" not in file_name:
        source_file_path = os.path.join(folder_path, file_name)
        destination_file_path = os.path.join(destination_folder_path, file_name)

        # Move the file (not copy)
        shutil.move(source_file_path, destination_file_path)
        moved_files += 1

        if moved_files == 2:
          break  # Break after moving two suitable files

print("Files moved successfully.")

## RECOGNIZING FACES-Using Distances between embeddings

In [None]:
# Function to calculate average embedding for each label
def calculate_average_embeddings(base_folder):
    average_embeddings = {}

    for label_folder in os.listdir(base_folder):
        label_path = os.path.join(base_folder, label_folder)
        if os.path.isdir(label_path):
            label_embeddings = []
            for file_name in os.listdir(label_path):
                file_path = os.path.join(label_path, file_name)
                # Load embeddings using your preferred method (e.g., np.load())
                embedding = np.load(file_path)
                label_embeddings.append(embedding)

            # Calculate average embedding for the label
            average_embedding = np.mean(label_embeddings, axis=0)
            average_embeddings[label_folder] = average_embedding

    return average_embeddings

# Function to recognize faces based on existing embeddings
def recognize_faces(test_folder, average_embeddings):
    predictions = []

    for label_folder in os.listdir(test_folder):
        label_path = os.path.join(test_folder, label_folder)
        if os.path.isdir(label_path):
            for file_name in os.listdir(label_path):
                file_path = os.path.join(label_path, file_name)

                # Load test embedding from the saved numpy file
                test_embedding = np.load(file_path)

                # Compare test embedding with averaged embeddings
                distances = {}
                for label, avg_embedding in average_embeddings.items():
                    distance = np.linalg.norm(test_embedding - avg_embedding)
                    distances[label] = distance

                # Predict the label with the minimum distance
                predicted_label = min(distances, key=distances.get)
                predictions.append((file_name, label_folder, predicted_label))

    return predictions

# folder paths
base_folder = './extracted_embeddings'
test_folder = './extracted_embeddingstest'

# Load existing average embeddings
average_embeddings = calculate_average_embeddings(base_folder)

# Recognize faces in the test folder
predictions = recognize_faces(test_folder, average_embeddings)

# Print the predictions
for file_name, actual_label, predicted_label in predictions:
    print(f"{file_name}: Actual Label - {actual_label}, Predicted Label - {predicted_label}")

In [None]:
from PIL import Image

# Function to calculate average embedding for each label
def calculate_average_embeddings(base_folder):
    average_embeddings = {}

    for label_folder in os.listdir(base_folder):
        label_path = os.path.join(base_folder, label_folder)
        if os.path.isdir(label_path):
            label_embeddings = []
            for file_name in os.listdir(label_path):
                file_path = os.path.join(label_path, file_name)
                # Load embeddings using your preferred method (e.g., np.load())
                embedding = np.load(file_path)
                label_embeddings.append(embedding)

            # Calculate average embedding for the label
            average_embedding = np.mean(label_embeddings, axis=0)
            average_embeddings[label_folder] = average_embedding

    return average_embeddings

# Function to recognize faces based on existing embeddings
def recognize_faces(test_folder, average_embeddings):
    predictions = []

    for label_folder in os.listdir(test_folder):
        label_path = os.path.join(test_folder, label_folder)
        if os.path.isdir(label_path):
            for file_name in os.listdir(label_path):
                file_path = os.path.join(label_path, file_name)

                # Load test embedding from the saved numpy file
                test_embedding = np.load(file_path)

                # Compare test embedding with averaged embeddings
                distances = {}
                for label, avg_embedding in average_embeddings.items():
                    distance = np.linalg.norm(test_embedding - avg_embedding)
                    distances[label] = distance

                # Predict the label with the minimum distance
                predicted_label = min(distances, key=distances.get)
                predictions.append((file_name, label_folder, predicted_label))

    return predictions

import random

# Function to display images with actual and predicted labels
def display_random_images(predictions, image_folder, num_images=40):
    selected_predictions = random.sample(predictions, min(num_images, len(predictions)))
    
    # Calculate the number of rows and columns based on the desired number of images
    num_rows = (num_images + 4) // 5  # Ensure at least 1 row
    num_cols = min(5, num_images)  # Maximum of 5 columns
    
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 2 * num_rows))
    fig.subplots_adjust(hspace=0.5)  # Adjust the vertical spacing

    for i, (file_name, actual_label, predicted_label) in enumerate(selected_predictions):
        # Extract identifier from the file name
        identifier_start = file_name.find("_detected_face_") + len("_detected_face_")
        identifier_end = file_name.find("_", identifier_start)
        if identifier_end == -1:
            identifier = file_name[identifier_start:]
        else:
            identifier = file_name[identifier_start:identifier_end]

        # Construct the file path for the corresponding image
        image_name = f"detected_face_{identifier}.jpg"
        image_path = os.path.join(image_folder, actual_label, image_name)

        # Display the actual image along with labels
        try:
            image = Image.open(image_path)
            axes[i // 5, i % 5].imshow(image)
            axes[i // 5, i % 5].set_title(f"Actual: {actual_label}\nPredicted: {predicted_label}")
            axes[i // 5, i % 5].axis('off')

        except FileNotFoundError:
            print(f"Image not found for {identifier}. Skipping to the next one.")

    plt.show()

display_random_images(predictions, './train_detected_faces', num_images=40)

# folder paths
base_folder = './extracted_embeddings'
test_folder = './extracted_embeddingstest'
image_folder = './train_detected_faces'

# Load existing average embeddings
average_embeddings = calculate_average_embeddings(base_folder)

# Recognize faces in the test folder
predictions = recognize_faces(test_folder, average_embeddings)

In [None]:
def load_test_embeddings(test_folder):
    test_embeddings = []

    for label_folder in os.listdir(test_folder):
        label_path = os.path.join(test_folder, label_folder)
        if os.path.isdir(label_path):
            for file_name in os.listdir(label_path):
                file_path = os.path.join(label_path, file_name)

                # Load test embedding from the saved numpy file
                test_embedding = np.load(file_path)
                test_embeddings.append((file_name, label_folder, test_embedding))

    return test_embeddings

# folder paths
base_folder = './extracted_embeddings'
test_folder = './extracted_embeddingstest'

# Load existing average embeddings
average_embeddings = calculate_average_embeddings(base_folder)

# Load test embeddings
test_embeddings = load_test_embeddings(test_folder)

# Randomly pick two test embeddings
random_test_embeddings = random.sample(test_embeddings, 2)

# Calculate and print distances between the random test embeddings and all average embeddings
for file_name, label, test_embedding in random_test_embeddings:
    print(f"\nDistances for {file_name} ({label}) against Average Embeddings:")
    for avg_label, avg_embedding in average_embeddings.items():
        distance = np.linalg.norm(test_embedding - avg_embedding)
        print(f"Distance to {avg_label}: {distance}")

RECOGNIZING FACES - Classification model - Multilayered Neural Network

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from json import JSONEncoder
import json

# Disable CUDA
torch.cuda.is_available = lambda : False

# Set the device to CPU
device = torch.device("cpu")

# Load embeddings and labels
data_dir = "./extracted_embeddings"
embeddings = []
labels = []

for label_folder in os.listdir(data_dir):
    label_path = os.path.join(data_dir, label_folder)
    if os.path.isdir(label_path):
        label = label_folder  # Assuming the folder names are labels
        embeddings_per_label = [] 

        for file_name in os.listdir(label_path):
            file_path = os.path.join(label_path, file_name)
            if file_name.endswith(".npy"):
                embedding = np.load(file_path)
                embeddings_per_label.append(embedding)
                labels.append(label)

        embeddings.append(embeddings_per_label)

# Flatten the embeddings list
embeddings = [item for sublist in embeddings for item in sublist]

# Convert data to PyTorch tensors
X = torch.tensor(embeddings, dtype=torch.float32)
labels = np.array(labels)

# Use LabelEncoder to encode string labels into integers
label_encoder = LabelEncoder()
y = torch.tensor(label_encoder.fit_transform(labels), dtype=torch.long)

# Split the data into 80% training, 10% validation, and 10% test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Define an updated model with more layers
class UpdatedModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(UpdatedModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(256, 128)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(128, 64)
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(64, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.fc4(x)
        return x

# Initialize the model
input_size = X_train.shape[1]
num_classes = len(set(y_train))
print(input_size, num_classes)
model = UpdatedModel(input_size, num_classes).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 20
batch_size = 32

for epoch in range(num_epochs):
    model.train()
    for i in range(0, len(X_train), batch_size):
        inputs = X_train[i:i+batch_size].to(device)
        labels = y_train[i:i+batch_size].to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        val_inputs = X_val.to(device)
        val_labels = y_val.to(device)

        val_outputs = model(val_inputs)
        val_loss = criterion(val_outputs, val_labels)

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {loss.item():.4f}, Validation Loss: {val_loss.item():.4f}")

# Evaluation on the test data
model.eval()
with torch.no_grad():
    test_inputs = X_test.to(device)
    test_labels = y_test.to(device)

    test_outputs = model(test_inputs)
    test_loss = criterion(test_outputs, test_labels)

# Decode the predicted labels using inverse_transform
predicted_labels = label_encoder.inverse_transform(torch.argmax(test_outputs, dim=1).cpu().numpy())

# Decode the true labels using inverse_transform
true_labels = label_encoder.inverse_transform(y_test.cpu().numpy())

# Display classification report and confusion matrix
print("Classification Report: ")
# print(classification_report(true_labels, predicted_labels))

torch.save(model.state_dict(), "./weights")

In [None]:
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

data_dir = "./extracted_embeddings"
embeddings = []
labels = []

for label_folder in os.listdir(data_dir):
    label_path = os.path.join(data_dir, label_folder)
    if os.path.isdir(label_path):
        label = label_folder  # Assuming the folder names are labels
        embeddings_per_label = [] 

        for file_name in os.listdir(label_path):
            file_path = os.path.join(label_path, file_name)
            if file_name.endswith(".npy"):
                embedding = np.load(file_path)
                embeddings_per_label.append(embedding)
                labels.append(label)

        embeddings.append(embeddings_per_label)

embeddings = [item for sublist in embeddings for item in sublist]
X = torch.tensor(embeddings, dtype=torch.float32)
labels = np.array(labels)

label_encoder = LabelEncoder()
y = torch.tensor(label_encoder.fit_transform(labels), dtype=torch.long)

# Split the data into 80% training, 10% validation, and 10% test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
print("y_test", y_test)

input_size = X_train.shape[1]
num_classes = len(set(y_train))

device = torch.device("cpu")
model = UpdatedModel(input_size, num_classes).to(device)
model.load_state_dict(torch.load('./weights'))
model.eval()

test_inputs = X_test.to(device)
test_outputs = model(test_inputs)
print("test_outputs", test_outputs)

# predicted_labels = label_encoder.inverse_transform(torch.argmax(test_outputs, dim=1).cpu().numpy())
# true_labels = label_encoder.inverse_transform(y_test.cpu().numpy())
# print("predicted_labels", predicted_labels, len(predicted_labels))
# print("true_labels", true_labels, len(true_labels))

test_outputs = model(test_inputs[0:1])
print("test_outputs", test_outputs)
predicted_labels = label_encoder.inverse_transform(torch.argmax(test_outputs, dim=1).cpu().numpy())
print("predicted_labels", predicted_labels, len(predicted_labels))