# Now, we will process the images using transformations to get more data for each celebrity. We ultimately want 300 augmentied images for each celebrity

## Packages needed:

pip install opencv-python

pip install numpy

pip install tensorflow

pip install scipy

pip install Pillow==10.2.0

pip install facenet-pytorch


## Installing LibGL library:

### On Debian/Ubuntu-based Linux systems, run:

sudo apt-get update

sudo apt-get install -y libgl1-mesa-glx

### On CentOS/RHEL-based systems, run:

sudo yum install mesa-libGL

In [1]:
import os
import shutil
import cv2  # OpenCV for face detection
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array, array_to_img

# Set paths
base_path = "/workspace/DS4002Project3/DATA/celebrities"  # Original dataset path
extra_output_path = "/workspace/DS4002Project3/DATA/celebrities_extra"  # Folder for augmented images
all_output_path = "/workspace/DS4002Project3/DATA/celebrities_all"  # Folder for final dataset

# Set the path to the Haar cascade file for face detection
haar_cascade_path = "/workspace/DS4002Project3/DATA/haarcascade_frontalface_default.xml"  # Update this path if needed

# Clear output folders if they exist
for path in [extra_output_path, all_output_path]:
    if os.path.exists(path):
        shutil.rmtree(path)
    os.makedirs(path, exist_ok=True)

# Data augmentation settings (generate exactly 5 versions per image)
datagen = ImageDataGenerator(
    rotation_range=10,  # Reduced rotation range
    width_shift_range=0.1,  # Smaller width shift to avoid faces being cropped out
    height_shift_range=0.1,  # Smaller height shift to avoid faces being cropped out
    shear_range=0.2,  # Moderate shear
    zoom_range=0.1,  # Zoom in slightly without cropping the face
    horizontal_flip=True,  # Horizontal flip to add variation
    fill_mode='nearest'  # Nearest mode to fill in pixels after transformations
)

# Function to check if a face is detected in an image using OpenCV
def detect_face(image_path):
    face_cascade = cv2.CascadeClassifier(haar_cascade_path)
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, 1.1, 4)

    # Ensure at least one face is detected with reasonable size
    for (x, y, w, h) in faces:
        if w > 50 and h > 50:  # Only consider faces with a reasonable size
            return True
    return False

# Loop through each celebrity folder
for celeb in os.listdir(base_path):
    celeb_folder = os.path.join(base_path, celeb)
    save_folder = os.path.join(extra_output_path, celeb)
    os.makedirs(save_folder, exist_ok=True)  # Create sub-folder for each celebrity

    # Detect images with .jpg, .jpeg, or .png extensions
    images = [img for img in os.listdir(celeb_folder) if img.endswith(('.jpg', '.jpeg', '.png'))]

    # Initialize the image index to start from 1
    image_index = 1

    # Save the original images (index 1-100) and create augmented versions (index 101-400)
    for image_name in images:
        img_path = os.path.join(celeb_folder, image_name)
        img = load_img(img_path)
        x = img_to_array(img)
        x = x.reshape((1,) + x.shape)  # Reshape to (1, height, width, channels)

        # Save the original image
        original_image_name = f"{image_index:03}.jpg"
        original_image_path = os.path.join(save_folder, original_image_name)
        img.save(original_image_path)
        image_index += 1

        # Generate exactly 5 augmented versions for each original image
        augmented_count = 0
        for batch in datagen.flow(x, batch_size=1, save_to_dir=save_folder, save_prefix='', save_format='jpg'):
            augmented_count += 1
            if augmented_count >= 5:  # Stop after 5 augmentations
                break

    # Create the "celebrities_all" folder and copy the original images
    final_save_folder = os.path.join(all_output_path, celeb)
    os.makedirs(final_save_folder, exist_ok=True)

    # Copy the original images (1-100) to the "celebrities_all" folder
    for image_name in images:
        original_image_path = os.path.join(celeb_folder, image_name)
        final_image_path = os.path.join(final_save_folder, f"{int(image_name.split('.')[0]):03}.jpg")
        shutil.copy2(original_image_path, final_image_path)

    # Verify face detection in the augmented images
    face_detected_images = []
    for filename in sorted(os.listdir(save_folder)):
        file_path = os.path.join(save_folder, filename)
        if detect_face(file_path):
            face_detected_images.append(file_path)

    print(f"{celeb} - Found {len(face_detected_images)} augmented images with detected faces.")

    # Ensure that we copy exactly 300 face-detected augmented images to the final folder
    if len(face_detected_images) < 300:
        print(f"Warning: {celeb} only has {len(face_detected_images)} face-detected augmented images.")
        # If there are fewer than 300 valid images, we will just use whatever we have
        face_detected_images = face_detected_images[:300]

    # Copy the first 300 images with detected faces to the "celebrities_all" folder
    for idx, image_path in enumerate(face_detected_images[:300]):
        new_image_name = f"{101 + idx:03}.jpg"  # Naming from 101 to 400
        new_image_path = os.path.join(final_save_folder, new_image_name)
        shutil.copy2(image_path, new_image_path)

    # Ensure the final number of images is exactly 400
    total_images = len(os.listdir(final_save_folder))
    if total_images != 400:
        print(f"Warning: {celeb} folder has {total_images} images, expected 400.")

print("Image augmentation and organization completed.")

ModuleNotFoundError: No module named 'cv2'

### SZA's folder was short some images. Let's correct this by implementing a separate for-loop to populate images for the SZA folder in celebrities_all:

In [None]:
import os
import shutil
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from facenet_pytorch import MTCNN
from PIL import Image

# Initialize MTCNN for face detection
mtcnn = MTCNN(keep_all=True)

# Set paths
base_path = "/workspace/DS4002Project3/DATA/celebrities/SZA"  # Original SZA image folder
extra_path = "/workspace/DS4002Project3/DATA/celebrities_extra/SZA"  # Path for augmented images
final_path = "/workspace/DS4002Project3/DATA/celebrities_all/SZA"  # Path for final 400 images

# Function to clear contents of a directory
def clear_directory(path):
    if os.path.exists(path):
        shutil.rmtree(path)  # Remove the entire folder and its contents
    os.makedirs(path, exist_ok=True)  # Recreate the empty folder

# Clear out existing contents in the directories
clear_directory(extra_path)
clear_directory(final_path)

# Copy original images to the final folder
original_images = [img for img in os.listdir(base_path) if img.endswith(('.jpg', '.jpeg', '.png'))]
for idx, image_name in enumerate(sorted(original_images)):
    img_path = os.path.join(base_path, image_name)
    new_img_name = f"{idx + 1:03}.jpg"  # Format with leading zeros (001.jpg, 002.jpg, ..., 100.jpg)
    new_img_path = os.path.join(final_path, new_img_name)
    shutil.copy(img_path, new_img_path)

# Data augmentation settings (same as the ones used before)
datagen = ImageDataGenerator(
    rotation_range=30,
    width_shift_range=0.15,
    height_shift_range=0.15,
    shear_range=0.15,
    zoom_range=0.15,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Generate augmented images
for image_name in original_images:
    img_path = os.path.join(base_path, image_name)
    img = load_img(img_path)
    x = img_to_array(img)
    x = x.reshape((1,) + x.shape)

    # Generate 8 versions per original image
    augmented_count = 0
    for batch in datagen.flow(x, batch_size=1, save_to_dir=extra_path, save_prefix='', save_format='jpg'):
        augmented_count += 1
        if augmented_count >= 8:
            break

# Function to detect if an image contains a face
def contains_face(image_path):
    img = Image.open(image_path)
    faces = mtcnn(img)
    return faces is not None

# Collect the first 300 images with faces detected
saved_count = 0
for img_name in sorted(os.listdir(extra_path)):
    if img_name.endswith('.jpg') and saved_count < 300:
        img_path = os.path.join(extra_path, img_name)
        if contains_face(img_path):
            new_img_name = f"{saved_count + 101:03}.jpg"  # Start numbering from 101 (101.jpg, 102.jpg, ..., 400.jpg)
            new_img_path = os.path.join(final_path, new_img_name)
            Image.open(img_path).save(new_img_path)
            saved_count += 1

# Confirm the number of saved images
print(f"Saved {len(original_images)} original images and {saved_count} augmented images with detected faces in {final_path}.")

Saved 100 original images and 300 augmented images with detected faces in /workspace/DS4002Project3/DATA/celebrities_all/SZA.
