In [1]:
import os
import cv2
import hashlib
from mtcnn import MTCNN
from tqdm import tqdm  # For progress bars
from PIL import Image
import numpy as np

In [2]:
# Initialize MTCNN detector with adjusted thresholds
detector = MTCNN(min_face_size=20, scale_factor=0.709, steps_threshold=[0.6, 0.7, 0.7])
# min_face_size: Minimum face size to detect (lowered for 48x48 images)
# steps_threshold: Lowered confidence thresholds for P-Net, R-Net, O-Net

dataset_base_path = 'FER2013_ORIGINAL'
splits = ['train', 'test']
classes = ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']

In [3]:
# Step 1: Remove Images Without Faces
def remove_images_without_faces():
    print("Removing images without faces...")
    for split in splits:
        for emotion in classes:
            image_dir = os.path.join(dataset_base_path, split, emotion)
            if not os.path.exists(image_dir):
                print(f"Directory {image_dir} does not exist, skipping...")
                continue

            print(f"Processing {split}/{emotion}...")
            image_files = [f for f in os.listdir(image_dir) if f.endswith(('.jpg', '.png'))]

            for img_name in tqdm(image_files, desc=f"Checking faces in {split}/{emotion}"):
                img_path = os.path.join(image_dir, img_name)
                try:
                    # Read image in grayscale
                    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
                    if img is None:
                        print(f"Failed to load {img_path}, removing...")
                        os.remove(img_path)
                        continue

                    # Convert grayscale to RGB by duplicating channels
                    img_rgb = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)

                    # Detect faces
                    faces = detector.detect_faces(img_rgb)

                    # If no faces detected, remove the image
                    if not faces:
                        print(f"No face detected in {img_path}, removing...")
                        os.remove(img_path)
                except Exception as e:
                    print(f"Error processing {img_path}: {e}, removing...")
                    os.remove(img_path)


In [4]:
# Step 2: Remove Duplicate Images Using SHA-256 (Exact Duplicates)
def remove_exact_duplicates():
    print("\nRemoving exact duplicates using SHA-256...")
    # Dictionary to store hash -> list of image paths
    hash_dict = {}

    # Collect all images across splits and classes
    for split in splits:
        for emotion in classes:
            image_dir = os.path.join(dataset_base_path, split, emotion)
            if not os.path.exists(image_dir):
                continue

            image_files = [f for f in os.listdir(image_dir) if f.endswith(('.jpg', '.png'))]

            for img_name in tqdm(image_files, desc=f"Hashing {split}/{emotion}"):
                img_path = os.path.join(image_dir, img_name)
                try:
                    # Read image as bytes
                    with open(img_path, 'rb') as f:
                        img_data = f.read()

                    # Compute SHA-256 hash
                    img_hash = hashlib.sha256(img_data).hexdigest()

                    # Add to hash dictionary
                    if img_hash in hash_dict:
                        hash_dict[img_hash].append(img_path)
                    else:
                        hash_dict[img_hash] = [img_path]
                except Exception as e:
                    print(f"Error hashing {img_path}: {e}, removing...")
                    os.remove(img_path)

    # Remove duplicates (keep the first occurrence)
    for img_hash, img_paths in hash_dict.items():
        if len(img_paths) > 1:
            print(f"Found {len(img_paths)} duplicates for hash {img_hash}:")
            for i, img_path in enumerate(img_paths):
                if i == 0:
                    print(f"  Keeping: {img_path}")
                else:
                    print(f"  Removing: {img_path}")
                    os.remove(img_path)


In [5]:
remove_images_without_faces()

Removing images without faces...
Processing train/angry...


Checking faces in train/angry: 100%|██████████| 3350/3350 [08:13<00:00,  6.79it/s]


Processing train/disgust...


Checking faces in train/disgust: 100%|██████████| 280/280 [00:45<00:00,  6.14it/s]


Processing train/fear...


Checking faces in train/fear: 100%|██████████| 3072/3072 [06:49<00:00,  7.50it/s]


Processing train/happy...


Checking faces in train/happy: 100%|██████████| 6248/6248 [09:01<00:00, 11.53it/s]


Processing train/sad...


Checking faces in train/sad: 100%|██████████| 3653/3653 [05:18<00:00, 11.48it/s]


Processing train/surprise...


Checking faces in train/surprise: 100%|██████████| 2244/2244 [03:04<00:00, 12.16it/s]


Processing train/neutral...


Checking faces in train/neutral: 100%|██████████| 4317/4317 [06:16<00:00, 11.46it/s]


Processing test/angry...


Checking faces in test/angry: 100%|██████████| 788/788 [01:08<00:00, 11.52it/s]


Processing test/disgust...


Checking faces in test/disgust: 100%|██████████| 58/58 [00:05<00:00, 10.87it/s]


Processing test/fear...


Checking faces in test/fear: 100%|██████████| 732/732 [01:01<00:00, 11.89it/s]


Processing test/happy...


Checking faces in test/happy: 100%|██████████| 1528/1528 [02:16<00:00, 11.16it/s]


Processing test/sad...


Checking faces in test/sad: 100%|██████████| 966/966 [01:29<00:00, 10.79it/s]


Processing test/surprise...


Checking faces in test/surprise: 100%|██████████| 472/472 [00:40<00:00, 11.78it/s]


Processing test/neutral...


Checking faces in test/neutral: 100%|██████████| 1040/1040 [01:32<00:00, 11.25it/s]


In [6]:
remove_exact_duplicates()


Removing exact duplicates using SHA-256...


Hashing train/angry: 100%|██████████| 3350/3350 [00:00<00:00, 6580.65it/s]
Hashing train/disgust: 100%|██████████| 280/280 [00:00<00:00, 6745.89it/s]
Hashing train/fear: 100%|██████████| 3072/3072 [00:00<00:00, 3850.51it/s]
Hashing train/happy: 100%|██████████| 6248/6248 [00:01<00:00, 5059.31it/s]
Hashing train/sad: 100%|██████████| 3653/3653 [00:00<00:00, 5405.72it/s]
Hashing train/surprise: 100%|██████████| 2244/2244 [00:00<00:00, 6305.51it/s]
Hashing train/neutral: 100%|██████████| 4317/4317 [00:00<00:00, 5274.96it/s]
Hashing test/angry: 100%|██████████| 788/788 [00:00<00:00, 7589.72it/s]
Hashing test/disgust: 100%|██████████| 58/58 [00:00<00:00, 3654.12it/s]
Hashing test/fear: 100%|██████████| 732/732 [00:00<00:00, 7257.64it/s]
Hashing test/happy: 100%|██████████| 1528/1528 [00:00<00:00, 5781.54it/s]
Hashing test/sad: 100%|██████████| 966/966 [00:00<00:00, 5268.70it/s]
Hashing test/surprise: 100%|██████████| 472/472 [00:00<00:00, 5906.45it/s]
Hashing test/neutral: 100%|██████████| 

Found 2 duplicates for hash 12d4436d827d07e5ab80dd8fe95d1f2ca9475c75c1b93eede5cdcff6d395df48:
  Keeping: FER2013_ORIGINAL\train\angry\Training_10334355.jpg
  Removing: FER2013_ORIGINAL\train\angry\Training_95310035.jpg
Found 2 duplicates for hash 95eef42d8688868f7d73c45368d7f91456dcd3c6c939226c3779bf86cf7e5ba7:
  Keeping: FER2013_ORIGINAL\train\angry\Training_10857340.jpg
  Removing: FER2013_ORIGINAL\train\sad\Training_33162240.jpg
Found 3 duplicates for hash a1551c60a2e46419b5234e791587918c2aeaeeb7d2914310244418a2b9b82c63:
  Keeping: FER2013_ORIGINAL\train\angry\Training_11036720.jpg
  Removing: FER2013_ORIGINAL\train\sad\Training_4450363.jpg
  Removing: FER2013_ORIGINAL\test\sad\PublicTest_86086159.jpg
Found 2 duplicates for hash f3c27e81e8d32a5042a676bcc3395c24eaf5de7701281a7f31db18c513ba949a:
  Keeping: FER2013_ORIGINAL\train\angry\Training_11478843.jpg
  Removing: FER2013_ORIGINAL\train\angry\Training_36215331.jpg
Found 3 duplicates for hash 72897b92e15adaf11e7cc5f07b89d50b10e30bb

In [7]:
for split in splits:
    for emotion in classes:
        image_dir = os.path.join(dataset_base_path, split, emotion)
        if os.path.exists(image_dir):
            num_images = len([f for f in os.listdir(image_dir) if f.endswith(('.jpg', '.png'))])
            print(f"{split}/{emotion}: {num_images} images")

train/angry: 3214 images
train/disgust: 280 images
train/fear: 3068 images
train/happy: 6122 images
train/sad: 3556 images
train/surprise: 2242 images
train/neutral: 4314 images
test/angry: 722 images
test/disgust: 57 images
test/fear: 730 images
test/happy: 1469 images
test/sad: 923 images
test/surprise: 470 images
test/neutral: 1039 images
