# Human Face Exploratory Data Analysis
A web scraped dataset of human faces suggested for image processing models
See more at: https://www.kaggle.com/datasets/ashwingupta3012/human-faces/data

In [1]:
import os
import cv2
import matplotlib.pyplot as plt

## Data Loading

In [2]:
dataset_path = '../data/raw/Humans/'

# List all files in the dataset directory
all_files = os.listdir(dataset_path)
image_files = [f for f in all_files if f.endswith(('.jpg', '.png'))]

# Load and display 5 sample images
sample_images = []
for image_file in image_files[:5]:
    image_path = os.path.join(dataset_path, image_file)
    image = cv2.imread(image_path)
    sample_images.append(image)

for img in sample_images:
    plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    plt.show()

FileNotFoundError: [WinError 3] The system cannot find the path specified: '../data/raw/Humans/'

## Data Visualization

In [None]:
# My path to the haarcascades directory
haarcascades_path = '/home/john/miniconda3/envs/profme2/share/opencv4/haarcascades/'

face_cascade = cv2.CascadeClassifier(haarcascades_path + 'haarcascade_frontalface_default.xml')

for img in sample_images:
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, 1.1, 4)
    for (x, y, w, h) in faces:
        cv2.rectangle(img, (x, y), (x+w, y+h), (255, 0, 0), 2)
    plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    plt.show()


## Face Size and Position Analysis

In [1]:
face_sizes = []

for img in sample_images:
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, 1.1, 4)
    for (x, y, w, h) in faces:
        face_sizes.append((w, h))

# Plotting the distribution of face widths and heights
face_widths = [w for w, h in face_sizes]
face_heights = [h for w, h in face_sizes]

plt.hist(face_widths, bins=20, color='blue', alpha=0.7, label='Width')
plt.hist(face_heights, bins=20, color='red', alpha=0.7, label='Height')
plt.xlabel('Pixels')
plt.ylabel('Number of Faces')
plt.legend()
plt.show()

NameError: name 'sample_images' is not defined

## Data Quality and Diversity

Checking for Duplicates

In [None]:
import hashlib

def dhash(image, hash_size=8):
    resized = cv2.resize(image, (hash_size + 1, hash_size))
    diff = resized[:, 1:] > resized[:, :-1]
    return sum([2 ** i for (i, v) in enumerate(diff.flatten()) if v])

hashes = {}
duplicates = []

for image_file in image_files:
    image_path = os.path.join(dataset_path, image_file)
    image = cv2.imread(image_path, 0)  # Load in grayscale
    h = dhash(image)
    if h in hashes:
        duplicates.append((image_path, hashes[h]))
    else:
        hashes[h] = image_path

print(f"Found {len(duplicates)} duplicate images.")


In [None]:
# Veryify that the images are indeed duplicates by displaying them
for dup in duplicates[:5]:
    img1 = cv2.imread(dup[0])
    img2 = cv2.imread(dup[1])
    plt.imshow(cv2.cvtColor(img1, cv2.COLOR_BGR2RGB))
    plt.show()
    plt.imshow(cv2.cvtColor(img2, cv2.COLOR_BGR2RGB))
    plt.show()


In [None]:
# Remove the duplicate images
for dup in duplicates:
    os.remove(dup[0])

In [None]:
# Viewing how many images are left
all_files = os.listdir(dataset_path)
print(len(all_files))

In [None]:
# Update image files list
image_files = [f for f in all_files if f.endswith(('.jpg', '.png'))]

Checking for Low-Quality Images

In [None]:
def variance_of_laplacian(image):
    return cv2.Laplacian(image, cv2.CV_64F).var()

blurry_images = []

for image_file in image_files:
    image_path = os.path.join(dataset_path, image_file)
    image = cv2.imread(image_path, 0)  # Load in grayscale
    if variance_of_laplacian(image) < 100:  # Threshold can be adjusted
        blurry_images.append(image_path)

print(f"Found {len(blurry_images)} blurry images.")

In [None]:
# Viewing the blurry images
for image_path in blurry_images[:5]:
    image = cv2.imread(image_path)
    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.show()

They're not too blurry, I think it's fine to leave them in.