In [1]:
import pandas as pd
import os
import cv2
from tqdm import tqdm
import random
import shutil
import time

# Load datasets
df = pd.read_csv("csv/processed_image_quality_metrics.csv")
aus_pop_dist = pd.read_csv("csv/australian_2021rescale%.csv")

# Filter out low-quality images
df = df[df["low_quality"] != 1]

# Calculate the total number of images needed based on aus_pop_dist
total_images_needed = len(df)

# Calculate the required number of images for each age
aus_pop_dist['required_images'] = (aus_pop_dist['Rescaled %'] / 100) * total_images_needed
aus_pop_dist['required_images'] = aus_pop_dist['required_images'].astype(int)

# Create a dictionary to store the number of images for each age
age_counts = df['label'].value_counts().to_dict()

# Add a new column for the current number of images
aus_pop_dist['current_images'] = aus_pop_dist['Age'].map(age_counts).fillna(0).astype(int)

# Calculate the difference between required and current images
aus_pop_dist['image_difference'] = aus_pop_dist['required_images'] - aus_pop_dist['current_images']


In [2]:
import pandas as pd
import os
import cv2
from tqdm import tqdm
import random
import shutil
import time

dataset_path = "/home/mill/Desktop/face-recognition/processed_dataset"
resampled_dataset_path = "/home/mill/Desktop/face-recognition/resampled_dataset"

# Ensure the resampled dataset directory exists
if not os.path.exists(resampled_dataset_path):
    os.makedirs(resampled_dataset_path)

# Function to perform data augmentation
def augment_image(image):
    rows, cols, _ = image.shape
    transformations = []

    # Random rotation
    angle = random.uniform(-15, 15)
    M = cv2.getRotationMatrix2D((cols / 2, rows / 2), angle, 1)
    transformations.append(lambda img: cv2.warpAffine(img, M, (cols, rows)))

    # Random flip
    flip_type = random.choice([-1, 0, 1])
    transformations.append(lambda img: cv2.flip(img, flip_type))

    # Apply a random transformation
    transformation = random.choice(transformations)
    return transformation(image)

# Process each age group based on the image difference
for _, row in aus_pop_dist.iterrows():
    age = int(row['Age'])  # Ensure age is an integer
    image_difference = int(row['image_difference'])  # Ensure image_difference is an integer
    current_images = int(row['current_images'])  # Ensure current_images is an integer
    required_images = int(row['required_images'])  # Ensure required_images is an integer
    
    age_folder = os.path.join(dataset_path, str(age))
    resampled_age_folder = os.path.join(resampled_dataset_path, str(age))

    # Ensure the resampled subdirectory exists
    if not os.path.exists(resampled_age_folder):
        os.makedirs(resampled_age_folder)

    # Get only the good quality images
    age_images = df[df['label'] == age]['image_name'].tolist()

    if image_difference < 0:
        # Down-sample by randomly selecting images
        sampled_images = random.sample(age_images, required_images)
        for img in tqdm(sampled_images, desc=f'Down-sampling age {age}'):
            src_path = os.path.join(age_folder, img)
            dst_path = os.path.join(resampled_age_folder, img)
            shutil.copy(src_path, dst_path)

    elif image_difference > 0:
        # Copy original images
        for img in tqdm(age_images, desc=f'Copying original images for age {age}'):
            src_path = os.path.join(age_folder, img)
            dst_path = os.path.join(resampled_age_folder, img)
            shutil.copy(src_path, dst_path)
        
        # Up-sample by augmenting images
        for i in tqdm(range(image_difference), desc=f'Up-sampling age {age}'):
            img = random.choice(age_images)
            src_path = os.path.join(age_folder, img)
            image = cv2.imread(src_path)
                
            if image is not None:
                augmented_image = augment_image(image)
                    
                # Create a unique new image name
                timestamp = int(time.time() * 1000)
                random_int = random.randint(0, 9999)
                new_image_name = f"aug_{timestamp}_{random_int}_{img}"
                new_image_path = os.path.join(resampled_age_folder, new_image_name)
                    
                # Save the augmented image
                cv2.imwrite(new_image_path, augmented_image)

print(f"Resampling completed. The resampled dataset is saved in '{resampled_dataset_path}'.")



Down-sampling age 20: 100%|██████████| 602/602 [00:00<00:00, 30790.08it/s]
Down-sampling age 21: 100%|██████████| 602/602 [00:00<00:00, 31701.62it/s]
Down-sampling age 22: 100%|██████████| 602/602 [00:00<00:00, 30232.29it/s]
Copying original images for age 23: 100%|██████████| 68/68 [00:00<00:00, 28797.73it/s]
Up-sampling age 23: 100%|██████████| 534/534 [00:00<00:00, 4497.40it/s]
Copying original images for age 24: 100%|██████████| 157/157 [00:00<00:00, 30936.10it/s]
Up-sampling age 24: 100%|██████████| 445/445 [00:00<00:00, 4690.85it/s]
Down-sampling age 25: 100%|██████████| 678/678 [00:00<00:00, 32248.88it/s]
Copying original images for age 26: 100%|██████████| 571/571 [00:00<00:00, 32043.29it/s]
Up-sampling age 26: 100%|██████████| 107/107 [00:00<00:00, 4632.58it/s]
Down-sampling age 27: 100%|██████████| 678/678 [00:00<00:00, 29762.09it/s]
Down-sampling age 28: 100%|██████████| 678/678 [00:00<00:00, 32275.96it/s]
Down-sampling age 29: 100%|██████████| 678/678 [00:00<00:00, 31946.37

Resampling completed. The resampled dataset is saved in '/home/mill/Desktop/face-recognition/resampled_dataset'.





In [13]:
94+534

628

In [16]:
aus_pop_dist

Unnamed: 0,Age,Rescaled %,required_images,current_images,image_difference
0,20,3.080685,602,868,-266
1,21,3.080685,602,1109,-507
2,22,3.080685,602,1130,-528
3,23,3.080685,602,68,534
4,24,3.080685,602,157,445
5,25,3.471883,678,707,-29
6,26,3.471883,678,571,107
7,27,3.471883,678,680,-2
8,28,3.471883,678,701,-23
9,29,3.471883,678,907,-229


In [3]:
# Function to count images in a directory
def count_images(directory):
    image_count = 0
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.jpg') or file.endswith('.png'):
                image_count += 1
    return image_count

# Verify the number of images in the resampled dataset
verification_results = []

for _, row in aus_pop_dist.iterrows():
    age = int(row['Age'])  # Ensure age is an integer
    required_images = int(row['required_images'])  # Ensure required_images is an integer
    
    age_folder = os.path.join(resampled_dataset_path, str(age))
    
    if os.path.exists(age_folder):
        current_images = count_images(age_folder)
    else:
        current_images = 0

    verification_results.append({
        'Age': age,
        'Required Images': required_images,
        'Current Images': current_images,
        'Status': 'Correct' if current_images == required_images else 'Incorrect'
    })

# Convert verification results to a DataFrame
verification_df = pd.DataFrame(verification_results)

# Print verification results
print(verification_df)

# Save verification results to a CSV file
verification_df.to_csv('resampled_dataset_verification.csv', index=False)

print("Verification completed. The results are saved in 'resampled_dataset_verification.csv'.")


    Age  Required Images  Current Images   Status
0    20              602             602  Correct
1    21              602             602  Correct
2    22              602             602  Correct
3    23              602             602  Correct
4    24              602             602  Correct
5    25              678             678  Correct
6    26              678             678  Correct
7    27              678             678  Correct
8    28              678             678  Correct
9    29              678             678  Correct
10   30              707             707  Correct
11   31              707             707  Correct
12   32              707             707  Correct
13   33              707             707  Correct
14   34              707             707  Correct
15   35              698             698  Correct
16   36              698             698  Correct
17   37              698             698  Correct
18   38              698             698  Correct
