In [2]:
import os
from collections import defaultdict
import pandas as pd

def analyze_folder(folder_path):
    """
    Analyzes the composition of images in a given folder.
    Returns dictionaries containing counts and percentages for race, gender, and age groups.
    """
    race_count = defaultdict(int)
    gender_count = defaultdict(int)
    age_groups = defaultdict(int)
    total_files = 0
    
    # Define race mapping
    race_mapping = {
        '0': 'White',
        '1': 'Black',
        '2': 'Asian',
        '3': 'Indian',
        '4': 'Others'
    }
    
    gender_mapping = {
        '0': 'Male',
        '1': 'Female'
    }
    
    # Process each file in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.jpg'):
            total_files += 1
            components = filename.split('_')
            if len(components) >= 3:
                age = int(components[0])
                gender = components[1]
                race = components[2]
                
                # Count race
                race_count[race_mapping[race]] += 1
                
                # Count gender
                gender_count[gender_mapping[gender]] += 1
                
                # Group ages
                if age < 18:
                    age_groups['Under 18'] += 1
                elif age < 30:
                    age_groups['18-29'] += 1
                elif age < 50:
                    age_groups['30-49'] += 1
                else:
                    age_groups['50+'] += 1
    
    # Calculate percentages
    race_percentages = {k: (v/total_files)*100 for k, v in race_count.items()}
    gender_percentages = {k: (v/total_files)*100 for k, v in gender_count.items()}
    age_percentages = {k: (v/total_files)*100 for k, v in age_groups.items()}
    
    return {
        'total': total_files,
        'race': race_percentages,
        'gender': gender_percentages,
        'age': age_percentages
    }

def main():
    # WSL path format
    base_path = "/mnt/c/Users/Luigi/Documents/Fall 2024/COSC 4394/Project/UTKFace/weighted_split_dataset"
    folders = ['train', 'test', 'validation']
    
    # Store results for each folder
    all_results = {}
    total_race_count = defaultdict(int)
    grand_total = 0
    
    # Analyze each folder
    for folder in folders:
        folder_path = os.path.join(base_path, folder)
        if os.path.exists(folder_path):
            results = analyze_folder(folder_path)
            all_results[folder] = results
            
            # Add to total race count for overall statistics
            for race, percentage in results['race'].items():
                count = (percentage * results['total']) / 100
                total_race_count[race] += count
                grand_total += count
    
    # Print results for each folder
    for folder, results in all_results.items():
        print(f"\n{folder.upper()} Folder Statistics (Total Images: {results['total']})")
        print("-" * 50)
        
        print("\nRace Distribution:")
        for race, percentage in results['race'].items():
            print(f"{race}: {percentage:.2f}%")
            
        print("\nGender Distribution:")
        for gender, percentage in results['gender'].items():
            print(f"{gender}: {percentage:.2f}%")
            
        print("\nAge Distribution:")
        for age_group, percentage in results['age'].items():
            print(f"{age_group}: {percentage:.2f}%")
    
    # Print overall race composition
    print("\nOVERALL DATASET RACE COMPOSITION")
    print("-" * 50)
    for race, count in total_race_count.items():
        percentage = (count / grand_total) * 100
        print(f"{race}: {percentage:.2f}%")

if __name__ == "__main__":
    main()


TRAIN Folder Statistics (Total Images: 17027)
--------------------------------------------------

Race Distribution:
White: 42.04%
Asian: 15.14%
Black: 18.82%
Indian: 16.73%
Others: 7.27%

Gender Distribution:
Male: 52.23%
Female: 47.77%

Age Distribution:
50+: 20.76%
Under 18: 19.46%
18-29: 31.91%
30-49: 27.87%

TEST Folder Statistics (Total Images: 3546)
--------------------------------------------------

Race Distribution:
White: 43.26%
Indian: 16.67%
Others: 6.68%
Black: 19.15%
Asian: 14.24%

Gender Distribution:
Female: 49.04%
Male: 50.96%

Age Distribution:
50+: 21.26%
Under 18: 17.37%
18-29: 33.11%
30-49: 28.26%

VALIDATION Folder Statistics (Total Images: 3529)
--------------------------------------------------

Race Distribution:
Asian: 14.25%
White: 43.30%
Black: 19.13%
Others: 6.66%
Indian: 16.66%

Gender Distribution:
Female: 46.73%
Male: 53.27%

Age Distribution:
50+: 21.73%
Under 18: 16.95%
18-29: 31.74%
30-49: 29.58%

OVERALL DATASET RACE COMPOSITION
-------------------