In [3]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Function to categorize age with only ranges
def categorize_age(age):
    if 0 <= age <= 1:
        return '0-1'
    elif 1 < age <= 3:
        return '1-3'
    elif 3 < age <= 5:
        return '3-5'
    elif 6 <= age <= 12:
        return '6-12'
    elif 13 <= age <= 19:
        return '13-19'
    elif 20 <= age <= 24:
        return '20-24'
    elif 25 <= age <= 64:
        return '25-64'
    else:
        return '65+'

# Function to load data from filenames (unchanged)
def load_data(directories):
    data = []
    for directory in directories:
        for filename in os.listdir(directory):
            if filename.endswith('.jpg'):
                try:
                    age, gender, race, _ = filename.split('_')
                    data.append({
                        'age': int(age),
                        'gender': 'Male' if gender == '0' else 'Female',
                        'race': ['White', 'Black', 'Asian', 'Indian', 'Others'][int(race)],
                        'age_category': categorize_age(int(age))
                    })
                except ValueError:
                    print(f"Skipping file due to unexpected format: {filename}")
    return pd.DataFrame(data)

# Function to add percentage labels to bars
def add_percentage_labels(ax, rects):
    for rect in rects:
        width = abs(rect.get_width())
        ax.text(rect.get_x() + rect.get_width(), rect.get_y() + rect.get_height()/2.,
                f'{width:.1f}%',
                ha='left' if rect.get_width() >= 0 else 'right', va='center')

# Function to save individual plots
def save_plot(fig, ax, filename):
    fig.tight_layout()
    fig.savefig(filename, dpi=300, bbox_inches='tight')
    plt.close(fig)

# Specify the paths to your dataset parts
base_path = r'C:\Users\Luigi\Documents\Fall 2024\COSC 4394\Project\UTKFace'
dataset_paths = [
    os.path.join(base_path, 'part1'),
    os.path.join(base_path, 'part2'),
    os.path.join(base_path, 'part3')
]

# Load the data from all parts
data = load_data(dataset_paths)

# Box Plot of Age by Gender
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(x='gender', y='age', data=data, ax=ax, palette={'Male': 'skyblue', 'Female': 'pink'})
ax.set_title('Age Distribution by Gender')
ax.set_xlabel('Gender')
ax.set_ylabel('Age')
save_plot(fig, ax, 'UTKFace_boxplot_age_by_gender.png')

# Print box plot statistics
for gender in ['Male', 'Female']:
    gender_data = data[data['gender'] == gender]['age']
    q1, median, q3 = gender_data.quantile([0.25, 0.5, 0.75])
    iqr = q3 - q1
    lower_whisker = gender_data[gender_data >= q1 - 1.5 * iqr].min()
    upper_whisker = gender_data[gender_data <= q3 + 1.5 * iqr].max()
    print(f"\n{gender} Age Statistics:")
    print(f"Lower Whisker: {lower_whisker}")
    print(f"Q1: {q1}")
    print(f"Median: {median}")
    print(f"Q3: {q3}")
    print(f"Upper Whisker: {upper_whisker}")

# Gender Distribution Graph
fig, ax = plt.subplots(figsize=(8, 6))
gender_counts = data['gender'].value_counts()
gender_percentages = gender_counts / len(data) * 100
sns.barplot(x=gender_percentages.index, y=gender_percentages.values, ax=ax, palette={'Male': 'skyblue', 'Female': 'pink'})
ax.set_title('Gender Distribution')
ax.set_xlabel('Gender')
ax.set_ylabel('Percentage')
for i, v in enumerate(gender_percentages):
    ax.text(i, v, f'{v:.1f}%', ha='center', va='bottom')
save_plot(fig, ax, 'UTKFace_gender_distribution.png')

# Age Distribution by Gender
fig, ax = plt.subplots(figsize=(12, 10))

# Prepare data
age_gender_counts = data.groupby(['age_category', 'gender']).size().unstack()
age_gender_percentages = age_gender_counts.div(age_gender_counts.sum(axis=1), axis=0) * 100

# Sort the index to match the order in the example
sorted_index = ['65+', '25-64', '20-24', '13-19', '6-12', '3-5', '1-3', '0-1']
age_gender_percentages = age_gender_percentages.reindex(sorted_index)

# Plot
male_bars = ax.barh(age_gender_percentages.index, -age_gender_percentages['Male'], 
                    align='center', color='skyblue', label='Male')
female_bars = ax.barh(age_gender_percentages.index, age_gender_percentages['Female'], 
                      align='center', color='pink', label='Female')

# Customize plot
ax.set_xlabel('Percent of Population')
ax.set_title('Age Distribution by Gender')
ax.legend(loc='lower right')

# Remove top and right spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Add percentage labels
add_percentage_labels(ax, male_bars)
add_percentage_labels(ax, female_bars)

# Adjust x-axis
ax.set_xlim(-70, 70)
ax.set_xticks(range(-60, 61, 20))
ax.set_xticklabels([str(abs(x)) for x in range(-60, 61, 20)])

# Add vertical line at 0
ax.axvline(x=0, color='black', linewidth=0.5)

save_plot(fig, ax, 'UTKFace_age_distribution_by_gender.png')

# Print some summary statistics
print("\nAge Category Distribution:")
print(data['age_category'].value_counts())
print("\nGender Distribution:")
print(data['gender'].value_counts())
print("\nRace Distribution:")
print(data['race'].value_counts())

# Additional statistics
print("\nTotal number of images:", len(data))
print("\nAge statistics:")
print(data['age'].describe())

# Save the combined data to a CSV file for further analysis if needed
data.to_csv('UTKFace_combined_data.csv', index=False)
print("\nCombined data saved to 'UTKFace_combined_data.csv'")
print("\nIndividual graph PNGs have been saved in the current directory.")

Skipping file due to unexpected format: 61_1_20170109142408075.jpg
Skipping file due to unexpected format: 61_3_20170109150557335.jpg
Skipping file due to unexpected format: 39_1_20170116174525125.jpg



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='gender', y='age', data=data, ax=ax, palette={'Male': 'skyblue', 'Female': 'pink'})



Male Age Statistics:
Lower Whisker: 1
Q1: 25.0
Median: 34.0
Q3: 50.0
Upper Whisker: 87

Female Age Statistics:
Lower Whisker: 1
Q1: 21.0
Median: 26.0
Q3: 37.0
Upper Whisker: 61



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=gender_percentages.index, y=gender_percentages.values, ax=ax, palette={'Male': 'skyblue', 'Female': 'pink'})



Age Category Distribution:
age_category
25-64    14890
20-24     2317
65+       2007
0-1       1282
13-19     1200
6-12      1076
1-3        839
3-5        492
Name: count, dtype: int64

Gender Distribution:
gender
Male      12581
Female    11522
Name: count, dtype: int64

Race Distribution:
race
White     10222
Black      4558
Indian     4027
Asian      3586
Others     1710
Name: count, dtype: int64

Total number of images: 24103

Age statistics:
count    24103.000000
mean        33.042609
std         20.138943
min          1.000000
25%         23.000000
50%         29.000000
75%         45.000000
max        116.000000
Name: age, dtype: float64

Combined data saved to 'UTKFace_combined_data.csv'

Individual graph PNGs have been saved in the current directory.
