# Evaluating our final models

We use around half of the UTKFace dataset (10,000 Images) to test our models on its Gender accuracy and Age MAE

In [1]:
# Imports
import os
import numpy as np
import cv2
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, mean_absolute_error
from tensorflow.keras.models import load_model

# Configs
DATASET_PATH = "/Users/s16teen/Downloads/Work/UOB/SEM2/ML2/age_gender_estimation_CNN/Evaluation/UTKFace"
IMAGE_SIZE = (128, 128)
SAMPLE_SIZE = 10000

# Load Filenames
all_image_files = [
    file for file in os.listdir(DATASET_PATH) 
    if file.lower().endswith('.jpg')
]

# Sample a subset for evaluation
all_image_files = np.random.choice(all_image_files, SAMPLE_SIZE, replace=False).tolist()

In [None]:
# Load images and labels
def load_images_and_labels(dataset_path, filenames, target_size=(128, 128)):
    print("Loading image data and extracting labels...")
    images, age_labels, gender_labels = [], [], []

    for file in filenames:
        img_path = os.path.join(dataset_path, file)
        img = cv2.imread(img_path)
        img = cv2.resize(img, target_size)
        img = img / 255.0

        try:
            age, gender = file.split('_')[:2]
            age_labels.append(int(age))
            gender_labels.append(int(gender))
            images.append(img)
        except ValueError:
            continue

    return np.array(images), np.array(age_labels), np.array(gender_labels)

# Load the data
images, age_labels, gender_labels = load_images_and_labels(DATASET_PATH, all_image_files, IMAGE_SIZE)

Loading image data and extracting labels...


In [4]:
# Load Models
modelA = load_model('age_gender_A.keras')
modelB = load_model('age_gender_B.keras')

# Predict using both models
predA = modelA.predict(images)
predB = modelB.predict(images)

# Helper function to convert gender output to class label
def binary_to_class(preds):
    return [1 if p >= 0.5 else 0 for p in preds]


I0000 00:00:1741949597.359701 7609559 service.cc:148] XLA service 0x17e010630 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1741949597.359748 7609559 service.cc:156]   StreamExecutor device (0): Host, Default Version
2025-03-14 10:53:17.383633: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.


[1m  5/313[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m8s[0m 29ms/step  

I0000 00:00:1741949597.576601 7609559 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 28ms/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 641ms/step


In [None]:
# Model A Metrics
gender_predA = binary_to_class(predA[0])
p1_acc = accuracy_score(gender_labels, gender_predA)
p1_mae = mean_absolute_error(age_labels, predA[1])

# Model B Metrics
gender_predB = binary_to_class(predB[0])
p2_acc = accuracy_score(gender_labels, gender_predB)
p2_mae = mean_absolute_error(age_labels, predB[1])

# Display Results
print("Model A:")
print(f"Age MAE: {p1_mae:.2f}")
print(f"Gender Accuracy: {p1_acc:.2%}")

print("\nModel B:")
print(f"Age MAE: {p2_mae:.2f}")
print(f"Gender Accuracy: {p2_acc:.2%}")

In [None]:
# Plot Gender Accuracy
plt.figure(figsize=(10, 4))

# Gender Accuracy
plt.subplot(1, 2, 1)
plt.bar(['Model A', 'Model B'], [p1_acc, p2_acc], color=['skyblue', 'lightgreen'])
plt.title("Gender Accuracy")
plt.ylabel("Accuracy")
plt.ylim(0, 1)
plt.grid(axis='y')

# Age MAE
plt.subplot(1, 2, 2)
plt.bar(['Model A', 'Model B'], [p1_mae, p2_mae], color=['coral', 'orchid'])
plt.title("Age MAE")
plt.ylabel("Mean Absolute Error")
plt.grid(axis='y')

plt.tight_layout()
plt.show()