# Evaluating our final models

We use around half of the UTKFace dataset (10,000 Images) to test our models on its Gender accuracy and Age MAE

In [None]:
# Imports
import os
import numpy as np
import cv2
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, mean_absolute_error
from tensorflow.keras.models import load_model
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Configs
DATASET_PATH = "/Users/s16teen/Downloads/Work/UOB/SEM2/ML2/age_gender_estimation_CNN/Evaluation/UTKFace"
IMAGE_SIZE = (128, 128)
SAMPLE_SIZE = 10000

# Load Filenames
all_image_files = [
    file for file in os.listdir(DATASET_PATH) 
    if file.lower().endswith('.jpg')
]

# Sample a subset for evaluation
all_image_files = np.random.choice(all_image_files, SAMPLE_SIZE, replace=False).tolist()

In [5]:
# Load images and labels
def load_images_and_labels(dataset_path, filenames, target_size=(128, 128)):
    print("Loading image data and extracting labels...")
    images, age_labels, gender_labels = [], [], []

    for file in filenames:
        img_path = os.path.join(dataset_path, file)
        img = cv2.imread(img_path)
        img = cv2.resize(img, target_size)
        img = img / 255.0

        try:
            age, gender = file.split('_')[:2]
            age_labels.append(int(age))
            gender_labels.append(int(gender))
            images.append(img)
        except ValueError:
            continue

    return np.array(images), np.array(age_labels), np.array(gender_labels)

# Load the data
images, age_labels, gender_labels = load_images_and_labels(DATASET_PATH, all_image_files, IMAGE_SIZE)

Loading image data and extracting labels...


In [6]:
# Load Models
modelA = load_model('age_gender_A.keras')
modelB = load_model('age_gender_B.keras')

# Predict using both models
predA = modelA.predict(images)
predB = modelB.predict(images)

# Helper function to convert gender output to class label
def binary_to_class(preds):
    return [1 if p >= 0.5 else 0 for p in preds]


I0000 00:00:1741950445.464807 7625086 service.cc:148] XLA service 0x30aef6080 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1741950445.464969 7625086 service.cc:156]   StreamExecutor device (0): Host, Default Version
2025-03-14 11:07:25.491800: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.


[1m  7/313[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m8s[0m 27ms/step 

I0000 00:00:1741950445.701539 7625086 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 28ms/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 636ms/step


In [7]:
# Model A Metrics
gender_predA = binary_to_class(predA[0])
p1_acc = accuracy_score(gender_labels, gender_predA)
p1_mae = mean_absolute_error(age_labels, predA[1])

# Model B Metrics
gender_predB = binary_to_class(predB[0])
p2_acc = accuracy_score(gender_labels, gender_predB)
p2_mae = mean_absolute_error(age_labels, predB[1])

# Display Results
print("Model A:")
print(f"Age MAE: {p1_mae:.2f}")
print(f"Gender Accuracy: {p1_acc:.2%}")

print("\nModel B:")
print(f"Age MAE: {p2_mae:.2f}")
print(f"Gender Accuracy: {p2_acc:.2%}")

Model A:
Age MAE: 6.10
Gender Accuracy: 89.55%

Model B:
Age MAE: 6.37
Gender Accuracy: 88.09%


In [18]:
# Create subplots: 1 row, 2 columns
fig = make_subplots(rows=1, cols=2,
                    subplot_titles=("Gender Accuracy (More is better)", 
                                   "Age MAE (Less is better)"))

# Gender Accuracy plot
fig.add_trace(
    go.Bar(
        x=['Model A', 'Model B'],
        y=[p1_acc, p2_acc],
        marker_color=['#5da5da', '#60bd68'],
        text=[f"{p1_acc:.1%}", f"{p2_acc:.1%}"],
        textposition='auto'
    ),
    row=1, col=1
)

# Age MAE plot
fig.add_trace(
    go.Bar(
        x=['Model A', 'Model B'],
        y=[p1_mae, p2_mae],
        marker_color=['#5da5da', '#60bd68'],
        text=[f"{p1_mae:.2f}", f"{p2_mae:.2f}"],
        textposition='auto'
    ),
    row=1, col=2
)

# Update layout
fig.update_layout(
    height=500,
    width=1000,
    showlegend=False,
)

fig.update_yaxes(title_text="Accuracy", row=1, col=1, gridcolor='lightgray')
fig.update_yaxes(title_text="Mean Absolute Error", row=1, col=2, gridcolor='lightgray')

# Show the plot
fig.show()