In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

# packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import cv2
import seaborn as sns
from PIL import Image
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications import EfficientNetB0, EfficientNetB1, EfficientNetB2, EfficientNetB3, EfficientNetB4, EfficientNetB5, EfficientNetB6, EfficientNetB7
from tensorflow.keras import models, layers
from tensorflow.keras.layers import Input, Dense, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras import utils
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.image import img_to_array
import math
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

We define some parameters that can be changed in order to have different behaviours in the code.

In [2]:
#Parameters
DIR_UTKFace="../input/utkface-new/UTKFace"
IMG_SIZE = 224 

#set SMALL_DATASET to use reduced dataset in order to be faster with tests. Otherwise set to NORMAL_DATASET
MODE="NORMAL_DATASET"

TRAIN_SPLIT = 0.8  
VALID_TEST_SPLIT = 0.8

#imagenet or noisy-student
WEIGHTS = 'imagenet'

BATCH_SIZE = 32

# Study of the dataset

## Import data

We get the data from the *DIR_UTKFace* directory and we create a dataframe with three atributes: age, gender and file_name. First we define some functions that we will need for getting the information of the person of a picture, and also other functions to preview the images.

In [3]:
#For gender we get a numeric value. Male = 0, Female = 1
GENDER_MAP=['M','F']

#Get age and gender from image file name
def get_info_from_image(image_file_name):
    age=int(image_file_name.split('_')[0])
    gender=int(image_file_name.split('_')[1])
    return age, gender

#get image from filename and directory
def get_image(file_name):
    img_dir=file_name
    image=cv2.imread(img_dir)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = cv2.resize(image,(IMG_SIZE,IMG_SIZE))
    return image
  
#Preview image with gender and age from dataframe
def preview_random_images(df, with_predictions, with_gender=False, with_age=False):
    samples = 25
    if len(df) < samples:
        samples = len(df)
    
    df_sample = df.sample(samples) 
    plt.figure(figsize=(20,20))
    for i in range(len(df_sample.index)):
        plt.subplot(5,5,i+1)
        plt.axis('off')
        image = get_image(np.array(df_sample['file_name'])[i])
        plt.imshow(image)     
        title = "Real: "
        if with_gender:
            gender = GENDER_MAP[np.array(df_sample['gender'])[i]]
            title = title + gender
        if with_age:
            age = np.array(df_sample['age'])[i]
            title = title + "(" + str(age) + ")"
        if ((not with_age) and (not with_gender)):
            gender = GENDER_MAP[np.array(df_sample['gender'])[i]]
            title = title + gender
            age = np.array(df_sample['age'])[i]
            title = title + "(" + str(age) + ")"         
        if with_predictions:
            title += " Pred: "
            if with_gender:
                gender = GENDER_MAP[np.array(df_sample['gender_pred'])[i]]
                title = title + gender
            if with_age:
                age = np.array(df_sample['age_pred'])[i]
                title = title + " (" + str(round(age)) + ")"
        plt.title(title)


We list all the files in DIR_UTKFace directory. For each of these files, we get the information and we construct the dataframe df

In [4]:
files_names=os.listdir(DIR_UTKFace)

all_ages = []
all_genders = []
all_file_names = []


for file_name in files_names:
    
    #check that all images are jpg format
    if ".jpg" not in file_name:
        raise RuntimeError("Format not expected: "+ file_name)
    
    age, gender = get_info_from_image(file_name)
    
    all_ages.append(age)
    all_genders.append(gender)
    all_file_names.append(DIR_UTKFace + "/" + file_name)
    
    
dataset_images = {'age': all_ages, 
                  'gender': all_genders, 
                  'file_name': all_file_names}

df = pd.DataFrame(data=dataset_images)

In [5]:
# SMALL_DATASET is used to reduced dataset in order to be faster with tests.
if MODE == "SMALL_DATASET":
    df = df.head(100)
    EPOCHS = 3

## Preview data

Now that we have the dataframe, we can explore the first rows

In [6]:
df.head()

We can preview some images with their age and gender.

In [7]:
preview_random_images(df, False)

## Cleaning data
We check if the dataframe contains nulls in order to treat them

In [8]:
#age checks
for i in range(len(df.index)):
    age = df['age'][i]
    if type(age) != np.int64:
        raise RuntimeError("Age not expected: "+ str(age))
df_age_nan  = df[df['age'].isna()]
print("Total rows with NaN in age field: " + str(df_age_nan.age.count()))


#gender checks
for i in range(len(df.index)):
    gender = df['gender'][i]
    if gender != 0 and gender != 1:
        raise RuntimeError("Gender not expected: "+ gender)
df_gender_nan = df[df['gender'].isna()]
print("Total rows with NaN in gender field: " + str(df_gender_nan.gender.count()))


#file_name checks
for i in range(len(df.index)):
    file_name = df['file_name'][i]
    if ".jpg" not in file_name:
        raise RuntimeError("Format not expected: "+ file_name)
df_file_name_nan = df[df['file_name'].isna()]
print("Total rows with null value in file_name field: " + str(df_file_name_nan.file_name.count()))

## Data exploration

We describe the dataframe to get more information about it.

### Exploration for gender

In [9]:
df['gender'].describe()

In [10]:
num_images = df.gender.count()
num_images_male_gender = df.groupby('gender').get_group(0).gender.count() 
num_images_female_gender = df.groupby('gender').get_group(1).gender.count() 
percentage_male_gender = (num_images_male_gender * 100)/num_images
percentage_female_gender = (num_images_female_gender * 100)/num_images
print("MALE gender: " + str(round(percentage_male_gender)) + "% (" + str(num_images_male_gender) + " images)")
print("FEMALE gender: " + str(round(percentage_female_gender)) + "% (" + str(num_images_female_gender) + " images)")

df.gender.value_counts().plot(kind='pie', labels=GENDER_MAP)
plt.title('Gender distribution')

### Exploration for age

In [11]:
df['age'].describe()

In [12]:
maxAge = df['age'].max()
print("Older person has " + str(maxAge) + " years old") 

sns.histplot(data=df, x='age', kde=True, bins=maxAge).set_title('Age Distribution')

In [13]:
sns.kdeplot(data=df, x="age")
sns.kdeplot(data=df.groupby('gender').get_group(0), x="age")
sns.kdeplot(data=df.groupby('gender').get_group(1), x="age")
plt.legend(labels=["All", "Male", "Female"])
plt.title("Age distribution by gender")

In [14]:
_, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
sns.boxplot(data=df, x='gender', y='age', ax=ax1)
sns.boxplot(data=df, y='age', ax=ax2)

As we can see in the boxplots we have outliers with the older people. As the model will not have enough information to be trained for these ages, we can expect worst behaviours in the prediction of this ages

# Generating the model for gender


In [16]:
def plot_hist(hist):
    plt.plot(hist.history["binary_accuracy"])
    plt.plot(hist.history["val_binary_accuracy"])
    plt.title("model accuracy")
    plt.ylabel("accuracy")
    plt.xlabel("epoch")
    plt.legend(["train", "validation"], loc="upper left")
    plt.show()

In [17]:
dataGenerator = ImageDataGenerator(
    brightness_range=[0.8,1.2],
    horizontal_flip=True
)

#preview of images generated by the data generator 
img = get_image(np.array(df['file_name'])[0])
data = img_to_array(img)
samples = np.expand_dims(data, 0)

datagen = dataGenerator
it = datagen.flow(samples, batch_size=1)

plt.figure(figsize=(20,20))
for i in range(20):
    plt.subplot(10,10,i+1)
    plt.axis('off')
    batch = it.next()
    image = batch[0].astype('uint8')
    plt.imshow(image)

In [18]:
def get_image_from_generator(image_array):
    img = np.expand_dims(image_array, 0)
    it = datagen.flow(img, batch_size=1)
    batch = it.next()
    img = batch[0].astype('uint8')
    return img

def get_data_generator_gender(df, indices, is_train, is_test):

    images = list()
    genders = list()
    while True:
        for i in indices:
            r = df.iloc[i]
            file, gender = r['file_name'], r['gender']
            im = Image.open(file)
            im = im.resize((IMG_SIZE, IMG_SIZE))
            im = np.array(im) / 255.0
            if is_train:
                im = get_image_from_generator(im)
                im = im / 255.0
            images.append(im)
            genders.append(gender)
            if len(images) >= BATCH_SIZE:
                yield np.asarray(images), np.asarray(genders)
                images = list()
                genders = list()
        if is_test:
            break

In [19]:
p = np.random.permutation(len(df)) 
train_len = int(len(df) * TRAIN_SPLIT)

train_idx = p[:train_len]
valid_test_idx = p[train_len:]



valid_len = int((len(df)-train_len) * VALID_TEST_SPLIT)
valid_idx, test_idx = valid_test_idx[:valid_len], valid_test_idx[valid_len:]

print("Lenght of original dataset: " + str(len(df)) + ")")
print("Lenght of train dataset: " + "(" + str(len(train_idx)) + ")")
print("Lenght of validation dataset: " + "(" + str(len(valid_idx)) + ")")
print("Lenght of test dataset: " + "(" + str(len(test_idx)) + ")")

In [20]:
inputs = Input(shape=(IMG_SIZE, IMG_SIZE, 3)) 
x = inputs
model = EfficientNetB0(weights=WEIGHTS, include_top=False, input_tensor = x)

# Freeze the pretrained weights
model.trainable = True

#for layer in model.layers:
#    layer.trainable = True

x = layers.GlobalAveragePooling2D(name="avg_pool")(model.output)
x = layers.BatchNormalization()(x)

top_dropout_rate = 0.2
x = layers.Dropout(top_dropout_rate, name="top_dropout")(x)
gender_output = Dense(units=1, activation='sigmoid', name='gender_output')(x)

model = Model(inputs=inputs, outputs=gender_output,  name="EfficientNet")
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-2)
model.compile(optimizer=optimizer,   loss='binary_crossentropy', metrics='binary_accuracy')

In [21]:
#Get steps based on lenght of a dataframe and the BATCH
def get_steps(lenght_samples, batch_size):
    if (lenght_samples % batch_size) > 0 :
        return (lenght_samples // batch_size) + 1
    else :
        return lenght_samples // batch_size

    
steps_train = get_steps(len(train_idx), BATCH_SIZE)
steps_valid = get_steps(len(valid_idx), BATCH_SIZE)

In [22]:
train_gen = get_data_generator_gender(df, train_idx, True, False)
valid_gen = get_data_generator_gender(df, valid_idx, False, False)


epochs=25
history = model.fit_generator(train_gen,
                    steps_per_epoch=steps_train,
                    epochs=epochs,
                    validation_data=valid_gen,
                    validation_steps=steps_valid)
plot_hist(history)

In [None]:
def unfreeze_model(model):
    # We unfreeze the top 20 layers while leaving BatchNorm layers frozen
    for layer in model.layers[-20:]:
        if not isinstance(layer, layers.BatchNormalization):
            layer.trainable = True

    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
    model.compile(optimizer=optimizer,   loss='binary_crossentropy', metrics='binary_accuracy')


unfreeze_model(model)

epochs = 10  # @param {type: "slider", min:8, max:50}
hist = model.fit_generator(train_gen,
                    steps_per_epoch=steps_train,
                    epochs=epochs,
                    validation_data=valid_gen,
                    validation_steps=steps_valid)
plot_hist(hist)

In [None]:
#classification report for train dataset
steps_train = get_steps(len(train_idx), BATCH_SIZE)
train_gen = get_data_generator_gender(df, train_idx, False, True)
gender_predictions = model.predict(train_gen, steps=steps_train)
train_gen = get_data_generator_gender(df, train_idx, False, True)
gender_evaluated = model.evaluate(train_gen, steps=steps_train)
gender_predictions_binary = []
for i in range(len(gender_predictions)):
    gender_pred = 0 if gender_predictions[i][0] < 0.5 else 1 
    gender_predictions_binary.append(gender_pred) 
gender_true = []
for i in range(len(gender_predictions_binary)):
    r = df.iloc[train_idx[i]]
    gender_real = r['gender']
    gender_true.append(gender_real)
from sklearn.metrics import classification_report
print("\nClassification report for gender (train dataset)")
print(classification_report(gender_true, gender_predictions_binary))

In [None]:
#classification report for validation dataset
steps_valid = get_steps(len(valid_idx), BATCH_SIZE)
valid_gen = get_data_generator_gender(df, valid_idx, False, True)
gender_predictions = model.predict(valid_gen, steps=steps_valid)
valid_gen = get_data_generator_gender(df, valid_idx, False, True)
gender_evaluated = model.evaluate(valid_gen, steps=steps_valid)
gender_predictions_binary = []
for i in range(len(gender_predictions)):
    gender_pred = 0 if gender_predictions[i][0] < 0.5 else 1 
    gender_predictions_binary.append(gender_pred) 
gender_true = []
for i in range(len(gender_predictions_binary)):
    r = df.iloc[valid_idx[i]]
    gender_real = r['gender']
    gender_true.append(gender_real)
from sklearn.metrics import classification_report
print("\nClassification report for gender (validation dataset)")
print(classification_report(gender_true, gender_predictions_binary))

In [None]:
#classification report for test dataset
steps_test = get_steps(len(test_idx), BATCH_SIZE)
test_gen = get_data_generator_gender(df, test_idx, False, True)
gender_predictions = model.predict(test_gen, steps=steps_test)
test_gen = get_data_generator_gender(df, test_idx, False, True)
gender_evaluated = model.evaluate(test_gen, steps=steps_test)
gender_predictions_binary = []
for i in range(len(gender_predictions)):
    gender_pred = 0 if gender_predictions[i][0] < 0.5 else 1 
    gender_predictions_binary.append(gender_pred) 
gender_true = []
for i in range(len(gender_predictions_binary)):
    r = df.iloc[test_idx[i]]
    gender_real = r['gender']
    gender_true.append(gender_real)
from sklearn.metrics import classification_report
print("\nClassification report for gender (validation dataset)")
print(classification_report(gender_true, gender_predictions_binary))

In [None]:
#preview images with prediction
genders_real = []
file_names = []
genders_pred = []
for i in range(len(gender_predictions)):
    r = df.iloc[test_idx[i]]
    file_name, gender_real = r['file_name'], r['gender']
    gender_pred = 0 if gender_predictions[i][0] < 0.5 else 1    
    genders_real.append(gender_real)
    file_names.append(file_name)
    genders_pred.append(gender_pred) 
    
dataset_images_pred = {'gender': genders_real, 
                       'gender_pred': genders_pred, 
                       'file_name': file_names}

df_pred = pd.DataFrame(data=dataset_images_pred)
preview_random_images(df_pred, True, with_gender=True)

# Model for age

In [23]:
def plot_hist(hist):
    plt.plot(hist.history["mae"])
    plt.plot(hist.history["val_mae"])
    plt.title("model mae")
    plt.ylabel("mae")
    plt.xlabel("epoch")
    plt.legend(["train", "validation"], loc="upper left")
    plt.show()


def get_data_generator_age(df, indices, is_train, is_test):

    images = list()
    ages = list()
    while True:
        for i in indices:
            r = df.iloc[i]
            file, age = r['file_name'], r['age']
            im = Image.open(file)
            im = im.resize((IMG_SIZE, IMG_SIZE))
            im = np.array(im) / 255.0
            if is_train:
                im = get_image_from_generator(im)
                im = im / 255.0
            images.append(im)
            age_norm = (age / maxAge)
            ages.append(age_norm)
            if len(images) >= BATCH_SIZE:
                yield np.asarray(images), np.asarray(ages)
                images = list()
                ages = list()
        if is_test:
            break

In [24]:
inputs = Input(shape=(IMG_SIZE, IMG_SIZE, 3)) 
x = inputs
model = EfficientNetB0(weights=WEIGHTS, include_top=False, input_tensor = x)

# Freeze the pretrained weights
model.trainable = True

#for layer in model.layers:
#    layer.trainable = True

x = layers.GlobalAveragePooling2D(name="avg_pool")(model.output)
x = layers.BatchNormalization()(x)

top_dropout_rate = 0.2
x = layers.Dropout(top_dropout_rate, name="top_dropout")(x)
age_output = Dense(units=1, activation='linear', name='age_output')(x)

model = Model(inputs=inputs, outputs=age_output,  name="EfficientNet")
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-2)
model.compile(optimizer=optimizer,   loss='mean_squared_error', metrics=['mae'])

In [25]:
train_gen = get_data_generator_age(df, train_idx, True, False)
valid_gen = get_data_generator_age(df, valid_idx, False, False)


epochs=25
history = model.fit_generator(train_gen,
                    steps_per_epoch=steps_train,
                    epochs=epochs,
                    validation_data=valid_gen,
                    validation_steps=steps_valid)
plot_hist(history)

In [28]:
def unfreeze_model(model):
    # We unfreeze the top 20 layers while leaving BatchNorm layers frozen
    for layer in model.layers[-20:]:
        if not isinstance(layer, layers.BatchNormalization):
            layer.trainable = True

    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
    model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mae'])


unfreeze_model(model)

epochs = 10  # @param {type: "slider", min:8, max:50}
hist = model.fit_generator(train_gen,
                    steps_per_epoch=steps_train,
                    epochs=epochs,
                    validation_data=valid_gen,
                    validation_steps=steps_valid)
plot_hist(hist)

In [30]:
steps_test = get_steps(len(test_idx), BATCH_SIZE)
test_gen = get_data_generator_age(df, test_idx, False, True)
age_predictions_norm = model.predict(test_gen, steps=steps_test)
test_gen = get_data_generator_age(df, test_idx, False, True)
age_predictions_evaluate = model.evaluate(test_gen, steps=steps_test)

In [32]:
age_predictions = age_predictions_norm*maxAge

In [37]:
ages_real = []
file_names_age = []
ages_pred = []
for i in range(len(age_predictions)):
    r = df.iloc[test_idx[i]]
    file_name, age_real = r['file_name'], r['age']
    ages_real.append(age_real)
    file_names_age.append(file_name)
    age_pred = age_predictions[i][0]
    ages_pred.append(age_pred)  
dataset_images_pred = {'age': ages_real, 
                       'age_pred': ages_pred, 
                       'file_name': file_names_age}

df_pred = pd.DataFrame(data=dataset_images_pred)

In [38]:
preview_random_images(df_pred, True, with_age=True)