# Face classifier

## Set up

In [None]:
# Stdlib imports
from pathlib import Path
import os

# 3rd party imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Input
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D 

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score 
from sklearn.metrics import classification_report, roc_curve, roc_auc_score, f1_score
from sklearn.metrics import r2_score

from IPython.display import display

# Local imports
from facecls import fcaux

## Configurations

In [None]:
def set_seeds(seed = 42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    tf.random.set_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

seed =42
set_seeds(seed)

In [None]:
target = "gender"

In [None]:
models_dir = Path(f"results/models/{target.title()}Classifier/")

try:
    last_model_id = max([int(folder.as_posix().split("_")[2]) for folder in models_dir.glob('alexnet*')])
except ValueError:
    last_model_id = 0

print("Last model id:", last_model_id)

new_model_id = last_model_id + 1
file_suffix = f"alexnet_{target}_{str(new_model_id).zfill(3)}"
new_model_dir = models_dir / file_suffix
print(f"Creating folder \"{new_model_dir}\"...")
new_model_dir.mkdir(parents=True, exist_ok=True)

## Load data

In [None]:
data = pd.read_csv("data/age_gender_preproc.csv")

## Challenger model: CNN classifier

In [None]:
full_img_vec_list = np.array([fcaux.pxlstring2pxlvec(data, i) for i in range(data.shape[0])])
full_img_array_list = np.array([fcaux.pxlvec2pxlarray(img_vec) for img_vec in full_img_vec_list])

### Data split

In [None]:
indeces = list(range(len(full_img_array_list)))

In [None]:
if "age" in target and not(target == "age_cat"):
    idx_train, idx_test = train_test_split(indeces,
                                           test_size = 0.2,
                                           random_state=seed)
    idx_train, idx_val  = train_test_split(idx_train,
                                           test_size = 0.1,
                                           random_state=seed)

    X_train = full_img_array_list[idx_train]
    y_train = data[target].values[idx_train]
    X_val = full_img_array_list[idx_val]
    y_val = data[target].values[idx_val]
    X_test = full_img_array_list[idx_test]
    y_test = data[target].values[idx_test]
    test_attrs = data[["age", "ethnicity", "gender"]].iloc[idx_test]

else:
    idx_train, idx_test = train_test_split(indeces, 
                                           test_size = 0.2,
                                           stratify = data[target].values,
                                           random_state=seed
                                           )

    idx_train, idx_val  = train_test_split(idx_train, 
                                           test_size = 0.1,
                                           stratify = data[target].values[idx_train],
                                           random_state=seed
                                           )

    X_train = full_img_array_list[idx_train]
    y_train = data[target].values[idx_train]
    X_val = full_img_array_list[idx_val]
    y_val = data[target].values[idx_val]
    X_test = full_img_array_list[idx_test]
    y_test = data[target].values[idx_test]
    test_attrs = data[["age", "ethnicity", "gender"]].iloc[idx_test]

In [None]:
print("#training:", len(X_train))
print("#validation:", len(X_val))
print("#test:", len(X_test))

In [None]:
idx_val += (len(idx_train) - len(idx_val))*[np.nan]
idx_test += (len(idx_train) - len(idx_test))*[np.nan]

assert len(idx_train) == len(idx_val)
assert len(idx_train) == len(idx_test)

idx_df = pd.DataFrame({"train_idx": idx_train,
                       "val_idx": idx_val,
                       "test_idx": idx_test}, dtype="Int64")

idx_df.to_csv(new_model_dir / f"data_set_indices__{file_suffix}.csv", index=False)

### Data preprocessing

In [None]:
X_train = X_train.reshape(X_train.shape + (1,))
X_val = X_val.reshape(X_val.shape + (1,))
X_test = X_test.reshape(X_test.shape + (1,))

In [None]:
X_train = X_train.astype("float32")
X_val = X_val.astype("float32")
X_test = X_test.astype("float32")
X_train /= 255
X_val /= 255
X_test /= 255

if target == "age_cat":
    y_train /= age_diff
    y_val /= age_diff
    y_test /= age_diff

In [None]:
if "age" in target and not(target == "age_cat"):
    num_classes = 0
else:
    num_classes = data[target].nunique()
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_val = keras.utils.to_categorical(y_val, num_classes)

print("num_classes =", num_classes)

### Building the CNN model (AlexNet architecture)

In [None]:
def create_alex_net_model(num_classes):
    input_img = Input(shape=(48,48,1))
    conv1 = Conv2D(16, (3,3), padding="same", strides=(1,1))(input_img)
    activ1 = Activation("relu")(conv1)
    conv2 = Conv2D(32, (5,5), strides=(1,1))(activ1)
    activ2 = Activation("relu")(conv2)
    pool1 = MaxPooling2D(pool_size=(2,2))(activ2)
    conv3 = Conv2D(64, (5,5), strides=(1,1))(pool1)
    activ3 = Activation("relu")(conv3)
    pool2 = MaxPooling2D(pool_size=(2,2))(activ3)
    flat = Flatten()(pool2)
    dense1 = Dense(128, activation="relu")(flat) # previously: 128
    dense1 = Dense(128, activation="relu")(dense1)
    
    if num_classes == 2:
        out = Dense(num_classes, activation="sigmoid")(dense1)
    elif num_classes > 2:
        out = Dense(num_classes, activation="softmax")(dense1)
    else:
        out = Dense(1, activation="linear")(dense1)

    model = Model(input_img, out)

    adam = keras.optimizers.Adam()

    if num_classes == 2:
        loss = "binary_crossentropy"
        metrics_list = ["accuracy"]
        
    elif num_classes > 2:
        loss = "categorical_crossentropy"
        metrics_list = ["accuracy"]
        
    else:
        loss = "mean_squared_error"
        #loss = "mean_absolute_percentage_error"
        metrics_list = ["r2_score", "mean_squared_error", "mean_absolute_percentage_error"]
        
    model.compile(optimizer=adam,
                  loss=loss,
                  metrics=metrics_list
                 )

    return model

In [None]:
set_seeds()
alex_net = create_alex_net_model(num_classes)

In [None]:
alex_net.summary()

### Train model until overfitting

In [None]:
n_epochs = 10
an_history = alex_net.fit(X_train, y_train,
                          epochs=n_epochs,
                          batch_size=32,
                          validation_data=(X_val, y_val),
                          shuffle=True
                         )             

In [None]:
alex_net.save(new_model_dir / f'{file_suffix}__nepochs{n_epochs}.keras')

In [None]:
history = alex_net.history.history
n_epochs = len(history[list(history.keys())[0]])
history["epoch"] = list(range(1, n_epochs+1))

# in general I don't know the names of the other columns. Therefore:
other_columns = [k for k, v in history.items() if k!="epoch"] 

history_df = pd.DataFrame(history, columns = ["epoch"] + other_columns)
file_name = f'history__{file_suffix}__nepochs{n_epochs}.csv'
history_df.to_csv(new_model_dir / file_name, index=False)

In [None]:
fig, axs = plt.subplots(2,1, figsize=(5,4), sharex=True)
ax = axs[0]
ax.plot(an_history.history["loss"], label="training")
ax.plot(an_history.history["val_loss"], label="validation")
ax.grid(True)
ax.set_xticks(range(n_epochs))
ax.set_ylabel("Loss")
ax.set_title(f"AlexNet ({target.title()})")
ax.legend(loc="best")

ax = axs[1]
ax.plot(an_history.history["accuracy"], label="training")
ax.plot(an_history.history[f"val_accuracy"], label="validation")
ax.grid(True)
ax.set_xticks(range(n_epochs))
ax.set_xlabel("Epochs")
ax.set_ylabel("Accuracy")#metric.title())
ax.legend(loc="best")

plt.savefig(new_model_dir / f"loss_curve__{file_suffix}__nepochs{n_epochs}.png",
            bbox_inches='tight')
plt.show()

### Refit model with optimal number of epochs

Remark: optimal number of epochs = number of epochs leading to minimal validation loss

In [None]:
set_seeds()
n_epochs = np.argmin(an_history.history["val_loss"])
print(f"Optimal epoch: #{n_epochs}.")
alex_net_refit = create_alex_net_model(num_classes)

refit_history = alex_net_refit.fit(X_train, y_train,
                                   epochs=n_epochs,
                                   batch_size=32,
                                   validation_data=(X_val, y_val),
                                   shuffle=True
                                  )    

alex_net_refit.save(new_model_dir / f'{file_suffix}__refit_nepochs{n_epochs}.keras')

In [None]:
refit_history = alex_net_refit.history.history
n_epochs = len(refit_history[list(refit_history.keys())[0]])
history["epoch"] = list(range(1, n_epochs+1))

# in general I don't know the names of the other columns. Therefore:
other_columns = [k for k, v in refit_history.items() if k!="epoch"] 

refit_history_df = pd.DataFrame(refit_history, columns = ["epoch"] + other_columns)
file_name = f'history__{file_suffix}__refit__nepochs{n_epochs}.csv'
#refit_history_df.to_csv(new_model_dir / file_name, index=False)

In [None]:
fig, axs = plt.subplots(2,1, figsize=(5,4), sharex=True)
ax = axs[0]
ax.plot(refit_history["loss"], label="training")
ax.plot(refit_history["val_loss"], label="validation")
ax.grid(True)
ax.set_xticks(range(n_epochs))
ax.set_ylabel("Loss")
ax.set_title(f"AlexNet ({target.title()})")
ax.legend(loc="best")

ax = axs[1]
ax.plot(refit_history["accuracy"], label="training")
ax.plot(refit_history[f"val_accuracy"], label="validation")
ax.grid(True)
ax.set_xticks(range(n_epochs))
ax.set_xlabel("Epochs")
ax.set_ylabel("Accuracy")#metric.title())
ax.legend(loc="best")

plt.savefig(new_model_dir / f"loss_curve__{file_suffix}__refit_nepochs{n_epochs}.png",
            bbox_inches='tight')
plt.show()

In [None]:
y_prob_train = alex_net.predict(X_train)
y_pred_train = np.array([np.argmax(i) for i in y_prob_train])

y_prob_val = alex_net.predict(X_val)
y_pred_val = np.array([np.argmax(i) for i in y_prob_val])

y_prob_test = alex_net.predict(X_test)
y_pred_test = np.array([np.argmax(i) for i in y_prob_test])

In [None]:
train_metrics = {"accuracy": accuracy_score(np.array([np.argmax(i) for i in y_train]), y_pred_train),
                "balanced_accuracy": balanced_accuracy_score(np.array([np.argmax(i) for i in y_train]), y_pred_train),
                "roc_auc": roc_auc_score(np.array([np.argmax(i) for i in y_train]), y_prob_train[:,1]),
                "F1": f1_score(np.array([np.argmax(i) for i in y_train]), y_pred_train)}

val_metrics = {"accuracy": accuracy_score(np.array([np.argmax(i) for i in y_val]), y_pred_val),
                "balanced_accuracy": balanced_accuracy_score(np.array([np.argmax(i) for i in y_val]), y_pred_val),
                "roc_auc": roc_auc_score(np.array([np.argmax(i) for i in y_val]), y_prob_val[:,1]),
                "F1": f1_score(np.array([np.argmax(i) for i in y_val]), y_pred_val)}

test_metrics = {"accuracy": accuracy_score(y_test, y_pred_test),
                "balanced_accuracy": balanced_accuracy_score(y_test, y_pred_test),
                "roc_auc": roc_auc_score(y_test, y_prob_test[:,1]),
                "F1": f1_score(y_test, y_pred_test)}

metrics_df = pd.DataFrame({"train": train_metrics, 
                           "val": val_metrics, 
                           "test": test_metrics})

display(metrics_df)
metrics_df.to_csv(new_model_dir / f"metrics__{file_suffix}.csv")

In [None]:
fpr, tpr, thr = roc_curve(y_test, y_prob_test[:,1])
pd.DataFrame({"FPR": fpr, "TPR": tpr}).to_csv(new_model_dir / f"fpr_vs_tpr__{file_suffix}.csv")

In [None]:
fig, ax = plt.subplots()
ax.plot(fpr,tpr)
ax.plot([0,1], [0,1], ls="--", c="k")
ax.grid(True)
ax.set_xlabel("False Positive Rate")
ax.set_ylabel("True Positive Rate")
ax.set_title(f"{target.title()} -- ROC AUC: {np.round(roc_auc_score(y_test, y_prob_test[:,1]),4)}")
plt.tight_layout()
plt.savefig(new_model_dir / f"roc_curve__{file_suffix}.png",
            bbox_inches='tight')
plt.show()

In [None]:
cls_report = pd.DataFrame(classification_report(y_test, y_pred_test, output_dict=True))
cls_report.to_csv(new_model_dir / f"classificationo_report__{file_suffix}.png")

### Analysis

In [None]:
test_data = data.iloc[[i for i in idx_test if i==i]].reset_index(drop=True)

In [None]:
ethnicity_groups = dict()

for ethn_idx in test_data["ethnicity"].unique():
    ethnicity_groups[ethn_idx] = list(test_data[test_data["ethnicity"]==ethn_idx].index)

In [None]:
performance_by_ethnicity = dict()
for ethn in range(5):
    y_prob_ethn = alex_net.predict(X_test[ethnicity_groups[ethn]])
    y_pred_ethn = np.array([np.argmax(i) for i in y_prob_ethn])
    acc = accuracy_score(y_test[ethnicity_groups[ethn]], y_pred_ethn)
    rocauc = roc_auc_score(y_test[ethnicity_groups[ethn]], y_pred_ethn)

    performance_by_ethnicity[ethn] = {"accuracy": acc, 
                                      "ROC_AUC": rocauc}

# Convert to data frame for easier plotting
performance_by_ethnicity_df = pd.DataFrame(performance_by_ethnicity).transpose()

In [None]:
age_groups = dict()

for age_idx in test_data["age_decades"].unique():
    age_groups[age_idx] = list(test_data[test_data["age_decades"]==age_idx].index)

In [None]:
performance_by_age = dict()
for age_idx in test_data["age_decades"].unique():
    y_prob_age = alex_net.predict(X_test[age_groups[age_idx]])
    y_pred_age = np.array([np.argmax(i) for i in y_prob_age])
    acc = accuracy_score(y_test[age_groups[age_idx]], y_pred_age)
    try:
        rocauc = roc_auc_score(y_test[age_groups[age_idx]], y_pred_age)
    except ValueError:
        rocauc = np.nan

    performance_by_age[age_idx] = {"accuracy": acc, 
                                      "ROC_AUC": rocauc}

sorted_columns = sorted([k for k,v in performance_by_age.items()])
performance_by_age_df = pd.DataFrame(performance_by_age, 
                                     columns = sorted_columns).transpose()

In [None]:
fig, axs = plt.subplots(1,2, figsize=(10,4), sharey=True, gridspec_kw = {"hspace": 0.07})
ax = axs[0]
performance_by_ethnicity_df.plot(kind="bar", ax = ax, grid=True, legend=False)
ax.set_xlabel("Ethnicity")
ax.set_ylabel("Metric")

ax = axs[1]
performance_by_age_df.plot(kind="bar", ax = ax, grid=True)
ax.set_xlabel("Age (decade)")
ax.legend(loc="center", bbox_to_anchor=(-0.15,-0.2), ncol=2)

fig.suptitle("Classification performance by ethnicity and age group")

plt.savefig(new_model_dir / f"cls_performance_analysis__{file_suffix}", 
            bbox_inches='tight')
plt.show()