# Face classifier - Baseline Model

## Set up

In [None]:
# Stdlib imports
from pathlib import Path

# 3rd party imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import log_loss, accuracy_score, classification_report, roc_curve, roc_auc_score

# Local imports
from facecls import fcaux

In [None]:
seed = 42
np.random.seed(seed)
random.seed(seed)

In [None]:
target = "gender"

In [None]:
models_dir = Path(f"results/models/{target.title()}Classifier/")

try:
    last_model_id = max([int(folder.as_posix().split("_")[2]) for folder in models_dir.glob('mlp*')])
except ValueError:
    last_model_id = 0

print("Last model id:", last_model_id)

new_model_id = last_model_id + 1
file_suffix = f"mlp_{target}_{str(new_model_id).zfill(3)}"
new_model_dir = models_dir / file_suffix
print(f"Creating folder \"{new_model_dir}\"...")
new_model_dir.mkdir(parents=True, exist_ok=True)

## Load data

In [None]:
data = pd.read_csv("data/age_gender_preproc.csv")

## Baseline model: Multilayer perceptron classifier

In [None]:
full_img_vec_list = np.array([fcaux.pxlstring2pxlvec(data, i) for i in range(data.shape[0])])

#### Data split

In [None]:
indeces = list(range(len(full_img_vec_list)))

In [None]:
if "age" in target and not(target == "age_cat"):
    idx_train, idx_test = train_test_split(indeces,
                                           test_size = 0.2,
                                           random_state=seed)
    idx_train, idx_val  = train_test_split(idx_train,
                                           test_size = 0.1,
                                           random_state=seed)

    X_train = full_img_vec_list[idx_train]
    y_train = data[target].values[idx_train]
    X_val = full_img_vec_list[idx_val]
    y_val = data[target].values[idx_val]
    X_test = full_img_vec_list[idx_test]
    y_test = data[target].values[idx_test]
    test_attrs = data[["age", "ethnicity", "gender"]].iloc[idx_test]

else:
    idx_train, idx_test = train_test_split(indeces, 
                                           test_size = 0.2,
                                           stratify = data[target].values,
                                           random_state=seed
                                           )

    idx_train, idx_val  = train_test_split(idx_train, 
                                           test_size = 0.2,
                                           stratify = data[target].values[idx_train],
                                           random_state=seed
                                           )

    X_train = full_img_vec_list[idx_train]
    y_train = data[target].values[idx_train]
    X_val = full_img_vec_list[idx_val]
    y_val = data[target].values[idx_val]
    X_test = full_img_vec_list[idx_test]
    y_test = data[target].values[idx_test]
    test_attrs = data[["age", "ethnicity", "gender"]].iloc[idx_test]

In [None]:
idx_val += (len(idx_train) - len(idx_val))*[np.nan]
idx_test += (len(idx_train) - len(idx_test))*[np.nan]

assert len(idx_train) == len(idx_val)
assert len(idx_train) == len(idx_test)

idx_df = pd.DataFrame({"train_idx": idx_train,
                       "val_idx": idx_val,
                       "test_idx": idx_test}, dtype="Int64")

idx_df.to_csv(new_model_dir / f"data_set_indices__{file_suffix}.csv", index=False)

In [None]:
mlp = MLPClassifier(hidden_layer_sizes = (1024,),
                           activation='relu', 
                           solver='adam', 
                           tol = 0.01,
                           batch_size = 32,
                           learning_rate='adaptive',
                           verbose = True,
                           shuffle = True,
                           max_iter=1,
                           warm_start=True,
                           random_state = seed)

training_losses = []
validation_losses = []
training_accuracies = []
validation_accuracies = []

n_epochs = 20
for epoch in range(n_epochs):
    # Fit the model for one epoch
    mlp.partial_fit(X_train, y_train, classes=np.unique(data[target].values))
    
    # Calculate the training loss
    train_prob = mlp.predict_proba(X_train)
    train_loss = log_loss(y_train, train_prob)
    training_losses.append(train_loss)
    
    # Calculate the validation loss
    val_prob = mlp.predict_proba(X_val)
    val_loss = log_loss(y_val, val_prob)
    validation_losses.append(val_loss)
    
    # Calculate the training accuracy
    train_pred = mlp.predict(X_train)
    train_accuracy = accuracy_score(y_train, train_pred)
    training_accuracies.append(train_accuracy)

    # Calculate the validation accuracy
    val_pred = mlp.predict(X_val)
    val_accuracy = accuracy_score(y_val, val_pred)
    validation_accuracies.append(val_accuracy)
    
    print(f'Epoch {epoch + 1}/{n_epochs} - Training Loss: {train_loss:.4f} - Validation Loss: {val_loss:.4f} - Training Accuracy: {train_accuracy:.4f}')


In [None]:
fig, axs = plt.subplots(2,1, figsize=(5,4), sharex=True)
ax = axs[0]
ax.plot(training_losses, label="training")
ax.plot(validation_losses, label="validation")
ax.grid(True)
ax.set_xticks(range(20))
ax.set_ylabel("Loss")
ax.set_title("1 hidden layer with 1024 neurons")
ax.legend(loc="best")

ax = axs[1]
ax.plot(training_accuracies, label="training")
ax.plot(validation_accuracies, label="validation")
ax.grid(True)
ax.set_xticks(range(20))
ax.set_xlabel("Epochs")
ax.set_ylabel("Accuracy")
ax.legend(loc="best")

plt.savefig("results/loss_curve_1x1024_mlp.png")
plt.show()

In [None]:
y_prob = mlp.predict_proba(X_test)
y_pred = mlp.predict(X_test)

In [None]:
fpr, tpr, thr = roc_curve(y_test, y_prob[:,1])

In [None]:
fig, ax = plt.subplots()
ax.plot(fpr,tpr)
ax.plot([0,1], [0,1], ls="--", c="k")
ax.grid(True)
ax.set_xlabel("False Positive Rate")
ax.set_ylabel("True Positive Rate")
ax.set_title(f"ROC AUC: {np.round(roc_auc_score(y_test, y_prob[:,1]),4)}")
plt.show()

In [None]:
pd.DataFrame({"FPR": fpr, "TPR": tpr}).to_csv("results/mlp_roc.csv")

In [None]:
pd.DataFrame(classification_report(y_test, y_pred, output_dict=True))

In [None]:
parameters = {'hidden_layer_sizes':[(32), (64), (128), (256), (512),
                                    (32, 32), (64, 64), (128, 128), (256, 128), (512, 256),
                                    
                                    ],
              'alpha': [0.001, 0.01, 0.1], 
              'max_iter': [10, 50, 100], 
              'learning_rate_init':[0.001, 0.01, 0.1],
              'batch_size': [16, 32]}

base_model = MLPClassifier(activation='relu', 
                           solver='adam', 
                           tol = 0.01,
                           learning_rate='adaptive',
                           verbose = True,
                           shuffle = True,
                           n_iter_no_change = 5,
                           random_state = seed)

cv_model = RandomizedSearchCV(estimator=base_model, 
                              param_distributions=parameters,
                              cv = 3,
                              random_state = seed,
                              verbose = True,
                              n_iter = 5)

cv_model.fit(X_train, y_train)

In [None]:
plt.plot(cv_model.best_estimator_.loss_curve_)

In [None]:
cv_model.best_score_

In [None]:
base_best_params = pd.DataFrame({k: str(v) for k,v in cv_model.best_params_.items()}, index=[0])
base_best_params.to_csv("../results/base_model__best_params.csv", index=False)

In [None]:
y_pred = cv_model.best_estimator_.predict(X_test)

In [None]:
#accuracy_score(y_test, y_pred)
pd.DataFrame(classification_report(y_test, y_pred, output_dict=True))

In [None]:
# Exploiting universal approximation theorem (UAT) for wide, shallow networks
wide_shallow = [(2**n,) for n in range(4,15)]

# Exploiting universal approximation theorem (UAT) for narrow, deep networks
narrow_deep  = [tuple(i*[64]) for i in range(2,5)]

parameters = {'hidden_layer_sizes': wide_shallow + narrow_deep,
              'alpha': [0.001, 0.01, 0.1], 
              'max_iter': [10, 50, 100], 
              'learning_rate_init':[0.001, 0.01, 0.1],
              'batch_size': [16, 32]}

base_model = MLPClassifier(activation='relu', 
                           solver='adam', 
                           tol = 0.01,
                           learning_rate='adaptive',
                           verbose = True,
                           shuffle = True,
                           n_iter_no_change = 5,
                           random_state = seed)

cv_model2 = RandomizedSearchCV(estimator=base_model, 
                              param_distributions=parameters,
                              cv = 3,
                              random_state = seed,
                              verbose = True,
                              n_iter = 10)

cv_model2.fit(X_train, y_train)

In [None]:
base_best_params2 = pd.DataFrame({k: str(v) for k,v in cv_model2.best_params_.items()}, index=[0])
base_best_params2["score"] = cv_model2.best_score_
base_best_params2.to_csv("../results/base_model2__best_params.csv", index=False)

In [None]:
fig, ax = plt.subplots()
ax.plot(range(1,len(cv_model2.best_estimator_.loss_curve_)+1), cv_model2.best_estimator_.loss_curve_)
ax.set_title("Loss curve")
ax.set_xlabel("Epoch")
ax.set_ylabel("Loss value")
plt.show()

In [None]:
y_pred = cv_model2.best_estimator_.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
roc_auc_score(y_test, y_pred)

In [None]:
fpr, tpr, thr = roc_curve(y_test, y_pred)

plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], ls=":", c="k")