# Face classifier - Baseline Model

## Set up

In [None]:
# Stdlib imports
from pathlib import Path
from datetime import datetime as dt

# 3rd party imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score 
from sklearn.metrics import classification_report, roc_curve, roc_auc_score, f1_score

# Local imports
from facecls import fcaux

In [None]:
seed = 42
np.random.seed(seed)
random.seed(seed)

In [None]:
target = "gender"
model_type = "logreg" 

In [None]:
models_dir = Path(f"results/models/{target.title()}Classifier/")

try:
    last_model_id = max([int(folder.as_posix().split("_")[2]) for folder in models_dir.glob(f'{model_type}*')])
except ValueError:
    last_model_id = 0

print("Last model id:", last_model_id)

new_model_id = last_model_id + 1
file_suffix = f"{model_type}_{target}_{str(new_model_id).zfill(3)}"
new_model_dir = models_dir / file_suffix
print(f"Creating folder \"{new_model_dir}\"...")
new_model_dir.mkdir(parents=True, exist_ok=True)

## Load data

In [None]:
data = pd.read_csv("data/age_gender_preproc.csv")

## Baseline model 1: Logistic Regression

In [None]:
full_img_vec_list = np.array([fcaux.pxlstring2pxlvec(data, i) for i in range(data.shape[0])])

#### Data split

In [None]:
indeces = list(range(len(full_img_vec_list)))

In [None]:
if "age" in target and not(target == "age_cat"):
    idx_train, idx_test = train_test_split(indeces,
                                           test_size = 0.2,
                                           random_state=seed)
    idx_train, idx_val  = train_test_split(idx_train,
                                           test_size = 0.1,
                                           random_state=seed)

    X_train = full_img_vec_list[idx_train]
    y_train = data[target].values[idx_train]
    X_val = full_img_vec_list[idx_val]
    y_val = data[target].values[idx_val]
    X_test = full_img_vec_list[idx_test]
    y_test = data[target].values[idx_test]
    test_attrs = data[["age", "ethnicity", "gender"]].iloc[idx_test]

else:
    idx_train, idx_test = train_test_split(indeces, 
                                           test_size = 0.2,
                                           stratify = data[target].values,
                                           random_state=seed
                                           )

    idx_train, idx_val  = train_test_split(idx_train, 
                                           test_size = 0.1,
                                           stratify = data[target].values[idx_train],
                                           random_state=seed
                                           )

    X_train = full_img_vec_list[idx_train]
    y_train = data[target].values[idx_train]
    X_val = full_img_vec_list[idx_val]
    y_val = data[target].values[idx_val]
    X_test = full_img_vec_list[idx_test]
    y_test = data[target].values[idx_test]
    test_attrs = data[["age", "ethnicity", "gender"]].iloc[idx_test]

In [None]:
print("#training:", len(X_train))
print("#validation:", len(X_val))
print("#test:", len(X_test))

In [None]:
idx_val += (len(idx_train) - len(idx_val))*[np.nan]
idx_test += (len(idx_train) - len(idx_test))*[np.nan]

assert len(idx_train) == len(idx_val)
assert len(idx_train) == len(idx_test)

idx_df = pd.DataFrame({"train_idx": idx_train,
                       "val_idx": idx_val,
                       "test_idx": idx_test}, dtype="Int64")

idx_df.to_csv(new_model_dir / f"data_set_indices__{file_suffix}.csv", index=False)

In [None]:
model = LogisticRegression(random_state = seed, 
                           #solver="liblinear",
                           n_jobs = -1,
                           verbose=True
                           )

start = dt.now()
model.fit(X_train, y_train)
elapsed = dt.now()-start
print(f"Elapsed: {elapsed}s")

In [None]:
model = RandomForestClassifier(random_state = seed,
                               n_jobs = -1,
                               verbose=True
                               )

param_dist = {"n_estimators": [10,50,100,200,500,1000],
              "max_depth": [1,2,3,4,5]
             }

rand_search = RandomizedSearchCV(model,
                                 param_distributions = param_dist,
                                 cv = 5,
                                 n_iter = 10,
                                 random_state = seed, 
                                 n_jobs = -1,
                                 verbose = True 
                                )

start = dt.now()
rand_search.fit(X_train, y_train)
elapsed = dt.now()-start
print(f"Elapsed: {elapsed}s")

In [None]:
model = rand_search.best_estimator_

In [None]:
y_prob_test = model.predict_proba(X_test)

In [None]:
fpr, tpr, thr = roc_curve(y_test, y_prob_test[:,1])
pd.DataFrame({"FPR": fpr, "TPR": tpr}).to_csv(new_model_dir / f"fpr_vs_tpr__{file_suffix}.csv")

In [None]:
fig, ax = plt.subplots()
ax.plot(fpr,tpr)
ax.plot([0,1], [0,1], ls="--", c="k")
ax.grid(True)
ax.set_xlabel("False Positive Rate")
ax.set_ylabel("True Positive Rate")
ax.set_title(f"ROC AUC: {np.round(roc_auc_score(y_test, y_prob_test[:,1]),4)}")
plt.savefig(new_model_dir / f"roc_curve__{file_suffix}.png",
            bbox_inches='tight')
plt.show()

In [None]:
y_pred_train = model.predict(X_train)
y_pred_val = model.predict(X_val)
y_pred_test = model.predict(X_test)

y_proba_train = model.predict_proba(X_train)
y_proba_val = model.predict_proba(X_val)
y_proba_test = model.predict_proba(X_test)

In [None]:
cls_report = pd.DataFrame(classification_report(y_test, y_pred_test, output_dict=True))
cls_report.to_csv(new_model_dir / f"classificationo_report__{file_suffix}.csv")

In [None]:
train_metrics = {"accuracy": accuracy_score(y_train, y_pred_train),
                "balanced_accuracy": balanced_accuracy_score(y_train, y_pred_train),
                "roc_auc": roc_auc_score(y_train, y_proba_train[:,1]),
                "F1": f1_score(y_train, y_pred_train)}

val_metrics = {"accuracy": accuracy_score(y_val, y_pred_val),
                "balanced_accuracy": balanced_accuracy_score(y_val, y_pred_val),
                "roc_auc": roc_auc_score(y_val, y_proba_val[:,1]),
                "F1": f1_score(y_val, y_pred_val)}

test_metrics = {"accuracy": accuracy_score(y_test, y_pred_test),
                "balanced_accuracy": balanced_accuracy_score(y_test, y_pred_test),
                "roc_auc": roc_auc_score(y_test, y_proba_test[:,1]),
                "F1": f1_score(y_test, y_pred_test)}

metrics_df = pd.DataFrame({"train": train_metrics, 
                           "val": val_metrics, 
                           "test": test_metrics})

display(metrics_df)
metrics_df.to_csv(new_model_dir / f"metrics__{file_suffix}.csv")

In [None]:
base_best_params = pd.DataFrame({k: str(v) for k,v in rand_search.best_params_.items()}, index=[0])
base_best_params.to_csv(new_model_dir / f"best_params__{file_suffix}.csv", index=False)