# Face classifier - Exploratory Data Analysis

## Set up

In [None]:
# 3rd party imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Local imports
from facecls import fcaux

## Load data

In [None]:
data = pd.read_csv("data/age_gender.csv")

## EDA

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.info()

In [None]:
sns.set_style("whitegrid")

fig, axs = plt.subplots(1,3, figsize=(12,4))
sns.histplot(data = data, 
             x="age", 
             binrange=(0,120),
             bins=30,
             ax=axs[0]
             )
axs[0].set_title("Age distribution")

sns.countplot(data = data, 
             x="ethnicity", 
             ax=axs[1]
             )
axs[1].set_title("Ethnicity distribution")

sns.countplot(data = data, 
             x="gender", 
             ax=axs[2]
             )
axs[2].set_title("Gender distribution")
plt.tight_layout()
plt.savefig("results/label_distributions.png")
plt.show()

In [None]:
fig, axs = plt.subplots(3,5, figsize=(10,6))

for i in range(3):
    for j in range(5):
        img = fcaux.pxlvec2pxlarray(fcaux.pxlstring2pxlvec(data,i*2000+j))
        axs[i,j].imshow(img, interpolation = "nearest", cmap="gray")
        axs[i,j].axis("off")
        axs[i,j].set_title(f"Image #{i*2000+j}")

fig.suptitle("Example images")
plt.show()

In [None]:
sample_imgs = data.sample(n=15, random_state=42)

In [None]:
sample_imgs#.index[0]

In [None]:
# 0 = Caucasian
# 1 = Black
# 2 = Asian
# 3 = Indian
# 4 = Latino

fig, axs = plt.subplots(3,5, figsize=(10,6))

for i in range(3):
    for j in range(5):
        img = fcaux.pxlvec2pxlarray(fcaux.pxlstring2pxlvec(sample_imgs,sample_imgs.index[i*5+j]))
        axs[i,j].imshow(img, interpolation = "nearest", cmap="gray")
        axs[i,j].axis("off")
        axs[i,j].set_title(f"Image #{sample_imgs.index[i*3+j]}")

fig.suptitle("Example images")
plt.savefig("results/random_face_images.png")
plt.show()

In [None]:
# This cell is to experiment with transformations of the age feature
# to make it more normally distributed
trf1 = data["age"]**0.5
trf2 = 1/(trf1-trf1.mean())
data["age_trf"] = trf2 * (1 + 1*np.exp(-1*trf2**2))

fig, axs = plt.subplots(1,2, figsize=(7,3))
data["age"].hist(bins=50, ax=axs[0], alpha=0.5)
data["age_trf"].hist(bins=50, color="red", alpha=0.5, ax=axs[1])
plt.show()

In [None]:
age_diff = 10
for i in range(0,130,age_diff):
    data.loc[(i <= data["age"]) & (data["age"]<i+age_diff), "age_cat"] = i

In [None]:
data.rename({"age_cat": "age_decades"}, axis=1, inplace=True)

In [None]:
data = data[["age", "age_decades", "ethnicity", "gender", "img_name", "pixels"]]

In [None]:
data.to_csv("data/age_gender_preproc.csv", index=False)