### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# Read Data

In [2]:
txt_data = []
relative_path = os.path.join(os.getcwd(), 'data', 'AdienceBenchmarkGenderAndAgeClassification')

for file in os.listdir(relative_path):
    if file.endswith('.txt'):
       txt_data.append(pd.read_csv(os.path.join(relative_path, file), sep='\t'))

raw_data = pd.concat(txt_data, ignore_index=True)

In [3]:
#Create a absolute path to dataset which contains imges for a model
raw_data.loc[:, 'img_folder_path'] = raw_data.apply(lambda row: os.path.join(relative_path, 'faces', row.user_id), axis=1)
raw_data.loc[:, 'img_name'] = raw_data.apply(lambda row: 'coarse_tilt_aligned_face.' + str(row.face_id) + '.' + row.original_image, axis=1)
raw_data.loc[:, 'img_absolute_path'] = raw_data.apply(lambda row: os.path.join(row.img_folder_path, row.img_name), axis=1)

In [None]:
img_name = 'coarse_tilt_aligned_face.' + str(raw_data.face_id.loc[5]) + '.' + raw_data.original_image.loc[5]
img_path = os.path.join(relative_path, 'faces', raw_data.user_id.iloc[5], img_name)
img = load_img(img_path)
plt.imshow(img)
plt.show()

# Preprocessing dataset


1. Check if data contains a missing values

In [4]:
raw_data.isnull().sum()

user_id                 0
original_image          0
face_id                 0
age                     0
gender                779
x                       0
y                       0
dx                      0
dy                      0
tilt_ang                0
fiducial_yaw_angle      0
fiducial_score          0
img_folder_path         0
img_name                0
img_absolute_path       0
dtype: int64

In [5]:
#Drop NA data from gender column
raw_data.dropna(subset=['gender'], inplace=True)

In [6]:
# f = female, m=male, u=unidetifie 
raw_data.gender.value_counts()

f    9372
m    8120
u    1099
Name: gender, dtype: int64

In [7]:
#Drop 'u' data from gender which stays as unindetyfie
raw_data = raw_data[raw_data.gender != 'u']

In [8]:
#Look at second column to predict - > age
raw_data.age.unique()

array(['(25, 32)', '(38, 43)', '(4, 6)', '(60, 100)', '(15, 20)',
       '(48, 53)', '(8, 12)', '(0, 2)', '(38, 48)', '35', '3', '55', '58',
       '22', '13', '45', '36', '23', '(38, 42)', 'None', '(8, 23)',
       '(27, 32)', '57', '2', '29', '34', '42', '46'], dtype=object)

In [9]:
#Drop data where age is unkonow
raw_data = raw_data[raw_data.age != 'None']

In [10]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17452 entries, 0 to 19345
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   user_id             17452 non-null  object
 1   original_image      17452 non-null  object
 2   face_id             17452 non-null  int64 
 3   age                 17452 non-null  object
 4   gender              17452 non-null  object
 5   x                   17452 non-null  int64 
 6   y                   17452 non-null  int64 
 7   dx                  17452 non-null  int64 
 8   dy                  17452 non-null  int64 
 9   tilt_ang            17452 non-null  int64 
 10  fiducial_yaw_angle  17452 non-null  int64 
 11  fiducial_score      17452 non-null  int64 
 12  img_folder_path     17452 non-null  object
 13  img_name            17452 non-null  object
 14  img_absolute_path   17452 non-null  object
dtypes: int64(8), object(7)
memory usage: 2.1+ MB


In [11]:
#Select data which will be needed to train and test models
data = raw_data[['age', 'gender', 'x', 'y', 'dx', 'dy', 'img_absolute_path']]

In [12]:
#Encode gender categorical data to numeric 
data = data.replace({'gender': {'f': 0, 'm': 1}})

In [13]:
#Encode age values
data.age.unique()

array(['(25, 32)', '(38, 43)', '(4, 6)', '(60, 100)', '(15, 20)',
       '(48, 53)', '(8, 12)', '(0, 2)', '(38, 48)', '35', '3', '55', '58',
       '22', '13', '45', '36', '23', '(38, 42)', '(8, 23)', '(27, 32)',
       '57', '2', '29', '34', '42', '46'], dtype=object)

In [14]:
age_mapping= [('(0, 2)', '0-2'), ('2', '0-2'), ('3', '0-2'), ('(4, 6)', '4-6'), ('(8, 12)', '8-13'), ('13', '8-13'), ('22', '15-20'), ('(8, 23)','15-20'), ('23', '25-32'), 
('(15, 20)', '15-20'), ('(25, 32)', '25-32'), ('(27, 32)', '25-32'), ('32', '25-32'), ('34', '25-32'), ('29', '25-32'), ('(38, 42)', '38-43'), ('35', '38-43'), ('36', '38-43'), 
('42', '48-53'), ('45', '38-43'), ('(38, 43)', '38-43'), ('(38, 42)', '38-43'), ('(38, 48)', '48-53'), ('46', '48-53'), ('(48, 53)', '48-53'), ('55', '48-53'), ('56', '48-53'), 
('(60, 100)', '60+'), ('57', '60+'), ('58', '60+')]

age_mapping_dict = {x[0]: x[1] for x in age_mapping}

In [15]:
#Encode categorical age data to numeric
age_dict =  {'0-2': 0, '4-6': 1, '8-13': 2, '15-20': 3, '25-32': 4, '38-43': 5, '48-53': 6, '60+': 7}
data.loc[:, 'age'] = data.apply(lambda row: age_dict[age_mapping_dict[row.age]], axis=1)

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17452 entries, 0 to 19345
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                17452 non-null  int64 
 1   gender             17452 non-null  int64 
 2   x                  17452 non-null  int64 
 3   y                  17452 non-null  int64 
 4   dx                 17452 non-null  int64 
 5   dy                 17452 non-null  int64 
 6   img_absolute_path  17452 non-null  object
dtypes: int64(6), object(1)
memory usage: 1.1+ MB


# Train Test Split

Split data on train_test values, in our case we split the absolute path to folders, where we store ours images. After this process we will preproces imgaes for a deep learning model

In [18]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

X_gender = data['img_absolute_path']
y_gender = data['gender']



X_train_gender, X_test_gender, y_train_gedner, y_test_gender = train_test_split(X_gender, y_gender, test_size=0.2, random_state=42)

# Images Preprocesing

Opis dlaczego w taki sposób podchodzimy do danych wsadowych. Powód rozmieszenie w bazie danych

In [40]:
from keras.preprocessing.image import img_to_array, load_img, smart_resize

def resize_array_img(img_path): 
    image_array = img_to_array(smart_resize(load_img(img_path), (227, 227)))
    return image_array

train_images = []
test_images = []

for img_path in X_train_gender:
    img_data = resize_array_img(img_path=img_path)
    train_images.append(img_data)
    break

for img_path in X_test_gender:
    img_data = resize_array_img(img_path=img_path)
    test_images.append(img_data)
    break



# Gender Model

Funkcja do tworzenia modelu

In [44]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Activation, Dropout, Flatten, Dense, LayerNormalization

def create_gender_model():
    model = Sequential()
    
    model.add(Conv2D(input_shape=(227, 227, 3), filters=96, kernel_size=(7, 7), strides=4, padding='valid', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2,2),strides=(2,2)))
    model.add(LayerNormalization())
    model.add(Conv2D(filters=256, kernel_size=(5, 5), strides=1, padding='same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2,2),strides=(2,2)))
    model.add(LayerNormalization())
    model.add(Conv2D(filters=256, kernel_size=(3, 3), strides=1, padding='same', activation='relu'))
    model.add(MaxPooling2D(pool_size=(2,2),strides=(2,2)))
    model.add(LayerNormalization())
    model.add(Flatten())

    model.add(Dense(units=512, activation='relu'))
    model.add(Dropout(rate=0.25))
    model.add(Dense(units=512, activation='relu'))
    model.add(Dropout(rate=0.25))
    model.add(Dense(units=2, activation='softmax'))

    return model

Early Stopping

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=5)

Stworzenie modelu oraz podsumowanie

In [45]:
gender_model = create_gender_model()
gender_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 56, 56, 96)        14208     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 28, 28, 96)       0         
 )                                                               
                                                                 
 layer_normalization (LayerN  (None, 28, 28, 96)       192       
 ormalization)                                                   
                                                                 
 conv2d_1 (Conv2D)           (None, 28, 28, 256)       614656    
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 14, 14, 256)      0         
 2D)                                                             
                                                        

In [None]:
gender_model.compile(loss="categorical_crossentropy", 
              optimizer="adam", 
              metrics=["categorical_accuracy"])

In [None]:
gender_model.fit(train_images, y_train_gedner,
          epochs=100,
          batch_size=32,
          callbacks=[early_stopping],
          validation_split=0.1)