<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setup-and-Overview" data-toc-modified-id="Setup-and-Overview-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setup and Overview</a></span></li><li><span><a href="#Crop-Faces-from-Images" data-toc-modified-id="Crop-Faces-from-Images-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Crop Faces from Images</a></span><ul class="toc-item"><li><span><a href="#Remove-faulty-images" data-toc-modified-id="Remove-faulty-images-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Remove faulty images</a></span></li></ul></li><li><span><a href="#Train-Test-Split-and-Save-Images" data-toc-modified-id="Train-Test-Split-and-Save-Images-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Train Test Split and Save Images</a></span></li><li><span><a href="#Export-data-and-labels-as-arrays" data-toc-modified-id="Export-data-and-labels-as-arrays-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Export data and labels as arrays</a></span><ul class="toc-item"><li><span><a href="#Training-Data" data-toc-modified-id="Training-Data-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Training Data</a></span></li><li><span><a href="#Test-Data" data-toc-modified-id="Test-Data-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Test Data</a></span></li></ul></li></ul></div>

# Setup and Overview

In [1]:
# load requirements

import pandas as pd
import cv2
import matplotlib.pyplot as plt
from face_detector import YoloDetector
import numpy as np
from PIL import Image
from numpy import asarray, load, savez_compressed
import time
from sklearn.model_selection import train_test_split
import warnings

In [2]:
# set base path and load attributes

base_path = "path/"

df = pd.read_csv(base_path + "labels_inmates_complete_scored.csv")

len(df)

60563

In [3]:
df.head()

Unnamed: 0,id,weight,height,sex,hair,race,age,BMI,bmi_class,health_score
0,A00147,83.91452,170.18,1,Brown,White,34,28.974775,Overweight,479
1,A00220,70.30676,185.42,1,Black,Black,59,20.449558,Healthy weight,501
2,A00360,75.749864,175.26,1,Gray or Partially Gray,White,42,24.661316,Healthy weight,511
3,A00367,111.13004,182.88,1,Black,Black,63,33.227605,Class 1 Obesity,449
4,A01054,75.296272,170.18,1,Salt and Pepper,Black,34,25.998988,Overweight,494


# Crop Faces from Images

Do not run this part, if the arrays already exist!! -> it takes >100min for 60'000 images!!

In [4]:
# create list with all image ids in the dataframe

idx = df.id.values.tolist()
len(idx)

60563

In [5]:
# check the frist five
idx[:5]

['A00147', 'A00220', 'A00360', 'A00367', 'A01054']

In [None]:
# define model to crop faces (may need to run this cell twice until no warning is shown)

model = YoloDetector(device = "cpu")

In [7]:
# define function to detect faulty images and return a list with their id


# function to check if a face is in the image
def crop_face(img_idx):
    
    # suppress DeprecationWarning messages to avoid printing logs
    warnings.filterwarnings("ignore", message=".*--img-size.*")
    
    # create lists as output
    images = list()
    no_face = list()
    img_ids = list ()
    for i, image_id in enumerate(img_idx):
        
        # Read the input image
        orgimg = np.array(Image.open(base_path + "front/front/" + image_id + ".jpg"))

        try:  
            bboxes = model.predict(orgimg)[0]
            # extract bounding box
            x1 = bboxes[0][0][0]
            y1 = bboxes[0][0][1]
            x2 = bboxes[0][0][2]
            y2 = bboxes[0][0][3]
            # crop image
            c_img = orgimg[y1:y2, x1:x2]
            # rezise image to 256 * 256           
            c_img = cv2.resize(c_img, (256, 256))
            # create list with succesfully cropped images
            img_ids.append(image_id)
            # add image to list to later export all as one np.array
            images.append(c_img)

        except IndexError:
            print("Not Face detected, check image! ID: " + str(image_id))
            no_face.append(image_id)

        except ValueError:
            print("Error with Image, check image! ID: " + str(image_id))
            no_face.append(image_id)
        # print progress every 100 images
        if i % 100 == 0:
            print(i , "/" ,len(img_idx))
            
    # reset warning filters
    warnings.resetwarnings()
    
    # return all list created in the function (images are returned as np.array)    
    return asarray(images),img_ids, no_face

In [None]:
# crop the images -> only a list of image ids is required as input

start = time.time()

faces, img_ids, no_face = crop_face(idx)

end = time.time()

elapsed_time = end - start
minutes, seconds = divmod(elapsed_time, 60)

print(f"Elapsed time: {int(minutes)} minutes and {round(seconds, 2)} seconds")

In [9]:
# save array with all faces

savez_compressed(base_path + 'data_arrays/faces_256.npz', faces)

In [10]:
# check shape of first image

for face in faces:
    print(face.shape)
    break

(256, 256, 3)


## Remove faulty images

In [13]:
# remove data of faulty images from metadata

print("Size before cleaning:", len(df))

for face_id in no_face:
    df = df.drop(df[df['id'] == face_id].index)
    
print("Size after cleaning:", len(df))

Size before cleaning: 60563
Size after cleaning: 59852


In [14]:
# save new df

df.to_csv(base_path + 'labels/labels_inmates_complete_cleaned.csv', index = False)

In [15]:
# print final shape of array with faces -> should be the same number as size after cleaning in the previous cell

faces.shape

(59852, 256, 256, 3)

# Train Test Split and Save Images

In [16]:
# train test split for images and labels


# create df with required labels
y = df

# Splitting the data into train and test sets with 80% for training and 20% for testing -> stratify for BMI and Sex
train_images, test_images, train_labels, test_labels = train_test_split(faces, y, test_size=0.2, random_state=123, stratify=y[['bmi_class','sex']])

# Printing the shapes of the train and test data
print('Train images shape:', train_images.shape)
print('Train labels shape:', train_labels.shape)
print('Test images shape:', test_images.shape)
print('Test labels shape:', test_labels.shape)
test_labels


Train images shape: (47881, 256, 256, 3)
Train labels shape: (47881, 10)
Test images shape: (11971, 256, 256, 3)
Test labels shape: (11971, 10)


Unnamed: 0,id,weight,height,sex,hair,race,age,BMI,bmi_class,health_score
17444,M10305,108.862080,180.34,1,Black,Black,27,33.472835,Class 1 Obesity,455
45399,R89181,87.543256,170.18,0,Brown,Black,28,30.227739,Class 1 Obesity,517
41862,R60277,111.130040,177.80,1,Black,Black,29,35.153450,Class 2 Obesity,445
50047,X57499,90.718400,180.34,1,Black,Black,51,27.894029,Overweight,497
11383,K61421,77.110640,170.18,1,Black,Black,36,26.625469,Overweight,493
...,...,...,...,...,...,...,...,...,...,...
3574,B33552,70.760352,177.80,1,Black,Black,20,22.383421,Healthy weight,497
58457,Y22831,78.471416,185.42,1,Black,Black,28,22.824346,Healthy weight,512
30011,M55074,81.646560,175.26,1,Black,White,20,26.581059,Overweight,481
42031,R61244,84.368112,172.72,1,Black,Black,24,28.280890,Overweight,474


In [17]:
# save train and test labels as CSV

train_labels.to_csv(base_path + 'labels/train_labels.csv', index = False)
test_labels.to_csv(base_path + 'labels/test_labels.csv', index = False)

# Export data and labels as arrays

## Training Data

In [18]:
# create arrays for training labels
bmi_train = train_labels['BMI'].values
bmi_class_train = train_labels['bmi_class'].values
hs_train = train_labels['health_score'].values


print("BMI training set: ", bmi_train.shape)
print("BMI Class training set: ", bmi_class_train.shape)
print("HS training set: ", hs_train.shape)

BMI training set:  (47881,)
BMI Class training set:  (47881,)
HS training set:  (47881,)


In [19]:
#save train arrays 
savez_compressed(base_path + 'data_arrays/train_images.npz', train_images)
savez_compressed(base_path + 'data_arrays/bmi_train.npz', bmi_train)
savez_compressed(base_path + 'data_arrays/bmi_class_train.npz', bmi_class_train)
savez_compressed(base_path + 'data_arrays/hs_train.npz', hs_train)

## Test Data

In [20]:
# create arrays for test labels
bmi_test = test_labels['BMI'].values
bmi_class_test = test_labels['bmi_class'].values
hs_test = test_labels['health_score'].values

print("BMI test set: ", bmi_test.shape)
print("BMI Class test set: ", bmi_class_test.shape)
print("HS test set: ", hs_test.shape)

BMI test set:  (11971,)
BMI Class test set:  (11971,)
HS test set:  (11971,)


In [21]:
# save test arrays
savez_compressed(base_path + 'data_arrays/test_images.npz', test_images)
savez_compressed(base_path + 'data_arrays/bmi_test.npz', bmi_test)
savez_compressed(base_path + 'data_arrays/bmi_class_test.npz', bmi_class_test)
savez_compressed(base_path + 'data_arrays/hs_test.npz', hs_test)