# New structure dataset
This notebook changes the structure for out dataset.

<br><br><br>
# Libaries

In [195]:
# General libaries
import os, shutil
import pandas as pd
import numpy as np
import pickle

pd.set_option('max_colwidth', -1)

  import sys


<br><br><br>
# Global variables

In [196]:
original_dataset_path = os.path.join(os.getcwd(), 'datasets', 'UTKface_inthewild')
new_directory = os.path.join(os.getcwd(), 'datasets', 'UTKface-new-structure')

<br><br><br>
# Functions

In [209]:
def extract_all_files(path):
    """
    This function extracts al the files within a specific path.
    """

    all_files = []

    # Go through all the files in a our 'path' and extract all files
    # and add those file paths to our 'all_files' list.
    for (root,dirs,files) in os.walk(path, topdown=True): 
        if len(files) == 0: 
            continue

        else: 
            all_files.extend([os.path.join(root, file) for file in files])

    # Filter out the paths that include a jpg.
    all_files = [file for file in all_files 
                 if file.endswith('.jpg')]

    # Return the result
    return all_files



def create_data_frame_with_all_information_of_files(all_files):
    """
    This function walks through all_files and extracts all the relevant
    information.
    """  
    df = pd.DataFrame(columns = ['path'], data = all_files)
    
    df['split']    = df['path'].apply(lambda x: x.split('\\')[-1].split('_'))
    df             = df[df['split'].map(len) == 4]
    
    
    df['age']      = df['split'].apply(lambda x: x[0])
    df['gender']   = df['split'].apply(lambda x: x[1])
    df['race']     = df['split'].apply(lambda x: x[2])
    df['datetime'] = df['split'].apply(lambda x: x[3].split('.')[0])    
    
    df = df.drop(['split'], axis=1)
    
    return df[['age', 'gender', 'race', 'datetime', 'path']]

<br><br><br>
# Create dataframe

In [210]:
# Extract all the files of our dataset folder. This folder contains 3 subfolders
# with each containing parts of our dataset.
all_files = extract_all_files(original_dataset_path)
print('Lenght of the collected files from our path is: ', len(all_files))

Lenght of the collected files from our path is:  24106


In [211]:
# We use the image paths to create a dataframe containing all our data. We also
# pickle the dataframe so we can easily re-use it later without running this
# function again.
df = create_data_frame_with_all_information_of_files(all_files)

display(df.head())


# Pickle the result
with open('df.pkl', 'wb') as output_file:
    pickle.dump(df, output_file)

Unnamed: 0,age,gender,race,datetime,path
0,100,1,0,20170110183726390,C:\Users\Paulo\Documents\GitHub\deep-learning-age-estimation\notebooks\datasets\UTKface_inthewild\part1\100_1_0_20170110183726390.jpg
1,100,1,2,20170105174847679,C:\Users\Paulo\Documents\GitHub\deep-learning-age-estimation\notebooks\datasets\UTKface_inthewild\part1\100_1_2_20170105174847679.jpg
2,100,1,2,20170110182836729,C:\Users\Paulo\Documents\GitHub\deep-learning-age-estimation\notebooks\datasets\UTKface_inthewild\part1\100_1_2_20170110182836729.jpg
3,101,1,2,20170105174739309,C:\Users\Paulo\Documents\GitHub\deep-learning-age-estimation\notebooks\datasets\UTKface_inthewild\part1\101_1_2_20170105174739309.jpg
4,10,0,0,20161220222308131,C:\Users\Paulo\Documents\GitHub\deep-learning-age-estimation\notebooks\datasets\UTKface_inthewild\part1\10_0_0_20161220222308131.jpg


In [212]:
# Open saved pickle file and test the saved df.
with open('df.pkl', 'rb') as input_file:
    df = pickle.load(input_file)

display(len(df))
df.head()

24103

Unnamed: 0,age,gender,race,datetime,path
0,100,1,0,20170110183726390,C:\Users\Paulo\Documents\GitHub\deep-learning-age-estimation\notebooks\datasets\UTKface_inthewild\part1\100_1_0_20170110183726390.jpg
1,100,1,2,20170105174847679,C:\Users\Paulo\Documents\GitHub\deep-learning-age-estimation\notebooks\datasets\UTKface_inthewild\part1\100_1_2_20170105174847679.jpg
2,100,1,2,20170110182836729,C:\Users\Paulo\Documents\GitHub\deep-learning-age-estimation\notebooks\datasets\UTKface_inthewild\part1\100_1_2_20170110182836729.jpg
3,101,1,2,20170105174739309,C:\Users\Paulo\Documents\GitHub\deep-learning-age-estimation\notebooks\datasets\UTKface_inthewild\part1\101_1_2_20170105174739309.jpg
4,10,0,0,20161220222308131,C:\Users\Paulo\Documents\GitHub\deep-learning-age-estimation\notebooks\datasets\UTKface_inthewild\part1\10_0_0_20161220222308131.jpg


<br><br><br>
# Create new folder structure

In [203]:
%%time

# Create folders for training, validation and testing
TRAINING_VALIDATION_TESTING_FOLDERS = ['training', 'validation', 'test']
TRAINING_VALIDATION_TESTING_FOLDERS = [os.path.join(new_directory, folder) for folder in TRAINING_VALIDATION_TESTING_FOLDERS]


# Creating all categories as folders
for new_folder in TRAINING_VALIDATION_TESTING_FOLDERS:
    try:
        os.mkdir(new_folder)
    except OSError:
        print ("Creation of the directory %s failed    \n\n" % new_folder)
    else:
        print ("Successfully created the directory %s  \n\n" % new_folder)

Successfully created the directory C:\Users\Paulo\Documents\GitHub\deep-learning-age-estimation\notebooks\datasets\UTKface-new-structure\training  


Successfully created the directory C:\Users\Paulo\Documents\GitHub\deep-learning-age-estimation\notebooks\datasets\UTKface-new-structure\validation  


Successfully created the directory C:\Users\Paulo\Documents\GitHub\deep-learning-age-estimation\notebooks\datasets\UTKface-new-structure\test  


Wall time: 3 ms


In [204]:
%%time

# Creating all categories as folders
CATEGORIES = list(range(0, 117))
TRAINING_CATEGORIES = [os.path.join(TRAINING_VALIDATION_TESTING_FOLDERS[0], str(CATEGORY)) for CATEGORY in CATEGORIES]
VALIDATION_CATEGORIES = [os.path.join(TRAINING_VALIDATION_TESTING_FOLDERS[1], str(CATEGORY)) for CATEGORY in CATEGORIES]
TEST_CATEGORIES = [os.path.join(TRAINING_VALIDATION_TESTING_FOLDERS[2], str(CATEGORY)) for CATEGORY in CATEGORIES]


for new_folder in TRAINING_CATEGORIES:
    try:
        os.mkdir(new_folder)
    except OSError:
        print("Creation of the directory %s failed    \n\n" % new_folder)

print('Finished with TRAINING_CATEGORIES')



for new_folder in VALIDATION_CATEGORIES:
    try:
        os.mkdir(new_folder)
    except OSError:
        print("Creation of the directory %s failed    \n\n" % new_folder)

print('Finished with VALIDATION_CATEGORIES')



for new_folder in TEST_CATEGORIES:
    try:
        os.mkdir(new_folder)
    except OSError:
        print("Creation of the directory %s failed    \n\n" % new_folder)

print('Finished with TEST_CATEGORIES')

Finished with TRAINING_CATEGORIES
Finished with VALIDATION_CATEGORIES
Finished with TEST_CATEGORIES
Wall time: 334 ms


<br><br><br>
# Split data

In [205]:
# Split the dataframe into train, validation and test
probs = np.random.rand(len(df))
training_mask = probs < 0.7
validation_mask = (probs>=0.7) & (probs < 0.9)
test_mask  = probs >= 0.9


df_training = df[training_mask]
df_validation = df[validation_mask]
df_test = df[test_mask]

print('Length training:   ', len(df_training))
print('Length validation: ', len(df_validation))
print('Length test:       ', len(df_test))

Length training:    16824
Length validation:  4869
Length test:        2410


In [206]:
# Check if the splitting up the dataframe was done correctly (if yes, then both df have one or more overlapping rows)
print('Overlapping paths in training and validation? Answer: ', bool(set(df_training['path']) & set(df_validation['path'])))
print('Overlapping paths in training and test? Answer:       ', bool(set(df_training['path']) & set(df_test['path'])))
print('Overlapping paths in validation and test? Answer:     ', bool(set(df_validation['path']) & set(df_test['path'])))

Overlapping paths in training and validation? Answer:  False
Overlapping paths in training and test? Answer:        False
Overlapping paths in validation and test? Answer:      False


<br><br><br>
# Move images

In [213]:
TRAINING_PATH   = TRAINING_VALIDATION_TESTING_FOLDERS[0]
VALIDATION_PATH = TRAINING_VALIDATION_TESTING_FOLDERS[1]
TEST_PATH       = TRAINING_VALIDATION_TESTING_FOLDERS[2]

df_training['new-path']   = df_training['age'].apply(lambda x: os.path.join(TRAINING_PATH, x))
df_validation['new-path'] = df_validation['age'].apply(lambda x: os.path.join(VALIDATION_PATH, x))
df_test['new-path']       = df_test['age'].apply(lambda x: os.path.join(TEST_PATH, x))

In [233]:
# Combine the results to one dataframe
def correct_category(path):
    if 'training' in path:
        return 'training'
    
    elif 'validation' in path:
        return 'validation'
    
    elif 'test' in path:
        return 'test'
    
    else:
        return None
    
df = df_training.append(df_validation).append(df_test).sort_index()
df['category'] = df['new-path'].apply(lambda x: correct_category(x))
df['path'] = df['new-path']
df = df.drop(['new-path'], axis=1)
df = df[['age', 'gender', 'race', 'datetime', 'category', 'path']]

display(df.head())


# Pickle the result
with open('df.pkl', 'wb') as output_file:
    pickle.dump(df, output_file)

Unnamed: 0,age,gender,race,datetime,category,path
0,100,1,0,20170110183726390,training,C:\Users\Paulo\Documents\GitHub\deep-learning-age-estimation\notebooks\datasets\UTKface-new-structure\training\100
1,100,1,2,20170105174847679,training,C:\Users\Paulo\Documents\GitHub\deep-learning-age-estimation\notebooks\datasets\UTKface-new-structure\training\100
2,100,1,2,20170110182836729,training,C:\Users\Paulo\Documents\GitHub\deep-learning-age-estimation\notebooks\datasets\UTKface-new-structure\training\100
3,101,1,2,20170105174739309,training,C:\Users\Paulo\Documents\GitHub\deep-learning-age-estimation\notebooks\datasets\UTKface-new-structure\training\101
4,10,0,0,20161220222308131,training,C:\Users\Paulo\Documents\GitHub\deep-learning-age-estimation\notebooks\datasets\UTKface-new-structure\training\10


In [217]:
%%time 

# Move all the newly defined trainings data
for index, row in df_training.iterrows():
    shutil.copy(row['path'], row['new-path'])
    
# Move all the newly defined validation data
for index, row in df_validation.iterrows():
    shutil.copy(row['path'], row['new-path'])

# Move all the newly defined test data
for index, row in df_test.iterrows():
    shutil.copy(row['path'], row['new-path'])

Wall time: 2min 6s


In [218]:
# Extract all the files of our dataset folder. This folder contains 3 subfolders
# with each containing parts of our dataset.
all_files = extract_all_files(new_directory)
print('Lenght of the collected files from our path is: ', len(all_files))

Lenght of the collected files from our path is:  24103
