# Directories organiser
This notebook shows how I set up the directory structure I use to implement the `ImageDataGenerator`s that let me train the model on which my painting style recognition [app](http://www.nicolascontreras.tech/portfolio/painting_reco) is based (cf. modeling notebook).

# 1. Imports

In [1]:
import numpy as np
import pandas as pd
import os

from tqdm.notebook import tqdm
tqdm.pandas()

from glob import glob

  from pandas import Panel


In [2]:
def print_shape(df):
    print(f'Number of rows:    {df.shape[0]:,}')
    print(f'Number of columns: {df.shape[1]}\n')

# 2. Set directories structure

In [3]:
DATA_DIR = 'data'

train_size = .8

IMG_DIR = os.path.join(DATA_DIR, 'img')
    
TRAIN_DIR = os.path.join(DATA_DIR, 'train')
if not os.path.isdir(TRAIN_DIR):
    os.mkdir(TRAIN_DIR)

VALID_DIR = os.path.join(DATA_DIR, 'valid')
if not os.path.isdir(VALID_DIR):
    os.mkdir(VALID_DIR)
    
filenames = [path.split(os.path.sep)[-1] for path in glob(os.path.join(IMG_DIR, '*.jpg'))]
filepath = os.path.join(DATA_DIR, 'all_data_info.csv')

# 3. Select a subset of images

In [37]:
# Read csv into a pandas.DataFrame
df = pd.read_csv(filepath)
print('Raw DataFrame shape')
print_shape(df)

# Filter df
print('\nFiltered DataFrame shape')
df = df.loc[df['new_filename'].isin(filenames)]
print_shape(df)

# Add path & size columns
df['path'] = df.loc[:, 'new_filename'].apply(lambda filename: os.path.join(IMG_DIR, filename))
df['size'] = df['path'].apply(lambda path: os.path.getsize(path) / 1024 ** 2)

# Only keep images with size <= 2MB
print('\nFinal DataFrame shape')
df = df.loc[df['size'] <= 2, :]
print_shape(df)

# Remove "(text)" from style
df['style'] = df['style'].str.replace(r'\s\(.+\)', '')

Raw DataFrame shape
Number of rows:    103,250
Number of columns: 12


Filtered DataFrame shape
Number of rows:    79,433
Number of columns: 12


Final DataFrame shape
Number of rows:    76,317
Number of columns: 14



# 4. Distribute images between train and validation directories

In [38]:
# Keep at most 1000 paintings by style, chosen at random,
# for each of the main styles
    ## Count images / style
style_counts = df['style'].value_counts()

    ## Keep only styles with at least 100 images
styles = style_counts[style_counts >= 100].index.tolist()
styles_to_trim = style_counts[style_counts >= 1000].index.tolist()

    ## Select 1000 images / style
for style in tqdm(styles):
    
    ## Get filenames
    if style in styles_to_trim: # then subset to 1000 images
        filenames = np.random.choice(df.loc[df['style'] == style, 'new_filename'].unique(), 1000,
                                     replace=False)
    else:
        filenames = df.loc[df['style'] == style, 'new_filename'].unique()
    
    ## Shuffle filenames and split them
    cut = round(len(filenames) * train_size)
    np.random.shuffle(filenames)
    train_filenames = filenames[:cut]
    valid_filenames = filenames[cut:]
    
    ## Split filenames between valid and train dirs & subdirs (e.g. train/Impressionism/).
    ## This file structure will be useful to set a tensorflow pipeline.
    for i in [0, 1]:
        
        ## Create style-specific directories
        style_dir = os.path.join([TRAIN_DIR, VALID_DIR][i], '_'.join(style.split()))
        if not os.path.isdir(style_dir):
            os.mkdir(style_dir)

        ## Move selected images to their destination directories
        for filename in [train_filenames, valid_filenames][i]:
            src = os.path.join(IMG_DIR, filename)
            dst = os.path.join(style_dir, filename)
            os.rename(src, dst)

print(f"Number of training images         = {len(glob(os.path.join(TRAIN_DIR, '*', '*.jpg'))):,}")
print(f"Number of validation images       = {len(glob(os.path.join(VALID_DIR, '*', '*.jpg'))):,}")
print(f"Number of images in source folder = {len(glob(os.path.join(IMG_DIR, '*.jpg'))):,}")

HBox(children=(FloatProgress(value=0.0, max=69.0), HTML(value='')))


Number of training images         = 27,598
Number of validation images       = 6,902
Number of images in source folder = 44,933


# 5. Put selected images back into their original directory
⚠️ This cell is to be executed after training the model. In doing so, a new train set / validation set pair can be generated to re-train the model.

In [8]:
# train images
for src in glob(os.path.join(TRAIN_DIR, '*', '*.jpg')):
    dst = os.path.join(IMG_DIR, src.split('/')[-1])
    os.rename(src, dst)

# valid images
for src in glob(os.path.join(VALID_DIR, '*', '*.jpg')):
    dst = os.path.join(IMG_DIR, src.split('/')[-1])
    os.rename(src, dst)
    
print(f"Number of training images         = {len(glob(os.path.join(TRAIN_DIR, '*', '*.jpg'))):,}")
print(f"Number of validation images       = {len(glob(os.path.join(VALID_DIR, '*', '*.jpg'))):,}")
print(f"Number of images in source folder = {len(glob(os.path.join(IMG_DIR, '*.jpg'))):,}")

Number of training images         = 0
Number of validation images       = 0
Number of images in source folder = 79,433
