# Move Data To Folders

This notebook takes the [food-101](https://www.vision.ee.ethz.ch/datasets_extra/food-101/) dataset and divides up the data as expected for the sandwich or not sandwich problem.
It will consider all images in the "sandwich_dir_names" folder as sandwiches, and all others as not sandwiches.

The script will read these images, and divide them up into train/validation/test sets. 

It expects the food-101 dataset to be extracted into the ../data folder.

Start by importing and defining some constants.

In [1]:
import shutil
import os
import random

sandwich_dir_names = \
    [\
         'croque_madam',\
         'hamburger',\
         'lobster_roll_sandwich',\
         'pulled_pork_sandwich',\
         'club_sandwich',\
         'grilled_cheese_sandwich',\
         'hot_dog',\
         'tacos'\
    ]

random.seed(42)

Define the output directories, and make them if they do not exist

In [2]:
output_directories =\
    [\
        '../data/images/train/sandwich',\
        '../data/images/val/sandwich',\
        '../data/images/test/sandwich',\
        '../data/images/train/not_sandwich',\
        '../data/images/val/not_sandwich',\
        '../data/images/test/not_sandwich'\
    ]

for directory in output_directories:
    if not os.path.isdir(directory):
        os.makedirs(directory)

Divide all of the filenames into "sandwich" or "not sandwich" classes

In [3]:
sandwich_files = []
not_sandwich_files = []

for item in os.walk('../data'):
    if ('train' not in item[0] and 'test' not in item[0]):
        filenames = [(item[0], filename) for filename in item[2]]
        if (any(name in item[0] for name in sandwich_dir_names)):
            sandwich_files += filenames
        else:
            not_sandwich_files += filenames

Create the train/val/test indexes. The i-th element of these indexes will determine the set membership of the i-th image. 0 => train, 1 => validation, 2 => test.

In [None]:
train_split = 0.8
val_split = 0.1
sandwich_train_index = [0 for i in range(0, int(len(sandwich_files)*train_split), 1)]
sandwich_train_index += [1 for i in range(int(len(sandwich_files)*train_split), int(len(sandwich_files)*(train_split+val_split)), 1)]
sandwich_train_index += [2 for i in range(int(len(sandwich_files)*(train_split+val_split)), len(sandwich_files), 1)]

#use sandwich_train_index size so that we have balanced dataset
not_sandwich_train_index = [0 for i in range(0, int(len(sandwich_files)*train_split), 1)]
not_sandwich_train_index += [1 for i in range(int(len(sandwich_files)*train_split), int(len(sandwich_files)*(train_split+val_split)), 1)]
not_sandwich_train_index += [2 for i in range(int(len(sandwich_files)*(train_split+val_split)), len(sandwich_files), 1)]


random.shuffle(sandwich_train_index)
random.shuffle(not_sandwich_train_index)
random.shuffle(not_sandwich_files)

Move each image to the proper directory to which class it belongs.

In [None]:
for i in range(0, len(sandwich_train_index), 1):
    output_directory = '../data/images/test/sandwich'
    if sandwich_train_index[i] == 0:
        output_directory = '../data/images/train/sandwich'
    elif sandwich_train_index[i] == 1:
        output_directory = '../data/images/val/sandwich'
    source_path = os.path.join(sandwich_files[i][0], sandwich_files[i][1])
    dest_path = os.path.join(output_directory, sandwich_files[i][1])
    
    if (os.path.isfile(dest_path)):
        raise ValueError('File already exists.')
    
    shutil.copyfile(source_path, dest_path)
    
for i in range(0, len(not_sandwich_train_index), 1):
    output_directory = '../data/images/test/not_sandwich'
    if not_sandwich_train_index[i] == 0:
        output_directory = '../data/images/train/not_sandwich'
    elif not_sandwich_train_index[i] == 1:
        output_directory = '../data/images/val/not_sandwich'
    source_path = os.path.join(not_sandwich_files[i][0], not_sandwich_files[i][1])
    dest_path = os.path.join(output_directory, not_sandwich_files[i][1])
    
    if (os.path.isfile(dest_path)):
        raise ValueError('File already exists.')
    
    shutil.copyfile(source_path, dest_path)