## Environment settings

### Libraries

In [None]:
import os
import random

from shutil import copyfile
from shutil import move

### Random seed

In [None]:
# Random seed for reproducibility
SEED = 42

random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

# Data Pre-Processing
* Training set : 60%
* Validation set : 20%
* Testing set : 20%

**Stratified sampling procedure** : because the proportion among classes are very different and must be preserved in order to avoid biased predictions.

Parameters setting:

In [None]:
# Directories
dataset_dir = 'leaf_dataset' # The name of the original dataset, it has to be in the same directory of this notebook
sub_dir_s = ['training','validation','testing']

# Splitting proportions
train = .6 # 60%
val = .2 # 20%
test = .2 # 20%

# Labels name
labels = ['Apple','Blueberry','Cherry','Corn','Grape','Orange','Peach','Pepper','Potato','Raspberry','Soybean','Squash','Strawberry','Tomato']

Directories organization:

In [None]:
# Getting current working directory
path = os.getcwd()

# Operative directories (training, validation, testing)
for sub in sub_dir_s:
    # Full path
    name = path + '/' + sub

    # mkdir
    try:
        os.mkdir(name)
    except OSError:
        print ("Creation of the directory %s failed" % name)
    else:
        print ("Successfully created the directory %s " % name)   

    # For each label (leaf categories)
    for label in labels:
        # Class directory
        class_name = name + '/' + label

        # mkdir
        try:
            os.mkdir(class_name)
        except OSError:
            print ("Creation of the directory %s failed" % class_name)
        else:
            print ("Successfully created the directory %s " % class_name)

Splitting procedure:

In [None]:
# Stratified sampling procedure
def stratified_sampling(labels, dataset_path, train_p, val_p):
    # params:
    # - labels : classes' label to be considered
    # - train_p : train samples proportion
    # - val_p : validation samples proportion
    # - test_p : test samples proportion

    # Return lists
    train_set = []
    val_set = []
    test_set = []

    # For each class
    for i in range(len(labels)):
        if i < len(labels):
            # Selecting all the images of the i-th class
            class_imgs = next(os.walk(os.getcwd() + '/{}/{}/'.format(dataset_path, labels[i])))[2]

            # Lenght
            class_len = len(class_imgs)

            # Shuffling
            random.shuffle(class_imgs)

            # Splitting
            train = class_imgs[:int(train_p*class_len)]
            val = class_imgs[int(train_p*class_len):int((train_p + val_p)*class_len)]
            test = class_imgs[int((train_p+val_p)*class_len):]

            # Append lists to the corresponding index
            train_set.append(train)
            val_set.append(val)
            test_set.append(test)
    
    return train_set, val_set, test_set

Directory populations setting:

In [None]:
# Stratified sampling
train_set, val_set, test_set = stratified_sampling(labels, dataset_dir, train_p=train, val_p=val)

# Getting current working directory
path = os.getcwd()

# Operative directories (training, validation, testing)
for sub in sub_dir_s:
    # Taking the correct list
    if (sub == 'training'):
        list = train_set
    elif (sub == 'validation'):
        list = val_set
    else:
        list = test_set

    # For each class target
    for i in labels:
        # Source path taking the full dataset from the root
        src_path = dataset_dir + '/' + i + '/'

        # Destination path taking the target sub directory          
        dst_path = path + '/' + sub + '/' + i + '/'

        # Copying each image to the new directory
        for img in list[labels.index(i)]:
            copyfile(src_path + img, dst_path + img)

Checking numbers:

In [None]:
def count_samples_classes(labels, dir_path):
    # Counters list
    counters = []

    # For each class
    for i in range(len(labels)):
        # Selecting all the images of the i-th class
        class_samples = next(os.walk('{}/{}/'.format(dir_path, labels[i])))[2]

        # Storing the counter bound with the class target
        counters.append((labels[i], len(class_samples)))
    
    return counters

In [None]:
# Current path
path = os.getcwd() + '/'

# Original dataset
print('Original Dataset:')
print(count_samples_classes(labels, path + dataset_dir))

# Training set
print('\nTraining set:')
print(count_samples_classes(labels, path + 'training'))

# Validation set
print('\nValidation set:')
print(count_samples_classes(labels, path + 'validation'))

# Testing set
print('\nTesting set:')
print(count_samples_classes(labels, path + 'testing'))

Creating the splitted dataset folder

In [None]:
target = 'leaf_dataset_splitted' # Created in the same directory of this notebook
if(not os.path.isdir(target)):
	os.mkdir(target)

move('training', target + '/training')
move('validation', target + '/validation')
move('testing', target + '/testing')