# Data Pre-Processing
* Training set : 60%
* Validation set : 20%
* Testing set : 20%

**stratified sampling procedure** : because the proportion among classes are very different and must be preserved in order to avoid biased predictions.

In [1]:
import os
from shutil import copyfile
import random
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

Parameters setting:

In [2]:
# directories
dataset_dir = 'full_dataset'
sub_dir_s = ['training','validation']

# splitting proportions
train = .8 # 60%
val = .2 # 20%
#test = .2 # 20%

# labels name
labels = ['Apple','Blueberry','Cherry','Corn','Grape','Orange','Peach','Pepper','Potato','Raspberry','Soybean','Squash','Strawberry','Tomato']

Directories organization:

In [3]:
# getting current working directory
path = os.getcwd()

# operative directories (training, validation, testing)
for sub in sub_dir_s:
    # full path
    name = path + '/' + sub

    # mkdir
    try:
        os.mkdir(name)
    except OSError:
        print ("Creation of the directory %s failed" % name)
    else:
        print ("Successfully created the directory %s " % name)   

    # for each label (leaf categories)
    for label in labels:
        # class directory
        class_name = name + '/' + label

        # mkdir
        try:
            os.mkdir(class_name)
        except OSError:
            print ("Creation of the directory %s failed" % class_name)
        else:
            print ("Successfully created the directory %s " % class_name)

Successfully created the directory /Users/mirkousuelli/OneDrive - Politecnico di Milano/GitHub/an2dl-homeworks/training_validation_testing/mirko/training 
Successfully created the directory /Users/mirkousuelli/OneDrive - Politecnico di Milano/GitHub/an2dl-homeworks/training_validation_testing/mirko/training/Apple 
Successfully created the directory /Users/mirkousuelli/OneDrive - Politecnico di Milano/GitHub/an2dl-homeworks/training_validation_testing/mirko/training/Blueberry 
Successfully created the directory /Users/mirkousuelli/OneDrive - Politecnico di Milano/GitHub/an2dl-homeworks/training_validation_testing/mirko/training/Cherry 
Successfully created the directory /Users/mirkousuelli/OneDrive - Politecnico di Milano/GitHub/an2dl-homeworks/training_validation_testing/mirko/training/Corn 
Successfully created the directory /Users/mirkousuelli/OneDrive - Politecnico di Milano/GitHub/an2dl-homeworks/training_validation_testing/mirko/training/Grape 
Successfully created the directory /

Splitting procedure:

In [4]:
# stratified sampling procedure
def stratified_sampling(labels, dataset_path, train_p, val_p):
    # params:
    # - labels : classes' label to be considered
    # - train_p : train samples proportion
    # - val_p : validation samples proportion
    # - test_p : test samples proportion

    # return lists
    train_set = []
    val_set = []
    #test_set = []

    # for each class
    for i in range(len(labels)):
        if i < len(labels):
            # selecting all the images of the i-th class
            class_imgs = next(os.walk('../../{}/{}/'.format(dataset_path, labels[i])))[2]

            # lenght
            class_len = len(class_imgs)

            # shuffling
            random.shuffle(class_imgs)

            # splitting
            train = class_imgs[:int(train_p*class_len)]
            val = class_imgs[int(train_p*class_len):]
            #test = class_imgs[int((train_p+val_p)*class_len):]

            # append lists to the corresponding index
            train_set.append(train)
            val_set.append(val)
            #test_set.append(test)
    
    return train_set, val_set#, test_set

Directory populations setting:

In [5]:
# stratified sampling
train_set, val_set = stratified_sampling(labels, dataset_dir, train_p=train, val_p=val)

# getting current working directory
path = os.getcwd()

# operative directories (training, validation, testing)
for sub in sub_dir_s:
    # taking the correct list
    if (sub == 'training'):
        list = train_set
    elif (sub == 'validation'):
        list = val_set
    #else:
        #list = test_set

    # for each class target
    for i in labels:
        # source path taking the full dataset from the root
        src_path = '../../' + dataset_dir + '/' + i + '/'

        # destination path taking the target sub directory          
        dst_path = path + '/' + sub + '/' + i + '/'

        # copying each image to the new directory
        for img in list[labels.index(i)]:
            copyfile(src_path + img, dst_path + img)

Checking numbers:

In [6]:
def count_samples_classes(labels, dir_path):
    # counters list
    counters = []

    # for each class
    for i in range(len(labels)):
        # selecting all the images of the i-th class
        class_samples = next(os.walk('{}/{}/'.format(dir_path, labels[i])))[2]

        # storing the counter bound with the class target
        counters.append((labels[i], len(class_samples)))
    
    return counters

In [7]:
# full dataset
print('Full dataset:')
print(count_samples_classes(labels, '../../' + dataset_dir))

# current path
path = os.getcwd() + '/'

# training set
print('\nTraining set:')
print(count_samples_classes(labels, path + 'training'))

# validation set
print('\nValidation set:')
print(count_samples_classes(labels, path + 'validation'))

# testing set
#print('\nTesting set:')
#print(count_samples_classes(labels, path + 'testing'))

Full dataset:
[('Apple', 988), ('Blueberry', 467), ('Cherry', 583), ('Corn', 1206), ('Grape', 1458), ('Orange', 1748), ('Peach', 977), ('Pepper', 765), ('Potato', 716), ('Raspberry', 264), ('Soybean', 1616), ('Squash', 574), ('Strawberry', 673), ('Tomato', 5693)]

Training set:
[('Apple', 790), ('Blueberry', 373), ('Cherry', 466), ('Corn', 964), ('Grape', 1166), ('Orange', 1398), ('Peach', 781), ('Pepper', 612), ('Potato', 572), ('Raspberry', 211), ('Soybean', 1292), ('Squash', 459), ('Strawberry', 538), ('Tomato', 4554)]

Validation set:
[('Apple', 198), ('Blueberry', 94), ('Cherry', 117), ('Corn', 242), ('Grape', 292), ('Orange', 350), ('Peach', 196), ('Pepper', 153), ('Potato', 144), ('Raspberry', 53), ('Soybean', 324), ('Squash', 115), ('Strawberry', 135), ('Tomato', 1139)]
