In [2]:
import shutil
import os
import numpy as np
import cv2
import math

In [3]:
test_image_w = 175
def get_files_from_folder(path):

    files = os.listdir(path)
    return np.asarray(files)

def get_center_square_subpicture(original_image, final_width:int):
    input_h, input_w = original_image.shape[:2]
    square_width = min(input_w,input_h)
    offset_height = (input_h - square_width) // 2
    offset_width = (input_w - square_width) // 2

    image = original_image[offset_height:offset_height+square_width, offset_width:offset_width+square_width,:]
    
    new_dim = (final_width, final_width)
    # resize image
    resized = cv2.resize(image, new_dim, interpolation = cv2.INTER_AREA)
    return resized

def split_dataset(path_to_data, path_to_train_data, path_to_test_data, train_ratio):
    # get dirs
    _, dirs, _ = next(os.walk(path_to_data))

    #lets clean the directories
    if os.path.exists(path_to_train_data):
        shutil.rmtree(path_to_train_data)
    if os.path.exists(path_to_test_data):
        shutil.rmtree(path_to_test_data)	
    
    for i in range(len(dirs)):

        #if(i==2):   #TMP!!
        #    continue
        path_to_original = os.path.join(path_to_data, dirs[i])
        files = get_files_from_folder(path_to_original)
        elements = len(files)
        if elements <= 0:
            continue
        train_data_number = (np.round(elements * train_ratio)).astype(np.int64)

        train_index = np.random.choice(range(elements), size=train_data_number,replace=False)
        test_index= np.delete(np.array(range(elements)), train_index)

        train_dataset = files[train_index]
        test_dataset = files[test_index]

        path_to_save_training = os.path.join(path_to_train_data, dirs[i])

        #Since we are grouping classes 1-2-3 and 4-5, here let's split the test dataset in these two subsets.
        if(int(dirs[i]) <=3):
            path_to_save_test = os.path.join(path_to_test_data, "0")
        else:
            path_to_save_test = os.path.join(path_to_test_data, "1")

        #eventually creates dir
        if not os.path.exists(path_to_save_training):
            os.makedirs(path_to_save_training)
        if not os.path.exists(path_to_save_test):
            os.makedirs(path_to_save_test)

        # copy training data
        for j in range(len(train_dataset)):
            dst = os.path.join(path_to_save_training, train_dataset[j])
            src = os.path.join(path_to_original, train_dataset[j])
            shutil.copy(src, dst)

        # copy test data
        for j in range(len(test_dataset)):
            src = os.path.join(path_to_original, test_dataset[j])
            img = cv2.imread(src)
            img = get_center_square_subpicture(img, test_image_w)

            destination = os.path.join(path_to_save_test, f"{dirs[i]}_{j}.jpg")

            cv2.imwrite(destination, img)



In [4]:
all_pictures_folder = os.path.join(".","original_pictures")
train_folder_original = os.path.join(".","tmp", "train_set")
test_folder_original = os.path.join(".","dataset", "test")

#let's split pictures in train and test subsets.
#this will shuffle/reshuffle pics, dividing them in train and test subsets.
split_dataset(all_pictures_folder, train_folder_original, test_folder_original, 0.8)