In [2]:

import shutil
import os
import numpy as np
import cv2


all_pictures_folder = os.path.join(".","dataset", "original")
train_folder_original = os.path.join(".","tmp", "train_set")
test_folder_original = os.path.join(".","dataset", "test")

test_image_w = 416

In [11]:
def get_files_from_folder(path):

    files = os.listdir(path)
    return np.asarray(files)


def center_crop(image,out_height,out_width):
    input_height, input_width = image.shape[:2]
    offset_height = (input_height - out_height) // 2
    offset_width = (input_width - out_width) // 2
    image = image[offset_height:offset_height+out_height, offset_width:offset_width+out_width,:]
    return image

def split_dataset(path_to_data, path_to_train_data, path_to_test_data, train_ratio):
    # get dirs
    _, dirs, _ = next(os.walk(path_to_data))

    #lets clean the directories
    if os.path.exists(path_to_train_data):
        shutil.rmtree(path_to_train_data)
    if os.path.exists(path_to_test_data):
        shutil.rmtree(path_to_test_data)	
    
    for i in range(len(dirs)):
        path_to_original = os.path.join(path_to_data, dirs[i])
        files = get_files_from_folder(path_to_original)
        elements = len(files)
        if elements <= 0:
            continue
        train_data_number = (np.round(elements * train_ratio)).astype(np.int64)

        train_index = np.random.choice(range(elements), size=train_data_number,replace=False)
        test_index= np.delete(np.array(range(elements)), train_index)

        train_dataset = files[train_index]
        test_dataset = files[test_index]

        path_to_save_training = os.path.join(path_to_train_data, dirs[i])
        path_to_save_test = os.path.join(path_to_test_data, dirs[i])

        #eventually creates dir
        if not os.path.exists(path_to_save_training):
            os.makedirs(path_to_save_training)
        if not os.path.exists(path_to_save_test):
            os.makedirs(path_to_save_test)

        # copy training data
        for j in range(len(train_dataset)):
            dst = os.path.join(path_to_save_training, train_dataset[j])
            src = os.path.join(path_to_original, train_dataset[j])
            shutil.copy(src, dst)

        # copy test data
        for j in range(len(test_dataset)):
            src = os.path.join(path_to_original, test_dataset[j])
            img = cv2.imread(src)
            img = center_crop(img, test_image_w, test_image_w)
            cv2.imwrite(os.path.join(path_to_save_test, f"{j}.jpg"), img)



In [12]:
#let's split pictures in train and test subsets.
#this will shuffle/reshuffle pics, dividing them in train and test subsets.
split_dataset(all_pictures_folder, train_folder_original, test_folder_original, 0.8)

trainIndex:
[ 628 2912 2519 ... 3753 3133 2622]
testIndex:
[   2    9   10   15   26   27   29   36   40   43   62   64   66   68
   90   93   96   98  108  143  145  147  149  155  158  160  163  164
  167  178  189  191  196  204  207  209  222  231  236  249  252  259
  260  263  265  273  283  284  287  296  298  305  311  312  314  317
  325  332  334  337  339  353  373  380  385  386  388  394  395  401
  404  413  416  417  418  429  433  434  438  444  445  453  458  464
  469  472  474  476  478  479  480  487  488  489  490  493  496  498
  502  512  515  530  541  542  544  556  562  563  564  572  575  576
  592  615  618  619  623  625  627  634  639  640  642  649  651  653
  659  666  667  672  681  692  693  697  698  702  704  706  710  711
  713  715  717  721  724  725  734  736  737  740  742  752  754  760
  762  767  770  779  800  804  805  808  811  818  820  823  831  833
  836  839  845  846  851  867  869  877  878  888  889  890  892  893
  894  896  897  9