# This script was built to process and split the data into appropriate training and testing datasets.

In [1]:
from PIL import Image as img
import os
import random
import numpy as np
import shutil as sh
from tqdm import tqdm
from simple_gate_detector import create_dir_system

In [2]:
# helper function
def choice_pair(N, train_percent):
    """
    Inputs
    ------
    N : type = int OR list OR numpy.ndarray
        contains the the data to select a random sample from
    train_percent : type = int OR float
              value that goes from 0.0 to 1.0
              represents the fraction of elements to split N at

    Outputs
    -------
    2 lists : The first one is the random sampling of N with percent % number of elements from N
              The second list contains all the elements from N that are not in the first list
    """
    numeric = None
    if isinstance(N, int):
        tmp = range(N)
        numeric=True
    elif isinstance(N, list) or isinstance(N, np.ndarray):
        tmp = np.array(N)
        numeric = False
    else:
        print("Unsupported input type [N]. N should be an int, list, or numpy array.")
    random.shuffle(tmp)
    if numeric:
        cut = int(N * train_percent)
    else:
        cut = int(len(N) * train_percent)
    return tmp[:cut], tmp[cut:]

## Create folders that will hold the training and testing data (organized appropriately)

In [3]:
#make the necessary folders, rename the existing ones if necessary [gates directories]
CURR_DIR_CUSTOM = os.getenv("CURR_DIR_CUSTOM")

create_dir_system('gates', CURR_DIR_CUSTOM)
create_dir_system('images', CURR_DIR_CUSTOM+'/gates/')
create_dir_system('labels', CURR_DIR_CUSTOM+'/gates/')

create_dir_system('test', CURR_DIR_CUSTOM+'/gates/images/')
create_dir_system('train', CURR_DIR_CUSTOM+'/gates/images/')

create_dir_system('test', CURR_DIR_CUSTOM+'/gates/labels/')
create_dir_system('train', CURR_DIR_CUSTOM+'/gates/labels/')


TypeError: expected str, bytes or os.PathLike object, not NoneType

## Split training and testing data - approximately 80/20 split for training/testing

In [None]:
#random sampling of 500x500 images and populating the train and test directories
train_list, test_list = choice_pair(os.listdir(CURR_DIR_CUSTOM+'/source'), 0.8)
for filename in tqdm(train_list):
    sh.copy2(CURR_DIR_CUSTOM+'/source/'+filename, CURR_DIR_CUSTOM+'/gates/images/train/')
    sh.copy2(CURR_DIR_CUSTOM+'/source_labels/'+filename[:-4]+'.txt', CURR_DIR_CUSTOM+'/gates/labels/train/')

for filename in tqdm(test_list):
    sh.copy2(CURR_DIR_CUSTOM+'/source/'+filename, CURR_DIR_CUSTOM+'/gates/images/test/')
    sh.copy2(CURR_DIR_CUSTOM+'/source_labels/'+filename[:-4]+'.txt', CURR_DIR_CUSTOM+'/gates/labels/test/')