# This script was built to process and split the data into appropriate training and testing datasets. 

In [1]:
from PIL import Image as img
import os
import random
import numpy as np
import shutil as sh
from tqdm import tqdm
import sys
from time import sleep

In [None]:
skip_img = [True if input("Skip image directory reclassification? [yes/no]").lower()[0] == 'y' else False][0]

## make the necessary master_subset and master_subset_resized* directories

In [None]:
if not skip_img:

    created_master_subset_rs_500 = False
    created_master_subset_rs_1000 = False

    while True:
        try:
            if not created_master_subset_rs_500: os.makedirs('./source/master_rs_500')
            if not created_master_subset_rs_1000: os.makedirs('./source/master_rs_1000')
            break
        except FileExistsError as e:
            errordir = str(e).split("/")[-1].split("'")[0]
            print("{} directory already exists. Replacing current content of directory with current selection, removing any old directories".format(errordir))
            os.system("rm -rf ./source/OLD_{}".format(errordir))
            os.system("mv ./source/{} ./source/OLD_{}".format(errordir,errordir))
            os.makedirs('./source/{}'.format(errordir))

            exec("created_{} = True".format(errordir))

## resize all the images to 500x500 and 1000x1000

In [None]:
if not skip_img:

    # Resize images and populates the rs directory with images
    for file in tqdm(os.listdir('./source/master')):
        pos = img.open('./source/master/'+file)
        resized500 = pos.resize((500,500))
        resized1000 = pos.resize((1000,1000))
        resized500.save("./source/master_rs_500/" + file)
        resized1000.save("./source/master_rs_1000/" + file)

In [2]:
# helper function
def choice_pair(N, percent):
    """
    Inputs
    ------
    N : type = int OR float OR list OR numpy.ndarray
        contains the the data to select a random sample from 
    percent : type = int OR float
              value that goes from 0.0 to 1.0
              represents the fraction of elements to split N at 

    Outputs
    -------
    2 lists : The first one is the random sampling of N with percent % number of elements from N
              The second list contains all the elements from N that are not in the first list
    """
    numeric = None
    if isinstance(N, int) or isinstance(N, float):
        tmp = range(N)
        numeric=True
    elif isinstance(N, list) or isinstance(N, np.ndarray):
        tmp = np.array(N)
        numeric = False
    random.shuffle(tmp)
    if numeric:
        cut = int(N * percent)
    else:
        cut = int(len(N) * percent)
    return tmp[:cut], tmp[cut:]

## Create folders that will hold the training and testing data (organized appropriately)

In [3]:
#make the necessary folders, rename the existing ones if necessary [gates directories]
for imgsize in (500, 1000):
    try:
        os.makedirs('./gates{}'.format(str(imgsize))) #makes gates dir

        os.makedirs('./gates{}/images'.format(str(imgsize))) # make images dir
        os.makedirs('./gates{}/images/train'.format(str(imgsize))) #subdirectories within images
        os.makedirs('./gates{}/images/test'.format(str(imgsize)))

        os.makedirs('./gates{}/labels'.format(str(imgsize))) # make labels dir
        os.makedirs('./gates{}/labels/train'.format(str(imgsize))) # subdirectories within labels
        os.makedirs('./gates{}/labels/test'.format(str(imgsize)))
    except FileExistsError:
        print("Training and testing directories already exist. Replacing current content of directories with current selection")
        if os.path.isdir('./gates{}'.format(str(imgsize))):
            os.rename("./gates{}".format(str(imgsize)),"./OLD_gates{}".format(str(imgsize))) # if the selected path is a folder, rename it with the "OLD_" prefix
        else:
            os.rename("./gates{}/train".format(str(imgsize)), "./gates{}/OLD_train".format(str(imgsize)))
            os.rename("./gates{}/test".format(str(imgsize)), "./gates{}/OLD_test".format(str(imgsize)))
        os.makedirs('./gates{}/train'.format(str(imgsize)))
        os.makedirs('./gates{}/test'.format(str(imgsize)))


## Split training and testing data - approximately 80/20 split for training/testing

In [None]:
print("WARNING: The following code will only work if you have created the label files and placed them in the master_subset_rs* directories corresponding to the image size")
to_continue = input("Are the images and the labels in the same directory? [yes/no]")
if to_continue.lower()[0] != 'y':
    exit()

In [None]:
#random sampling of 500x500 images and populating the train and test directories
train_list, test_list = choice_pair(os.listdir('./source/master_subset'), 0.8)
for filename in tqdm(train_list):
    sh.copy2('/home/mihir/ram/source/master_rs_500/'+filename, '/home/mihir/ram/gates500/images/train/'+filename)
    sh.copy2('/home/mihir/ram/source/master_rs_500/'+filename[:-4]+'.txt','/home/mihir/ram/gates500/labels/train/'+filename)

for filename in tqdm(test_list):
    sh.copy2('/home/mihir/ram/source/master_rs_500/'+filename, '/home/mihir/ram/gates500/images/test/'+filename)
    sh.copy2('/home/mihir/ram/source/master_rs_500/'+filename[:-4]+'.txt','/home/mihir/ram/gates500/labels/test/'+filename)

In [None]:
#random sampling of 1000x1000 images and populating the train and test directories
train_list, test_list = choice_pair(os.listdir('./source/master_subset'), 0.8)
for filename in tqdm(train_list):
    sh.copy2('/home/mihir/ram/source/master_rs_1000/'+filename, '/home/mihir/ram/gates1000/images/train/'+filename)
    sh.copy2('/home/mihir/ram/source/master_rs_1000/'+filename[:-4]+'.txt','/home/mihir/ram/gates1000/labels/train/'+filename[:-4]+'.txt')

for filename in tqdm(test_list):
    sh.copy2('/home/mihir/ram/source/master_rs_1000/'+filename, '/home/mihir/ram/gates1000/images/test/'+filename)
    sh.copy2('/home/mihir/ram/source/master_rs_1000/'+filename[:-4]+'.txt','/home/mihir/ram/gates1000/labels/test/'+filename[:-4]+'.txt')