## Notebook Description
This notebook contains the code for creating the directories to hold the train/test/validation data sorted by class<br>
**Date of Last Update**: November 23rd 2020

---

**INPUTS**: Data/train folder <br>
folder containing image files sorted by class

**OUTPUTS**: Data/train, Data/val, Data/validation<br>
directory containing image files sorted by class and train test val

---

### Previous Steps
- download images and perform manual inspection

### Next Steps
- selection of off the shelf model

---

## Import Packages

In [1]:
import shutil
import os
import numpy as np
import argparse

---
## Define Helper Functions

In [2]:
def get_files_from_folder(data_path):
    """takes in path to files 
    returns a numpy array of files"""
    files = os.listdir(data_path)
    arrfiles = np.asarray(files)
    return arrfiles

In [3]:
def split(path_to_data, path_to_test_data, ratio=0.8):
    # get directories
    _, dirs, _ = next(os.walk(path_to_data))

    # calculates how many train data per class
    data_counter_per_class = np.zeros((len(dirs)))
    for i in range(len(dirs)):
        path = os.path.join(path_to_data, dirs[i])
        files = get_files_from_folder(path)
        data_counter_per_class[i] = len(files)
    test_counter = np.round(data_counter_per_class * (1 - ratio))

    # transfers files
    for i in range(len(dirs)):
        path_to_original = os.path.join(path_to_data, dirs[i])
        path_to_save = os.path.join(path_to_test_data, dirs[i])

        #creates directory
        if not os.path.exists(path_to_save):
            os.makedirs(path_to_save)
        
        #obtains array of files and shuffles the order
        files = get_files_from_folder(path_to_original)
        np.random.set_state = 42
        np.random.shuffle(files)
        
        # moves data
        for j in range(int(test_counter[i])):
            dst = os.path.join(path_to_save, files[j])
            src = os.path.join(path_to_original, files[j])
            shutil.move(src, dst)
    
    return print("file split complete")

## Define File Paths and Split Sizes

In [4]:
path_to_data = 'Data/train'
path_to_test_data = 'Data/val'
path_to_val_data ='Data/validation' 
val_ratio=0.95 
train_ratio=0.8

## Perform Splits

In [5]:
# perform split for train/validation
# use ratio = 0.95
split(path_to_data, path_to_val_data, val_ratio)

file split complete


In [6]:
# perform split for test/train
#use ratio = 0.8
split(path_to_data, path_to_test_data, train_ratio)

file split complete
