In [None]:
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
base_dir = "/content/drive/My Drive/PATH"
source_dir = os.path.join(base_dir, "FOLDERNAME")
os.listdir(source_dir)

# dataset exploration

### images per class

### least represented class

### size of train/val/test splits given percentages

In [None]:
classes = os.listdir(source_dir)

class_counts = {}
smallest_class = classes[0]
smallest_class_count = 1000
for i in classes:
    class_counts[i] = len(os.listdir(os.path.join(source_dir, i)))
    if class_counts[i] < smallest_class_count:
        smallest_class_count = class_counts[i]
        smallest_class = i


In [None]:
output = f"Smallest class: {smallest_class}, {smallest_class_count} images"
print(output)

for k,v in class_counts.items():
    print(k,v)

In [None]:
def get_split_counts(dataset_size, train_test_split, train_val_split):
    """
    @param int dataset_size: the number of images in a dataset
    @param tuple ([0.0 - 1.0], [0.0 - 1.0]) train: The proportion of training images to testing images (Must sum to 1)
    @param tuple ([0.0 - 1.0], [0.0 - 1.0]) val: The proportion of traing images to validation images (Must sum to 1)

    @returns tuple counts: (int train, int val, int test) 
    """
    for i in [train_test_split, train_val_split]:
        assert i[0] + i[1] == 1

    test_size = int(dataset_size * train_test_split[1])

    train_val_size = int(dataset_size * train_test_split[0])
    train_size = int(train_val_size * train_val_split[0])
    val_size = int(train_val_size * train_val_split[1])

    assert test_size + train_size + val_size <= dataset_size

    return train_size, val_size, test_size

## 80/20 split of training/testing images. Of the 80% used for training, 80% of those will be used for actual training, the remaining 20% will be used for validation. This is a based off of rule-of-thumb suggestion from [this stackoverflow post](https://stackoverflow.com/questions/13610074/is-there-a-rule-of-thumb-for-how-to-divide-a-dataset-into-training-and-validation)

In [None]:
train_test_split = (0.8, 0.2)
train_val_split = (0.8, 0.2)

splits = get_split_counts(smallest_class_count, train_test_split, train_val_split)

In [None]:
# Copy class direcory structure into train and test dirs
for i in ["test", "train"]:
    split_dir = os.path.join(base_dir, i)
    try:
        os.mkdir(split_dir)
    except FileExistsError as e:
        pass

    for j in classes:
        try:
            os.mkdir(os.path.join(split_dir, j))
        except FileExistsError as e:
            pass


# Randomly choose images from each class directory then copy 20% into a testing directory and copy the rest into a training directory

# The training/validation split will be handled by Keras's ImageDataGenerator class

In [None]:
import random
import shutil
random.seed(1)
for i in classes:
    src_path = os.path.join(source_dir, i)
    train_dir = os.path.join(base_dir, "train", i)
    test_dir = os.path.join(base_dir, "test", i)

    downsampled_images = random.sample(os.listdir(src_path), smallest_class_count)

    test_images = random.sample(downsampled_images, splits[2])

    for j in test_images:
        shutil.copy(os.path.join(src_path, j), test_dir)
        downsampled_images.remove(j)

    for k in downsampled_images:
        shutil.copy(os.path.join(src_path, k), train_dir)


