In [1]:
from glob import glob
import numpy as np
import random
import shutil
import os

paths = ["./iNaturalist/images/Poison Ivy"]
n_training_images_per_class = 500
percent_test = .1

# =========== You Shouldn't Have To Edit Beyond This Point ===========
class_file_path = paths[0] + "/classes.txt"
dataset_name = "v5_" + str(n_training_images_per_class)
dataset_path = "./dataset/" + dataset_name
total_training_images = 0
total_test_images = 0


def build_dataset(dataset_path, training_labels, testing_labels):
    steps = {"train":training_labels, "val":testing_labels}
    for path, label_paths in steps.items():
        for label_path in label_paths:
            record_id = label_path.split("\\")[-1].split(".txt")[0]
            p = label_path.split(record_id)[0]
            img_path = glob(p + record_id + ".j*")[0]
            new_img_path = dataset_path + "/images/" + path + "/" + record_id + ".jpg"
            new_label_path = dataset_path + "/labels/" + path + "/" + record_id + ".txt"
            shutil.copyfile(img_path, new_img_path)
            shutil.copyfile(label_path, new_label_path)
            
            
def init_clean_dir(path):
    if os.path.exists(path):
        shutil.rmtree(path)
    os.mkdir(path)

    
def init_yolov5_dataset_directories(base_path):
    init_clean_dir(base_path)
    init_clean_dir(base_path + "/images")
    init_clean_dir(base_path + "/images/train")
    init_clean_dir(base_path + "/images/val")
    init_clean_dir(base_path + "/labels/")
    init_clean_dir(base_path + "/labels/train")
    init_clean_dir(base_path + "/labels/val")


def train_test_split(base_path, n_train, p_test):
    n_test = int( n_train / (1 - p_test)) - n_train
    total_images_needed = n_train + n_test
    labels = glob(base_path + "/*.txt")
    labels.remove(base_path + "\\classes.txt")
    if len(labels) < total_images_needed:
        print("Not Enough Labeled Images")
    else:
        for_training = random.sample(labels, n_train)
        for_testing = np.setdiff1d(labels, for_training)
        for_testing = random.sample(for_testing.tolist(), n_test)
        return (for_training, for_testing)

## Set up the custom data set for YOLOv5 training

In [2]:
init_yolov5_dataset_directories(dataset_path)  

for path in paths:
    train, test = train_test_split(path, n_training_images_per_class, percent_test)
    build_dataset(dataset_path, train, test)
    total_training_images += len(train)
    total_test_images += len(test)

## Dataset Config YAML
Generate the YAML used to train the YOLOv5 model

In [3]:
names = "names: ["
n_classes = 0
file = open(class_file_path, "r")
for line in file.readlines():
    n_classes += 1
    names += "'" + line.strip() + "', "
names = names[:len(names)-2] + "]"

f = open(dataset_path + "/data.yaml", "w")
f.write("train: ../data/images/train/\r")
f.write("val: ../data/images/val/\r")
f.write("\r")
f.write("nc: " + str(n_classes) + "\r")
f.write("\r")
f.write(names)
f.close()

## YOLOv5 YAML
Generate the YOLOv5 config YAML by taking the base YAML and editing the number of classes

In [4]:
file = open("yolov5x.yaml", "r")
lines = file.readlines()
lines[1] = "nc: "+str(n_classes)+"  # number of classes\n"
f = open(dataset_path + "/yolov5x.yaml", "w")
f.writelines(["%s" % line  for line in lines])
f.close()

## Create README
Add in some details into a README file

In [5]:
msg = str(total_training_images) + " images in training set"
f = open(dataset_path + "/README", "w")
f.write(msg)
f.write("\r")
f.close()
print(msg)

500 images in training set


In [6]:
msg = str(total_test_images) + " images in test set"
f = open(dataset_path + "/README", "a")
f.write(msg)
f.write("\r")
f.close()
print(msg)

55 images in test set


In [7]:
msg = str(n_classes) + " classes in total"
f = open(dataset_path + "/README", "a")
f.write(msg)
f.close()
print(msg)

1 classes in total


## Zip it up

Finally compress everything so it can be uploaded to the cloud

In [8]:
shutil.make_archive("./dataset/" + dataset_name, "zip", dataset_path)
shutil.rmtree(dataset_path)