# Yolov8 format
Yolov8 wants following format: train, val and test folder. For each have a images folder and a labels folder. For each image in the images folder there is a .txt file with the same name and the corresponding labels. To get labels for segmentation look at JSON2YOLO github <https://github.com/ultralytics/JSON2YOLO>

Change paths accordingly for individual use

In [44]:
# Get all file names for detection

import os
os.chdir('..')
print(os.getcwd())
os.chdir('gauge_detection')
print(os.getcwd())

# Detection file directories
detection_directory = 'data/raw_data/detection'
# detection_image_directory = 'data/raw_data/detection/images'
# detection_label_directory = 'data/raw_data/detection/labels'

# Segmentation File directories
segmentation_directory = 'data/raw_data/segmentation'
# segmentation_image_directory = 'data/raw_data/segmentation/images'
# segmentation_label_directory = 'data/raw_data/segmentation/labels'

# get all detection image file names
detection_filenames = []
for detection_filename in os.listdir(detection_directory + '/images'):
    detection_filenames.append(detection_filename[:-4]) # last 4 characters are '.jpg'

print(detection_filenames)

# get all segmentation image file names
segmentation_filenames = []
for segmentation_filename in os.listdir(segmentation_directory + '/images'):
    segmentation_filenames.append(segmentation_filename[:-4]) # last 4 characters are '.jpg'

print(segmentation_filenames)

c:\Lovelesh\Wireless Gauge Reader\analog_gauge_reader
c:\Lovelesh\Wireless Gauge Reader\analog_gauge_reader\gauge_detection
['002b612d-130', '003e3653-65', '0056c307-133', '00971356-res-287', '012256c2-118', '01aeb940-res-214', '01c399f7-26', '01f08d3f-124', '01f38e2e-138', '020227a7-115', '024cadc1-res-157', '029941c1-res-278', '02ab25a2-36', '02bbf3ee-120', '02bd4b8d-4', '02dbb7b9-res-109', '02ddb80d-173', '02df14bf-169', '0365937f-105', '03a64967-86', '03b8975b-207', '03bf9052-res-223', '03e838e8-res-181', '0414a2be-res-298', '0429f7fd-141', '045a7c1b-76', '049920a1-res-231', '04b8c186-161', '04c0072c-32', '04eae9b5-73', '04f25595-56', '04f7b297-142', '0530a7cd-res-102', '054d87a2-93', '05d43b20-res-14', '05e7ac1d-86', '060b3906-res-87', '062930e9-146', '0641b17f-res-61', '06480fe7-res-65', '06b03b57-174', '06d5e28b-189', '06fd3254-166', '0736c422-res-263', '075141d1-19', '075bf84d-146', '07a3b993-10', '07a8ac78-171', '07dc225c-res-105', '07eb3ae5-1', '07f5ffab-res-316', '07f843ca-6

In [45]:
# split images into train, val and test set

import random

def split_dataset(filenames):
    # Assuming you have a list of filenames called "all_filenames"
    random.shuffle(filenames)

    # Calculate the size of each set
    num_files = len(filenames)
    num_train = int(0.8 * num_files)  # 80% for training
    num_val = int(0.1 * num_files)   # 10% for validation
    num_test = num_files - num_train - num_val  # remaining 10% for test

    # Split the list into three sets
    train_filenames = filenames[:num_train]
    val_filenames = filenames[num_train:num_train+num_val]
    test_filenames = filenames[num_train+num_val:]

    # Print the sizes of each set
    print(f"Number of files in train set: {len(train_filenames)}")
    print(f"Number of files in validation set: {len(val_filenames)}")
    print(f"Number of files in test set: {len(test_filenames)}")
    
    return train_filenames, val_filenames, test_filenames

In [46]:
# Create folder structure
import os
dir_base = ['detection', 'segmentation']
modes = ['train', 'val', 'test']
for base in dir_base:
    for mode in modes:
        path = 'data/' + base + '/' + mode +'/images'
        os.makedirs(path, exist_ok=True)
        path = 'data/' + base + '/' + mode +'/labels'
        os.makedirs(path, exist_ok=True)

In [47]:
import shutil

#copy image and label file of given file name to their corresponding folders in new folderstructure
def copy_pair(src_dir, target_dir, file_name, mode):
    src = src_dir + '/images/' + file_name + ".jpg"
    dst = target_dir + '/' + mode +'/images/' + file_name + ".jpg"
    shutil.copy2(src, dst)
    src = src_dir + '/labels/' + file_name + ".txt"
    dst = target_dir + '/' + mode +'/labels/' + file_name + ".txt"
    shutil.copy2(src, dst)

#for each set copy all labels and images of this set to corresponding 
def copy_split(src_dir, target_dir):
    for name in train_filenames:
        copy_pair(src_dir, target_dir, name, 'train')
    for name in val_filenames:
        copy_pair(src_dir, target_dir, name, 'val')
    for name in test_filenames:
        copy_pair(src_dir, target_dir, name, 'test')

# create data.yaml file for YOLO training
def create_data_yaml(src_dir, target_dir):
    with open(target_dir + '/' + 'data.yaml', 'w') as outfile:
        outfile.write('# Train/val/test sets as\n')
        outfile.write('## 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]\n')
        outfile.write('\n\n')
        outfile.write('path: ../' + target_dir + ' # dataset root dir\n')
        outfile.write('train: train/images         # train images (relative to \'path\')\n')
        outfile.write('val: val/images             # val images (relative to \'path\')\n')
        outfile.write('test: test/images           # test images (optional)\n')
        outfile.write('\n')
        outfile.write('# Classes\n')
        classes = [] 
        with open(src_dir + '/' + 'classes.txt', 'r') as infile:
            num_of_classes = len(infile.readlines())
            print(num_of_classes)
            outfile.write('nc: ' + str(num_of_classes) + '   # number of classes\n')
            infile.seek(0)  # move the pointer to beginning of the file
            for lines in infile:
                line = lines.rstrip()   # this  functin removes newline character
                classes.append(line)
            print(classes)
            outfile.write('names: ' + str(classes) + '       # class names\n')

# split detection filenames
train_filenames, val_filenames, test_filenames = split_dataset(detection_filenames)
copy_split(detection_directory, 'data/detection')
create_data_yaml(detection_directory, 'data/detection')

# split segmentation filenames
train_filenames, val_filenames, test_filenames = split_dataset(segmentation_filenames)
copy_split(segmentation_directory, 'data/segmentation')
create_data_yaml(segmentation_directory, 'data/segmentation')

Number of files in train set: 1649
Number of files in validation set: 206
Number of files in test set: 207
1
['Gauge Face']
Number of files in train set: 1600
Number of files in validation set: 200
Number of files in test set: 200
1
['Needle']
