
# Data preparation

## DS1 Label transformation
This part deals with the transformation of the labels into the YOLOv5 format.

### Imports

In [None]:
import os
import random
import shutil
from shutil import copyfile

import numpy as np
from PIL import Image

### Read Test Data Labels

In [None]:
# Saves Lines in list

input_labels_test = "DS1\simpleTestFullSizeAllPotholesSortedFullAnnotation.txt"
with open(input_labels_test) as file:
    lines = []
    for line in file:
        lines.append(line.strip().replace("  ",""))

file.close()

# Example ["Test data\G0011476.bmp 2 2176 1580 242 44 1726 1458 106 26 "]
#lines[0]

### Create New Labels for Test Data

In [None]:
# for Test data
error_log_test = open("ErrorLogTest.txt","w+")
imgLabelNameTest = []

for line in lines:

 # Gain Information 
    line_list = line.split()
    name = line_list[1].replace("data\\"," ").replace(".bmp", "").strip()
    num_potholes = int(line_list[2])
    
    image_path = "DS1\TestData\\" + name + ".JPG"
    try:
        im = Image.open(image_path)
        img_width, img_height = im.size
    
      # gain information for individual potholes
        potholes = []
        for n in range(0,num_potholes):
        #x_center, y_center, width, height
            potholes.append([(int(line_list[3+4*n]) + int(line_list[5+4*n])/2) / img_width,
                             (int(line_list[4+4*n]) + int(line_list[6+4*n])/2) / img_height,
                             int(line_list[4+4*n]) / img_width,
                             int(line_list[5+4*n]) / img_height])
    
      #Write new file
        path = "DS1_new\labels\Test"
        new_file = open(path + "\\" + name+ ".txt", "w+")
        for n in range (0, num_potholes): 
            new_file.write("0 %f %f %f %f\n" %(potholes[n][0], potholes[n][1], potholes[n][2], potholes[n][3]))
        new_file.close()
        imgLabelNameTest.append(name)
    except:
        error_log_test.write("%s resulted in error\n" %name)
error_log_test.close()

### Read Training Data Labels

In [None]:
# Saves Lines in list

input_labels_test = "DS1\simpleTrainFullPhotosSortedFullAnnotations.txt"
with open(input_labels_test) as file:
    lines = []
    for line in file:
        lines.append(line.strip().replace("  ",""))

file.close()

# Example: "Train data\Positive data\G0010110.bmp 2 2002 1370 58 18 1896 1342 50 16"
lines[0]

'Train data\\Positive data\\G0010033.bmp 6 1990 1406 66 14 1464 1442 92 16 1108 1450 54 16 558 1434 102 16 338 1450 72 18 262 1450 58 22'

### Create New Labels for Training Data
(Note: only images with potholes are considered)

In [None]:
# for Train data
error_log_train = open("ErrorLogTrain.txt","w+")
imgLabelNameTrain = []


for line in lines:

 # Gain Information 
    line_list = line.split()
    name = line_list[2].replace("data\\"," ").replace(".bmp", "").strip()
    num_potholes = int(line_list[3])
            
    image_path = "DS1\TrainData\PositiveData\\" + name + ".JPG"
    try:
        im = Image.open(image_path)
        img_width, img_height = im.size
    
      # gain information for individual potholes and change format
        potholes = []
        for n in range(0,num_potholes):
        #x_center, y_center, width, height
            potholes.append([(int(line_list[4+4*n]) + int(line_list[6+4*n])/2) / img_width,
                             (int(line_list[5+4*n]) + int(line_list[7+4*n])/2) / img_height,
                             int(line_list[5+4*n]) / img_width,
                             int(line_list[6+4*n]) / img_height])
    
      #Write new file
        path = "DS1_new\labels\Train"
        new_file = open(path + "\\" + name + ".txt", "w+")
        for n in range (0, num_potholes): 
            new_file.write("0 %f %f %f %f\n" %(potholes[n][0], potholes[n][1], potholes[n][2], potholes[n][3]))
        new_file.close()
        imgLabelNameTrain.append(name)

    except:
        error_log_train.write("%s resulted in error\n" %name)
        
error_log_train.close()

### Copy the right pictures into new files

In [None]:
# Commented to avoid long runtimes
"""
for name in imgLabelNameTest:
    oldPath = "DS1\TestData\\" + name +".JPG"
    newPath = "DS1_new\images\TestData\\" + name + ".JPG"
    copyfile(oldPath, newPath)
    
for name in imgLabelNameTrain:
    oldPath = "DS1\TrainData\PositiveData\\" + name +".JPG"
    newPath = "DS1_new\images\TrainData\\" + name + ".JPG"
    copyfile(oldPath, newPath)
"""

'\nfor name in imgLabelNameTest:\n    oldPath = "DS1\\TestData\\" + name +".JPG"\n    newPath = "DS1_new\\images\\TestData\\" + name + ".JPG"\n    copyfile(oldPath, newPath)\n    \nfor name in imgLabelNameTrain:\n    oldPath = "DS1\\TrainData\\PositiveData\\" + name +".JPG"\n    newPath = "DS1_new\\images\\TrainData\\" + name + ".JPG"\n    copyfile(oldPath, newPath)\n'

In [None]:
convertedLabels = len(imgLabelNameTest)+len(imgLabelNameTrain)
print("Total images&labels converted to YOLO format: %d" %convertedLabels)

Total images&labels converted to YOLO format: 2030


## DS2 Removal of Dublicates

In [None]:
# create a list of images without dublicates (training data)

path = "DS2\\training\\images"
files = os.listdir(path)

pictureNumber=[]
for f in files:
    pictureNumber.append(int(f.replace("_png","").replace("potholes","").split(".")[0]))

imagesWithoutDublicates=[]
imagesWithoutDublicates.append(files[0])  # necessary because switches 
for i in range(1,len(pictureNumber)):
    if  pictureNumber[i] != pictureNumber[i-1]:
        imagesWithoutDublicates.append(files[i])

In [None]:
# Copy all chosen images into new folder
"""
for name in imagesWithoutDublicates:
    oldPath = "DS2\training\images\\"+ name
    newPath = "DS2_new\training\images\\"+ name
    copyfile(oldPath, newPath)
"""

'\nfor name in imagesWithoutDublicates:\n    oldPath = "DS2\training\\images\\"+ name\n    newPath = "DS2_new\training\\images\\"+ name\n    copyfile(oldPath, newPath)\n'

In [None]:
# Copy the labels for the chosen images
"""
for name in imagesWithoutDublicates:
    oldPath = "DS2\training\labels\\"+ name.replace(".jpg", ".txt")
    newPath = "DS2_new\training\labels\\"+ name.replace(".jpg", ".txt")
    copyfile(oldPath, newPath)
"""

'\nfor name in imagesWithoutDublicates:\n    oldPath = "DS2\training\\labels\\"+ name.replace(".jpg", ".txt")\n    newPath = "DS2_new\training\\labels\\"+ name.replace(".jpg", ".txt")\n    copyfile(oldPath, newPath)\n'

In [None]:
# Copy test data (no dublicates)
"""
files = os.listdir("DS2\test\images")
    for file in files:
        oldPath =  path + "\\" + file
        newPath = "DS2_new\test\images\\" + file
        copyfile(oldPath, newPath)
        
files = os.listdir("DS2\test\labels")
    for file in files:
        oldPath =  path + "\\" + file
        newPath = "DS2_new\test\labels\\" + file
        copyfile(oldPath, newPath)
"""

'\nfiles = os.listdir("DS2\test\\images")\n    for file in files:\n        oldPath =  path + "\\" + file\n        newPath = "DS2_new\test\\images\\" + file\n        copyfile(oldPath, newPath)\n        \nfiles = os.listdir("DS2\test\\labels")\n    for file in files:\n        oldPath =  path + "\\" + file\n        newPath = "DS2_new\test\\labels\\" + file\n        copyfile(oldPath, newPath)\n'

## Merge all Data together

In [None]:
# Create final dataset (without train-test split)

#  Get images:
path1 = "DS1_new\images\TestData"
path2 = "DS1_new\images\TrainData"
path3 = "DS2_new\training\images"
path4 = "DS2_new\test\images"

#get labels
path5 = "DS1_new\labels\Test"
path6 = "DS1_new\labels\Train"
path7 = "DS2_new\training\labels"
path8 = "DS2_new\test\labels"

imagepath = [path1, path2, path3, path4]
labelpath = [path5, path6, path7, path8]

"""
for path in imagepath:
    files = os.listdir(path)
    for file in files:
        oldPath =  path + "\\" + file
        newPath = "Dataset_final\images\\" + file
        copyfile(oldPath, newPath)
      
for path in labelpath:
    files = os.listdir(path)
    for file in files:
        oldPath =  path + "\\" + file
        newPath = "Dataset_final\labels\\" + file
        copyfile(oldPath, newPath)
"""

'\nfor path in imagepath:\n    files = os.listdir(path)\n    for file in files:\n        oldPath =  path + "\\" + file\n        newPath = "Dataset_final\\images\\" + file\n        copyfile(oldPath, newPath)\n      \nfor path in labelpath:\n    files = os.listdir(path)\n    for file in files:\n        oldPath =  path + "\\" + file\n        newPath = "Dataset_final\\labels\\" + file\n        copyfile(oldPath, newPath)\n'

## Split Train, Validation and Test Folders

In [None]:
# set root folder
root_dir = '.'
# set val and test ratio, this means 80%  training
val_ratio = 0.15
test_ratio = 0.05

In [None]:
# generate respective folders 

os.makedirs(root_dir +'/train/images')
os.makedirs(root_dir +'/val/images')
os.makedirs(root_dir +'/test/images')

os.makedirs(root_dir +'/train/labels')
os.makedirs(root_dir +'/val/labels')
os.makedirs(root_dir +'/test/labels')

In [None]:
# Creating and partitions of the data after shuffeling
src = root_dir + "/images" # Folder to copy images from
labels = root_dir + "/labels" # Folder to copy images from
allFileNames = os.listdir(src)
allLabelsFileNames = os.listdir(labels)

np.random.shuffle(allFileNames)
train_FileNames, val_FileNames, test_FileNames = np.split(np.array(allFileNames),
                                                            [int(len(allFileNames)* (1 - val_ratio + test_ratio)), 
                                                            int(len(allFileNames)* (1 - test_ratio))])

In [None]:
# display 
print('Total images: ', len(allFileNames))
print('Training: ', len(train_FileNames.tolist()))
print('Validation: ', len(val_FileNames.tolist()))
print('Testing: ', len(test_FileNames.tolist()))
print('Total Lables: ', len(allLabelsFileNames))

In [None]:
# Copy-pasting images
print("Training Data Generation")
for name in train_FileNames.tolist():
    image_path = src+'/'+ name
    label_path = labels+'/'+ name[:-3] + "txt" 
    
    if os.path.isfile(image_path) and os.path.isfile(label_path):
        shutil.copy(image_path, root_dir +'/train/images')
        shutil.copy(label_path, root_dir +'/train/labels' )

    else:
        print("skipping ---", name)

In [None]:
print("Validation Data Generation")
for name in val_FileNames.tolist():
    image_path = src+'/'+ name
    label_path = labels+'/'+ name[:-3] + "txt" 
    
    if os.path.isfile(image_path) and os.path.isfile(label_path):
        shutil.copy(image_path, root_dir +'/val/images')
        shutil.copy(label_path, root_dir +'/val/labels' )

    else:
        print("skipping ---", name)

In [None]:
print("Test Data Generation")
for name in test_FileNames.tolist():
    image_path = src+'/'+ name
    label_path = labels+'/'+ name[:-3] + "txt" 
    
    if os.path.isfile(image_path) and os.path.isfile(label_path):
        shutil.copy(image_path, root_dir +'/test/images')
        shutil.copy(label_path, root_dir +'/test/labels' )

    else:
        print("skipping ---", name)

In [None]:
""" 
Log: 

Total images:  2688
Training:  2419
Validation:  134
Testing:  135
Total Lables:  2638
Training Data Generation
skipping --- G0064865.JPG
skipping --- potholes215_png.rf.11648a404fb9add73a0e8022d2b9559b.jpg
skipping --- G0094405.JPG
skipping --- G0041017.JPG
skipping --- G0145175.JPG
skipping --- G0011880.JPG
skipping --- G0064638.JPG
skipping --- potholes23_png.rf.418ea12bcc6904b3191873bf121c32ef.jpg
skipping --- G0090207.JPG
skipping --- G0066574.JPG
skipping --- G0028159.JPG
skipping --- G0052788.JPG
skipping --- G0134510.JPG
skipping --- G0051947.JPG
skipping --- G0066950.JPG
skipping --- G0267325.JPG
skipping --- potholes341_png.rf.2cfce3be37b54c6dcaba768ec80ebea7.jpg
skipping --- G0090202.JPG
skipping --- G0010765.JPG
skipping --- G0077749.JPG
skipping --- G0028261.JPG
skipping --- G0052767.JPG
skipping --- G0031377.JPG
skipping --- G0052998.JPG
skipping --- G0065278.JPG
skipping --- G0064999.JPG
skipping --- G0029081.JPG
skipping --- G0043320.JPG
skipping --- G0088599.JPG
skipping --- G0090433.JPG
skipping --- G0066263.JPG
skipping --- G0024296.JPG
skipping --- G0064442.JPG
skipping --- G0065283.JPG
skipping --- G0064147.JPG
skipping --- potholes324_png.rf.4ef8eaa79b5ef11c295d7784aecfa173.jpg
skipping --- G0040979.JPG
skipping --- G0066592.JPG
skipping --- potholes487_png.rf.a11b763bb961f503ed530abc9896fd72(1).jpg
skipping --- G0063501.JPG
skipping --- potholes117_png.rf.00646a96915d539964beb023b376d75e.jpg
skipping --- potholes195_png.rf.351094a628a41b2ceeb3c98c62d60079.jpg
skipping --- G0043317.JPG
skipping --- G0053102.JPG
skipping --- potholes168_png.rf.36c4527e94ed35643354f3a73c675619.jpg
skipping --- G0043164.JPG
skipping --- G0030659.JPG
skipping --- potholes240_png.rf.313b6bf417d763ac30c78007edf7ea71.jpg
Validation Data Generation
skipping --- G0031365.JPG
Test Data Generation

"""

A part from this, dataset from https://public.roboflow.ai/object-detection/pothole is also added manulay. Final dataset consist of 2836 training images, 266 for validation and 201 for test.

**Ready to use data link: https://drive.google.com/file/d/1-lScdLoaW_yzuvQ9nph7raL76wuHz5v_/view?usp=sharing**