# Pascal VOC -> yolo format

In [None]:
import glob
import os
import pickle
import xml.etree.ElementTree as ET
from os import listdir, getcwd
from os.path import join


def convert(size, box):
    dw = 1./(size[0])
    dh = 1./(size[1])
    x = (box[0] + box[1])/2.0 - 1
    y = (box[2] + box[3])/2.0 - 1
    w = box[1] - box[0]
    h = box[3] - box[2]
    x = x*dw
    w = w*dw
    y = y*dh
    h = h*dh
    return (x, y, w, h)


def convert_annotation(xml_file, output_file_path, classes):

    in_file = open(xml_file)
    out_file = open(output_file_path, 'w')
    tree = ET.parse(in_file)
    root = tree.getroot()
    size = root.find('size')
    w = int(size.find('width').text)
    h = int(size.find('height').text)

    for obj in root.iter('object'):
        #difficult = obj.find('difficult').text
        cls = obj.find('name').text
        if cls not in classes:# or int(difficult) == 1:
            continue
        cls_id = classes.index(cls)
        xmlbox = obj.find('bndbox')
        b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(
            xmlbox.find('ymin').text), float(xmlbox.find('ymax').text))
        bb = convert((w, h), b)
        out_file.write(str(cls_id) + " " +
                       " ".join([str(a) for a in bb]) + '\n')


In [None]:
import os
def get_files_in_folder_as_array(path):
    files = []
    
    if os.path.lexists(path):
        for filename in os.listdir(path):
            files.append(filename.split(".")[0])
    return files


In [None]:
from sklearn.model_selection import train_test_split
from pathlib import Path

dataset_location = "/cluster/projects/vc/courses/TDT17/2022/open/RDD2022/"
subsets = ["China_Drone", "China_MotorBike", "Czech", "India", "Japan", "United_States", "Norway"]
subsets = ["Norway"]
classes = ['D00', 'D10', 'D20', 'D40']


for subset in subsets:
    path = dataset_location + subset

    train_images_location = path + "/train/images"
    test_images_location = path + "/test/images"
    labels_location = path + "/train/annotations/xmls/"

    train_val_files = get_files_in_folder_as_array(train_images_location)
    test_files = get_files_in_folder_as_array(test_images_location)  
    
    train_files, val_files = train_test_split(train_val_files, test_size=0.2, random_state=42)


    ####### Make folders and symlink in correct folders ######
    new_path = "./data/RDD2022/" + subset

    for path, files in zip(["/train/", "/val/", "/test/"], [train_files, val_files, test_files]):
        for f in files:
            img_path = new_path + "/images" +  path + f + ".jpg"
            img_src = train_images_location + "/" + f + ".jpg"
            
            Path(new_path + "/images" + path).mkdir(parents=True, exist_ok=True)
            if not os.path.lexists(img_path):
                os.symlink(img_src, img_path)
            
            # Converting xml annotations
            label_path = new_path + "/labels" + path + f + ".txt"
            label_src = labels_location + f + ".xml"
            if os.path.lexists(label_src) and not os.path.lexists(label_path):
                Path(new_path + "/labels" + path).mkdir(parents=True, exist_ok=True)
                convert_annotation(label_src, label_path, classes=classes)
