# Rozdelenie dát do trénovacej, validačnej a testovacej množiny

* Množiny musia byť navzájom disjunktné
* ak má niektorá snímka kópie, musia byť v rovnakej množine ako pôvodný obrázok
* **TP**: pôvodné obrázky
* **hf**: horizontálne prevrátené obrázky
* **vf**: vertikálne prevrátené obrázky
* **syn**: syntetické obrázky, ktoré vznikli umiestneným TLE javu na iný obrázok
* **rot**: rotované obrázky
* **TN**: obrázky bez výskytu TLE

### Výsledné rozdelenie dát

|                    |  **TP**  | **hf** | **vf** | **syn**| **rot** | **TN** | **Num. of images** | **%** |
|--------------------|----------|--------|--------|--------|---------|--------|--------------------|-------|
| **Training Set**   |    399   |  399   |  399   |   374  |   399   |  100   |        2070        | 76.2  |
| **Validation Set** |     47   |   47   |   47   |   45   |    47   |  0     |        233         | 8.6  |
| **Testing Set**    |    14    |    0   |    0   |    0   |    0    |  400   |        414         | 15.3  |
| **SUMMARY**        |  **460** |**460** |**460** |**419** | **460** |**500** |      **2717**      |**100**|

### Načítanie knižníc

In [49]:
import os
import random
import shutil

### Načítanie názvov obrázkov a .txt súborov do premenných

In [None]:
directory = "/home/jovyan/data/lightning/LiviaMurankova/DATA/new/"
lists_folders = ["New_TP","New_Horizontal_Flip","New_Vertical_Flip","New_Synthetic_Data","New_Random_Rotation","TN"]

image_lists = {"TP_images": [], "hf_images": [], "vf_images": [], "syn_images": [], "rot_images": [], "TN_images": []}
txt_lists = {"TP_txt": [], "hf_txt": [], "vf_txt": [], "syn_txt": [], "rot_txt": []}
types = ["images","txt"]

tmp_i = []
tmp_t = []
for i, list_name in enumerate(lists_folders):
    for j in types:
        print(f"{directory}{list_name}/"+j)
        if j == "images":
            list_i = os.listdir(f"{directory}{list_name}/"+j)
            list_i = list(filter(lambda x: '.jpg' in x, list_i))
            tmp_i.append(list_i)
        elif ((j == "txt") and (list_name != "TN")):
            list_t = os.listdir(f"{directory}{list_name}/"+j)
            list_t = list(filter(lambda x: '.txt' in x, list_t))
            tmp_t.append(list_t)

for dictionary in [image_lists, txt_lists]:
    i=0
    for key in dictionary.keys():
        if dictionary == image_lists:
            dictionary[key] = tmp_i[i]
        elif dictionary == txt_lists:
            dictionary[key] = tmp_t[i]
        i+=1

### Počty súborov v jednotlivých priečinkoch

In [None]:
for dictionary in [image_lists, txt_lists]:
    for x, y in dictionary.items():
        print(str(x) + " : " + str(len(y)))

### Rozdelenie TP na originály a kópie

In [None]:
# rozdelenie TP na originaly a kopie

TP_copies=[]
TP_original=[]
for item in image_lists["TP_images"]:
    if "copy" in item:
        TP_copies.append(item)
    elif "copy" not in item:
        TP_original.append(item)
print(len(TP_copies))
print(len(TP_original))
random.sample(image_lists["TP_images"], 5)

### Priradenie kópií k vybraným tp

In [None]:
# priradenie kopii k vybranym tp
valid_images = random.sample(TP_original, 52)

# add copies, if needed
for item in valid_images:
    for copy in TP_copies:
        if((item[:-4] in copy) and (copy not in valid_images)):
            valid_images.append(copy) 
print(valid_images)
print(len(valid_images))

### Validačná množina

In [None]:
# validation set

directory = "/home/jovyan/data/lightning/LiviaMurankova/DATA/new/"
dest_directory = "/home/jovyan/data/lightning/LiviaMurankova/YOLOv8/DATA_SPLIT_FOR_YOLO10/"

# select random 67 TP images
valid_images = random.sample(TP_original, 42)

# add copies, if needed
for item in valid_images:
    for copy in TP_copies:
        if((item[:-4] in copy) and (copy not in valid_images)):
            valid_images.append(copy)           
        

# create file with image names and copy images to YOLO valid dataset
open('/home/jovyan/data/lightning/LiviaMurankova/YOLOv8/valid_dataset10.txt', 'w').close()
file = open('/home/jovyan/data/lightning/LiviaMurankova/YOLOv8/valid_dataset10.txt','w')
for item in valid_images:
    file.write(dest_directory + "val/images/" + item + "\n")
    shutil.copy(f"{directory}" + "New_TP/images/" + item, dest_directory + "val/images/" + item)
    shutil.copy(f"{directory}" + "New_TP/txt/" + item.split(".")[0] + ".txt", dest_directory + "val/labels/" + item.split(".")[0] + ".txt")

    event_name = item[3:-3]
    keys_i = ["hf_images", "vf_images", "syn_images", "rot_images"]
    for key in keys_i:
        for image in image_lists.get(key):    
            if(event_name in image):
                file.write(dest_directory + "val/images/" + image + "\n")
                if(key == "hf_images"):
                    shutil.copy(f"{directory}" + "New_Horizontal_Flip/images/" + image, dest_directory + "val/images/" + image)
                elif(key == "vf_images"):
                    shutil.copy(f"{directory}" + "New_Vertical_Flip/images/" + image, dest_directory + "val/images/" + image)
                elif(key == "syn_images"):
                    shutil.copy(f"{directory}" + "New_Synthetic_Data/images/" + image, dest_directory + "val/images/" + image)
                elif(key == "rot_images"):
                    shutil.copy(f"{directory}" + "New_Random_Rotation/images/" + image, dest_directory + "val/images/" + image)
                #print("IMAGE : " + image)
                
    keys_t = ["hf_txt", "vf_txt", "syn_txt", "rot_txt"]
    for key in keys_t:
        for txt in txt_lists.get(key):
            if(event_name in txt):
                if(key == "hf_txt"):
                    shutil.copy(f"{directory}" + "New_Horizontal_Flip/txt/" + txt, dest_directory + "val/labels/" + txt)
                elif(key == "vf_txt"):
                    shutil.copy(f"{directory}" + "New_Vertical_Flip/txt/" + txt, dest_directory + "val/labels/" + txt)
                elif(key == "syn_txt"):
                    shutil.copy(f"{directory}" + "New_Synthetic_Data/txt/" + txt, dest_directory + "val/labels/" + txt)
                elif(key == "rot_txt"):
                    shutil.copy(f"{directory}" + "New_Random_Rotation/txt/" + txt, dest_directory + "val/labels/" + txt)
file.close()

# remove choosen images from the TP_images list
image_lists["TP_images"] = set(image_lists["TP_images"]) - set(valid_images)
print("num. of TP: " + str(len(image_lists["TP_images"])))

### Testovacia množina

In [None]:
# test set
# 14 pozit na 400 negat

# select random 14 (TP) + 317 (TN) images
tp_select = random.sample(image_lists["TP_images"], 14)
tn_select = random.sample(image_lists["TN_images"], 400)
test_images = tp_select + tn_select

# add copies, if needed
for item in test_images:
    for copy in TP_copies:
        if((item[0:2] == "TP") and (item[:-4] in copy) and (copy not in test_images)):
            test_images.append(copy) 

# create file with image names and copy images to YOLO valid dataset
open('/home/jovyan/data/lightning/LiviaMurankova/YOLOv8/test_dataset10.txt', 'w').close()
file = open('/home/jovyan/data/lightning/LiviaMurankova/YOLOv8/test_dataset10.txt','w')
for item in test_images:
    file.write(dest_directory + "test/images/" + item + "\n")
    if item[0:2] == "TP":
        shutil.copy(f"{directory}" + "New_TP/images/" + item, dest_directory + "test/images/" + item)
        #shutil.copy(f"{directory}" + "TP/txt/" + item.split(".")[0] + ".txt", dest_directory + "test/labels/" + item.split(".")[0] + ".txt")
    else:
        shutil.copy(f"{directory}" + "TN/images/" + item, dest_directory + "test/images/" + item)        
file.close()

# remove choosen images from the TP_images list
image_lists["TP_images"] = set(image_lists["TP_images"]) - set(tp_select)
image_lists["TN_images"] = set(image_lists["TN_images"]) - set(tn_select)
print("num. of TP: " + str(len(image_lists["TP_images"])))
print("num. of TN: " + str(len(image_lists["TN_images"])))

### Trénovacia množina

In [None]:
# train set

directory = "/home/jovyan/data/lightning/LiviaMurankova/DATA/new/"
dest_directory = "/home/jovyan/data/lightning/LiviaMurankova/YOLOv8/DATA_SPLIT_FOR_YOLO10/"

train_images=[]
# select random 379 TP images
for item in image_lists["TP_images"]:
    if((item not in test_images) and (item not in valid_images)):
        train_images.append(item)

#train_images = random.sample(image_lists["TP_images"], 379)
tn_train_images = random.sample(image_lists["TN_images"], 100)

# create file with image names and copy images to YOLO valid dataset
open('/home/jovyan/data/lightning/LiviaMurankova/YOLOv8/train_dataset10.txt', 'w').close()
file = open('/home/jovyan/data/lightning/LiviaMurankova/YOLOv8/train_dataset10.txt','w')
for item in train_images:
    file.write(dest_directory + "train/images/" + item + "\n")
    shutil.copy(f"{directory}" + "New_TP/images/" + item, dest_directory + "train/images/" + item)
    shutil.copy(f"{directory}" + "New_TP/txt/" + item.split(".")[0] + ".txt", dest_directory + "train/labels/" + item.split(".")[0] + ".txt")
    
    event_name = item[3:-3]
    keys_i = ["hf_images", "vf_images", "syn_images", "rot_images"]
    for key in keys_i:
        for image in image_lists.get(key):    
            if(event_name in image):
                file.write(dest_directory + "train/images/" + image + "\n")
                if(key == "hf_images"):
                    shutil.copy(f"{directory}" + "New_Horizontal_Flip/images/" + image, dest_directory + "train/images/" + image)
                elif(key == "vf_images"):
                    shutil.copy(f"{directory}" + "New_Vertical_Flip/images/" + image, dest_directory + "train/images/" + image)
                elif(key == "syn_images"):
                    shutil.copy(f"{directory}" + "New_Synthetic_Data/images/" + image, dest_directory + "train/images/" + image)
                elif(key == "rot_images"):
                    shutil.copy(f"{directory}" + "New_Random_Rotation/images/" + image, dest_directory + "train/images/" + image)
                #print("IMAGE : " + image)
                
    keys_t = ["hf_txt", "vf_txt", "syn_txt", "rot_txt"]
    for key in keys_t:
        for txt in txt_lists.get(key):
            if(event_name in txt):
                if(key == "hf_txt"):
                    shutil.copy(f"{directory}" + "New_Horizontal_Flip/txt/" + txt, dest_directory + "train/labels/" + txt)
                elif(key == "vf_txt"):
                    shutil.copy(f"{directory}" + "New_Vertical_Flip/txt/" + txt, dest_directory + "train/labels/" + txt)
                elif(key == "syn_txt"):
                    shutil.copy(f"{directory}" + "New_Synthetic_Data/txt/" + txt, dest_directory + "train/labels/" + txt)
                elif(key == "rot_txt"):
                    shutil.copy(f"{directory}" + "New_Random_Rotation/txt/" + txt, dest_directory + "train/labels/" + txt)
                #print("TXT : " + txt)
for item in tn_train_images:
    file.write(dest_directory + "train/images/" + item + "\n")
    shutil.copy(f"{directory}" + "TN/images/" + item, dest_directory + "train/images/" + item)     
file.close()

# remove choosen images from the TP_images list
image_lists["TP_images"] = set(image_lists["TP_images"]) - set(train_images)
print("num. of TP: " + str(len(image_lists["TP_images"])))

image_lists["TN_images"] = set(image_lists["TN_images"]) - set(tn_train_images)
print("num. of TN: " + str(len(image_lists["TN_images"])))