<a href="https://colab.research.google.com/github/mamoan/NOVA_DL_home_exercise/blob/main/3_train_val_split.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 3 split the data into training (70%) and validation (30%)
## OBJECTIVE: randomly split the annotated data into
- *training* (70% of the tiles): set of data used for learning (by the model), that is, to fit the parameters to the machine learning model.
- *validation* set (30%): Set of data used to provide an unbiased evaluation of a model fitted on the training dataset while tuning model hyperparameters.

We also need a third test dataset for a fully independent evaluation of model's performance on unseen data. In this home exercise the test data are already taken out of the data.

**OUTPUT:**
- train and validation data organized in the following folders:

```
├── train
│   ├── images
│   └── labels
├── val
│   ├── images
│   └── labels
```


In [None]:
annotator_ID=8 # change this to your folder ID
path_to_tiles_small="/content/drive/MyDrive/NOVA_course_home_exercise/data/annotated_data/train/"+str(annotator_ID)
path_to_tiles_full="/content/drive/MyDrive/NOVA_course_home_exercise/data/annotated_data/train/full_data"

# define split for training and validation
split_train= 0.7 #
split_val=1-split_train

### 3.1 Load libraries

In [None]:
import os
import shutil
import random

# mount google drive
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### 3.2 Create train and validation directories and subdivide each into "images" and "labels" sub-directories

In [None]:
# Full data
train_dir_full = os.path.join(path_to_tiles_full, "train")
os.makedirs(train_dir_full, exist_ok=True) # creates new directory for training data
val_dir_full = os.path.join(path_to_tiles_full, "val")
os.makedirs(val_dir_full, exist_ok=True) # creates new directory for validation data
val_img_dir_full = os.path.join(path_to_tiles_full, "val","images")
os.makedirs(val_img_dir_full, exist_ok=True) # creates new directory for training data
train_img_dir_full = os.path.join(path_to_tiles_full, "train","images")
os.makedirs(train_img_dir_full, exist_ok=True) # creates new directory for training data
val_ann_dir_full = os.path.join(path_to_tiles_full, "val","labels")
os.makedirs(val_ann_dir_full, exist_ok=True) # creates new directory for training data
train_ann_dir_full = os.path.join(path_to_tiles_full, "train","labels")
os.makedirs(train_ann_dir_full, exist_ok=True) # creates new directory for training data

In [None]:
# Smaller dataset
train_dir_small = os.path.join(path_to_tiles_small, "train")
os.makedirs(train_dir_small, exist_ok=True) # creates new directory for training data
val_dir_small = os.path.join(path_to_tiles_small, "val")
os.makedirs(val_dir_small, exist_ok=True) # creates new directory for validation data
val_img_dir_small = os.path.join(path_to_tiles_small, "val","images")
os.makedirs(val_img_dir_small, exist_ok=True) # creates new directory for training data
train_img_dir_small = os.path.join(path_to_tiles_small, "train","images")
os.makedirs(train_img_dir_small, exist_ok=True) # creates new directory for training data
val_ann_dir_small = os.path.join(path_to_tiles_small, "val","labels")
os.makedirs(val_ann_dir_small, exist_ok=True) # creates new directory for training data
train_ann_dir_small = os.path.join(path_to_tiles_small, "train","labels")
os.makedirs(train_ann_dir_small, exist_ok=True) # creates new directory for training data

### 3.3 Randomly sample tiles

#### For smaller dataset

In [None]:
# Get a list of all the .txt files in the data directory
txt_files = [f for f in os.listdir(path_to_tiles_small) if f.endswith(".txt")]
img_files = [f for f in os.listdir(path_to_tiles_small) if f.endswith(".tif")]

In [None]:
# remove .txt files that have no image (not sure why ?)
txt_files_with_tif = []
for txt_file in txt_files:
    # get the base name of the text file
    txt_base_name = os.path.basename(txt_file)
    # replace the file extension with .tif to get the corresponding tif file name
    img_file = os.path.join(os.path.dirname(txt_file), os.path.splitext(txt_base_name)[0] + '.tif')
    img_file=path_to_tiles_small+"/"+img_file
    #print("txt: "+txt_file)
    #print("tif: "+img_file)
    # check if the tif file exists
    if os.path.exists(img_file):
      #print("path to image " + img_file + " does not exist!")
      txt_files_with_tif.append(txt_file)



In [None]:
txt_files=txt_files_with_tif

# Shuffle the list of text files
random.shuffle(txt_files)
#train=random.sample(txt_files, )

# Calculate the number of files for the train and validation sets
train_size = int(0.7 * len(txt_files))
val_size = len(txt_files) - train_size

In [None]:
print(len(txt_files))
print(val_size)

0
0


Move the text annotation files and respective images to the train and validation directories

In [None]:
# iterate through each annotated .txt file
for i, txt_file in enumerate(txt_files):
    if i < train_size:
        dest_dir = train_dir_small
    else:
        dest_dir = val_dir_small
    #print("path to "+path_to_tiles+"/"+txt_file+" exists: "+ str(os.path.exists(txt_file)))
    if os.path.exists(path_to_tiles_small+"/"+txt_file):
      src_file = os.path.join(path_to_tiles_small, txt_file)
      src_img = os.path.join(path_to_tiles_small, os.path.splitext(txt_file)[0]+".tif")
      if os.path.exists(src_img):
        dest_file = os.path.join(dest_dir,"labels", txt_file)
        dest_img = os.path.join(dest_dir,"images", os.path.splitext(txt_file)[0]+".tif")
        #print("copying files")
        shutil.move(src_file, dest_file)
        shutil.move(src_img, dest_img)



#### For the full dataset

In [None]:
# Get a list of all the .txt files in the data directory
txt_files = [f for f in os.listdir(path_to_tiles_full) if f.endswith(".txt")]
img_files = [f for f in os.listdir(path_to_tiles_full) if f.endswith(".tif")]

In [None]:
print(len(txt_files))
print(len(img_files))

383
3039


In [None]:
# remove .txt files that have no image (not sure why ?)
txt_files_with_tif = []
for txt_file in txt_files:
    # get the base name of the text file
    txt_base_name = os.path.basename(txt_file)
    # replace the file extension with .tif to get the corresponding tif file name
    img_file = os.path.join(os.path.dirname(txt_file), os.path.splitext(txt_base_name)[0] + '.tif')
    img_file=path_to_tiles_full+"/"+img_file
    #print("txt: "+txt_file)
    #print("tif: "+img_file)
    # check if the tif file exists
    if os.path.exists(img_file):
      #print("path to image " + img_file + " does not exist!")
      txt_files_with_tif.append(txt_file)

In [None]:
txt_files=txt_files_with_tif

# Shuffle the list of text files
random.shuffle(txt_files)
#train=random.sample(txt_files, )

# Calculate the number of files for the train and validation sets
train_size = int(0.7 * len(txt_files))
val_size = len(txt_files) - train_size

In [None]:
# iterate through each annotated .txt file
for i, txt_file in enumerate(txt_files):
    if i < train_size:
        dest_dir = train_dir
    else:
        dest_dir = val_dir
    print("path to "+path_to_tiles_full+"/"+txt_file+" exists: "+ str(os.path.exists(txt_file)))
    if os.path.exists(path_to_tiles_full+"/"+txt_file):
      src_file = os.path.join(path_to_tiles_full, txt_file)
      src_img = os.path.join(path_to_tiles_full, os.path.splitext(txt_file)[0]+".tif")
      if os.path.exists(src_img):
        dest_file = os.path.join(dest_dir,"labels", txt_file)
        dest_img = os.path.join(dest_dir,"images", os.path.splitext(txt_file)[0]+".tif")
        #print("copying files")
        shutil.move(src_file, dest_file)
        shutil.move(src_img, dest_img)