# Preparing dataset for Dinov2 (Key Point Detection)

Dinov2 requires the following format: train, val and test folders. These folders need to be further divided into images and labels as shown below:
```text
--training_data
    --train
        --images
        --labels
    --val
        --images
        --labels
```
For each image in the images folder there is needs to be a .npy numpy file with the same name in the corresponding labels folder.

Chanse the source image directory accordingly.

In [None]:
# Get all file names for detection

import os

print(os.getcwd())

# Detection file directories
image_directory = 'data/images_cropped_new'
# image_directory = 'data/images_cropped_new/images'
# label_directory = 'data/images_cropped_new/labels'

# get all detection image file names
filenames = []
for filename in os.listdir(image_directory + '/images'):
    filenames.append(filename[:-4]) # last 4 characters are '.jpg'

# print(filenames)

In [10]:
# split images into train, val and test set

import random

def split_dataset(filenames):
    # Assuming you have a list of filenames called "all_filenames"
    random.shuffle(filenames)

    # Calculate the size of each set
    num_files = len(filenames)
    num_train = int(0.8 * num_files)  # 80% for training
    num_val = int(0.1 * num_files)   # 10% for validation
    num_test = num_files - num_train - num_val  # remaining 10% for test

    # Split the list into three sets
    train_filenames = filenames[:num_train]
    val_filenames = filenames[num_train:num_train+num_val]
    test_filenames = filenames[num_train+num_val:]

    # Print the sizes of each set
    print(f"Number of files in train set: {len(train_filenames)}")
    print(f"Number of files in validation set: {len(val_filenames)}")
    print(f"Number of files in test set: {len(test_filenames)}")
    
    return train_filenames, val_filenames, test_filenames

train_filenames, val_filenames, test_filenames = split_dataset(filenames)

Number of files in train set: 1649
Number of files in validation set: 206
Number of files in test set: 207


In [11]:
# Create folder structure
import os
dir_base = ['key_point_detection']
modes = ['train', 'val', 'test']
for base in dir_base:
    for mode in modes:
        path = 'data/' + base + '/' + mode +'/images'
        os.makedirs(path, exist_ok=True)
        path = 'data/' + base + '/' + mode +'/labels'
        os.makedirs(path, exist_ok=True)

In [12]:
import shutil

#copy image and label file of given file name to their corresponding folders in new folderstructure
def copy_pair(src_dir, target_dir, file_name, mode):
    src = src_dir + '/images/' + file_name + ".jpg"
    dst = target_dir + '/' + mode +'/images/' + file_name + ".jpg"
    shutil.copy2(src, dst)
    src = src_dir + '/labels/' + file_name + ".npy"
    dst = target_dir + '/' + mode +'/labels/' + file_name + ".npy"
    shutil.copy2(src, dst)

#for each set copy all labels and images of this set to corresponding 
def copy_split(src_dir, target_dir):
    for name in train_filenames:
        copy_pair(src_dir, target_dir, name, 'train')
    for name in val_filenames:
        copy_pair(src_dir, target_dir, name, 'val')
    for name in test_filenames:
        copy_pair(src_dir, target_dir, name, 'test')

# split detection filenames
copy_split(image_directory, 'data/' + dir_base[0])