In [1]:
import torch
import os
import numpy as np
import shutil

dataset_dir = os.path.join("D:\\Data\\University\\MOTSynth","dataset_multiclip")

#we have too many identities (data), so let's just keep the first k identities with the highest number of pictures.
training_identities = 1500
testing_identities = 750

max_number_queries_per_id = 4
max_number_gallery_per_id = 20

gallery_ratio = 0.80        # % of testing set that will be gallery. the remaining will be queries.

def get_folders_from_folder(path):
    elements = os.listdir(path)
    dirs = [el for el in elements if os.path.isdir(os.path.join(path, el))]
    return np.asarray(dirs)

def get_files_from_folder(path):
    elements = os.listdir(path)
    return np.asarray(elements)   

In [2]:
dirs = get_folders_from_folder(dataset_dir)


dtype = [('path', 'U50'), ('len', int)]
values = [(p, len(os.listdir(os.path.join(dataset_dir,p)))) for p in dirs]

a = np.array(values, dtype=dtype)       # create a structured array
a = np.flip(np.sort(a, order='len'))

total_number = training_identities+testing_identities
to_keep = a[:total_number]
to_delete = a[total_number:]

for i in to_delete:
    shutil.rmtree(os.path.join(dataset_dir,i['path']))

print(len(to_keep))
dirs = np.array([el['path'] for el in to_keep])


np.random.shuffle(dirs)
#renaming
for idx, e in enumerate(dirs):
    #print(e)
    newname = f"{idx}_{e}"
    #print(newname)
    shutil.move(os.path.join(dataset_dir,e), os.path.join(dataset_dir, newname))


636


In [3]:
dtype = [('path', 'U50'), ('id', int)]
values = [(p, int(p.split('_')[0])) for p in get_folders_from_folder(dataset_dir)]

a = np.array(values, dtype=dtype)       # create a structured array
a = np.sort(a, order='id')
dirs = [el['path'] for el in a]
n_folders = len(dirs)

train_dir = os.path.join(dataset_dir,"train")
os.mkdir(train_dir)
queries_dir = os.path.join(dataset_dir,"queries")
os.mkdir(queries_dir)
gallery_dir = os.path.join(dataset_dir,"gallery")
os.mkdir(gallery_dir)

train_set, test_set = np.split(dirs, [training_identities])

#print("train: ",train_set, len(train_set))
#print("test: ", test_set, len(test_set))

for i in train_set:
    shutil.move(os.path.join(dataset_dir,i), os.path.join(train_dir,i))


In [4]:
for i in test_set:
    src_dir = os.path.join(dataset_dir,i)
    query_dst = os.path.join(queries_dir,i)
    os.mkdir(query_dst)
    gallery_dst = os.path.join(gallery_dir,i)
    os.mkdir(gallery_dst)

    elements = get_files_from_folder(src_dir)
    np.random.shuffle(elements)
    n_elements = len(elements)
    idx = round(gallery_ratio * n_elements)
    idx = min(idx, n_elements-1)    # we want at least one query
    gallery, queries = np.split(elements, [idx])
    
    for (idx, j) in enumerate(gallery):
        shutil.move(os.path.join(src_dir,j), os.path.join(gallery_dst,j))
        if idx + 1 >= max_number_gallery_per_id:
            break

    for (idx, j) in enumerate(queries):
        shutil.move(os.path.join(src_dir,j), os.path.join(query_dst,j))
        if idx + 1 >= max_number_queries_per_id:
            break
    shutil.rmtree(src_dir)
    #print("Split ", i, " into gallery & queries")   