In [1]:
import random 
import os
import shutil

In [2]:
# Merge files from the source dirs to a unified data directory
data_dir = 'all_data'
filenames = data_dir + '/filename_list.txt'

if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    os.makedirs(data_dir + '/images')
    os.makedirs(data_dir + '/annotations')

filelist = []

source_dirs = ['bosch_mini', 'sim', 'rosbag']
# source_dirs = ['sim', 'rosbag']
for source_dir in source_dirs:

    # Read source filename_list.txt
    source_filenames = open(source_dir + '_data/filename_list.txt', 'r').read().split('\n')
    source_filenames.remove('')

    # copying image + annotation files
    for f in source_filenames:
        try:
            # copy image
            src = source_dir + '_data/images/' + f + '.jpg'
            dst = 'all_data/images/' + source_dir + '_' + f + '.jpg'
            shutil.copyfile(src, dst)

        except:
            print("Error: {} -- file not found".format(source_dir + '_data/images/' + f + '.jpg') )
            continue

        try:
            src = source_dir + '_data/annotations/' + f + '.xml'
            dst = 'all_data/annotations/' + source_dir + '_' + f + '.xml'

            # copy annotation and also modify the filename reference in the xml
            xml_content = open(src, 'r').read()
            repl_src = '<filename>' + f + '.jpg</filename>'
            repl_dst = '<filename>' + source_dir + '_' + f + '.jpg</filename>'
            xml_content = xml_content.replace(repl_src, repl_dst)
            
            fh = open(dst, "w") 
            fh.write(xml_content)
            fh.close()

        except:
            print("Error: {} -- file not found".format(source_dir + '_data/annotations/' + f + '.xml') )
            continue

        filelist.append(source_dir + '_' + f)

# dumping unified filenames
with open(filenames, mode='wt', encoding='utf-8') as myfile:
    myfile.write('\n'.join(filelist))

In [3]:
# usage: put this Jupyter notebook in the same directory as the train+valid dataset 
# set val_percent and run.  

# data_dir = 'rosbag_data' # directory for train + val data 
# data_dir = 'bosch_mini_data' # directory for train + val data 
# data_dir = 'sim_data' # directory for train + val data
data_dir = 'all_data' # directory for train + val data
filename = data_dir+'/filename_list.txt' # file names are in it 
val_percent = 0.2 # percentage of validation set 
random.seed(1789) # random seed
# read all the file names  and randomize 
filename_trainval = open(filename,'r').read().split('\n')
if '' in filename_trainval:
    filename_trainval.remove('')

random.shuffle(filename_trainval) # randomize 
n_samples = len(filename_trainval) 
print("number of samples: ", n_samples)

number of samples:  7518


In [4]:
# create list of file names for  validation set
filename_val = random.sample(filename_trainval, int(val_percent*n_samples))
# create list of file names for training set 
filename_train= [f for f in filename_trainval if f not in filename_val]

print("number of samples in training set:", len(filename_train))
print("number of samples in validation set:", len(filename_val))

number of samples in training set: 6015
number of samples in validation set: 1503


In [5]:
# create a folder for training and validation sets, 
# and put image and annotation files 

for set_ in ['train', 'val']:
    
    # createa a directory and subdirectory `images`, `annotations`
    set_dir = data_dir+'_'+set_
    if not os.path.exists(set_dir):
        os.makedirs(set_dir)
    annotation_dir = set_dir+'/annotations'
    image_dir = set_dir+'/images'
    if not os.path.exists(image_dir):
        os.makedirs(image_dir)
    if not os.path.exists(annotation_dir):
        os.makedirs(annotation_dir)
        
    if set_ == 'train':
        filename_set = filename_train
    elif set_ == 'val':
        filename_set = filename_val

    # create text file listing file names 
    with open(set_dir+'/filename_list.txt', 'w') as fl:  

        for f in filename_set:
            fl.write('%s\n' % f)

    fl.close()

    for f in filename_set:
        # copy image files
        try:
            image_name = f+'.jpg'
            shutil.copyfile(data_dir+'/images/'+image_name, image_dir+'/'+image_name)
        except:
            print( "[IMAGE] error: {} not found".format(data_dir+'/images/'+f+'.jpg'))

        # copy annotation files
        try:
            annotation_name = f+'.xml'
            shutil.copyfile(data_dir+'/annotations/'+annotation_name, annotation_dir+'/'+annotation_name)
        except:
            print( "[ANNOTATION] error: {} not found".format(data_dir+'/images/'+f+'.xml'))
            
