# PREPARING A LARGE-SCALE IMAGE DATASET WITH TENSORFLOW'S TFRECORD FILES


#### Data Structure
flowers\
    
    flower_photos\
    
        tulips\
            ....jpg
            ....jpg
            ....jpg
        sunflowers\
            ....jpg
        roses\
            ....jpg
        dandelion\
            ....jpg
        daisy\
            ....jpg

# WRITING A TFRECORD FILE

In [3]:
import random
import tensorflow as tf
from dataset_utils import _dataset_exists, _get_filenames_and_classes, write_label_file, _convert_dataset


dataset_dir = "flowers/"
validation_size = 0.3
num_shards = 2
random_seed = 0
tfrecord_filename = 'flower'

#=============CHECKS==============
#Check if there is a tfrecord_filename entered
if not tfrecord_filename:
    raise ValueError('tfrecord_filename is empty. Please state a tfrecord_filename argument.')

#Check if there is a dataset directory entered
if not dataset_dir:
    raise ValueError('dataset_dir is empty. Please state a dataset_dir argument.')

#If the TFRecord files already exist in the directory, then exit without creating the files again
if _dataset_exists(dataset_dir = dataset_dir, _NUM_SHARDS = num_shards, output_filename = tfrecord_filename):
    print ('Dataset files already exist. Exiting without re-creating them.')
#==========END OF CHECKS============

#Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories.
photo_filenames, class_names = _get_filenames_and_classes(dataset_dir)

print(len(photo_filenames))
#Refer each of the class name to a specific integer number for predictions later
class_names_to_ids = dict(zip(class_names, range(len(class_names))))

#Find the number of validation examples we need
num_validation = int(validation_size * len(photo_filenames))

# Divide the training datasets into train and test:
random.seed(random_seed)
random.shuffle(photo_filenames)
training_filenames = photo_filenames[num_validation:]
validation_filenames = photo_filenames[:num_validation]

# First, convert the training and validation sets.
_convert_dataset('train', training_filenames, class_names_to_ids,
                 dataset_dir = dataset_dir,
                 tfrecord_filename = tfrecord_filename,
                 _NUM_SHARDS = num_shards)
_convert_dataset('validation', validation_filenames, class_names_to_ids,
                 dataset_dir = dataset_dir,
                 tfrecord_filename = tfrecord_filename,
                 _NUM_SHARDS = num_shards)

# Finally, write the labels file:
labels_to_class_names = dict(zip(range(len(class_names)), class_names))
write_label_file(labels_to_class_names, dataset_dir)

print ( '\nFinished converting the %s dataset!' % (tfrecord_filename))

3670
>> Converting image 2569/2569 shard 1
>> Converting image 1101/1101 shard 1

Finished converting the flower dataset!
