In [5]:
!unzip "/content/cell_images_32.zip" -d /content/cell_images_32 #unzip train data there

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
  inflating: /content/cell_images_32/cell_image_ (5776).png  
  inflating: /content/cell_images_32/cell_image_ (4366).png  
  inflating: /content/cell_images_32/cell_image_ (2884).png  
 extracting: /content/cell_images_32/cell_image_ (3498).png  
  inflating: /content/cell_images_32/cell_image_ (4110).png  
  inflating: /content/cell_images_32/cell_image_ (948).png  
  inflating: /content/cell_images_32/cell_image_ (403).png  
  inflating: /content/cell_images_32/cell_image_ (4972).png  
  inflating: /content/cell_images_32/cell_image_ (4493).png  
  inflating: /content/cell_images_32/cell_image_ (2130).png  
  inflating: /content/cell_images_32/cell_image_ (3352).png  
  inflating: /content/cell_images_32/cell_image_ (2742).png  
  inflating: /content/cell_images_32/cell_image_ (1999).png  
  inflating: /content/cell_images_32/cell_image_ (3241).png  
  inflating: /content/cell_images_32/cell_image_ (1979).png  
  inflating: /content/

In [6]:
import argparse
from pathlib import Path
import random
import sys
import json

import numpy as np
import tensorflow as tf

# Default data paths.
DEFAULT_TRAIN_INPUT_DIR = Path('/content/cell_images_32')
DEFAULT_TEST_INPUT_DIR = Path('/content/test')
DEFAULT_OUTPUT_DIR = Path('./tfrecords-output')
DEFAULT_NUM_SHARDS_TRAIN = 1
DEFAULT_NUM_SHARDS_TEST = 1

In [7]:
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))

def _float_feature(value):
  return tf.train.Feature(float_list=tf.train.FloatList(value=value))

In [8]:
args = None

parser = argparse.ArgumentParser()
parser.add_argument('--train-input-dir', type=Path, dest='train_input_dir',
                    default=DEFAULT_TRAIN_INPUT_DIR,
                    help='Train input directory to convert image files.')
parser.add_argument('--test-input-dir', type=Path, dest='test_input_dir',
                    default=DEFAULT_TEST_INPUT_DIR,
                    help='Test input directory to convert image files.')
parser.add_argument('--output-dir', type=Path, dest='output_dir',
                    default=DEFAULT_OUTPUT_DIR,
                    help='Output directory to store TFRecords files.')
parser.add_argument('--num-shards-train', type=int, dest='num_shards_train',
                    default=DEFAULT_NUM_SHARDS_TRAIN,
                    help='Number of shards to divide training set '
                         'TFRecords into.')
parser.add_argument('--num-shards-test', type=int, dest='num_shards_test',
                    default=DEFAULT_NUM_SHARDS_TEST,
                    help='Number of shards to divide test set '
                         'TFRecords into.')
args = parser.parse_args('')

In [13]:
class TFRecordsConverter(object):
    """Class that handles converting images to TFRecords."""

    def __init__(self):

        self.train_input_path = args.train_input_dir
        self.test_input_path = args.test_input_dir
        self.output_path = args.output_dir
        self.num_shards_train = args.num_shards_train
        self.num_shards_test = args.num_shards_test
        self.output_path.mkdir(exist_ok = True)

        self.train_filenames, self.num_files_train = self.process_image(self.train_input_path)

        json_dict = {'num_train_images': self.num_files_train}
        self.dump_json(json_dict)

        # Counter for total number of images processed.
        self.counter = 0

    def process_image(self, input_path):

        # Build the lists.
        images = []
        
        im_files = sorted(input_path.glob('*.png'))
        images += im_files

        num_files = len(images)

        # Randomize the order of all the images/labels.
        shuffled_indices = list(range(num_files))
        random.shuffle(shuffled_indices)

        filenames = [images[i] for i in shuffled_indices]

        return filenames, num_files

    def dump_json(self, json_dict):
        json_file = self.output_path / 'dataset.json'

        if json_file.exists():
            with json_file.open('r') as f:
                ori_json_dict = json.load(f)
            ori_json_dict.update(json_dict)
        else:
            ori_json_dict = json_dict

        with json_file.open('w') as f:
            json.dump(ori_json_dict, f)

    def write_tfrecords_file(self, output_path, filenames, indices):
        """Writes out TFRecords file."""
        writer = tf.io.TFRecordWriter(str(output_path))
        for i in indices:
            filename = filenames[i]
            try:
                im_data = filename.read_bytes()
            except:
                print("The image file cannot be read. ")
                sys.exit()

            # Example is a data format that contains a key-value store, where
            # each key maps to a Feature message. In this case, each Example
            # contains two features. One will be a ByteList for the raw image
            # data and the other will be an Int64List containing the index of
            # the corresponding label in the labels list from the file.
            example = tf.train.Example(features=tf.train.Features(feature={
                'filename': _bytes_feature([bytes(filename.name, encoding = "utf-8")]),
                'image/encoded': _bytes_feature([im_data]),}))
            writer.write(example.SerializeToString())
            self.counter += 1
            if not self.counter % 1000:
                print('Processed {} images...'.format(self.counter))
        writer.close()

    def convert(self):
        """This function will drive the conversion to TFRecords.

        Here, we partition the data into a training and testing set, then
        divide each data set into the specified number of TFRecords shards.
        """

        print('Processing training set TFRecords...')
        if self.num_shards_train > 1:
            files_per_shard = self.num_files_train // (self.num_shards_train-1)
        else:
            files_per_shard = 0

        start = 0
        for i in range(0, self.num_shards_train-1):
            shard_path = self.output_path / f'train-{i}.tfrecords'
            # Get a subset of indices to get only a subset of images/labels for
            # the current shard file.
            file_indices = np.arange(start, start+files_per_shard, dtype=int)
            start = start + files_per_shard
            self.write_tfrecords_file(shard_path, self.train_filenames, file_indices)

        # The remaining images will go in the final shard.
        file_indices = np.arange(start, self.num_files_train, dtype=int)
        if len(file_indices) > 0:
            final_shard_path = self.output_path / f'train-{self.num_shards_train}.tfrecords'
            self.write_tfrecords_file(final_shard_path, self.train_filenames, file_indices)

        if self.num_shards_test > 1:
            files_per_shard = self.num_files_test // (self.num_shards_test-1)
        else:
            files_per_shard = 0

        '''
        start = 0
        for i in range(0, self.num_shards_test-1):
            shard_path = self.output_path / f'test-{i}.tfrecords'
            # Get a subset of indices to get only a subset of images/labels for
            # the current shard file.
            file_indices = np.arange(start, start+files_per_shard, dtype=int)
            start = start + files_per_shard
            self.write_tfrecords_file(shard_path, self.test_filenames, self.test_labellist, file_indices)

        # The remaining images will go in the final shard.
        file_indices = np.arange(start, self.num_files_test, dtype=int)
        if len(file_indices) > 0:
            final_shard_path = self.output_path / f'test-{self.num_shards_test}.tfrecords'
            self.write_tfrecords_file(final_shard_path, self.test_filenames, self.test_labellist, file_indices)
        '''

        print(f'\nProcessed {self.counter} total images...')
        print(f'Number of training examples: {self.num_files_train}')
        #print(f'Number of test examples: {self.num_files_test}')
        print(f'TFRecords files saved to {str(self.output_path)}')

In [14]:
converter = TFRecordsConverter()
converter.convert()

Processing training set TFRecords...
Processed 1000 images...
Processed 2000 images...
Processed 3000 images...
Processed 4000 images...
Processed 5000 images...
Processed 6000 images...

Processed 6000 total images...
Number of training examples: 6000


AttributeError: ignored

In [15]:
!zip -r tfreocrds-output.zip ./tfrecords-output

  adding: tfrecords-output/ (stored 0%)
  adding: tfrecords-output/dataset.json (stored 0%)
  adding: tfrecords-output/train-1.tfrecords (deflated 17%)
