Can't use tensorflow example with TFRecord file or FileReader #18

jxmelody · 2018-07-03T08:38:00Z

the directory "example/tensorflow" introduces a method to use DALI with tensorflow ,but it's implemented by ops.MXNetReader to read images and labels . When I want to use ops.FileReader or ops.TFRecordReader to reader file ,the error occur ：

DALI data_tensor_shape = ShapeAt(&pipe_handle_, 0) failed: [/opt/dali/dali/pipeline/data/tensor.h:188] Assert on "tl->IsDenseTensor()" failed: All tensors in the input TensorList must have the same shape and be densely packed.

I have no idea what is the error mean, and each pipeline classes return the same data struct, why MXNetReader can work while the others can not?

my code(which is mostly copied from the example):

class RN50Pipeline(Pipeline):
    def __init__(self, batch_size, num_threads, device_id, num_gpus):
        super(RN50Pipeline, self).__init__(batch_size,
                                         num_threads,
                                         device_id)
        self.input = ops.MXNetReader(path = rec_files, index_path = idx_files,
                                     shard_id = device_id, num_shards = num_gpus)

        self.decode = ops.nvJPEGDecoder(device = "mixed", output_type = types.RGB)
        self.resize = ops.Resize(device = "gpu", random_resize = True,
                                 resize_a = 256, resize_b = 480,
                                 image_type = types.RGB,
                                 interp_type = types.INTERP_LINEAR)
        self.cmn = ops.CropMirrorNormalize(device = "gpu",
                                            output_dtype = types.FLOAT,
                                            crop = (227, 227),
                                            image_type = types.RGB,
                                            mean = [128., 128., 128.],
                                            std = [1., 1., 1.])
        self.uniform = ops.Uniform(range = (0.0, 1.0))

    def define_graph(self):
        inputs, labels = self.input(name="Reader")
        images = self.decode(inputs)
        images = self.resize(images)
        output = self.cmn(images, crop_pos_x = self.uniform(),
                          crop_pos_y = self.uniform())
        return (output, labels.gpu())
class FileReadPipeline(Pipeline):
    def __init__(self,batch_size, num_threads, device_id):
        super(FileReadPipeline, self).__init__(batch_size, num_threads, device_id, seed = 12)
        self.input = ops.FileReader(file_root = image_dir, random_shuffle = True, initial_fill = 21)
        self.decode = ops.nvJPEGDecoder(device = "mixed", output_type = types.RGB)
        self.resize = ops.Resize(device = "gpu", random_resize = True, 
                                 resize_a = 256, resize_b = 480,
                                 image_type = types.RGB,
                                 interp_type = types.INTERP_LINEAR)
        self.cmn = ops.CropMirrorNormalize(device = "gpu",
                                            output_dtype = types.FLOAT,
                                            crop = (227, 227),
                                            image_type = types.RGB,
                                            mean = [128., 128., 128.],
                                            std = [1., 1., 1.])
        self.uniform = ops.Uniform(range = (0.0, 1.0))
    def define_graph(self):
        jpegs, labels = self.input()
        images = self.decode(jpegs)
        resized_images = self.resize(images)
        output = self.cmn(resized_images, crop_pos_x = self.uniform(),
                           crop_pos_y = self.uniform())
        # images are on the GPU
        return (output, labels.gpu())

class TFRecordPipeline(Pipeline):
    def __init__(self, batch_size, num_threads, device_id):
        super(TFRecordPipeline, self).__init__(batch_size,
                                         num_threads,
                                         device_id)
        self.input = ops.TFRecordReader(path = tfrecord, 
                                        index_path = tfrecord_idx,
                                        features = {"image/encoded" : tfrec.FixedLenFeature((), tfrec.string, ""),
                                         "image/class/text":          tfrec.FixedLenFeature((), tfrec.string, "")})
        self.decode = ops.nvJPEGDecoder(device = "mixed", output_type = types.RGB)
        self.resize = ops.Resize(device = "gpu", random_resize = True,
                                 resize_a = 256, resize_b = 480,
                                 image_type = types.RGB,
                                 interp_type = types.INTERP_LINEAR)
        self.cmn = ops.CropMirrorNormalize(device = "gpu",
                                            output_dtype = types.FLOAT,
                                            crop = (227, 227),
                                            image_type = types.RGB,
                                            mean = [128., 128., 128.],
                                            std = [1., 1., 1.])
        self.uniform = ops.Uniform(range = (0.0, 1.0))

    def define_graph(self):
        inputs = self.input()
        images = self.decode(inputs["image/encoded"])
        resized_images = self.resize(images)
        output = self.cmn(resized_images, crop_pos_x = self.uniform(),
                           crop_pos_y = self.uniform())
        return (output, inputs["image/class/text"].gpu())

def get_batch_test_dali(batch_size):
  
    global DEVICES

    pipes = [FileReadPipeline(batch_size=BATCH_SIZE, num_threads=2, device_id = device_id) for device_id in range(DEVICES)]#not work
    # pipes = [RN50Pipeline(batch_size=BATCH_SIZE, num_threads=2, device_id = device_id,num_gpus = DEVICES) for device_id in range(DEVICES)]#work
    # pipes = [TFRecordPipeline(batch_size=batch_size, num_threads=2, device_id = 0) for device_id in range(DEVICES)]#not work

    serialized_pipes = [pipe.serialize() for pipe in pipes]
    del pipes
    daliop = dali_tf.DALIIterator()
    images = []
    labels = []
    for d in range(DEVICES):
        with tf.device('/gpu:%i' % d):
            image, label = daliop(serialized_pipeline = serialized_pipes[d],
                batch_size = BATCH_SIZE,
                height = 227,
                width = 227,
                device_id = d)
            images.append(image)
            labels.append(label)

    return [images, labels]

def main_run():
 
    test_batch = get_batch_test_dali( BATCH_SIZE)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        step = 0
        while step < NUM_DATA / BATCH_SIZE + 1:
            print('step', step)
            imgs = []
            get_batch = sess.run(test_batch)  #error occurs
            for i in range(len(images)):
                img = images[0][0][i].transpose((1,2,0)) + 128
                imgs.append(img)
            maxx = sess.run(softmax, feed_dict={x: imgs})
            step = step + 1
   sess.close()

The text was updated successfully, but these errors were encountered:

ptrendx · 2018-07-03T14:17:11Z

Hmm, that's strange - @Kh4L could you look at it?

JanuszL · 2018-07-03T14:26:41Z

Hi,
For me following code works (corrected some errors from your example):

class FileReadPipeline(Pipeline):
    def __init__(self,batch_size, num_threads, device_id):
        super(FileReadPipeline, self).__init__(batch_size, num_threads, device_id, seed = 12)
        self.input = ops.FileReader(file_root = image_dir, random_shuffle = True, initial_fill = 21)
        self.decode = ops.nvJPEGDecoder(device = "mixed", output_type = types.RGB)
        self.resize = ops.Resize(device = "gpu", random_resize = True, 
                                resize_a = 256, resize_b = 480,
                                image_type = types.RGB,
                                interp_type = types.INTERP_LINEAR)
        self.cmn = ops.CropMirrorNormalize(device = "gpu",
                                            output_dtype = types.FLOAT,
                                            crop = (227, 227),
                                            image_type = types.RGB,
                                            mean = [128., 128., 128.],
                                            std = [1., 1., 1.])
        self.uniform = ops.Uniform(range = (0.0, 1.0))

    def define_graph(self):
        jpegs, labels = self.input()
        images = self.decode(jpegs)
        resized_images = self.resize(images)
        output = self.cmn(resized_images, crop_pos_x = self.uniform(),
                        crop_pos_y = self.uniform())
        # images are on the GPU
        return (output, labels.gpu())

I will check TFRecordPipeline too later.

jxmelody · 2018-07-03T16:15:20Z

Hi @JanuszL ,
Thanks for your reply.
I checked my code and I found it's somewhere confusing cause I changed it before I raise this issue... so I modified it.
Actually, the most confused question of me is the serialize part ... What's the diffrence between serialized pipeline and non-serialized pipeline ？must serialized pipeline when using tensorflow ？
At last, I noticed that your code above are different with mine in last line:

return (output, labels,.gpu())
#return (output,labels)

So, is the .gpu() neccessary?

(sorry, I cant run the code right now,but I will check if the change can work tommrow (9 hours later... ), thanks again ;))

Kh4L · 2018-07-03T18:31:14Z

Hi @jxmelody ,
For TFRecordPipeline you have to define the graph by overriding def define_graph(self):.

As @JanuszL wrote, in FileReadPipeline you are returning images that is images = self.decode(jpegs). You actually want to return output (from CropMirrorNormalize).

Yes, .gpu() is necassary because FileReader returns CPU Tensors and tensorflow-gpu expects only GPU Tensors as Output.
So your define_graph's should return

return (output, labels.gpu())

The serialized pipeline is a string representing your pipeline in protobuf format: it contains parameters and the graph defined in Python. Our DALI-Tensorflow op needs this serialized pipeline to build and run internally the actual DALI Pipeline.

jxmelody · 2018-07-04T08:05:57Z

Hi @Kh4L ,
Sorry ... I forgot to paste the define_graph function of TFRecordPipeline , but I actually wrote this function and that not work... Anyway, I will add it to my code above, and you can check this.

For FileReaderPipeline, by correcting the errors as @JanuszL wrote, it can work now, It's my mistake. Thank both of you!

But TFRecordPipeline still can not work ,even though I have return .gpu() as you said .

Thank you for explaining the serialization . So as far as I understand, it's necessary to serialize the pipline first before use sess.run([image, label]), isn't it?

JanuszL · 2018-07-06T11:10:26Z

Hi,
Yes, indeed you need to serialize the pipeline first, then use it to initialize daliop. Please follow TensorFlow-ResNet50 example.
TFRecordPipeline works for me. You need to use image/class/label as labels, image/class/text is rather human-readable representation not meant for training.

class TFRecordPipeline(Pipeline):
    def __init__(self, batch_size, num_threads, device_id):
        super(TFRecordPipeline, self).__init__(batch_size,
                                        num_threads,
                                        device_id)
        self.input = ops.TFRecordReader(path = tfrecord, 
                                        index_path = tfrecord_idx,
                                        features = {"image/encoded" : tfrec.FixedLenFeature((), tfrec.string, ""),
                                        'image/class/label':         tfrec.FixedLenFeature([1], tfrec.int64,  -1)
                                        })
        self.decode = ops.nvJPEGDecoder(device = "mixed", output_type = types.RGB)
        self.resize = ops.Resize(device = "gpu", resize_a = 256, resize_b = 256)
        self.cmnp = ops.CropMirrorNormalize(device = "gpu",
                                            output_dtype = types.FLOAT,
                                            crop = (224, 224),
                                            image_type = types.RGB,
                                            mean = [0., 0., 0.],
                                            std = [1., 1., 1.])
        self.uniform = ops.Uniform(range = (0.0, 1.0))

    def define_graph(self):
        inputs = self.input()
        images = self.decode(inputs["image/encoded"])
        resized_images = self.resize(images)
        output = self.cmnp(resized_images, crop_pos_x = self.uniform(),
                        crop_pos_y = self.uniform())
        return (output, inputs["image/class/label"].gpu())

cliffwoolley · 2018-07-12T19:45:33Z

What's the next step to improve this one? Do we need to document the example better, improve it generally, ... ?

JanuszL · 2018-07-12T19:48:15Z

I think we need to provide example with different readers doing the same at the end.

JanuszL · 2018-07-12T23:32:13Z

Tracked internally in DALI-133

jxmelody · 2018-07-13T10:11:26Z

@JanuszL thanks, you are right . I changed texts to label, then it works!

JanuszL · 2018-07-19T18:45:22Z

I just prepared some example based on your experience, #58.

JanuszL · 2018-07-25T08:59:30Z

Just merged #58.
It will be included in next minor release.

biswajitcsecu · 2018-08-13T10:22:57Z

it is working.

cliffwoolley added the examples label Jul 12, 2018

JanuszL added this to the Release_0.1.1+ milestone Jul 19, 2018

JanuszL closed this as completed Jul 25, 2018

Jingsnow mentioned this issue Dec 11, 2019

Volatile GPU-Util=100% #1563

Closed

pawopawo mentioned this issue Jul 18, 2021

NVJPEG error "6" : NVJPEG_STATUS_EXECUTION_FAILED #3160

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Can't use tensorflow example with TFRecord file or FileReader #18

Can't use tensorflow example with TFRecord file or FileReader #18

jxmelody commented Jul 3, 2018 •

edited

ptrendx commented Jul 3, 2018

JanuszL commented Jul 3, 2018 •

edited

jxmelody commented Jul 3, 2018

Kh4L commented Jul 3, 2018 •

edited

jxmelody commented Jul 4, 2018

JanuszL commented Jul 6, 2018

cliffwoolley commented Jul 12, 2018

JanuszL commented Jul 12, 2018

JanuszL commented Jul 12, 2018

jxmelody commented Jul 13, 2018

JanuszL commented Jul 19, 2018

JanuszL commented Jul 25, 2018

biswajitcsecu commented Aug 13, 2018

Can't use tensorflow example with TFRecord file or FileReader #18

Can't use tensorflow example with TFRecord file or FileReader #18

Comments

jxmelody commented Jul 3, 2018 • edited

ptrendx commented Jul 3, 2018

JanuszL commented Jul 3, 2018 • edited

jxmelody commented Jul 3, 2018

Kh4L commented Jul 3, 2018 • edited

jxmelody commented Jul 4, 2018

JanuszL commented Jul 6, 2018

cliffwoolley commented Jul 12, 2018

JanuszL commented Jul 12, 2018

JanuszL commented Jul 12, 2018

jxmelody commented Jul 13, 2018

JanuszL commented Jul 19, 2018

JanuszL commented Jul 25, 2018

biswajitcsecu commented Aug 13, 2018

jxmelody commented Jul 3, 2018 •

edited

JanuszL commented Jul 3, 2018 •

edited

Kh4L commented Jul 3, 2018 •

edited