Error with last batch that is smaller than the others #78

skeydan · 2021-09-29T07:52:43Z

I'm getting an error due to the last batch of the training data loader being smaller than the others (it has 31 instead of 32 items).
(Something I think we got to disappear a long time ago.)

The trigger is a custom augmentation function that does resizings (which affect dimensions 3 & 4, but not 1).

The reproducing example is based on https://github.com/mlverse/luz/blob/master/vignettes/examples/pets-unet.Rmd - the model is identical, data loading code very similar, apart from a slight modification I needed (or thought I needed) to pet_dataset, and the use of augmentation.

Transforms that do not involve resizing don't trigger the error, so I'm not reproducing them here.

library(torch)
library(torchvision)
library(torchdatasets)
library(luz)

dir <- "~/.torch-datasets/oxford_pet_dataset"

pet_dataset <- torch::dataset(
  
  inherit = oxford_pet_dataset,
  
  initialize = function(..., size, augmentation = NULL) {
    
    self$augmentation <- augmentation
    
    input_transform <- function(x) {
      x %>%
        transform_to_tensor() %>%
        transform_resize(size) %>%
        transform_normalize(mean = c(0.485, 0.456, 0.406),
                            std = c(0.229, 0.224, 0.225))
    }
    
    target_transform <- function(x) {
      x <- torch_tensor(x, dtype = torch_long())
      x <- x[newaxis,..]
      x <- transform_resize(x, size, interpolation = 0)
    }
    
    self$split <- split
    
    super$initialize(
      ...,
      transform = input_transform,
      target_transform = target_transform
    )
    
  },
  .getitem = function(i) {
    
    item <- super$.getitem(i)
    if (!is.null(self$augmentation)) 
      self$augmentation(item)
    else
      list(x = item$x, y = item$y[1,..])
  }
)

augmentation <- function(item) {
  
  x <- item$x
  y <- item$y
  
  # more transforms ...

  angle <- 12
  
  x <- transform_resize(x, size = c(292, 292))
  y <- transform_resize(y, size = c(292, 292), interpolation = 0)
  
  x <- transform_rotate(x, angle)
  y <- transform_rotate(y, angle, resample = 0)
  
  x <- transform_center_crop(x, size = c(244, 244))
  y <- transform_center_crop(y, size = c(244, 244))
  
  # more transforms ...
  
  list(x = x, y = y[1,..])
  
}


train_ds <- pet_dataset(root = dir,
                        split = "train",
                        size = c(224, 224),
                        augmentation = augmentation)

valid_ds <- pet_dataset(root = dir,
                        split = "valid",
                        size = c(224, 224))


train_dl <- dataloader(train_ds, batch_size = 32, shuffle = TRUE)
valid_dl <- dataloader(valid_ds, batch_size = 32)


x <- coro::collect(train_dl, 1)

x[[1]]$y %>% torch_max()
x[[1]]$y %>% torch_min()

encoder <- torch::nn_module(
  initialize = function() {
    model <- model_mobilenet_v2(pretrained = TRUE)
    self$stages <- nn_module_list(list(
      nn_identity(),
      model$features[1:2],
      model$features[3:4],
      model$features[5:7],
      model$features[8:14],
      model$features[15:18]
    ))
    
    for (par in self$parameters) {
      par$requires_grad_(FALSE)
    }
    
  },
  forward = function(x) {
    features <- list()
    for (i in 1:length(self$stages)) {
      x <- self$stages[[i]](x)
      features[[length(features) + 1]] <- x
    }
    features
  }
)
# The decoder blocks are composed of a upsample layer + a convolution
# with same padding.
decoder_block <- nn_module(
  initialize = function(in_channels, skip_channels, out_channels) {
    self$upsample <- nn_conv_transpose2d(
      in_channels = in_channels, 
      out_channels = out_channels,
      kernel_size = 2,
      stride = 2
    )
    self$activation <- nn_relu()
    self$conv <- nn_conv2d(
      in_channels = out_channels + skip_channels, 
      out_channels = out_channels,
      kernel_size = 3,
      padding = "same"
    )
  },
  forward = function(x, skip) {
    x <- x %>% 
      self$upsample() %>% 
      self$activation()
    
    input <- torch_cat(list(x, skip), dim = 2)
    
    input %>% 
      self$conv() %>% 
      self$activation()
  }
)
# We build the decoder by making a sequence of `decoder_blocks` matching
# the sizes to be compatible with the encoder sizes.
decoder <- nn_module(
  initialize = function(
    decoder_channels = c(256, 128, 64, 32, 16),
    encoder_channels = c(16, 24, 32, 96, 320)
  ) {
    
    encoder_channels <- rev(encoder_channels)
    skip_channels <- c(encoder_channels[-1], 3)
    in_channels <- c(encoder_channels[1], decoder_channels)
    
    depth <- length(encoder_channels)
    
    self$blocks <- nn_module_list()
    for (i in seq_len(depth)) {
      self$blocks$append(decoder_block(
        in_channels = in_channels[i],
        skip_channels = skip_channels[i],
        out_channels = decoder_channels[i]
      ))
    }
    
  },
  forward = function(features) {
    features <- rev(features)
    x <- features[[1]]
    for (i in seq_along(self$blocks)) {
      x <- self$blocks[[i]](x, features[[i+1]])
    }
    x
  }
)
# FInally the model is the composition of encoder and decoder + an output 
# layer that will produce the distribution for each one of the possible 
# classes.
model <- nn_module(
  initialize = function() {
    self$encoder <- encoder()
    self$decoder <- decoder()
    self$output <- nn_sequential(
      nn_conv2d(16, 3, 3, padding = "same")
    )
  },
  forward = function(x) {
    x %>% 
      self$encoder() %>% 
      self$decoder() %>% 
      self$output()
  }
)

model <- model %>%
  setup(optimizer = optim_adam, loss = nn_cross_entropy_loss())


fitted <- model %>%
  set_opt_hparams(lr = 1e-3) %>%
  fit(train_dl, epochs = 10, valid_data = valid_dl)

Error:

Error in (function (tensors, dim)  : 
  Sizes of tensors must match except in dimension 2. Got 31 and 32 (The offending index is 0)
Exception raised from check_shape_except_dim at /pytorch/aten/src/ATen/native/cuda/Shape.cu:192 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x69 (0x7fadf40f31d9 in /home/key/libtorch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0xd2 (0x7fadf40ef812 in /home/key/libtorch/lib/libc10.so)
frame #2: at::native::(anonymous namespace)::check_shape_except_dim(at::Tensor const&, at::Tensor const&, int, int) + 0x17a (0x7facee7b4f1a in /home/key/libtorch/lib/libtorch_cuda_cu.so)
frame #3: at::native::cat_out_cuda(c10::ArrayRef<at::Tensor>, long, at::Tensor&) + 0x879 (0x7facee7b5859 in /home/key/libtorch/lib/libtorch_cuda_cu.so)
frame #4: at::native:

The text was updated successfully, but these errors were encountered:

dfalbel · 2021-09-29T13:42:56Z

Is it really the batch dimension?
I think it's probably a convolution that is returning a size 31 image tensor instead of the expected 32.

Ther problem is likely in transform_center_crop which seems to be returning tensors of different sizes:

dim(transform_center_crop(torch_randn(3, 225, 224), size = c(244, 244)))
3 244 243

skeydan · 2021-09-29T13:55:42Z

Hm... I may have been mislead by the 0 in

Sizes of tensors must match except in dimension 2. Got 31 and 32 (The offending index is `0`)

BTW, that 2 is R-level, and the 0 is Python, right?

Still ... all images are 292x292 before the crop:

 x <- transform_resize(x, size = c(292, 292))
  y <- transform_resize(y, size = c(292, 292), interpolation = 0)
  
  x <- transform_rotate(x, angle)
  y <- transform_rotate(y, angle, resample = 0)
  
  x <- transform_center_crop(x, size = c(244, 244))
  y <- transform_center_crop(y, size = c(244, 244))

...?

dfalbel · 2021-09-29T18:28:11Z

Actually the expected initial size is c(224, 224)

If you change the lines:

 x <- transform_center_crop(x, size = c(224, 224))
  y <- transform_center_crop(y, size = c(224, 224))

That should work

skeydan · 2021-09-29T19:05:10Z

ugh. thanks.

skeydan added the bug Something isn't working label Sep 29, 2021

dfalbel removed the bug Something isn't working label Sep 29, 2021

skeydan closed this as completed Sep 29, 2021

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Error with last batch that is smaller than the others #78

Error with last batch that is smaller than the others #78

skeydan commented Sep 29, 2021

dfalbel commented Sep 29, 2021 •

edited

Loading

skeydan commented Sep 29, 2021

dfalbel commented Sep 29, 2021 •

edited

Loading

skeydan commented Sep 29, 2021

Error with last batch that is smaller than the others #78

Error with last batch that is smaller than the others #78

Comments

skeydan commented Sep 29, 2021

dfalbel commented Sep 29, 2021 • edited Loading

skeydan commented Sep 29, 2021

dfalbel commented Sep 29, 2021 • edited Loading

skeydan commented Sep 29, 2021

dfalbel commented Sep 29, 2021 •

edited

Loading

dfalbel commented Sep 29, 2021 •

edited

Loading