Skip to content
Permalink
Browse files
btrfs: fix deadlock when defragging transparent huge pages
Attempting to defragment a Btrfs file containing a transparent huge page
immediately deadlocks with the following stack trace:

  #0  context_switch (kernel/sched/core.c:4940:2)
  #1  __schedule (kernel/sched/core.c:6287:8)
  #2  schedule (kernel/sched/core.c:6366:3)
  #3  io_schedule (kernel/sched/core.c:8389:2)
  #4  wait_on_page_bit_common (mm/filemap.c:1356:4)
  #5  __lock_page (mm/filemap.c:1648:2)
  torvalds#6  lock_page (./include/linux/pagemap.h:625:3)
  torvalds#7  pagecache_get_page (mm/filemap.c:1910:4)
  torvalds#8  find_or_create_page (./include/linux/pagemap.h:420:9)
  torvalds#9  defrag_prepare_one_page (fs/btrfs/ioctl.c:1068:9)
  torvalds#10 defrag_one_range (fs/btrfs/ioctl.c:1326:14)
  torvalds#11 defrag_one_cluster (fs/btrfs/ioctl.c:1421:9)
  torvalds#12 btrfs_defrag_file (fs/btrfs/ioctl.c:1523:9)
  torvalds#13 btrfs_ioctl_defrag (fs/btrfs/ioctl.c:3117:9)
  torvalds#14 btrfs_ioctl (fs/btrfs/ioctl.c:4872:10)
  torvalds#15 vfs_ioctl (fs/ioctl.c:51:10)
  torvalds#16 __do_sys_ioctl (fs/ioctl.c:874:11)
  torvalds#17 __se_sys_ioctl (fs/ioctl.c:860:1)
  torvalds#18 __x64_sys_ioctl (fs/ioctl.c:860:1)
  torvalds#19 do_syscall_x64 (arch/x86/entry/common.c:50:14)
  torvalds#20 do_syscall_64 (arch/x86/entry/common.c:80:7)
  torvalds#21 entry_SYSCALL_64+0x7c/0x15b (arch/x86/entry/entry_64.S:113)

A huge page is represented by a compound page, which consists of a
struct page for each PAGE_SIZE page within the huge page. The first
struct page is the "head page", and the remaining are "tail pages".

Defragmentation attempts to lock each page in the range. However,
lock_page() on a tail page actually locks the corresponding head page.
So, if defragmentation tries to lock more than one struct page in a
compound page, it tries to lock the same head page twice and deadlocks
with itself.

Ideally, we should be able to defragment transparent huge pages.
However, THP for filesystems is currently read-only, so a lot of code is
not ready to use huge pages for I/O. For now, let's just return
ETXTBUSY.

This can be reproduced with the following on a kernel with
CONFIG_READ_ONLY_THP_FOR_FS=y:

  $ cat create_thp_file.c
  #include <fcntl.h>
  #include <stdbool.h>
  #include <stdio.h>
  #include <stdint.h>
  #include <stdlib.h>
  #include <unistd.h>
  #include <sys/mman.h>

  static const char zeroes[1024 * 1024];
  static const size_t FILE_SIZE = 2 * 1024 * 1024;

  int main(int argc, char **argv)
  {
          if (argc != 2) {
                  fprintf(stderr, "usage: %s PATH\n", argv[0]);
                  return EXIT_FAILURE;
          }
          int fd = creat(argv[1], 0777);
          if (fd == -1) {
                  perror("creat");
                  return EXIT_FAILURE;
          }
          size_t written = 0;
          while (written < FILE_SIZE) {
                  ssize_t ret = write(fd, zeroes,
                                      sizeof(zeroes) < FILE_SIZE - written ?
                                      sizeof(zeroes) : FILE_SIZE - written);
                  if (ret < 0) {
                          perror("write");
                          return EXIT_FAILURE;
                  }
                  written += ret;
          }
          close(fd);
          fd = open(argv[1], O_RDONLY);
          if (fd == -1) {
                  perror("open");
                  return EXIT_FAILURE;
          }

          /*
           * Reserve some address space so that we can align the file mapping to
           * the huge page size.
           */
          void *placeholder_map = mmap(NULL, FILE_SIZE * 2, PROT_NONE,
                                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
          if (placeholder_map == MAP_FAILED) {
                  perror("mmap (placeholder)");
                  return EXIT_FAILURE;
          }

          void *aligned_address =
                  (void *)(((uintptr_t)placeholder_map + FILE_SIZE - 1) & ~(FILE_SIZE - 1));

          void *map = mmap(aligned_address, FILE_SIZE, PROT_READ | PROT_EXEC,
                           MAP_SHARED | MAP_FIXED, fd, 0);
          if (map == MAP_FAILED) {
                  perror("mmap");
                  return EXIT_FAILURE;
          }
          if (madvise(map, FILE_SIZE, MADV_HUGEPAGE) < 0) {
                  perror("madvise");
                  return EXIT_FAILURE;
          }

          char *line = NULL;
          size_t line_capacity = 0;
          FILE *smaps_file = fopen("/proc/self/smaps", "r");
          if (!smaps_file) {
                  perror("fopen");
                  return EXIT_FAILURE;
          }
          for (;;) {
                  for (size_t off = 0; off < FILE_SIZE; off += 4096)
                          ((volatile char *)map)[off];

                  ssize_t ret;
                  bool this_mapping = false;
                  while ((ret = getline(&line, &line_capacity, smaps_file)) > 0) {
                          unsigned long start, end, huge;
                          if (sscanf(line, "%lx-%lx", &start, &end) == 2) {
                                  this_mapping = (start <= (uintptr_t)map &&
                                                  (uintptr_t)map < end);
                          } else if (this_mapping &&
                                     sscanf(line, "FilePmdMapped: %ld", &huge) == 1 &&
                                     huge > 0) {
                                  return EXIT_SUCCESS;
                          }
                  }

                  sleep(6);
                  rewind(smaps_file);
                  fflush(smaps_file);
          }
  }
  $ ./create_thp_file huge
  $ btrfs fi defrag -czstd ./huge

Signed-off-by: Omar Sandoval <osandov@fb.com>
  • Loading branch information
osandov committed Oct 19, 2021
1 parent 5ece081 commit 39c9507c4eca68030bd65cab791db0aaf5314360
Showing 1 changed file with 14 additions and 0 deletions.
@@ -1069,6 +1069,20 @@ static struct page *defrag_prepare_one_page(struct btrfs_inode *inode,
if (!page)
return ERR_PTR(-ENOMEM);

/*
* Since we can defragment files opened read-only, we can encounter
* transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
* can't do I/O using huge pages yet, so return an error for now.
* Filesystem transparent huge pages are typically only used for
* executables that explicitly enable them, so this isn't very
* restrictive.
*/
if (PageCompound(page)) {
unlock_page(page);
put_page(page);
return -ETXTBSY;
}

ret = set_page_extent_mapped(page);
if (ret < 0) {
unlock_page(page);

0 comments on commit 39c9507

Please sign in to comment.