<a href="https://colab.research.google.com/github/olga-terekhova/pdf-utilities/blob/main/Generate_PDF_thumbnails.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [183]:
# initialize libraries
!pip install PyMuPDF

import fitz  # PyMuPDF
from math import ceil, floor, sqrt



In [184]:
def choose_grid(
    n,
    target_ratio=1.294
):
    """
    Choose (rows, cols) to fit n thumbnails on a letter-like page.
    Choose the grid with the rows : cols aspect ratio closest to target_ratio.
    """
    if n <= 0:
        raise ValueError("n must be a positive integer")

    best_aspect = None

    for cols in range(1, n + 1):
        rows = ceil(n / cols)
        ratio = rows / cols
        cells = rows * cols
        score = abs(ratio - target_ratio)

        if best_aspect is None or score < best_aspect[0]:
          best_aspect = (score, (rows, cols))


    return best_aspect[1]

In [185]:
def calc_max_num_per_page(
    target_ratio=1.294, min_acceptable=50, max_acceptable=80
    ):
  """
  For a current page aspect ratio determine a maximum number of thumbnails
  per page.
  """
  min_value_x = floor(sqrt(min_acceptable/float(target_ratio)))
  min_value_y = floor(sqrt(min_acceptable*float(target_ratio)))
  max_value_x = ceil(sqrt(max_acceptable/float(target_ratio)))
  max_value_y = ceil(sqrt(max_acceptable*float(target_ratio)))

  best_num = None
  best_ratio =  None
  best_res = None
  for x in range(min_value_x, max_value_x+1):
    for y in range(min_value_y, max_value_y+1):
      num = x*y
      ratio = y/x
      if best_res is None or abs(ratio - target_ratio) < abs(best_res[0] - target_ratio):
        best_res = (ratio, num, (x,y))

  res = (best_res[1], best_res[2])
  return res

In [186]:
def make_page_grid(
    src_pdf, out_pdf,
    page_size=(612, 792),  # Letter portrait
    margin=36, gutter=10, caption_h=16,
    heading_h = 64,
    heading_font_size = 56,
    heading_input = 'Thumbnails',
    target_ratio=1.294,
    font_size=9, dpi=300, jpeg_quality=90
):
    """
    Place the source pdf pages into a grid on every page of the output pdf.
    """
    with fitz.open(src_pdf) as src, fitz.open() as dst:

        # Define geometry of the destination page
        pw, ph = page_size # page width, page height without heading
        x0, y0 = margin, margin # margin on left, margin on top
        yh = y0 + heading_h # heading position
        x1, y1 = pw - margin, ph - margin # margin on right, margin on bottom
        content_w, content_h = x1 - x0, y1 - yh # width/height of content without margins

        # Get ratio of the destination page
        target_ratio = content_h / content_w # target aspect ratio for the content area

        # Determine number of pages and grid parameters
        N = src.page_count
        max_per_sheet, (rows_max, cols_max) = calc_max_num_per_page(target_ratio)

        if N >= max_per_sheet:
          per_sheet = max_per_sheet
          rows, cols = rows_max, cols_max
          num_sheets = ceil(N / per_sheet)
        else:
          rows, cols = choose_grid(N, target_ratio=target_ratio)
          per_sheet = rows * cols
          num_sheets = 1

        # Define geometry of the cells
        gx = gy = gutter # space between cells
        cell_w = (content_w - (cols - 1) * gx) / cols  # cell width
        cell_h = (content_h - (rows - 1) * gy) / rows  # cell height

        # Generate a grid for every destination page
        for sheet in range(num_sheets):
            page = dst.new_page(width=pw, height=ph)

            start_idx = sheet * per_sheet
            end_idx = min(start_idx + per_sheet, N)
            idx = start_idx

            heading_str = heading_input
            heading_rect = fitz.Rect(x0, y0, x1, yh)
            page.insert_htmlbox(heading_rect, heading_str)

            for r in range(rows):
                for c in range(cols):
                    if idx >= end_idx:
                        break

                    # Cell rect
                    cx0 = x0 + c * (cell_w + gx)
                    cy0 = yh + r * (cell_h + gy)
                    cx1 = cx0 + cell_w
                    cy1 = cy0 + cell_h

                    caption_rect = fitz.Rect(cx0, cy1 - caption_h, cx1, cy1)
                    image_rect = fitz.Rect(cx0, cy0, cx1, cy1 - caption_h)

                    # Fit source page into image_rect
                    src_page = src.load_page(idx)
                    sw, sh = src_page.rect.width, src_page.rect.height
                    src_ar = sw / sh
                    img_w, img_h = image_rect.width, image_rect.height
                    img_ar = img_w / img_h

                    if src_ar >= img_ar:
                        # width-limited
                        draw_w = img_w
                        draw_h = draw_w / src_ar
                        dx = image_rect.x0
                        dy = image_rect.y0 + (img_h - draw_h) / 2
                    else:
                        # height-limited
                        draw_h = img_h
                        draw_w = draw_h * src_ar
                        dx = image_rect.x0 + (img_w - draw_w) / 2
                        dy = image_rect.y0

                    final_rect = fitz.Rect(dx, dy, dx + draw_w, dy + draw_h)

                    # Place page thumbnail
                    page.show_pdf_page(final_rect, src, idx)



                    # Page number
                    page.insert_textbox(caption_rect,str(idx + 1),
                                        fontsize=font_size, fontname = "times-roman", align=1)

                    idx += 1

        dst.save(out_pdf)

In [187]:
make_page_grid("input.pdf", "output.pdf")