Skip to content

Commit

Permalink
Merge 758be21 into d0ee203
Browse files Browse the repository at this point in the history
  • Loading branch information
stumpylog committed May 23, 2022
2 parents d0ee203 + 758be21 commit 33cc296
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 17 deletions.
25 changes: 17 additions & 8 deletions docs/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -424,14 +424,23 @@ PAPERLESS_OCR_IMAGE_DPI=<num>
the produced PDF documents are A4 sized.

PAPERLESS_OCR_MAX_IMAGE_PIXELS=<num>
Paperless will not OCR images that have more pixels than this limit.
This is intended to prevent decompression bombs from overloading paperless.
Increasing this limit is desired if you face a DecompressionBombError despite
the concerning file not being malicious; this could e.g. be caused by invalidly
recognized metadata.
If you have enough resources or if you are certain that your uploaded files
are not malicious you can increase this value to your needs.
The default value is 256000000, an image with more pixels than that would not be parsed.
Paperless will raise a warning when OCRing images which are over this limit and
will not OCR images which are more than twice this limit. Note this does not
prevent the document from being consumed, but could result in missing text content.

If unset, will default to the value determined by
`Pillow <https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.MAX_IMAGE_PIXELS>`.

.. note::

Increasing this limit could cause Paperless to consume additional resources
when consuming a file. Be sure you have sufficient system resources.

.. caution::

The limit is intended to prevent malicious files from consuming system resources
and causing crashes and other errors. Only increase this value if you are certain
your documents are not malicious and you need the text which was not OCRed

PAPERLESS_OCR_USER_ARGS=<json>
OCRmyPDF offers many more options. Use this parameter to specify any
Expand Down
8 changes: 4 additions & 4 deletions src/paperless/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os
import re
from typing import Final
from typing import Optional
from typing import Set
from urllib.parse import urlparse

Expand Down Expand Up @@ -551,10 +552,9 @@ def default_threads_per_worker(task_workers) -> int:
os.getenv("PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD", 12.0),
)

OCR_MAX_IMAGE_PIXELS = os.environ.get(
"PAPERLESS_OCR_MAX_IMAGE_PIXELS",
256000000,
)
OCR_MAX_IMAGE_PIXELS: Optional[int] = None
if os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS") is not None:
OCR_MAX_IMAGE_PIXELS: int = int(os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS"))

OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}")

Expand Down
20 changes: 18 additions & 2 deletions src/paperless_tesseract/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
from documents.parsers import ParseError
from PIL import Image

Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS


class NoTextFoundException(Exception):
pass
Expand Down Expand Up @@ -225,6 +223,24 @@ def construct_ocrmypdf_parameters(
f"they will not be used. Error: {e}",
)

if settings.OCR_MAX_IMAGE_PIXELS is not None:
# Convert pixels to mega-pixels and provide to ocrmypdf
max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0
if max_pixels_mpixels > 0:

self.log(
"debug",
f"Calculated {max_pixels_mpixels} megapixels for OCR",
)

ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
else:
self.log(
"warning",
"There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, "
"this value must be at least 1 megapixel if set",
)

return ocrmypdf_args

def parse(self, document_path, mime_type, file_name=None):
Expand Down
4 changes: 1 addition & 3 deletions src/paperless_text/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
from PIL import ImageDraw
from PIL import ImageFont

Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS


class TextDocumentParser(DocumentParser):
"""
Expand All @@ -28,7 +26,7 @@ def read_text():
font = ImageFont.truetype(
font=settings.THUMBNAIL_FONT_NAME,
size=20,
layout_engine=ImageFont.LAYOUT_BASIC,
layout_engine=ImageFont.Layout.BASIC,
)
draw.text((5, 5), read_text(), font=font, fill="black")

Expand Down

0 comments on commit 33cc296

Please sign in to comment.