Skip to content

Commit

Permalink
More smoothly handle the case of a password protected PDF for barcodes
Browse files Browse the repository at this point in the history
  • Loading branch information
stumpylog committed Oct 24, 2022
1 parent f8ce628 commit d52fbbb
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 12 deletions.
49 changes: 38 additions & 11 deletions src/documents/barcodes.py
Expand Up @@ -10,9 +10,12 @@
import magic
from django.conf import settings
from pdf2image import convert_from_path
from pdf2image.exceptions import PDFPageCountError
from pikepdf import Page
from pikepdf import PasswordError
from pikepdf import Pdf
from pikepdf import PdfImage
from pikepdf.models.image import HifiPrintImageNotTranscodableError
from PIL import Image
from PIL import ImageSequence
from pyzbar import pyzbar
Expand Down Expand Up @@ -120,7 +123,9 @@ def _pikepdf_barcode_scan(pdf_filepath: str):
pdfimage = PdfImage(page.images[image_key])

if "/CCITTFaxDecode" in pdfimage.filters:
raise BarcodeImageFormatError()
raise BarcodeImageFormatError(
"Unable to decode CCITTFaxDecode images",
)

# Not all images can be transcoded to a PIL image, which
# is what pyzbar expects to receive
Expand All @@ -132,7 +137,7 @@ def _pikepdf_barcode_scan(pdf_filepath: str):
separator_page_numbers.append(page_num)

def _pdf2image_barcode_scan(pdf_filepath: str):
# use a temporary directory in case the file os too big to handle in memory
# use a temporary directory in case the file is too big to handle in memory
with tempfile.TemporaryDirectory() as path:
pages_from_path = convert_from_path(pdf_filepath, output_folder=path)
for current_page_number, page in enumerate(pages_from_path):
Expand All @@ -150,20 +155,42 @@ def _pdf2image_barcode_scan(pdf_filepath: str):
if mime_type == "image/tiff":
pdf_filepath = convert_from_tiff_to_pdf(filepath)

# Chose the scanner
if settings.CONSUMER_USE_LEGACY_DETECTION:
_pdf2image_barcode_scan(pdf_filepath)
logger.debug("Using pdf2image for barcodes")
scanner_function = _pdf2image_barcode_scan
else:
try:
_pikepdf_barcode_scan(pdf_filepath)
except Exception as e:
logger.debug("Using pikepdf for barcodes")
scanner_function = _pikepdf_barcode_scan

logger.warning(
f"Exception using pikepdf for barcodes,"
f" falling back to pdf2image: {e}",
)
# Reset this incase pikepdf got part way through
# Run the scanner
try:
scanner_function(pdf_filepath)
# Neither method can handle password protected PDFs without it being
# provided. Log it and continue
except (PasswordError, PDFPageCountError) as e:
logger.warning(
f"File is likely password protected, not splitting: {e}",
)
# Handle pikepdf related image decoding issues with a fallback
except (BarcodeImageFormatError, HifiPrintImageNotTranscodableError) as e:
logger.warning(
f"Falling back to pdf2image because: {e}",
)
try:
separator_page_numbers = []
_pdf2image_barcode_scan(pdf_filepath)
# This file is really borked, allow the consumption to continue
# but it may fail further on
except Exception as e: # pragma: no cover
logger.warning(
f"Exception during barcode scanning: {e}",
)
# We're not sure what happened, but allow the consumption to continue
except Exception as e: # pragma: no cover
logger.warning(
f"Exception during barcode scanning: {e}",
)

else:
logger.warning(
Expand Down
Binary file added src/documents/tests/samples/password-is-test.pdf
Binary file not shown.
39 changes: 38 additions & 1 deletion src/documents/tests/test_barcodes.py
Expand Up @@ -174,7 +174,7 @@ def test_scan_file_for_separating_barcodes(self):
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [0])

def test_scan_file_for_separating_barcodes2(self):
def test_scan_file_for_separating_barcodes_none_present(self):
test_file = os.path.join(self.SAMPLE_DIR, "simple.pdf")
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
Expand Down Expand Up @@ -585,3 +585,40 @@ def test_consume_barcode_supported_no_extension_file(self):

with mock.patch("documents.tasks.async_to_sync"):
self.assertEqual(tasks.consume_file(dst), "File successfully split")

def test_scan_file_for_separating_barcodes_password_pikepdf(self):
"""
GIVEN:
- Password protected PDF
- pikepdf based scanning
WHEN:
- File is scanned for barcode
THEN:
- Scanning handle the exception without exception
"""
test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf")
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)

self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [])

@override_settings(CONSUMER_USE_LEGACY_DETECTION=True)
def test_scan_file_for_separating_barcodes_password_pdf2image(self):
"""
GIVEN:
- Password protected PDF
- pdf2image based scanning
WHEN:
- File is scanned for barcode
THEN:
- Scanning handle the exception without exception
"""
test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf")
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)

self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [])

0 comments on commit d52fbbb

Please sign in to comment.