diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index eb0c60fa..2277acbf 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -10,12 +10,11 @@ jobs: strategy: matrix: python-version: - #- "3.5" - - "3.6" + #- "3.6" - "3.7" - "3.8" - "3.9" - #- "3.10" + - "3.10" steps: - uses: actions/checkout@v2 diff --git a/README.md b/README.md index c5b2b20e..c2b1b457 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ The full documentation is available [here](https://developers.mindee.com/docs/ge ## Requirements -This library is officially supported on Python 3.6 to 3.9. +This library is officially supported on Python 3.7 to 3.10. ## Install diff --git a/mindee/__init__.py b/mindee/__init__.py index 5f718e30..8c0834e0 100644 --- a/mindee/__init__.py +++ b/mindee/__init__.py @@ -93,14 +93,13 @@ def _wrap_response(self, input_file, response, document_type): "Receipt API %s HTTP error: %s" % (response.status_code, json.dumps(dict_response)) ) - elif response.status_code > 201: + if response.status_code > 201: return Response( http_response=dict_response, pages=[], document=None, document_type=document_type, ) - return Response.format_response(dict_response, document_type, input_file) def parse_passport( @@ -264,8 +263,8 @@ def dump(self, path): :param path: file path for storing the response object :return: (void) save the json response """ - with open(path, "w") as fp: - json.dump(self.http_response, fp) + with open(path, "w") as handle: + json.dump(self.http_response, handle) @staticmethod def load(json_path): @@ -274,8 +273,8 @@ def load(json_path): :return: Full response object loaded from json file """ try: - with open(json_path) as fp: - json_response = json.load(fp) + with open(json_path) as handle: + json_response = json.load(handle) file_input = Inputs.load( json_response["input_type"], diff --git a/mindee/documents/passport.py b/mindee/documents/passport.py index 02f36ede..bf09b631 100644 --- a/mindee/documents/passport.py +++ b/mindee/documents/passport.py @@ -283,15 +283,15 @@ def __mrz_last_name_checksum(self): return True @staticmethod - def check_sum(s): + def check_sum(to_check: str) -> str: """ https://en.wikipedia.org/wiki/Machine-readable_passport - :param s: string + :param to_check: string :return: checksum value for string s """ checker = 0 alpha_to_num = {c: 10 + i for i, c in enumerate("ABCDEFGHIJKLMNOPQRSTUVWXYZ")} - for i, c in enumerate(s): + for i, chk in enumerate(to_check): if i % 3 == 0: weight = 7 elif i % 3 == 1: @@ -299,12 +299,12 @@ def check_sum(s): else: weight = 1 - if c == "<": + if chk == "<": val = 0 - elif c.isalpha(): - val = alpha_to_num[c] + elif chk.isalpha(): + val = alpha_to_num[chk] else: - val = int(c) + val = int(chk) checker += val * weight return str(checker % 10) diff --git a/mindee/inputs.py b/mindee/inputs.py index 45961997..6c55a33f 100644 --- a/mindee/inputs.py +++ b/mindee/inputs.py @@ -2,7 +2,7 @@ import os from base64 import decodebytes from mimetypes import guess_type -import fitz +import pikepdf class Inputs: @@ -48,7 +48,7 @@ def __init__( self.file_extension = guess_type(file)[0] if input_type == "dummy": - self.file_object = "" + self.file_object = None self.input_type = "" self.filename = "" self.filepath = "" @@ -60,6 +60,8 @@ def __init__( ) if self.file_extension == "application/pdf": + self.check_pdf_open() + count_pages = self.count_pdf_pages() if cut_pdf is True: @@ -99,12 +101,8 @@ def count_pdf_pages(self): :return: Number of pages in the Input file for pdfs """ self.file_object.seek(0) - src = fitz.open( - stream=self.file_object.read(), - filetype=self.file_extension, - filename=self.filename, - ) - return len(src) + with pikepdf.open(self.file_object) as pdf: + return len(pdf.pages) def merge_pdf_pages(self, pages_number): """ @@ -112,34 +110,35 @@ def merge_pdf_pages(self, pages_number): :return: (void) Set the Input.file with the reconstructed pdf stream """ self.file_object.seek(0) - src = fitz.open(stream=self.file_object.read(), filetype="pdf") - doc = fitz.open() - pdf_pages = [src[n] for n in pages_number] - for spage in pdf_pages: - width = spage.MediaBoxSize[0] - height = spage.MediaBoxSize[1] - r = fitz.Rect(0, 0, width, height) - page = doc.new_page(-1, width=width, height=height) - try: - page.showPDFpage(r, src, spage.number) - except: - pass + new_pdf = pikepdf.Pdf.new() + with pikepdf.open(self.file_object) as pdf: + for page_n in pages_number: + new_pdf.pages.append(pdf.pages[page_n]) self.file_object.close() - self.file_object = io.BytesIO(doc.write()) + self.file_object = io.BytesIO() + new_pdf.save(self.file_object) def check_if_document_is_empty(self): """ :return: (void) Check if the document contain only empty pages """ + self.file_object.seek(0) + with pikepdf.open(self.file_object) as pdf: + for _, page in enumerate(pdf.pages): + if ( + "/Font" in page["/Resources"].keys() + or "/XObject" in page["/Resources"].keys() + or page["/Contents"]["/Length"] > 1000 + ): + return + raise Exception("PDF pages are empty") + def check_pdf_open(self): + """ + :return: (void) Check if the document can be opened using pikepdf + """ self.file_object.seek(0) - src = fitz.open(stream=self.file_object.read(), filetype="pdf") - fitz.open() - for page in src: - if ( - len(page.get_images()) > 0 - or len(page.get_cdrawings()) > 1 - or len(page.get_text()) > 0 - ): - return - raise Exception("PDF pages are empty") + try: + pikepdf.open(self.file_object) + except Exception as err: + raise Exception("Couldn't open PDF file. %s" % err) diff --git a/pyproject.toml b/pyproject.toml index 5e44d396..9d611433 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,10 +1,10 @@ [tool.black] line-length = 88 -target-version = ['py35', 'py36', 'py37'] +target-version = ['py36', 'py37', 'py38'] include = '\.pyi?$' [[tool.mypy.overrides]] -module = ['fitz',] +module = ['pikepdf',] ignore_missing_imports = true [tool.pylint.'MESSAGES CONTROL'] @@ -23,9 +23,7 @@ disable=[ 'unidiomatic-typecheck', 'arguments-differ', 'inconsistent-return-statements', - 'invalid-name', 'super-init-not-called', - 'no-else-raise', 'raise-missing-from', 'consider-iterating-dictionary', 'unspecified-encoding', diff --git a/requirements.txt b/requirements.txt index c76f16ac..67c50588 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,8 +10,16 @@ chardet==4.0.0 # via requests idna==2.10 # via requests -pymupdf==1.18.17 +lxml==4.7.1 + # via pikepdf +packaging==21.3 + # via pikepdf +pikepdf==4.3.1 # via mindee (setup.py) +pillow==9.0.0 + # via pikepdf +pyparsing==3.0.6 + # via packaging pytz==2021.3 # via mindee (setup.py) requests==2.25.1 diff --git a/setup.py b/setup.py index 84156d82..05616b3f 100644 --- a/setup.py +++ b/setup.py @@ -16,13 +16,13 @@ requirements = [ + "pikepdf==4.3.1", "pytz==2021.3", - "PyMuPDF==1.18.17", "requests==2.25.1", ] test_requirements = [ - "pytest==6.1.2", + "pytest==6.2.5", "pytest-cov==2.11.1", ] diff --git a/tests/data/pdfs/blank.pdf b/tests/data/pdfs/blank.pdf new file mode 100644 index 00000000..86563d08 Binary files /dev/null and b/tests/data/pdfs/blank.pdf differ diff --git a/tests/data/pdfs/blank_1.pdf b/tests/data/pdfs/blank_1.pdf new file mode 100644 index 00000000..3cb3597c Binary files /dev/null and b/tests/data/pdfs/blank_1.pdf differ diff --git a/tests/data/pdfs/not_blank_image_only.pdf b/tests/data/pdfs/not_blank_image_only.pdf new file mode 100644 index 00000000..efa2a92b Binary files /dev/null and b/tests/data/pdfs/not_blank_image_only.pdf differ diff --git a/tests/test_inputs.py b/tests/test_inputs.py index 5c9bf421..819cdb87 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -1,14 +1,13 @@ import pytest from mindee import Inputs -import fitz -def test_mpdf_reconstruct(): +def test_pdf_reconstruct(): with pytest.raises(Exception): Inputs("./tests/data/invoices/invoice_6p.pdf", cut_pdf=True, n_pdf_pages=4) -def test_mpdf_reconstruct_check_n_pages(): +def test_pdf_reconstruct_check_n_pages(): input_obj_3 = Inputs( "./tests/data/invoices/invoice_6p.pdf", cut_pdf=True, n_pdf_pages=3 ) @@ -24,21 +23,23 @@ def test_mpdf_reconstruct_check_n_pages(): input_obj_2.file_object.seek(0) input_obj_1.file_object.seek(0) - src_3 = fitz.open( - stream=input_obj_3.file_object.read(), - filetype="application/pdf", - filename="test.pdf", - ) - src_2 = fitz.open( - stream=input_obj_2.file_object.read(), - filetype="application/pdf", - filename="test.pdf", - ) - src_1 = fitz.open( - stream=input_obj_1.file_object.read(), - filetype="application/pdf", - filename="test.pdf", - ) - assert len(src_3) == 3 - assert len(src_2) == 2 - assert len(src_1) == 1 + assert input_obj_3.count_pdf_pages() == 3 + assert input_obj_2.count_pdf_pages() == 2 + assert input_obj_1.count_pdf_pages() == 1 + + +def test_input_from_stream(): + with open("./tests/data/invoices/invoice_6p.pdf", "rb") as fp: + input_obj_1 = Inputs(fp, input_type="stream", cut_pdf=True, n_pdf_pages=1) + assert input_obj_1.count_pdf_pages() == 1 + + +def test_pdf_blank_check(): + with pytest.raises(Exception): + Inputs("./tests/data/pdfs/blank.pdf") + + with pytest.raises(Exception): + Inputs("./tests/data/pdfs/blank_1.pdf") + + input_not_blank = Inputs("./tests/data/pdfs/not_blank_image_only.pdf") + assert input_not_blank.count_pdf_pages() == 1