mindee · ianardee · Jan 17, 2022 · Jan 14, 2022 · Jan 14, 2022 · Jan 14, 2022
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -10,12 +10,11 @@ jobs:
     strategy:
       matrix:
         python-version:
-          #- "3.5"
-          - "3.6"
+          #- "3.6"
           - "3.7"
           - "3.8"
           - "3.9"
-          #- "3.10"
+          - "3.10"
     steps:
     - uses: actions/checkout@v2
 

diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@ The full documentation is available [here](https://developers.mindee.com/docs/ge
 
 ## Requirements
 
-This library is officially supported on Python 3.6 to 3.9.
+This library is officially supported on Python 3.7 to 3.10.
 
 ## Install
 

diff --git a/mindee/__init__.py b/mindee/__init__.py
@@ -93,14 +93,13 @@ def _wrap_response(self, input_file, response, document_type):
                 "Receipt API %s HTTP error: %s"
                 % (response.status_code, json.dumps(dict_response))
             )
-        elif response.status_code > 201:
+        if response.status_code > 201:
             return Response(
                 http_response=dict_response,
                 pages=[],
                 document=None,
                 document_type=document_type,
             )
-
         return Response.format_response(dict_response, document_type, input_file)
 
     def parse_passport(
@@ -264,8 +263,8 @@ def dump(self, path):
         :param path: file path for storing the response object
         :return: (void) save the json response
         """
-        with open(path, "w") as fp:
-            json.dump(self.http_response, fp)
+        with open(path, "w") as handle:
+            json.dump(self.http_response, handle)
 
     @staticmethod
     def load(json_path):
@@ -274,8 +273,8 @@ def load(json_path):
         :return: Full response object loaded from json file
         """
         try:
-            with open(json_path) as fp:
-                json_response = json.load(fp)
+            with open(json_path) as handle:
+                json_response = json.load(handle)
 
             file_input = Inputs.load(
                 json_response["input_type"],

diff --git a/mindee/documents/passport.py b/mindee/documents/passport.py
@@ -283,28 +283,28 @@ def __mrz_last_name_checksum(self):
             return True
 
     @staticmethod
-    def check_sum(s):
+    def check_sum(to_check: str) -> str:
         """
         https://en.wikipedia.org/wiki/Machine-readable_passport
-        :param s: string
+        :param to_check: string
         :return: checksum value for string s
         """
         checker = 0
         alpha_to_num = {c: 10 + i for i, c in enumerate("ABCDEFGHIJKLMNOPQRSTUVWXYZ")}
-        for i, c in enumerate(s):
+        for i, chk in enumerate(to_check):
             if i % 3 == 0:
                 weight = 7
             elif i % 3 == 1:
                 weight = 3
             else:
                 weight = 1
 
-            if c == "<":
+            if chk == "<":
                 val = 0
-            elif c.isalpha():
-                val = alpha_to_num[c]
+            elif chk.isalpha():
+                val = alpha_to_num[chk]
             else:
-                val = int(c)
+                val = int(chk)
             checker += val * weight
         return str(checker % 10)
 

diff --git a/mindee/inputs.py b/mindee/inputs.py
@@ -2,7 +2,7 @@
 import os
 from base64 import decodebytes
 from mimetypes import guess_type
-import fitz
+import pikepdf
 
 
 class Inputs:
@@ -48,7 +48,7 @@ def __init__(
             self.file_extension = guess_type(file)[0]
 
         if input_type == "dummy":
-            self.file_object = ""
+            self.file_object = None
             self.input_type = ""
             self.filename = ""
             self.filepath = ""
@@ -60,6 +60,8 @@ def __init__(
             )
 
         if self.file_extension == "application/pdf":
+            self.check_pdf_open()
+
             count_pages = self.count_pdf_pages()
 
             if cut_pdf is True:
@@ -99,47 +101,44 @@ def count_pdf_pages(self):
         :return: Number of pages in the Input file for pdfs
         """
         self.file_object.seek(0)
-        src = fitz.open(
-            stream=self.file_object.read(),
-            filetype=self.file_extension,
-            filename=self.filename,
-        )
-        return len(src)
+        with pikepdf.open(self.file_object) as pdf:
+            return len(pdf.pages)
 
     def merge_pdf_pages(self, pages_number):
         """
         :param pages_number: List of pages number to use for merging in the original pdf
         :return: (void) Set the Input.file with the reconstructed pdf stream
         """
         self.file_object.seek(0)
-        src = fitz.open(stream=self.file_object.read(), filetype="pdf")
-        doc = fitz.open()
-        pdf_pages = [src[n] for n in pages_number]
-        for spage in pdf_pages:
-            width = spage.MediaBoxSize[0]
-            height = spage.MediaBoxSize[1]
-            r = fitz.Rect(0, 0, width, height)
-            page = doc.new_page(-1, width=width, height=height)
-            try:
-                page.showPDFpage(r, src, spage.number)
-            except:
-                pass
+        new_pdf = pikepdf.Pdf.new()
+        with pikepdf.open(self.file_object) as pdf:
+            for page_n in pages_number:
+                new_pdf.pages.append(pdf.pages[page_n])
         self.file_object.close()
-        self.file_object = io.BytesIO(doc.write())
+        self.file_object = io.BytesIO()
+        new_pdf.save(self.file_object)
 
     def check_if_document_is_empty(self):
         """
         :return: (void) Check if the document contain only empty pages
         """
+        self.file_object.seek(0)
+        with pikepdf.open(self.file_object) as pdf:
+            for _, page in enumerate(pdf.pages):
+                if (
+                    "/Font" in page["/Resources"].keys()
+                    or "/XObject" in page["/Resources"].keys()
+                    or page["/Contents"]["/Length"] > 1000
+                ):
+                    return
+            raise Exception("PDF pages are empty")
 
+    def check_pdf_open(self):
+        """
+        :return: (void) Check if the document can be opened using pikepdf
+        """
         self.file_object.seek(0)
-        src = fitz.open(stream=self.file_object.read(), filetype="pdf")
-        fitz.open()
-        for page in src:
-            if (
-                len(page.get_images()) > 0
-                or len(page.get_cdrawings()) > 1
-                or len(page.get_text()) > 0
-            ):
-                return
-        raise Exception("PDF pages are empty")
+        try:
+            pikepdf.open(self.file_object)
+        except Exception as err:
+            raise Exception("Couldn't open PDF file. %s" % err)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,10 +1,10 @@
 [tool.black]
 line-length = 88
-target-version = ['py35', 'py36', 'py37']
+target-version = ['py36', 'py37', 'py38']
 include = '\.pyi?$'
 
 [[tool.mypy.overrides]]
-module = ['fitz',]
+module = ['pikepdf',]
 ignore_missing_imports = true
 
 [tool.pylint.'MESSAGES CONTROL']
@@ -23,9 +23,7 @@ disable=[
   'unidiomatic-typecheck',
   'arguments-differ',
   'inconsistent-return-statements',
-  'invalid-name',
   'super-init-not-called',
-  'no-else-raise',
   'raise-missing-from',
   'consider-iterating-dictionary',
   'unspecified-encoding',

diff --git a/requirements.txt b/requirements.txt
@@ -10,8 +10,16 @@ chardet==4.0.0
     # via requests
 idna==2.10
     # via requests
-pymupdf==1.18.17
+lxml==4.7.1
+    # via pikepdf
+packaging==21.3
+    # via pikepdf
+pikepdf==4.3.1
     # via mindee (setup.py)
+pillow==9.0.0
+    # via pikepdf
+pyparsing==3.0.6
+    # via packaging
 pytz==2021.3
     # via mindee (setup.py)
 requests==2.25.1

diff --git a/setup.py b/setup.py
@@ -16,13 +16,13 @@
 
 
 requirements = [
+    "pikepdf==4.3.1",
     "pytz==2021.3",
-    "PyMuPDF==1.18.17",
     "requests==2.25.1",
 ]
 
 test_requirements = [
-    "pytest==6.1.2",
+    "pytest==6.2.5",
     "pytest-cov==2.11.1",
 ]
 

diff --git a/tests/data/pdfs/blank.pdf b/tests/data/pdfs/blank.pdf
diff --git a/tests/data/pdfs/blank_1.pdf b/tests/data/pdfs/blank_1.pdf
diff --git a/tests/data/pdfs/not_blank_image_only.pdf b/tests/data/pdfs/not_blank_image_only.pdf
diff --git a/tests/test_inputs.py b/tests/test_inputs.py
@@ -1,14 +1,13 @@
 import pytest
 from mindee import Inputs
-import fitz
 
 
-def test_mpdf_reconstruct():
+def test_pdf_reconstruct():
     with pytest.raises(Exception):
         Inputs("./tests/data/invoices/invoice_6p.pdf", cut_pdf=True, n_pdf_pages=4)
 
 
-def test_mpdf_reconstruct_check_n_pages():
+def test_pdf_reconstruct_check_n_pages():
     input_obj_3 = Inputs(
         "./tests/data/invoices/invoice_6p.pdf", cut_pdf=True, n_pdf_pages=3
     )
@@ -24,21 +23,23 @@ def test_mpdf_reconstruct_check_n_pages():
     input_obj_2.file_object.seek(0)
     input_obj_1.file_object.seek(0)
 
-    src_3 = fitz.open(
-        stream=input_obj_3.file_object.read(),
-        filetype="application/pdf",
-        filename="test.pdf",
-    )
-    src_2 = fitz.open(
-        stream=input_obj_2.file_object.read(),
-        filetype="application/pdf",
-        filename="test.pdf",
-    )
-    src_1 = fitz.open(
-        stream=input_obj_1.file_object.read(),
-        filetype="application/pdf",
-        filename="test.pdf",
-    )
-    assert len(src_3) == 3
-    assert len(src_2) == 2
-    assert len(src_1) == 1
+    assert input_obj_3.count_pdf_pages() == 3
+    assert input_obj_2.count_pdf_pages() == 2
+    assert input_obj_1.count_pdf_pages() == 1
+
+
+def test_input_from_stream():
+    with open("./tests/data/invoices/invoice_6p.pdf", "rb") as fp:
+        input_obj_1 = Inputs(fp, input_type="stream", cut_pdf=True, n_pdf_pages=1)
+    assert input_obj_1.count_pdf_pages() == 1
+
+
+def test_pdf_blank_check():
+    with pytest.raises(Exception):
+        Inputs("./tests/data/pdfs/blank.pdf")
+
+    with pytest.raises(Exception):
+        Inputs("./tests/data/pdfs/blank_1.pdf")
+
+    input_not_blank = Inputs("./tests/data/pdfs/not_blank_image_only.pdf")
+    assert input_not_blank.count_pdf_pages() == 1