Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,11 @@ jobs:
strategy:
matrix:
python-version:
#- "3.5"
- "3.6"
#- "3.6"
- "3.7"
- "3.8"
- "3.9"
#- "3.10"
- "3.10"
steps:
- uses: actions/checkout@v2

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ The full documentation is available [here](https://developers.mindee.com/docs/ge

## Requirements

This library is officially supported on Python 3.6 to 3.9.
This library is officially supported on Python 3.7 to 3.10.

## Install

Expand Down
11 changes: 5 additions & 6 deletions mindee/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,13 @@ def _wrap_response(self, input_file, response, document_type):
"Receipt API %s HTTP error: %s"
% (response.status_code, json.dumps(dict_response))
)
elif response.status_code > 201:
if response.status_code > 201:
return Response(
http_response=dict_response,
pages=[],
document=None,
document_type=document_type,
)

return Response.format_response(dict_response, document_type, input_file)

def parse_passport(
Expand Down Expand Up @@ -264,8 +263,8 @@ def dump(self, path):
:param path: file path for storing the response object
:return: (void) save the json response
"""
with open(path, "w") as fp:
json.dump(self.http_response, fp)
with open(path, "w") as handle:
json.dump(self.http_response, handle)

@staticmethod
def load(json_path):
Expand All @@ -274,8 +273,8 @@ def load(json_path):
:return: Full response object loaded from json file
"""
try:
with open(json_path) as fp:
json_response = json.load(fp)
with open(json_path) as handle:
json_response = json.load(handle)

file_input = Inputs.load(
json_response["input_type"],
Expand Down
14 changes: 7 additions & 7 deletions mindee/documents/passport.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,28 +283,28 @@ def __mrz_last_name_checksum(self):
return True

@staticmethod
def check_sum(s):
def check_sum(to_check: str) -> str:
"""
https://en.wikipedia.org/wiki/Machine-readable_passport
:param s: string
:param to_check: string
:return: checksum value for string s
"""
checker = 0
alpha_to_num = {c: 10 + i for i, c in enumerate("ABCDEFGHIJKLMNOPQRSTUVWXYZ")}
for i, c in enumerate(s):
for i, chk in enumerate(to_check):
if i % 3 == 0:
weight = 7
elif i % 3 == 1:
weight = 3
else:
weight = 1

if c == "<":
if chk == "<":
val = 0
elif c.isalpha():
val = alpha_to_num[c]
elif chk.isalpha():
val = alpha_to_num[chk]
else:
val = int(c)
val = int(chk)
checker += val * weight
return str(checker % 10)

Expand Down
61 changes: 30 additions & 31 deletions mindee/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
from base64 import decodebytes
from mimetypes import guess_type
import fitz
import pikepdf


class Inputs:
Expand Down Expand Up @@ -48,7 +48,7 @@ def __init__(
self.file_extension = guess_type(file)[0]

if input_type == "dummy":
self.file_object = ""
self.file_object = None
self.input_type = ""
self.filename = ""
self.filepath = ""
Expand All @@ -60,6 +60,8 @@ def __init__(
)

if self.file_extension == "application/pdf":
self.check_pdf_open()

count_pages = self.count_pdf_pages()

if cut_pdf is True:
Expand Down Expand Up @@ -99,47 +101,44 @@ def count_pdf_pages(self):
:return: Number of pages in the Input file for pdfs
"""
self.file_object.seek(0)
src = fitz.open(
stream=self.file_object.read(),
filetype=self.file_extension,
filename=self.filename,
)
return len(src)
with pikepdf.open(self.file_object) as pdf:
return len(pdf.pages)

def merge_pdf_pages(self, pages_number):
"""
:param pages_number: List of pages number to use for merging in the original pdf
:return: (void) Set the Input.file with the reconstructed pdf stream
"""
self.file_object.seek(0)
src = fitz.open(stream=self.file_object.read(), filetype="pdf")
doc = fitz.open()
pdf_pages = [src[n] for n in pages_number]
for spage in pdf_pages:
width = spage.MediaBoxSize[0]
height = spage.MediaBoxSize[1]
r = fitz.Rect(0, 0, width, height)
page = doc.new_page(-1, width=width, height=height)
try:
page.showPDFpage(r, src, spage.number)
except:
pass
new_pdf = pikepdf.Pdf.new()
with pikepdf.open(self.file_object) as pdf:
for page_n in pages_number:
new_pdf.pages.append(pdf.pages[page_n])
self.file_object.close()
self.file_object = io.BytesIO(doc.write())
self.file_object = io.BytesIO()
new_pdf.save(self.file_object)

def check_if_document_is_empty(self):
"""
:return: (void) Check if the document contain only empty pages
"""
self.file_object.seek(0)
with pikepdf.open(self.file_object) as pdf:
for _, page in enumerate(pdf.pages):
if (
"/Font" in page["/Resources"].keys()
or "/XObject" in page["/Resources"].keys()
or page["/Contents"]["/Length"] > 1000
):
return
raise Exception("PDF pages are empty")

def check_pdf_open(self):
"""
:return: (void) Check if the document can be opened using pikepdf
"""
self.file_object.seek(0)
src = fitz.open(stream=self.file_object.read(), filetype="pdf")
fitz.open()
for page in src:
if (
len(page.get_images()) > 0
or len(page.get_cdrawings()) > 1
or len(page.get_text()) > 0
):
return
raise Exception("PDF pages are empty")
try:
pikepdf.open(self.file_object)
except Exception as err:
raise Exception("Couldn't open PDF file. %s" % err)
6 changes: 2 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
[tool.black]
line-length = 88
target-version = ['py35', 'py36', 'py37']
target-version = ['py36', 'py37', 'py38']
include = '\.pyi?$'

[[tool.mypy.overrides]]
module = ['fitz',]
module = ['pikepdf',]
ignore_missing_imports = true

[tool.pylint.'MESSAGES CONTROL']
Expand All @@ -23,9 +23,7 @@ disable=[
'unidiomatic-typecheck',
'arguments-differ',
'inconsistent-return-statements',
'invalid-name',
'super-init-not-called',
'no-else-raise',
'raise-missing-from',
'consider-iterating-dictionary',
'unspecified-encoding',
Expand Down
10 changes: 9 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,16 @@ chardet==4.0.0
# via requests
idna==2.10
# via requests
pymupdf==1.18.17
lxml==4.7.1
# via pikepdf
packaging==21.3
# via pikepdf
pikepdf==4.3.1
# via mindee (setup.py)
pillow==9.0.0
# via pikepdf
pyparsing==3.0.6
# via packaging
pytz==2021.3
# via mindee (setup.py)
requests==2.25.1
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@


requirements = [
"pikepdf==4.3.1",
"pytz==2021.3",
"PyMuPDF==1.18.17",
"requests==2.25.1",
]

test_requirements = [
"pytest==6.1.2",
"pytest==6.2.5",
"pytest-cov==2.11.1",
]

Expand Down
Binary file added tests/data/pdfs/blank.pdf
Binary file not shown.
Binary file added tests/data/pdfs/blank_1.pdf
Binary file not shown.
Binary file added tests/data/pdfs/not_blank_image_only.pdf
Binary file not shown.
43 changes: 22 additions & 21 deletions tests/test_inputs.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
import pytest
from mindee import Inputs
import fitz


def test_mpdf_reconstruct():
def test_pdf_reconstruct():
with pytest.raises(Exception):
Inputs("./tests/data/invoices/invoice_6p.pdf", cut_pdf=True, n_pdf_pages=4)


def test_mpdf_reconstruct_check_n_pages():
def test_pdf_reconstruct_check_n_pages():
input_obj_3 = Inputs(
"./tests/data/invoices/invoice_6p.pdf", cut_pdf=True, n_pdf_pages=3
)
Expand All @@ -24,21 +23,23 @@ def test_mpdf_reconstruct_check_n_pages():
input_obj_2.file_object.seek(0)
input_obj_1.file_object.seek(0)

src_3 = fitz.open(
stream=input_obj_3.file_object.read(),
filetype="application/pdf",
filename="test.pdf",
)
src_2 = fitz.open(
stream=input_obj_2.file_object.read(),
filetype="application/pdf",
filename="test.pdf",
)
src_1 = fitz.open(
stream=input_obj_1.file_object.read(),
filetype="application/pdf",
filename="test.pdf",
)
assert len(src_3) == 3
assert len(src_2) == 2
assert len(src_1) == 1
assert input_obj_3.count_pdf_pages() == 3
assert input_obj_2.count_pdf_pages() == 2
assert input_obj_1.count_pdf_pages() == 1


def test_input_from_stream():
with open("./tests/data/invoices/invoice_6p.pdf", "rb") as fp:
input_obj_1 = Inputs(fp, input_type="stream", cut_pdf=True, n_pdf_pages=1)
assert input_obj_1.count_pdf_pages() == 1


def test_pdf_blank_check():
with pytest.raises(Exception):
Inputs("./tests/data/pdfs/blank.pdf")

with pytest.raises(Exception):
Inputs("./tests/data/pdfs/blank_1.pdf")

input_not_blank = Inputs("./tests/data/pdfs/not_blank_image_only.pdf")
assert input_not_blank.count_pdf_pages() == 1