Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions mindee/inputs.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
import base64
import io
import mimetypes
import os
from mimetypes import guess_type
from typing import BinaryIO, Optional, Tuple

import pikepdf

from mindee.logger import logger

mimetypes.add_type("image/heic", ".heic")
mimetypes.add_type("image/heic", ".heif")

ALLOWED_MIME_TYPES = [
"application/pdf",
"image/heic",
"image/png",
"image/jpg",
"image/jpeg",
"image/tiff",
"image/webp",
"application/pdf",
]

INPUT_TYPE_FILE = "file"
Expand Down Expand Up @@ -54,7 +59,7 @@ def __init__(
logger.debug("Loaded new document '%s' from %s", self.filename, self.input_type)

def _check_mimetype(self) -> None:
file_mimetype = guess_type(self.filename)[0]
file_mimetype = mimetypes.guess_type(self.filename)[0]
if file_mimetype:
self.file_mimetype = file_mimetype
else:
Expand Down
Binary file added tests/data/receipt/receipt.heic
Binary file not shown.
Binary file added tests/data/receipt/receipt.tif
Binary file not shown.
Binary file added tests/data/receipt/receipt.tiff
Binary file not shown.
72 changes: 56 additions & 16 deletions tests/test_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,29 @@
import pytest

from mindee.inputs import Base64Document, BytesDocument, FileDocument, PathDocument
from tests import INVOICE_DATA_DIR, RECEIPT_DATA_DIR

#
# PDF
#


def test_pdf_reconstruct_fail():
with pytest.raises(AssertionError):
PathDocument(
"./tests/data/invoice/invoice_10p.pdf",
f"{INVOICE_DATA_DIR}/invoice_10p.pdf",
cut_pdf=True,
n_pdf_pages=4,
)


def test_pdf_reconstruct_ok():
input_file = PathDocument("./tests/data/invoice/invoice_10p.pdf")
input_file = PathDocument(f"{INVOICE_DATA_DIR}/invoice_10p.pdf")
assert isinstance(input_file.file_object, io.BytesIO)


def test_read_contents():
input_doc = PathDocument("./tests/data/invoice/invoice.pdf")
def test_pdf_read_contents():
input_doc = PathDocument(f"{INVOICE_DATA_DIR}/invoice.pdf")
contents = input_doc.read_contents(close_file=False)
assert contents[0] == "invoice.pdf"
assert isinstance(contents[1], bytes)
Expand All @@ -31,27 +36,28 @@ def test_read_contents():


def test_pdf_reconstruct_no_cut():
input_file = PathDocument("./tests/data/invoice/invoice_10p.pdf", cut_pdf=False)
input_file = PathDocument(f"{INVOICE_DATA_DIR}/invoice_10p.pdf", cut_pdf=False)
assert input_file.count_pdf_pages() == 10
assert isinstance(input_file.file_object, io.BufferedReader)


def test_pdf_reconstruct_check_n_pages():
input_obj_3 = PathDocument(
"./tests/data/invoice/invoice_10p.pdf",
f"{INVOICE_DATA_DIR}/invoice_10p.pdf",
cut_pdf=True,
n_pdf_pages=3,
)
input_obj_2 = PathDocument(
"./tests/data/invoice/invoice_10p.pdf",
f"{INVOICE_DATA_DIR}/invoice_10p.pdf",
cut_pdf=True,
n_pdf_pages=2,
)
input_obj_1 = PathDocument(
"./tests/data/invoice/invoice_10p.pdf",
f"{INVOICE_DATA_DIR}/invoice_10p.pdf",
cut_pdf=True,
n_pdf_pages=1,
)
assert input_obj_1.file_mimetype == "application/pdf"

# re-initialize file pointer
input_obj_3.file_object.seek(0)
Expand All @@ -63,40 +69,44 @@ def test_pdf_reconstruct_check_n_pages():
assert input_obj_1.count_pdf_pages() == 1


def test_input_from_path():
def test_pdf_input_from_path():
input_obj_1 = PathDocument(
"./tests/data/invoice/invoice_10p.pdf",
f"{INVOICE_DATA_DIR}/invoice_10p.pdf",
cut_pdf=True,
n_pdf_pages=1,
)
assert input_obj_1.file_mimetype == "application/pdf"
assert input_obj_1.count_pdf_pages() == 1


def test_input_from_file():
with open("./tests/data/invoice/invoice_10p.pdf", "rb") as fp:
def test_pdf_input_from_file():
with open(f"{INVOICE_DATA_DIR}/invoice_10p.pdf", "rb") as fp:
input_obj_1 = FileDocument(fp, cut_pdf=True, n_pdf_pages=1)
assert input_obj_1.file_mimetype == "application/pdf"
assert input_obj_1.count_pdf_pages() == 1


def test_input_from_base64():
with open("./tests/data/invoice/invoice_10p.txt", "rt") as fp:
def test_pdf_input_from_base64():
with open(f"{INVOICE_DATA_DIR}/invoice_10p.txt", "rt") as fp:
input_obj_1 = Base64Document(
fp.read(),
filename="invoice_10p.pdf",
cut_pdf=True,
n_pdf_pages=1,
)
assert input_obj_1.file_mimetype == "application/pdf"
assert input_obj_1.count_pdf_pages() == 1


def test_input_from_bytes():
with open("./tests/data/invoice/invoice_10p.pdf", "rb") as fp:
def test_pdf_input_from_bytes():
with open(f"{INVOICE_DATA_DIR}/invoice_10p.pdf", "rb") as fp:
input_obj_1 = BytesDocument(
fp.read(),
filename="invoice_10p.pdf",
cut_pdf=True,
n_pdf_pages=1,
)
assert input_obj_1.file_mimetype == "application/pdf"
assert input_obj_1.count_pdf_pages() == 1


Expand All @@ -109,3 +119,33 @@ def test_pdf_blank_check():

input_not_blank = PathDocument("./tests/data/pdfs/not_blank_image_only.pdf")
assert input_not_blank.count_pdf_pages() == 1


#
# Images
#


def test_tif_input_from_path():
input_obj_1 = PathDocument(
f"{RECEIPT_DATA_DIR}/receipt.tif",
cut_pdf=True,
n_pdf_pages=1,
)
assert input_obj_1.file_mimetype == "image/tiff"

input_obj_2 = PathDocument(
f"{RECEIPT_DATA_DIR}/receipt.tiff",
cut_pdf=True,
n_pdf_pages=1,
)
assert input_obj_2.file_mimetype == "image/tiff"


def test_heic_input_from_path():
input_obj_1 = PathDocument(
f"{RECEIPT_DATA_DIR}/receipt.heic",
cut_pdf=True,
n_pdf_pages=1,
)
assert input_obj_1.file_mimetype == "image/heic"