diff --git a/.github/workflows/_static-analysis.yml b/.github/workflows/_static-analysis.yml index 1942a07c0..b95c9c16d 100644 --- a/.github/workflows/_static-analysis.yml +++ b/.github/workflows/_static-analysis.yml @@ -13,7 +13,7 @@ jobs: submodules: recursive - name: Set up JDK - uses: actions/setup-java@v4 + uses: actions/setup-java@v5 with: java-version: "11" distribution: "temurin" diff --git a/src/main/java/com/mindee/extraction/PDFExtractor.java b/src/main/java/com/mindee/extraction/PDFExtractor.java index 33d71519e..55b5f7b02 100644 --- a/src/main/java/com/mindee/extraction/PDFExtractor.java +++ b/src/main/java/com/mindee/extraction/PDFExtractor.java @@ -111,7 +111,6 @@ public List extractSubDocuments(List> pageIndexes) return extractedPDFs; } - /** * Extract invoices from the given page indexes (from an invoice-splitter prediction). * @@ -119,18 +118,17 @@ public List extractSubDocuments(List> pageIndexes) * @return a list of extracted files. * @throws IOException Throws if the file can't be accessed. */ - public List extractInvoices(List pageIndexes) - throws IOException { + public List extractInvoices( + List pageIndexes + ) throws IOException { List> indexes = pageIndexes.stream().map(InvoiceSplitterV1InvoicePageGroup::getPageIndexes) .collect(Collectors.toList()); - return extractSubDocuments(indexes); } - /** * Extract invoices from the given page indexes (from an invoice-splitter prediction). * @@ -139,8 +137,10 @@ public List extractInvoices(List extractInvoices(List pageIndexes, - boolean strict) throws IOException { + public List extractInvoices( + List pageIndexes, + boolean strict + ) throws IOException { List> correctPageIndexes = new ArrayList<>(); if (!strict) { return extractInvoices(pageIndexes); diff --git a/src/main/java/com/mindee/input/LocalInputSource.java b/src/main/java/com/mindee/input/LocalInputSource.java index ed49b905e..41c812309 100644 --- a/src/main/java/com/mindee/input/LocalInputSource.java +++ b/src/main/java/com/mindee/input/LocalInputSource.java @@ -1,6 +1,7 @@ package com.mindee.input; import com.mindee.image.ImageCompressor; +import com.mindee.pdf.PDFUtils; import com.mindee.pdf.PdfBoxApi; import com.mindee.pdf.PdfCompressor; import com.mindee.pdf.PdfOperation; @@ -48,6 +49,17 @@ public LocalInputSource(String fileAsBase64, String filename) { this.filename = filename; } + /** + * Get the number of pages in the document. + * @return the number of pages in the current file. + * @throws IOException If an I/O error occurs during the PDF operation. + */ + public int getPageCount() throws IOException { + if (!this.isPdf()) { + return 1; + } + return PDFUtils.getNumberOfPages(this.file); + } /** * Applies PDF-specific operations on the current file based on the specified {@code PageOptions}. diff --git a/src/main/java/com/mindee/pdf/PDFUtils.java b/src/main/java/com/mindee/pdf/PDFUtils.java index f21fa0f7f..ca3536aa6 100644 --- a/src/main/java/com/mindee/pdf/PDFUtils.java +++ b/src/main/java/com/mindee/pdf/PDFUtils.java @@ -39,7 +39,16 @@ private PDFUtils() { * @param inputSource The PDF file. */ public static int getNumberOfPages(LocalInputSource inputSource) throws IOException { - PDDocument document = Loader.loadPDF(inputSource.getFile()); + return getNumberOfPages(inputSource.getFile()); + } + + /** + * Get the number of pages in the PDF. + * + * @param pdfBytes The PDF file as a byte array. + */ + public static int getNumberOfPages(byte[] pdfBytes) throws IOException { + PDDocument document = Loader.loadPDF(pdfBytes); int pageCount = document.getNumberOfPages(); document.close(); return pageCount; diff --git a/src/test/java/com/mindee/extraction/InvoiceSplitterAutoExtractionIT.java b/src/test/java/com/mindee/extraction/InvoiceSplitterAutoExtractionIT.java index 035141150..c84936d0a 100644 --- a/src/test/java/com/mindee/extraction/InvoiceSplitterAutoExtractionIT.java +++ b/src/test/java/com/mindee/extraction/InvoiceSplitterAutoExtractionIT.java @@ -42,10 +42,8 @@ protected Document getInvoiceSplitterPrediction() throws protected PredictResponse getInvoicePrediction(LocalInputSource invoicePDF) throws IOException, MindeeException { return client.parse(InvoiceV4.class, invoicePDF); - } - protected String prepareInvoiceReturn(String rstFilePath, Document invoicePrediction) throws IOException { List rstRefLines = Files.readAllLines(Paths.get(rstFilePath)); @@ -60,7 +58,7 @@ protected String prepareInvoiceReturn(String rstFilePath, Document in } @Test - public void givenAPDF_shouldExtractInvoicesStrict() throws IOException, InterruptedException { + public void givenAPDF_shouldExtractInvoices() throws IOException, InterruptedException { Document document = getInvoiceSplitterPrediction(); InvoiceSplitterV1 inference = document.getInference(); @@ -71,29 +69,37 @@ public void givenAPDF_shouldExtractInvoicesStrict() throws IOException, Interrup Assertions.assertEquals(2, extractedPDFsStrict.size()); Assertions.assertEquals("default_sample_001-001.pdf", extractedPDFsStrict.get(0).getFilename()); Assertions.assertEquals("default_sample_002-002.pdf", extractedPDFsStrict.get(1).getFilename()); - PredictResponse invoice0 = - getInvoicePrediction(extractedPDFsStrict.get(0).asInputSource()); + PredictResponse invoice0 = getInvoicePrediction( + extractedPDFsStrict.get(0).asInputSource() + ); String testStringRSTInvoice0 = prepareInvoiceReturn( "src/test/resources/products/invoices/response_v4/summary_full_invoice_p1.rst", - invoice0.getDocument()); - Assertions.assertEquals(testStringRSTInvoice0, String.join(String.format("%n"), - invoice0.getDocument().toString().split(System.lineSeparator()))); - - PredictResponse invoice1 = - getInvoicePrediction(extractedPDFsStrict.get(1).asInputSource()); + invoice0.getDocument() + ); + double invoice0Ratio = levenshteinRatio( + testStringRSTInvoice0, + String.join( + String.format("%n"), + invoice0.getDocument().toString().split(System.lineSeparator()) + ) + ); + Assertions.assertTrue(invoice0Ratio > 0.90); + PredictResponse invoice1 = getInvoicePrediction( + extractedPDFsStrict.get(1).asInputSource() + ); String testStringRSTInvoice1 = prepareInvoiceReturn( "src/test/resources/products/invoices/response_v4/summary_full_invoice_p2.rst", - invoice1.getDocument()); - Assertions.assertTrue( - levenshteinRatio( + invoice1.getDocument() + ); + double invoice1Ratio = levenshteinRatio( testStringRSTInvoice1, - String.join(String.format("%n"), - invoice1.getDocument().toString().split(System.lineSeparator()) + String.join( + String.format("%n"), + invoice1.getDocument().toString().split(System.lineSeparator()) ) - ) > 0.97); - - + ); + Assertions.assertTrue(invoice1Ratio > 0.90); } } diff --git a/src/test/java/com/mindee/input/FileCompressionTest.java b/src/test/java/com/mindee/input/FileCompressionTest.java new file mode 100644 index 000000000..0d76e8305 --- /dev/null +++ b/src/test/java/com/mindee/input/FileCompressionTest.java @@ -0,0 +1,287 @@ +package com.mindee.input; + +import com.mindee.image.ImageCompressor; +import com.mindee.pdf.PdfCompressor; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.text.PDFTextStripper; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import javax.imageio.ImageIO; +import java.awt.image.BufferedImage; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +public class FileCompressionTest { + + @Test + public void fromInputSource_imageQuality_should_Compress() throws IOException { + LocalInputSource receiptInput = + new LocalInputSource("src/test/resources/file_types/receipt.jpg"); + + receiptInput.compress(40); + Path outputPath = Paths.get("src/test/resources/output/compresstest.jpg"); + + Files.write(outputPath, receiptInput.getFile()); + + Assertions.assertTrue(Files.exists(outputPath)); + + long initialFileSize = Files.size(Paths.get("src/test/resources/file_types/receipt.jpg")); + long compressedFileSize = Files.size(outputPath); + + Assertions.assertTrue( + compressedFileSize < initialFileSize, + "Compressed file size (" + compressedFileSize + + ") should be less than initial file size (" + initialFileSize + ")" + ); + } + + @Test + public void testImageQualityCompressesFromCompressor() throws IOException { + Path outputDir = Paths.get("src/test/resources/output"); + LocalInputSource receiptInput = + new LocalInputSource("src/test/resources/file_types/receipt.jpg"); + List compresses = Arrays.asList( + ImageCompressor.compressImage(receiptInput.getFile(), 100), + ImageCompressor.compressImage(receiptInput.getFile()), + ImageCompressor.compressImage(receiptInput.getFile(), 50), + ImageCompressor.compressImage(receiptInput.getFile(), 10), + ImageCompressor.compressImage(receiptInput.getFile(), 1) + ); + + List outputPaths = Arrays.asList( + outputDir.resolve("compress100.jpg"), + outputDir.resolve("compress75.jpg"), + outputDir.resolve("compress50.jpg"), + outputDir.resolve("compress10.jpg"), + outputDir.resolve("compress1.jpg") + ); + + for (int i = 0; i < compresses.size(); i++) { + Files.write(outputPaths.get(i), compresses.get(i)); + } + + long initialFileSize = Files.size(Paths.get("src/test/resources/file_types/receipt.jpg")); + List compressedFileSizes = outputPaths.stream() + .map(path -> { + try { + return Files.size(path); + } catch (IOException e) { + throw new RuntimeException(e); + } + }).collect(Collectors.toList()); + + Assertions.assertTrue( + initialFileSize < compressedFileSizes.get(0), + "Compressed file size (" + compressedFileSizes.get(0) + + ") should be less than initial file size (" + initialFileSize + ")" + ); + Assertions.assertTrue( + initialFileSize < compressedFileSizes.get(1), + "Compressed file size (" + compressedFileSizes.get(1) + + ") should be less than initial file size (" + initialFileSize + ")" + ); + Assertions.assertTrue( + compressedFileSizes.get(1) > compressedFileSizes.get(2), + "Compressed file size (" + compressedFileSizes.get(2) + + ") should be less than initial file size (" + compressedFileSizes.get(1) + ")" + ); + Assertions.assertTrue( + compressedFileSizes.get(2) > compressedFileSizes.get(3), + "Compressed file size (" + compressedFileSizes.get(3) + + ") should be less than initial file size (" + compressedFileSizes.get(2) + ")" + ); + Assertions.assertTrue( + compressedFileSizes.get(3) > compressedFileSizes.get(4), + "Compressed file size (" + compressedFileSizes.get(4) + + ") should be less than initial file size (" + compressedFileSizes.get(3) + ")" + ); + } + + @Test + public void testImageResizeFromInputSource() throws IOException { + Path outputDir = Paths.get("src/test/resources/output"); + LocalInputSource imageResizeInput = + new LocalInputSource("src/test/resources/file_types/receipt.jpg"); + imageResizeInput.compress(75, 250, 1000); + Path outputPath = outputDir.resolve("resize_indirect.jpg"); + Files.write(outputPath, imageResizeInput.getFile()); + + long initialFileSize = Files.size(Paths.get("src/test/resources/file_types/receipt.jpg")); + long resizedFileSize = Files.size(outputPath); + Assertions.assertTrue(resizedFileSize < initialFileSize); + + BufferedImage resizedImage = ImageIO.read(outputPath.toFile()); + Assertions.assertEquals(250, resizedImage.getWidth()); + Assertions.assertEquals(333, resizedImage.getHeight()); + } + + @Test + public void testImageResizeFromCompressor() throws IOException { + Path outputDir = Paths.get("src/test/resources/output"); + LocalInputSource imageResizeInput = + new LocalInputSource("src/test/resources/file_types/receipt.jpg"); + List resizes = Arrays.asList( + ImageCompressor.compressImage(imageResizeInput.getFile(), 75, 500, null), + ImageCompressor.compressImage(imageResizeInput.getFile(), 75, 250, 500), + ImageCompressor.compressImage(imageResizeInput.getFile(), 75, 500, 250), + ImageCompressor.compressImage(imageResizeInput.getFile(), 75, null, 250) + ); + + List outputPaths = Arrays.asList( + outputDir.resolve("resize500xnull.jpg"), + outputDir.resolve("resize250x500.jpg"), + outputDir.resolve("resize500x250.jpg"), + outputDir.resolve("resizenullx250.jpg") + ); + + for (int i = 0; i < resizes.size(); i++) { + Files.write(outputPaths.get(i), resizes.get(i)); + } + + long initialFileSize = Files.size(Paths.get("src/test/resources/file_types/receipt.jpg")); + List resizedFileSizes = outputPaths.stream() + .map(path -> { + try { + return Files.size(path); + } catch (IOException e) { + throw new RuntimeException(e); + } + }).collect(Collectors.toList()); + + Assertions.assertTrue( + initialFileSize > resizedFileSizes.get(0), + "Resized file size (" + resizedFileSizes.get(0) + + ") should be less than initial file size (" + initialFileSize + ")" + ); + Assertions.assertTrue( + resizedFileSizes.get(0) > resizedFileSizes.get(1), + "Resized file size (" + resizedFileSizes.get(1) + + ") should be less than initial file size (" + initialFileSize + ")" + ); + Assertions.assertTrue( + resizedFileSizes.get(1) > resizedFileSizes.get(2), + "Resized file size (" + resizedFileSizes.get(2) + + ") should be less than initial file size (" + resizedFileSizes.get(1) + ")" + ); + Assertions.assertEquals(resizedFileSizes.get(2), resizedFileSizes.get(3), + "Resized file size (" + resizedFileSizes.get(3) + + ") should be less than initial file size (" + resizedFileSizes.get(2) + ")" + ); + } + + @Test + public void testPdfResizeFromInputSource() throws IOException { + Path outputDir = Paths.get("src/test/resources/output"); + Path inputPath = Paths.get("src/test/resources/products/invoice_splitter/default_sample.pdf"); + Path outputPath = outputDir.resolve("resize_indirect.pdf"); + + LocalInputSource pdfResizeInput = new LocalInputSource(inputPath.toString()); + pdfResizeInput.compress(75); + Files.write(outputPath, pdfResizeInput.getFile()); + + long initialFileSize = Files.size(inputPath); + long renderedFileSize = Files.size(outputPath); + + Assertions.assertTrue( + renderedFileSize < initialFileSize, + "Resized file size (" + renderedFileSize + + ") should be less than initial file size (" + initialFileSize + ")" + ); + } + + @Test + public void testPdfResizeFromCompressor() throws IOException { + Path outputDir = Paths.get("src/test/resources/output"); + Path inputPath = Paths.get("src/test/resources/products/invoice_splitter/default_sample.pdf"); + LocalInputSource pdfResizeInput = new LocalInputSource(inputPath.toString()); + + List resizes = Arrays.asList( + PdfCompressor.compressPdf(pdfResizeInput.getFile()), + PdfCompressor.compressPdf(pdfResizeInput.getFile(), 75), + PdfCompressor.compressPdf(pdfResizeInput.getFile(), 50), + PdfCompressor.compressPdf(pdfResizeInput.getFile(), 10) + ); + + List outputPaths = Arrays.asList( + outputDir.resolve("compress85.pdf"), + outputDir.resolve("compress75.pdf"), + outputDir.resolve("compress50.pdf"), + outputDir.resolve("compress10.pdf") + ); + + for (int i = 0; i < resizes.size(); i++) { + Files.write(outputPaths.get(i), resizes.get(i)); + } + + long initialFileSize = Files.size(inputPath); + List renderedFileSizes = outputPaths.stream() + .map(path -> { + try { + return Files.size(path); + } catch (IOException e) { + throw new RuntimeException(e); + } + }) + .collect(Collectors.toList()); + + Assertions.assertTrue( + initialFileSize > renderedFileSizes.get(0), + "Compressed file size (" + renderedFileSizes.get(0) + + ") should be less than initial file size (" + initialFileSize + ")" + ); + Assertions.assertTrue( + renderedFileSizes.get(0) > renderedFileSizes.get(1), + "Compressed file size (" + renderedFileSizes.get(1) + + ") should be less than initial file size (" + renderedFileSizes.get(0) + ")" + ); + Assertions.assertTrue( + renderedFileSizes.get(1) > renderedFileSizes.get(2), + "Compressed file size (" + renderedFileSizes.get(2) + + ") should be less than initial file size (" + renderedFileSizes.get(1) + ")" + ); + Assertions.assertTrue( + renderedFileSizes.get(2) > renderedFileSizes.get(3), + "Compressed file size (" + renderedFileSizes.get(3) + + ") should be less than initial file size (" + renderedFileSizes.get(2) + ")" + ); + } + + @Test + public void testPdfResizeWithTextKeepsText() throws IOException { + Path inputPath = Paths.get("src/test/resources/file_types/pdf/multipage.pdf"); + LocalInputSource initialWithText = new LocalInputSource(inputPath.toString()); + byte[] compressedWithText = + PdfCompressor.compressPdf(initialWithText.getFile(), 100, true, false); + + PDDocument originalDoc = Loader.loadPDF(initialWithText.getFile()); + PDDocument compressedDoc = Loader.loadPDF(compressedWithText); + + Assertions.assertEquals(originalDoc.getNumberOfPages(), compressedDoc.getNumberOfPages()); + Assertions.assertNotEquals(originalDoc.hashCode(), compressedDoc.hashCode()); + PDFTextStripper textStripper = new PDFTextStripper(); + for (int i = 0; i < originalDoc.getNumberOfPages(); i++) { + textStripper.setStartPage(i + 1); + textStripper.setEndPage(i + 1); + // The character extractor seems to ignore some whitespaces as they are sometimes used for + // positioning, so we ignore them in the return string. + String originalText = textStripper.getText(originalDoc).trim().replaceAll(" ", ""); + String compressedText = textStripper.getText(compressedDoc).trim().replaceAll(" ", ""); + + Assertions.assertEquals(originalText, compressedText); + Assertions.assertNotEquals( + originalDoc.getPage(i).hashCode(), + compressedDoc.getPage(i).hashCode() + ); + } + + originalDoc.close(); + compressedDoc.close(); + } +} diff --git a/src/test/java/com/mindee/input/LocalInputSourceTest.java b/src/test/java/com/mindee/input/LocalInputSourceTest.java index e9689839d..c72de8475 100644 --- a/src/test/java/com/mindee/input/LocalInputSourceTest.java +++ b/src/test/java/com/mindee/input/LocalInputSourceTest.java @@ -1,82 +1,69 @@ package com.mindee.input; import com.mindee.MindeeException; -import com.mindee.image.ImageCompressor; -import com.mindee.pdf.PdfCompressor; -import java.awt.image.BufferedImage; import java.io.File; import java.io.IOException; import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; -import javax.imageio.ImageIO; import org.apache.commons.codec.binary.Base64; -import org.apache.pdfbox.Loader; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.text.PDFTextStripper; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; public class LocalInputSourceTest { + void assertMultipagePDF(LocalInputSource inputSource, File file) throws IOException { + Assertions.assertNotNull(inputSource); + + String filename = inputSource.getFilename(); + boolean isPdf = inputSource.isPdf(); + boolean hasSourceText = inputSource.hasSourceText(); + int numberOfPages = inputSource.getPageCount(); + + Assertions.assertTrue(isPdf); + Assertions.assertTrue(hasSourceText); + Assertions.assertEquals(3, numberOfPages); + Assertions.assertEquals("multipage_cut-3.pdf", filename); + Assertions.assertArrayEquals(inputSource.getFile(), Files.readAllBytes(file.toPath())); + } + @Test - void loadDocument_withFile_mustReturnAValidLocalInputSource() throws IOException { - File file = new File("src/test/resources/file_types/pdf/multipage.pdf"); + void loadPDF_withFile_mustReturnAValidLocalInputSource() throws IOException { + File file = new File("src/test/resources/file_types/pdf/multipage_cut-3.pdf"); LocalInputSource localInputSource = new LocalInputSource(file); - Assertions.assertNotNull(localInputSource); - Assertions.assertArrayEquals(localInputSource.getFile(), Files.readAllBytes(file.toPath())); + assertMultipagePDF(localInputSource, file); } @Test - void loadDocument_withInputStream_mustReturnAValidLocalInputSource() throws IOException { - File file = new File("src/test/resources/file_types/pdf/multipage.pdf"); + void loadPDF_withInputStream_mustReturnAValidLocalInputSource() throws IOException { + File file = new File("src/test/resources/file_types/pdf/multipage_cut-3.pdf"); LocalInputSource localInputSource = new LocalInputSource( Files.newInputStream(file.toPath()), - "multipage.pdf" + "multipage_cut-3.pdf" ); - Assertions.assertNotNull(localInputSource); - Assertions.assertArrayEquals(localInputSource.getFile(), Files.readAllBytes(file.toPath())); + assertMultipagePDF(localInputSource, file); } @Test - void loadDocument_withByteArray_mustReturnAValidLocalInputSource() throws IOException { - File file = new File("src/test/resources/file_types/pdf/multipage.pdf"); + void loadPDF_withByteArray_mustReturnAValidLocalInputSource() throws IOException { + File file = new File("src/test/resources/file_types/pdf/multipage_cut-3.pdf"); LocalInputSource localInputSource = new LocalInputSource( Files.readAllBytes(file.toPath()), - "multipage.pdf" - ); - Assertions.assertNotNull(localInputSource); - Assertions.assertArrayEquals(localInputSource.getFile(), Files.readAllBytes(file.toPath())); - } - - @Test - void loadDocument_withBase64Encoded_mustReturnAValidLocalInputSource() throws IOException { - File file = new File("src/test/resources/file_types/pdf/multipage.pdf"); - String encodedFile = Base64.encodeBase64String(Files.readAllBytes(file.toPath())); - LocalInputSource localInputSource = new LocalInputSource( - encodedFile, - "multipage.pdf" + "multipage_cut-3.pdf" ); - Assertions.assertNotNull(localInputSource); - Assertions.assertArrayEquals(localInputSource.getFile(), Files.readAllBytes(file.toPath())); + assertMultipagePDF(localInputSource, file); } @Test - void pdf_inputSource_withText_mustDetectSourceText() throws MindeeException, IOException { - File file = new File("src/test/resources/file_types/pdf/multipage.pdf"); + void loadPDF_withBase64Encoded_mustReturnAValidLocalInputSource() throws IOException { + File file = new File("src/test/resources/file_types/pdf/multipage_cut-3.pdf"); String encodedFile = Base64.encodeBase64String(Files.readAllBytes(file.toPath())); LocalInputSource localInputSource = new LocalInputSource( encodedFile, - "multipage.pdf" + "multipage_cut-3.pdf" ); - Assertions.assertNotNull(localInputSource); - Assertions.assertTrue(localInputSource.hasSourceText()); + assertMultipagePDF(localInputSource, file); } @Test - void pdf_inputSource_withoutText_mustNotDetectSourceText() throws MindeeException, IOException { + void loadPDF__withoutText_mustNotDetectSourceText() throws MindeeException, IOException { File file = new File("src/test/resources/products/invoice_splitter/default_sample.pdf"); String encodedFile = Base64.encodeBase64String(Files.readAllBytes(file.toPath())); LocalInputSource localInputSource = new LocalInputSource( @@ -84,283 +71,61 @@ void pdf_inputSource_withoutText_mustNotDetectSourceText() throws MindeeExceptio "default_sample.pdf" ); Assertions.assertNotNull(localInputSource); + Assertions.assertTrue(localInputSource.isPdf()); Assertions.assertFalse(localInputSource.hasSourceText()); } - @Test - void image_inputSource_mustNotDetectSourceText() throws MindeeException, IOException { - File file = new File("src/test/resources/products/expense_receipts/default_sample.jpg"); - String encodedFile = Base64.encodeBase64String(Files.readAllBytes(file.toPath())); - LocalInputSource localInputSource = new LocalInputSource( - encodedFile, - "default_sample.jpg" - ); - Assertions.assertNotNull(localInputSource); - Assertions.assertFalse(localInputSource.hasSourceText()); - } - - @Test - public void fromInputSource_imageQuality_should_Compress() throws IOException { - LocalInputSource receiptInput = - new LocalInputSource("src/test/resources/file_types/receipt.jpg"); - - receiptInput.compress(40); - Path outputPath = Paths.get("src/test/resources/output/compresstest.jpg"); - - Files.write(outputPath, receiptInput.getFile()); - - Assertions.assertTrue(Files.exists(outputPath)); - - long initialFileSize = Files.size(Paths.get("src/test/resources/file_types/receipt.jpg")); - long compressedFileSize = Files.size(outputPath); - - Assertions.assertTrue( - compressedFileSize < initialFileSize, - "Compressed file size (" + compressedFileSize + - ") should be less than initial file size (" + initialFileSize + ")" - ); - } - - @Test - public void testImageQualityCompressesFromCompressor() throws IOException { - Path outputDir = Paths.get("src/test/resources/output"); - LocalInputSource receiptInput = - new LocalInputSource("src/test/resources/file_types/receipt.jpg"); - List compresses = Arrays.asList( - ImageCompressor.compressImage(receiptInput.getFile(), 100), - ImageCompressor.compressImage(receiptInput.getFile()), - ImageCompressor.compressImage(receiptInput.getFile(), 50), - ImageCompressor.compressImage(receiptInput.getFile(), 10), - ImageCompressor.compressImage(receiptInput.getFile(), 1) - ); - - List outputPaths = Arrays.asList( - outputDir.resolve("compress100.jpg"), - outputDir.resolve("compress75.jpg"), - outputDir.resolve("compress50.jpg"), - outputDir.resolve("compress10.jpg"), - outputDir.resolve("compress1.jpg") - ); - - for (int i = 0; i < compresses.size(); i++) { - Files.write(outputPaths.get(i), compresses.get(i)); - } + void assertImage(LocalInputSource inputSource, File file) throws IOException { + Assertions.assertNotNull(inputSource); - long initialFileSize = Files.size(Paths.get("src/test/resources/file_types/receipt.jpg")); - List compressedFileSizes = outputPaths.stream() - .map(path -> { - try { - return Files.size(path); - } catch (IOException e) { - throw new RuntimeException(e); - } - }).collect(Collectors.toList()); + String filename = inputSource.getFilename(); + boolean isPdf = inputSource.isPdf(); + boolean hasSourceText = inputSource.hasSourceText(); + int numberOfPages = inputSource.getPageCount(); - Assertions.assertTrue( - initialFileSize < compressedFileSizes.get(0), - "Compressed file size (" + compressedFileSizes.get(0) + - ") should be less than initial file size (" + initialFileSize + ")" - ); - Assertions.assertTrue( - initialFileSize < compressedFileSizes.get(1), - "Compressed file size (" + compressedFileSizes.get(1) + - ") should be less than initial file size (" + initialFileSize + ")" - ); - Assertions.assertTrue( - compressedFileSizes.get(1) > compressedFileSizes.get(2), - "Compressed file size (" + compressedFileSizes.get(2) + - ") should be less than initial file size (" + compressedFileSizes.get(1) + ")" - ); - Assertions.assertTrue( - compressedFileSizes.get(2) > compressedFileSizes.get(3), - "Compressed file size (" + compressedFileSizes.get(3) + - ") should be less than initial file size (" + compressedFileSizes.get(2) + ")" - ); - Assertions.assertTrue( - compressedFileSizes.get(3) > compressedFileSizes.get(4), - "Compressed file size (" + compressedFileSizes.get(4) + - ") should be less than initial file size (" + compressedFileSizes.get(3) + ")" - ); + Assertions.assertFalse(isPdf); + Assertions.assertFalse(hasSourceText); + Assertions.assertEquals(1, numberOfPages); + Assertions.assertEquals("receipt.jpg", filename); + Assertions.assertArrayEquals(inputSource.getFile(), Files.readAllBytes(file.toPath())); } @Test - public void testImageResizeFromInputSource() throws IOException { - Path outputDir = Paths.get("src/test/resources/output"); - LocalInputSource imageResizeInput = - new LocalInputSource("src/test/resources/file_types/receipt.jpg"); - imageResizeInput.compress(75, 250, 1000); - Path outputPath = outputDir.resolve("resize_indirect.jpg"); - Files.write(outputPath, imageResizeInput.getFile()); - - long initialFileSize = Files.size(Paths.get("src/test/resources/file_types/receipt.jpg")); - long resizedFileSize = Files.size(outputPath); - Assertions.assertTrue(resizedFileSize < initialFileSize); - - BufferedImage resizedImage = ImageIO.read(outputPath.toFile()); - Assertions.assertEquals(250, resizedImage.getWidth()); - Assertions.assertEquals(333, resizedImage.getHeight()); + void loadImage_withFile_mustReturnAValidLocalInputSource() throws IOException { + File file = new File("src/test/resources/file_types/receipt.jpg"); + LocalInputSource localInputSource = new LocalInputSource(file); + assertImage(localInputSource, file); } @Test - public void testImageResizeFromCompressor() throws IOException { - Path outputDir = Paths.get("src/test/resources/output"); - LocalInputSource imageResizeInput = - new LocalInputSource("src/test/resources/file_types/receipt.jpg"); - List resizes = Arrays.asList( - ImageCompressor.compressImage(imageResizeInput.getFile(), 75, 500, null), - ImageCompressor.compressImage(imageResizeInput.getFile(), 75, 250, 500), - ImageCompressor.compressImage(imageResizeInput.getFile(), 75, 500, 250), - ImageCompressor.compressImage(imageResizeInput.getFile(), 75, null, 250) - ); - - List outputPaths = Arrays.asList( - outputDir.resolve("resize500xnull.jpg"), - outputDir.resolve("resize250x500.jpg"), - outputDir.resolve("resize500x250.jpg"), - outputDir.resolve("resizenullx250.jpg") - ); - - for (int i = 0; i < resizes.size(); i++) { - Files.write(outputPaths.get(i), resizes.get(i)); - } - - long initialFileSize = Files.size(Paths.get("src/test/resources/file_types/receipt.jpg")); - List resizedFileSizes = outputPaths.stream() - .map(path -> { - try { - return Files.size(path); - } catch (IOException e) { - throw new RuntimeException(e); - } - }).collect(Collectors.toList()); - - Assertions.assertTrue( - initialFileSize > resizedFileSizes.get(0), - "Resized file size (" + resizedFileSizes.get(0) + - ") should be less than initial file size (" + initialFileSize + ")" - ); - Assertions.assertTrue( - resizedFileSizes.get(0) > resizedFileSizes.get(1), - "Resized file size (" + resizedFileSizes.get(1) + - ") should be less than initial file size (" + initialFileSize + ")" - ); - Assertions.assertTrue( - resizedFileSizes.get(1) > resizedFileSizes.get(2), - "Resized file size (" + resizedFileSizes.get(2) + - ") should be less than initial file size (" + resizedFileSizes.get(1) + ")" - ); - Assertions.assertEquals(resizedFileSizes.get(2), resizedFileSizes.get(3), - "Resized file size (" + resizedFileSizes.get(3) + - ") should be less than initial file size (" + resizedFileSizes.get(2) + ")" + void loadImage_withInputStream_mustReturnAValidLocalInputSource() throws IOException { + File file = new File("src/test/resources/file_types/receipt.jpg"); + LocalInputSource localInputSource = new LocalInputSource( + Files.newInputStream(file.toPath()), + "receipt.jpg" ); + assertImage(localInputSource, file); } @Test - public void testPdfResizeFromInputSource() throws IOException { - Path outputDir = Paths.get("src/test/resources/output"); - Path inputPath = Paths.get("src/test/resources/products/invoice_splitter/default_sample.pdf"); - Path outputPath = outputDir.resolve("resize_indirect.pdf"); - - LocalInputSource pdfResizeInput = new LocalInputSource(inputPath.toString()); - pdfResizeInput.compress(75); - Files.write(outputPath, pdfResizeInput.getFile()); - - long initialFileSize = Files.size(inputPath); - long renderedFileSize = Files.size(outputPath); - - Assertions.assertTrue( - renderedFileSize < initialFileSize, - "Resized file size (" + renderedFileSize + - ") should be less than initial file size (" + initialFileSize + ")" + void loadImage_withByteArray_mustReturnAValidLocalInputSource() throws IOException { + File file = new File("src/test/resources/file_types/receipt.jpg"); + LocalInputSource localInputSource = new LocalInputSource( + Files.readAllBytes(file.toPath()), + "receipt.jpg" ); + assertImage(localInputSource, file); } @Test - public void testPdfResizeFromCompressor() throws IOException { - Path outputDir = Paths.get("src/test/resources/output"); - Path inputPath = Paths.get("src/test/resources/products/invoice_splitter/default_sample.pdf"); - LocalInputSource pdfResizeInput = new LocalInputSource(inputPath.toString()); - - List resizes = Arrays.asList( - PdfCompressor.compressPdf(pdfResizeInput.getFile()), - PdfCompressor.compressPdf(pdfResizeInput.getFile(), 75), - PdfCompressor.compressPdf(pdfResizeInput.getFile(), 50), - PdfCompressor.compressPdf(pdfResizeInput.getFile(), 10) - ); - - List outputPaths = Arrays.asList( - outputDir.resolve("compress85.pdf"), - outputDir.resolve("compress75.pdf"), - outputDir.resolve("compress50.pdf"), - outputDir.resolve("compress10.pdf") - ); - - for (int i = 0; i < resizes.size(); i++) { - Files.write(outputPaths.get(i), resizes.get(i)); - } - - long initialFileSize = Files.size(inputPath); - List renderedFileSizes = outputPaths.stream() - .map(path -> { - try { - return Files.size(path); - } catch (IOException e) { - throw new RuntimeException(e); - } - }) - .collect(Collectors.toList()); - - Assertions.assertTrue( - initialFileSize > renderedFileSizes.get(0), - "Compressed file size (" + renderedFileSizes.get(0) + - ") should be less than initial file size (" + initialFileSize + ")" - ); - Assertions.assertTrue( - renderedFileSizes.get(0) > renderedFileSizes.get(1), - "Compressed file size (" + renderedFileSizes.get(1) + - ") should be less than initial file size (" + renderedFileSizes.get(0) + ")" - ); - Assertions.assertTrue( - renderedFileSizes.get(1) > renderedFileSizes.get(2), - "Compressed file size (" + renderedFileSizes.get(2) + - ") should be less than initial file size (" + renderedFileSizes.get(1) + ")" - ); - Assertions.assertTrue( - renderedFileSizes.get(2) > renderedFileSizes.get(3), - "Compressed file size (" + renderedFileSizes.get(3) + - ") should be less than initial file size (" + renderedFileSizes.get(2) + ")" + void loadImage_withBase64Encoded_mustReturnAValidLocalInputSource() throws IOException { + File file = new File("src/test/resources/file_types/receipt.jpg"); + String encodedFile = Base64.encodeBase64String(Files.readAllBytes(file.toPath())); + LocalInputSource localInputSource = new LocalInputSource( + encodedFile, + "receipt.jpg" ); + assertImage(localInputSource, file); } - @Test - public void testPdfResizeWithTextKeepsText() throws IOException { - Path inputPath = Paths.get("src/test/resources/file_types/pdf/multipage.pdf"); - LocalInputSource initialWithText = new LocalInputSource(inputPath.toString()); - byte[] compressedWithText = - PdfCompressor.compressPdf(initialWithText.getFile(), 100, true, false); - - PDDocument originalDoc = Loader.loadPDF(initialWithText.getFile()); - PDDocument compressedDoc = Loader.loadPDF(compressedWithText); - - Assertions.assertEquals(originalDoc.getNumberOfPages(), compressedDoc.getNumberOfPages()); - Assertions.assertNotEquals(originalDoc.hashCode(), compressedDoc.hashCode()); - PDFTextStripper textStripper = new PDFTextStripper(); - for (int i = 0; i < originalDoc.getNumberOfPages(); i++) { - textStripper.setStartPage(i + 1); - textStripper.setEndPage(i + 1); - // The character extractor seems to ignore some whitespaces as they are sometimes used for - // positioning, so we ignore them in the return string. - String originalText = textStripper.getText(originalDoc).trim().replaceAll(" ", ""); - String compressedText = textStripper.getText(compressedDoc).trim().replaceAll(" ", ""); - - Assertions.assertEquals(originalText, compressedText); - Assertions.assertNotEquals( - originalDoc.getPage(i).hashCode(), - compressedDoc.getPage(i).hashCode() - ); - } - - originalDoc.close(); - compressedDoc.close(); - } } diff --git a/src/test/java/com/mindee/input/URLInputSourceTest.java b/src/test/java/com/mindee/input/URLInputSourceTest.java index 3ca2d4f04..e61d4952a 100644 --- a/src/test/java/com/mindee/input/URLInputSourceTest.java +++ b/src/test/java/com/mindee/input/URLInputSourceTest.java @@ -16,7 +16,6 @@ public class URLInputSourceTest { private static final String TEST_URL = "https://example.com/testfile.pdf"; - private static final String TEST_LOCAL_FILENAME = "testfile.pdf"; private TestableURLInputSource urlInputSource; @BeforeEach diff --git a/src/test/resources b/src/test/resources index bc8356c1c..128075122 160000 --- a/src/test/resources +++ b/src/test/resources @@ -1 +1 @@ -Subproject commit bc8356c1ce52d60351ed3430d336f33366025012 +Subproject commit 1280751220ee3673a79697e1b00be494545ad5f5