Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/_static-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
submodules: recursive

- name: Set up JDK
uses: actions/setup-java@v4
uses: actions/setup-java@v5
with:
java-version: "11"
distribution: "temurin"
Expand Down
14 changes: 7 additions & 7 deletions src/main/java/com/mindee/extraction/PDFExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -111,26 +111,24 @@ public List<ExtractedPDF> extractSubDocuments(List<List<Integer>> pageIndexes)
return extractedPDFs;
}


/**
* Extract invoices from the given page indexes (from an invoice-splitter prediction).
*
* @param pageIndexes List of page indexes.
* @return a list of extracted files.
* @throws IOException Throws if the file can't be accessed.
*/
public List<ExtractedPDF> extractInvoices(List<InvoiceSplitterV1InvoicePageGroup> pageIndexes)
throws IOException {
public List<ExtractedPDF> extractInvoices(
List<InvoiceSplitterV1InvoicePageGroup> pageIndexes
) throws IOException {

List<List<Integer>> indexes =
pageIndexes.stream().map(InvoiceSplitterV1InvoicePageGroup::getPageIndexes)
.collect(Collectors.toList());


return extractSubDocuments(indexes);
}


/**
* Extract invoices from the given page indexes (from an invoice-splitter prediction).
*
Expand All @@ -139,8 +137,10 @@ public List<ExtractedPDF> extractInvoices(List<InvoiceSplitterV1InvoicePageGroup
* @return a list of extracted files.
* @throws IOException Throws if the file can't be accessed.
*/
public List<ExtractedPDF> extractInvoices(List<InvoiceSplitterV1InvoicePageGroup> pageIndexes,
boolean strict) throws IOException {
public List<ExtractedPDF> extractInvoices(
List<InvoiceSplitterV1InvoicePageGroup> pageIndexes,
boolean strict
) throws IOException {
List<List<Integer>> correctPageIndexes = new ArrayList<>();
if (!strict) {
return extractInvoices(pageIndexes);
Expand Down
12 changes: 12 additions & 0 deletions src/main/java/com/mindee/input/LocalInputSource.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package com.mindee.input;

import com.mindee.image.ImageCompressor;
import com.mindee.pdf.PDFUtils;
import com.mindee.pdf.PdfBoxApi;
import com.mindee.pdf.PdfCompressor;
import com.mindee.pdf.PdfOperation;
Expand Down Expand Up @@ -48,6 +49,17 @@ public LocalInputSource(String fileAsBase64, String filename) {
this.filename = filename;
}

/**
* Get the number of pages in the document.
* @return the number of pages in the current file.
* @throws IOException If an I/O error occurs during the PDF operation.
*/
public int getPageCount() throws IOException {
if (!this.isPdf()) {
return 1;
}
return PDFUtils.getNumberOfPages(this.file);
}

/**
* Applies PDF-specific operations on the current file based on the specified {@code PageOptions}.
Expand Down
11 changes: 10 additions & 1 deletion src/main/java/com/mindee/pdf/PDFUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,16 @@ private PDFUtils() {
* @param inputSource The PDF file.
*/
public static int getNumberOfPages(LocalInputSource inputSource) throws IOException {
PDDocument document = Loader.loadPDF(inputSource.getFile());
return getNumberOfPages(inputSource.getFile());
}

/**
* Get the number of pages in the PDF.
*
* @param pdfBytes The PDF file as a byte array.
*/
public static int getNumberOfPages(byte[] pdfBytes) throws IOException {
PDDocument document = Loader.loadPDF(pdfBytes);
int pageCount = document.getNumberOfPages();
document.close();
return pageCount;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,8 @@ protected Document<InvoiceSplitterV1> getInvoiceSplitterPrediction() throws
protected PredictResponse<InvoiceV4> getInvoicePrediction(LocalInputSource invoicePDF) throws
IOException, MindeeException {
return client.parse(InvoiceV4.class, invoicePDF);

}


protected String prepareInvoiceReturn(String rstFilePath, Document<InvoiceV4> invoicePrediction)
throws IOException {
List<String> rstRefLines = Files.readAllLines(Paths.get(rstFilePath));
Expand All @@ -60,7 +58,7 @@ protected String prepareInvoiceReturn(String rstFilePath, Document<InvoiceV4> in
}

@Test
public void givenAPDF_shouldExtractInvoicesStrict() throws IOException, InterruptedException {
public void givenAPDF_shouldExtractInvoices() throws IOException, InterruptedException {
Document<InvoiceSplitterV1> document = getInvoiceSplitterPrediction();
InvoiceSplitterV1 inference = document.getInference();

Expand All @@ -71,29 +69,37 @@ public void givenAPDF_shouldExtractInvoicesStrict() throws IOException, Interrup
Assertions.assertEquals(2, extractedPDFsStrict.size());
Assertions.assertEquals("default_sample_001-001.pdf", extractedPDFsStrict.get(0).getFilename());
Assertions.assertEquals("default_sample_002-002.pdf", extractedPDFsStrict.get(1).getFilename());
PredictResponse<InvoiceV4> invoice0 =
getInvoicePrediction(extractedPDFsStrict.get(0).asInputSource());

PredictResponse<InvoiceV4> invoice0 = getInvoicePrediction(
extractedPDFsStrict.get(0).asInputSource()
);
String testStringRSTInvoice0 = prepareInvoiceReturn(
"src/test/resources/products/invoices/response_v4/summary_full_invoice_p1.rst",
invoice0.getDocument());
Assertions.assertEquals(testStringRSTInvoice0, String.join(String.format("%n"),
invoice0.getDocument().toString().split(System.lineSeparator())));

PredictResponse<InvoiceV4> invoice1 =
getInvoicePrediction(extractedPDFsStrict.get(1).asInputSource());
invoice0.getDocument()
);
double invoice0Ratio = levenshteinRatio(
testStringRSTInvoice0,
String.join(
String.format("%n"),
invoice0.getDocument().toString().split(System.lineSeparator())
)
);
Assertions.assertTrue(invoice0Ratio > 0.90);

PredictResponse<InvoiceV4> invoice1 = getInvoicePrediction(
extractedPDFsStrict.get(1).asInputSource()
);
String testStringRSTInvoice1 = prepareInvoiceReturn(
"src/test/resources/products/invoices/response_v4/summary_full_invoice_p2.rst",
invoice1.getDocument());
Assertions.assertTrue(
levenshteinRatio(
invoice1.getDocument()
);
double invoice1Ratio = levenshteinRatio(
testStringRSTInvoice1,
String.join(String.format("%n"),
invoice1.getDocument().toString().split(System.lineSeparator())
String.join(
String.format("%n"),
invoice1.getDocument().toString().split(System.lineSeparator())
)
) > 0.97);


);
Assertions.assertTrue(invoice1Ratio > 0.90);
}
}
Loading
Loading