diff --git a/.gitignore b/.gitignore index 514fd563..818808e9 100644 --- a/.gitignore +++ b/.gitignore @@ -21,8 +21,6 @@ composer.lock .env.local _test.php .php-cs-fixer.cache -local_test/ -LocalTest.php .phpunit* *.log /build @@ -30,3 +28,8 @@ LocalTest.php .phpdoc docs/_build tests/LocalTestNotUnit.php + +# For local testing +/output/ +local_test/ +LocalTest.php diff --git a/examples/MultiReceiptsAutoExtractionExample.php b/examples/MultiReceiptsAutoExtractionExample.php index 3a921404..05f07282 100644 --- a/examples/MultiReceiptsAutoExtractionExample.php +++ b/examples/MultiReceiptsAutoExtractionExample.php @@ -1,7 +1,7 @@ image = $image; $this->filename = $filename; $this->saveFormat = $saveFormat; + $this->pageId = $pageIndex; + $this->elementId = $index; } /** * Writes the image to a file. * Uses the default image format and filename. * - * @param string $outputPath The output directory (must exist). + * @param string $outputPath The output directory (must exist). + * @param null|string $format The image format to use. Defaults to the save format if not provided. + * @param integer $quality Quality of the saved image. + * * @return void * @throws \ImagickException Throws if the image can't be processed. */ - public function writeToFile(string $outputPath): void + public function writeToFile(string $outputPath, ?string $format = null, int $quality = 100): void { $imagePath = $outputPath . DIRECTORY_SEPARATOR . $this->filename; - $format = $this->getEncodedImageFormat($this->saveFormat); + $format = $this->getEncodedImageFormat($format ?? $this->saveFormat); $this->image->setImageFormat($format); + $this->image->stripImage(); + $quality = min(100, max(0, $quality)); + if ('png' === $format) { + $finalQuality = round($quality * 0.09); + $this->image->setOption('png:compression-level', $finalQuality); + } elseif (in_array($format, ['jpg', 'jpeg'])) { + $this->image->setImageCompression(\Imagick::COMPRESSION_JPEG); + } + $this->image->setImageCompressionQuality($quality); $this->image->writeImage($imagePath); } /** * Returns the image in a format suitable for sending to a client for parsing. * - * @throws \ImagickException Throws if the image can't be processed. * @return BytesInput Bytes input for the image. + * + * @throws \ImagickException Throws if the image can't be processed. */ public function asInputSource(): BytesInput { $format = $this->getEncodedImageFormat($this->saveFormat); $this->image->setImageFormat($format); + return new BytesInput($this->image->getImageBlob(), $this->filename); } @@ -82,7 +105,7 @@ public function asInputSource(): BytesInput * Get the encoded image format. * * @param string $saveFormat Format to save the file as. - * @return string + * @return string Encoded image format. */ private function getEncodedImageFormat(string $saveFormat): string { diff --git a/src/Extraction/ExtractedPdf.php b/src/Extraction/ExtractedPdf.php index 1a34ee1b..860d642c 100644 --- a/src/Extraction/ExtractedPdf.php +++ b/src/Extraction/ExtractedPdf.php @@ -16,24 +16,21 @@ class ExtractedPdf { /** - * File object for an ExtractedPdf. - * - * @var string + * @var string name of the original file */ - protected string $pdfBytes; + public string $filename; /** - * Name of the original file. - * - * @var string + * @var string File object for an ExtractedPdf. */ - protected string $filename; + protected string $pdfBytes; /** * Initializes a new instance of the ExtractedPdf class. * * @param string $pdfBytes A binary string representation of the PDF. * @param string $filename Name of the original file. + * * @throws MindeeUnhandledException Throws if PDF operations aren't supported. */ public function __construct(string $pdfBytes, string $filename) @@ -47,16 +44,18 @@ public function __construct(string $pdfBytes, string $filename) /** * Wrapper for pdf GetPageCount(). * - * @return integer The number of pages in the file. + * @return integer the number of pages in the file + * * @throws MindeePDFException Throws if FPDI is unable to process the file. */ public function getPageCount(): int { try { - $pdfHandle = new FPDI(); + $pdfHandle = new Fpdi(); $tempFilename = tempnam(sys_get_temp_dir(), 'extracted_pdf_'); file_put_contents($tempFilename, $this->pdfBytes); + return $pdfHandle->setSourceFile($tempFilename); } catch (PdfParserException $e) { throw new MindeePDFException( @@ -76,8 +75,13 @@ public function getPageCount(): int public function writeToFile(string $outputPath): void { $pdfPath = $outputPath . DIRECTORY_SEPARATOR . $this->filename; - if (basename($outputPath) !== '') { - $pdfPath = realpath($outputPath); + if ('' !== basename($outputPath)) { + if (!($pdfPath = realpath($outputPath))) { + $pdfPath = $outputPath; + } + } + if (!str_ends_with(strtolower($pdfPath), 'pdf')) { + $pdfPath .= '.pdf'; } file_put_contents($pdfPath, $this->pdfBytes); } @@ -85,7 +89,7 @@ public function writeToFile(string $outputPath): void /** * Return the file in a format suitable for sending to MindeeClient for parsing. * - * @return BytesInput Bytes input for the image. + * @return BytesInput bytes input for the image */ public function asInputSource(): BytesInput { @@ -93,7 +97,7 @@ public function asInputSource(): BytesInput } /** - * @return string The pdf bytes. + * @return string the pdf bytes */ public function getPdfBytes(): string { @@ -101,7 +105,7 @@ public function getPdfBytes(): string } /** - * @return string The name of the file. + * @return string the name of the file */ public function getFilename(): string { diff --git a/src/Extraction/ImageExtractor.php b/src/Extraction/ImageExtractor.php index 37dcab68..16bc580a 100644 --- a/src/Extraction/ImageExtractor.php +++ b/src/Extraction/ImageExtractor.php @@ -6,9 +6,9 @@ use Mindee\Error\MindeeGeometryException; use Mindee\Error\MindeeImageException; use Mindee\Error\MindeePDFException; -use Mindee\Error\MindeeUnhandledException; use Mindee\Geometry\BBox; use Mindee\Geometry\BBoxUtils; +use Mindee\Geometry\Polygon; use Mindee\Input\LocalInputSource; use Mindee\Parsing\DependencyChecker; use Mindee\Parsing\Standard\BaseField; @@ -19,36 +19,30 @@ class ImageExtractor { /** - * Array of extracted page images. - * - * @var array + * @var \Imagick[] Array of extracted page images. */ - private array $pageImages = []; + protected array $pageImages = []; + /** - * Name of the file. - * - * @var string + * @var string Name of the file. */ - private string $filename; + protected string $filename; + /** - * Format to save the image as. - * - * @var string + * @var string Format to save the image as. */ - private string $saveFormat; + protected string $saveFormat; + /** - * Local input object used by the ImageExtractor. - * - * @var LocalInputSource + * @var LocalInputSource Local input object used by the ImageExtractor. */ protected LocalInputSource $inputSource; - /** - * @param LocalInputSource $localInput Local Input, accepts all compatible formats. - * @param string|null $saveFormat Save format, will be coerced to jpg by default. - * @throws MindeeUnhandledException|MindeePDFException Throws if PDF operations aren't supported, - * or if the file can't be read, respectively. + * @param LocalInputSource $localInput Local input, accepts all compatible formats. + * @param null|string $saveFormat Save format, will be coerced to jpg by default. + * + * @throws MindeePDFException Throws if PDF operations aren't supported, or if the file can't be read, respectively. */ public function __construct(LocalInputSource $localInput, ?string $saveFormat = null) { @@ -58,8 +52,8 @@ public function __construct(LocalInputSource $localInput, ?string $saveFormat = $this->inputSource = $localInput; $extension = pathinfo($localInput->fileName, PATHINFO_EXTENSION); - if ($saveFormat === null) { - if ($extension && strtolower($extension) !== 'pdf') { + if (null === $saveFormat) { + if ($extension && 'pdf' !== strtolower($extension)) { $this->saveFormat = $extension; } else { $this->saveFormat = 'jpg'; @@ -89,7 +83,9 @@ public function __construct(LocalInputSource $localInput, ?string $saveFormat = * Renders the input PDF's pages as individual images. * * @param string $fileBytes Input pdf. - * @return array A list of pages. + * + * @return \Imagick[] A list of pages. + * * @throws MindeeImageException Throws if the image can't be handled. */ public static function pdfToImages(string $fileBytes): array @@ -100,7 +96,7 @@ public static function pdfToImages(string $fileBytes): array $imagick->readImageBlob($fileBytes); foreach ($imagick as $page) { - $page->setImageFormat('png'); + $page->setImageFormat('jpg'); $images[] = $page; } @@ -116,8 +112,7 @@ public static function pdfToImages(string $fileBytes): array /** * Gets the number of pages in the file. - * - * @return integer + * @return integer Page count. */ public function getPageCount(): int { @@ -129,8 +124,9 @@ public function getPageCount(): int * * @param array $fields List of Fields to extract. * @param integer $pageIndex The page index to extract, begins at 0. - * @param string|null $outputName The base output filename, must have an image extension. - * @return array A list of extracted images. + * @param null|string $outputName The base output filename, must have an image extension. + * + * @return array a list of extracted images */ public function extractImagesFromPage(array $fields, int $pageIndex, ?string $outputName = null): array { @@ -141,29 +137,73 @@ public function extractImagesFromPage(array $fields, int $pageIndex, ?string $ou /** * Extracts images from a page. * - * @param array $fields List of Fields to extract. - * @param integer $pageIndex The page index to extract, begins at 0. - * @param string $outputName Name of the created file. - * @return array An array of created images. + * @param array $polygons List of polygons to extract. + * @param integer $pageIndex The page index to extract, begins at 0. + * @param null|string $filenamePrefix Output filename prefix. + * @param null|string $format Save format for extracted images. Defaults to the original format. + * + * @return array an array of created images + * @throws MindeeImageException Throws if the image can't be processed. */ - private function extractFromPage(array $fields, int $pageIndex, string $outputName): array - { - $splitName = $this->splitNameStrict($outputName); - $filename = sprintf("%s_page-%03d.%s", $splitName[0], $pageIndex + 1, $this->saveFormat); + public function extractPolygonsFromPage( + array $polygons, + int $pageIndex, + ?string $filenamePrefix = null, + ?string $format = null + ): array { + $saveFormat = $format ?? $this->saveFormat; $extractedImages = []; - $i = 0; - foreach ($fields as $field) { - $extractedImage = $this->extractImage($field, $pageIndex, $i + 1, $filename); - if ($extractedImage !== null) { - $extractedImages[] = $extractedImage; + try { + foreach ($polygons as $i => $polygon) { + $filenamePrefix ??= $this->filename; + $outputFilename = sprintf('%s-%d.%s', $filenamePrefix, $i, $saveFormat); + $extractedImages[] = $this->extractPolygonFromPage( + $polygon, + $pageIndex, + $i, + $outputFilename, + $saveFormat + ); } - $i++; + } catch (\ImagickException $e) { + throw new MindeeImageException($e->getMessage(), $e->getCode(), $e); } return $extractedImages; } + /** + * Extracts a cropped portion from an image. + * + * @param Polygon $polygon Polygon to extract. + * @param integer $pageIndex Page index to extract from. + * @param integer $index Index to use for naming the extracted image. + * @param null|string $filename Output filename. + * @param null|string $format Output format. + * + * @return ExtractedImage Extracted image data. + * @throws MindeeImageException Throws if the image can't be processed. + */ + public function extractPolygonFromPage( + Polygon $polygon, + int $pageIndex, + int $index, + ?string $filename = null, + ?string $format = null + ): ExtractedImage { + $bbox = BBoxUtils::generateBBoxFromPolygon($polygon); + try { + $extractedImageData = $this->extractImageFromBbox($bbox, $pageIndex); + } catch (\ImagickException $e) { + throw new MindeeImageException($e->getMessage(), $e->getCode(), $e); + } + $filename ??= $this->filename; + $format ??= $this->saveFormat; + $filename ??= sprintf('%s.%s_page%d-%d.%s', $filename, $format, $pageIndex, $index, $format); + return new ExtractedImage($extractedImageData, $filename, $format, $pageIndex, $index); + } + /** * Extracts a single image from a Position field. * @@ -171,41 +211,43 @@ private function extractFromPage(array $fields, int $pageIndex, string $outputNa * @param integer $pageIndex The page index to extract, begins at 0. * @param integer $index The index to use for naming the extracted image. * @param string $filename The output filename. - * @return ExtractedImage|null The extracted image, or null if the field does not have valid position data. + * @param string $format The output format. + * + * @return null|ExtractedImage The extracted image, or null if the field does not have valid position data. + * * @throws MindeeGeometryException Throws if a field does not contain positional data. */ - public function extractImage(BaseField $field, int $pageIndex, int $index, string $filename): ?ExtractedImage - { - $splitName = $this->splitNameStrict($filename); - $boundingBox = null; + public function extractImage( + BaseField $field, + int $pageIndex, + int $index, + string $filename, + string $format + ): ?ExtractedImage { + $polygon = null; if (!empty($field->polygon)) { - $boundingBox = $field->polygon; + $polygon = $field->polygon; } elseif (!empty($field->boundingBox)) { - $boundingBox = $field->boundingBox; + $polygon = $field->boundingBox; } elseif (!empty($field->quadrangle)) { - $boundingBox = $field->quadrangle; + $polygon = $field->quadrangle; } elseif (!empty($field->rectangle)) { - $boundingBox = $field->rectangle; + $polygon = $field->rectangle; } - if ($boundingBox === null) { + if (null === $polygon) { throw new MindeeGeometryException( - "Provided field has no valid position data.", + 'Provided field has no valid position data.', ErrorCode::GEOMETRIC_OPERATION_FAILED ); } - $bbox = BBoxUtils::generateBBoxFromPolygon($boundingBox); - $fieldFilename = sprintf("%s_%03d.%s", $splitName[0], $index, $this->saveFormat); - $extractedImageData = $this->extractImageFromBbox($bbox, $pageIndex); - - return new ExtractedImage($extractedImageData, $fieldFilename, $this->saveFormat); + return $this->extractPolygonFromPage($polygon, $pageIndex, $index, $filename, $format); } /** * Getter for the local input source. - * * @return LocalInputSource */ public function getInputSource(): LocalInputSource @@ -213,14 +255,43 @@ public function getInputSource(): LocalInputSource return $this->inputSource; } + /** + * Extracts images from a page. + * + * @param array $fields List of Fields to extract. + * @param integer $pageIndex The page index to extract, begins at 0. + * @param string $outputName Name of the created file. + * @param string $format The output format. + * + * @return array an array of created images + */ + protected function extractFromPage(array $fields, int $pageIndex, string $outputName, string $format = 'jpg'): array + { + $format ??= $this->saveFormat; + $extractedImages = []; + + $i = 0; + foreach ($fields as $field) { + $filename = sprintf('%s_page%d-%d.%s', $outputName, $pageIndex, $i, $format); + $extractedImage = $this->extractImage($field, $pageIndex, $i, $filename, $format); + if (null !== $extractedImage) { + $extractedImages[] = $extractedImage; + } + ++$i; + } + + return $extractedImages; + } + /** * Extracts an image from a set of coordinates. * - * @param BBox $bbox BBox coordinates. - * @param integer $pageIndex The page index to extract, begins at 0. + * @param BBox $bbox BBox coordinates. + * @param integer|float $pageIndex The page index to extract, begins at 0. * @return \Imagick + * @throws \ImagickException Throws if the image can't be processed. */ - private function extractImageFromBbox(BBox $bbox, int $pageIndex): \Imagick + protected function extractImageFromBbox(BBox $bbox, int|float $pageIndex): \Imagick { $image = $this->pageImages[$pageIndex]->clone(); $width = $image->getImageWidth(); @@ -231,7 +302,7 @@ private function extractImageFromBbox(BBox $bbox, int $pageIndex): \Imagick $minY = round($bbox->getMinY() * $height); $maxY = round($bbox->getMaxY() * $height); - $image->cropImage($maxX - $minX, $maxY - $minY, $minX, $minY); + $image->cropImage((int)($maxX - $minX), (int)($maxY - $minY), (int)$minX, (int)$minY); return $image; } @@ -240,13 +311,13 @@ private function extractImageFromBbox(BBox $bbox, int $pageIndex): \Imagick * Splits the filename into name and extension. * * @param string $filename Name of the file. - * @return array + * @return array An array containing the name and extension of the file. */ - private function splitNameStrict(string $filename): array + protected static function splitNameStrict(string $filename): array { return [ pathinfo($filename, PATHINFO_FILENAME), - pathinfo($filename, PATHINFO_EXTENSION) + pathinfo($filename, PATHINFO_EXTENSION), ]; } } diff --git a/src/Extraction/PdfExtractor.php b/src/Extraction/PdfExtractor.php index 23605f17..36bac699 100644 --- a/src/Extraction/PdfExtractor.php +++ b/src/Extraction/PdfExtractor.php @@ -2,18 +2,14 @@ namespace Mindee\Extraction; -use InvalidArgumentException; use Mindee\Error\MindeePDFException; -use Mindee\Error\MindeeUnhandledException; use Mindee\Input\LocalInputSource; use Mindee\Parsing\DependencyChecker; -use Mindee\Product\InvoiceSplitter\InvoiceSplitterV1InvoicePageGroup; use Mindee\Product\InvoiceSplitter\InvoiceSplitterV1InvoicePageGroups; use setasign\Fpdi\Fpdi; use setasign\Fpdi\PdfParser\CrossReference\CrossReferenceException; use setasign\Fpdi\PdfParser\Filter\FilterException; use setasign\Fpdi\PdfParser\PdfParserException; -use setasign\Fpdi\PdfParser\Type\PdfTypeException; use setasign\Fpdi\PdfReader\PdfReaderException; /** @@ -22,18 +18,20 @@ class PdfExtractor { /** - * @var string Bytes representation of a file. + * @var string bytes representation of a file */ private string $pdfBytes; + /** - * @var string Name of the file. + * @var string name of the file */ private string $fileName; /** * @param LocalInputSource $localInput Local Input, accepts all compatible formats. - * @throws MindeeUnhandledException|MindeePDFException Throws if PDF operations aren't supported, - * or if the file can't be read, respectively. + * + * @throws MindeePDFException Throws if PDF operations aren't supported, or if the file + * can't be read, respectively. */ public function __construct(LocalInputSource $localInput) { @@ -59,50 +57,54 @@ public function __construct(LocalInputSource $localInput) * Wrapper for pdf GetPageCount(). * * @return integer The number of pages in the file. + * * @throws MindeePDFException Throws if FPDI is unable to process the file. */ public function getPageCount(): int { try { - $pdfHandle = new FPDI(); + $pdfHandle = new Fpdi(); $tempFilename = tempnam(sys_get_temp_dir(), 'extracted_pdf_'); file_put_contents($tempFilename, $this->pdfBytes); + return $pdfHandle->setSourceFile($tempFilename); } catch (PdfParserException $e) { throw new MindeePDFException("Couldn't open PDF file. FPDI sent the following: ", 0, $e); } } - /** * Extracts sub-documents from the source document using list of page indexes. * - * @param array $pageIndexes List of sub-lists of pages to keep. - * @return array List of extracted documents. - * @throws MindeePDFException Throws if FDPF/FPDI wasn't able to handle the pdf during the extraction. - * @throws InvalidArgumentException Throws if invalid indexes are provided. + * @param array|InvoiceSplitterV1InvoicePageGroups $pageIndexes List of sub-lists of pages to keep. + * + * @return ExtractedPdf[] list of extracted documents + * + * @throws MindeePDFException Throws if FDPF/FPDI wasn't able to handle the pdf during the extraction. + * @throws \InvalidArgumentException Throws if invalid indexes are provided. */ - public function extractSubDocuments(array $pageIndexes): array + public function extractSubDocuments(mixed $pageIndexes): array { $extractedPdfs = []; foreach ($pageIndexes as $pageIndexElem) { if (empty($pageIndexElem)) { - throw new InvalidArgumentException("Empty indexes not allowed for extraction."); + throw new \InvalidArgumentException('Empty indexes not allowed for extraction.'); } $extension = pathinfo($this->fileName, PATHINFO_EXTENSION); $prefix = pathinfo($this->fileName, PATHINFO_FILENAME); $fieldFilename = sprintf( - "%s_%03d-%03d.%s", + '%s_%03d-%03d.%s', $prefix, $pageIndexElem[0] + 1, $pageIndexElem[count($pageIndexElem) - 1] + 1, $extension ); + try { - $pdf = new FPDI(); + $pdf = new Fpdi(); $tempFilename = tempnam(sys_get_temp_dir(), 'extracted_pdf_'); file_put_contents($tempFilename, $this->pdfBytes); $pdf->setSourceFile($tempFilename); @@ -114,13 +116,12 @@ public function extractSubDocuments(array $pageIndexes): array $mergedPdfBytes = $pdf->Output('S'); } catch ( - PdfParserException | CrossReferenceException | FilterException | - PdfTypeException | + PdfParserException | PdfReaderException $e ) { - throw new MindeePDFException("PDF file couldn't be processed during extraction."); + throw new MindeePDFException("PDF file couldn't be processed during extraction.", 0, $e); } $extractedPdfs[] = new ExtractedPdf($mergedPdfBytes, $fieldFilename); } @@ -131,21 +132,22 @@ public function extractSubDocuments(array $pageIndexes): array /** * Extracts invoices as complete PDFs from the document. * - * @param array| InvoiceSplitterV1InvoicePageGroups $pageIndexes List of sub-lists of pages to keep. - * @param boolean $strict Whether to trust confidence scores or not. - * @return array A list of extracted invoices. + * @param array|InvoiceSplitterV1InvoicePageGroups $pageIndexes List of sub-lists of pages to keep. + * @param boolean $strict Whether to trust confidence scores or not. + * + * @return ExtractedPdf[] a list of extracted invoices */ - public function extractInvoices($pageIndexes, bool $strict = false): array + public function extractInvoices(mixed $pageIndexes, bool $strict = false): array { if (empty($pageIndexes)) { return []; } if (!$strict) { - $indexes = array_map(function ($invoicePageIndexes) { - return $invoicePageIndexes->pageIndexes; - }, (array)$pageIndexes); + $indexes = array_map(fn ($invoicePageIndexes) => $invoicePageIndexes->pageIndexes, (array) $pageIndexes); + return $this->extractSubDocuments($indexes); - } elseif (is_array($pageIndexes[0])) { + } + if (is_array($pageIndexes[0])) { return $this->extractSubDocuments($pageIndexes); } @@ -158,7 +160,7 @@ public function extractInvoices($pageIndexes, bool $strict = false): array $confidence = $pageIndex->confidence; $pageList = $pageIndex->pageIndexes; - if ($confidence >= 0.5 && $previousConfidence === null) { + if ($confidence >= 0.5 && null === $previousConfidence) { $currentList = $pageList; } elseif ($confidence >= 0.5 && $i !== count($pageIndexes) - 1) { if (!empty($currentList)) { @@ -178,13 +180,14 @@ public function extractInvoices($pageIndexes, bool $strict = false): array } $previousConfidence = $confidence; - $i++; + ++$i; } + return $this->extractSubDocuments($correctPageIndexes); } /** - * @return string Name of the file. + * @return string name of the file */ public function getFileName(): string { diff --git a/src/V1/Image/ImageExtractor.php b/src/V1/Image/ImageExtractor.php new file mode 100644 index 00000000..72656282 --- /dev/null +++ b/src/V1/Image/ImageExtractor.php @@ -0,0 +1,12 @@ +localInput = $localInput; + } + + /** + * Extracts a crop zone from a file. + * + * @param CropItem $crop Crop to extract. + * + * @return ExtractedImage extracted image + */ + public function extractCrop(CropItem $crop): ExtractedImage + { + return $this->extractCrops([$crop])[0]; + } + + /** + * Extracts multiple crop zones from a file. + * + * @param CropItem[] $crops List of crops to extract. + * @return CropFiles list of extracted files + */ + public function extractCrops(array $crops): CropFiles + { + $imageExtractor = new ImageExtractor($this->localInput); + $extractedImages = []; + + $cropsPerPage = []; + foreach ($crops as $crop) { + $cropsPerPage[$crop->location->page][] = $crop; + } + + foreach ($cropsPerPage as $page => $pageCrops) { + $polygons = array_map(fn ($c) => $c->location->polygon, $pageCrops); + $filenamePrefix = sprintf('%s_page%d', $this->localInput->fileName, $page); + + $images = $imageExtractor->extractPolygonsFromPage( + $polygons, + $page, + $filenamePrefix + ); + array_push($extractedImages, ...$images); + } + + return new CropFiles(...$extractedImages); + } +} diff --git a/src/V2/FileOperations/CropFiles.php b/src/V2/FileOperations/CropFiles.php new file mode 100644 index 00000000..14c5999f --- /dev/null +++ b/src/V2/FileOperations/CropFiles.php @@ -0,0 +1,59 @@ + + */ +class CropFiles extends \ArrayObject +{ + /** + * Builds a new CropFiles collection. + * + * @param ExtractedImage ...$items Items. + */ + public function __construct(ExtractedImage ...$items) + { + parent::__construct($items); + } + + /** + * Save all extracted crops to disk. + * + * @param string $path The directory path to save the extracted crops to. + * @param string $prefix Prefix to add to the filename. + * @param null|string $fileFormat File format to save the crops as. + * @param integer $quality Quality of the saved image. + * + * @throws MindeeException If directory creation fails. + * @return void + */ + public function saveAllToDisk( + string $path, + string $prefix = 'crop', + ?string $fileFormat = null, + int $quality = 100 + ): void { + $format ??= $fileFormat; + $idx = 1; + + foreach ($this as $crop) { + $formattedIdx = sprintf('%03d', $idx); + $filename = sprintf('%s_%s.jpg', $prefix, $formattedIdx); + $crop->filename = $filename; + + try { + $crop->writeToFile($path, $format, $quality); + } catch (\ImagickException $e) { + throw new MindeeException('Failed to save crop to disk.', 0, $e); + } + + ++$idx; + } + } +} diff --git a/src/V2/FileOperations/Split.php b/src/V2/FileOperations/Split.php new file mode 100644 index 00000000..43a076e2 --- /dev/null +++ b/src/V2/FileOperations/Split.php @@ -0,0 +1,73 @@ +localInput = $inputSource; + } + + /** + * Expands a range to a list of integers. + * + * @param integer $start Start of the range. + * @param integer $end End of the range. + * + * @return int[] + * + * @throws MindeeInputException If the start page is greater than the end page. + */ + public static function expandRange(int $start, int $end): array + { + if ($start > $end || $start < 0) { + throw new MindeeInputException('Invalid page range provided.'); + } + + return range($start, $end); + } + + /** + * Extracts a single split from the input file. + * + * @param int[] $split Split range to extract. + * + * @return ExtractedPdf 2D array of extracted pages + */ + public function extractSingleSplit(array $split): ExtractedPdf + { + return $this->extractSplits([$split])[0]; + } + + /** + * Extracts the splits from the input file. + * + * @param int[][] $splits List of split ranges to extract. + * + * @return SplitFiles list of extracted files + */ + public function extractSplits(array $splits): SplitFiles + { + $pdfExtractor = new PdfExtractor($this->localInput); + $expandedPageIndexes = array_map(fn (array $split) => self::expandRange($split[0], $split[1]), $splits); + + return new SplitFiles(...$pdfExtractor->extractSubDocuments($expandedPageIndexes)); + } +} diff --git a/src/V2/FileOperations/SplitFiles.php b/src/V2/FileOperations/SplitFiles.php new file mode 100644 index 00000000..b9fefd66 --- /dev/null +++ b/src/V2/FileOperations/SplitFiles.php @@ -0,0 +1,58 @@ + + */ +class SplitFiles extends \ArrayObject +{ + /** + * Builds a new SplitFiles collection. + * + * @param ExtractedPdf ...$items Items. + */ + public function __construct(ExtractedPdf ...$items) + { + parent::__construct($items); + } + + /** + * Save all extracted splits to disk. + * + * @param string $path The directory path to save the extracted splits to. + * @param string $prefix Prefix to add to the filename. + * + * @throws MindeeException If directory creation fails. + * @return void + */ + public function saveAllToDisk(string $path, string $prefix = 'split'): void + { + if (!is_dir($path)) { + if (!mkdir($path, 0o777, true) && !is_dir($path)) { + throw new MindeeException(sprintf('Directory "%s" was not created', $path)); + } + } + + $idx = 1; + + foreach ($this as $split) { + $formattedIdx = sprintf('%03d', $idx); + $filename = sprintf('%s_%s.pdf', $prefix, $formattedIdx); + $filePath = rtrim($path, DIRECTORY_SEPARATOR) . DIRECTORY_SEPARATOR . $filename; + + try { + $split->writeToFile($filePath); + } catch (\Exception $e) { + throw new MindeeException('Failed to save split to disk.', 0, $e->getMessage()); + } + + ++$idx; + } + } +} diff --git a/tests/Dependencies/DependencyCheckerNoExtendedTestPdf.php b/tests/Dependencies/DependencyCheckerNoExtendedTestPdf.php index a82ba53c..766d8016 100644 --- a/tests/Dependencies/DependencyCheckerNoExtendedTestPdf.php +++ b/tests/Dependencies/DependencyCheckerNoExtendedTestPdf.php @@ -5,7 +5,7 @@ use Mindee\Error\MindeeUnhandledException; use Mindee\Extraction\ExtractedImage; use Mindee\Extraction\ExtractedPdf; -use Mindee\Extraction\ImageExtractor; +use Mindee\V1\Image\ImageExtractor; use Mindee\Extraction\PdfExtractor; use Mindee\Input\PathInput; use PHPUnit\Framework\TestCase; @@ -31,8 +31,8 @@ public function testNoExtractedImage() $this->expectException(MindeeUnhandledException::class); $inputImage = ""; $filename = "dummy"; - $saveFormat = "pdf;"; - new ExtractedImage($inputImage, $filename, $saveFormat); + $saveFormat = "pdf"; + new ExtractedImage($inputImage, $filename, $saveFormat, 0, 0); } public function testNoExtractedPdf() { diff --git a/tests/V1/Extraction/ImageExtractorTest.php b/tests/V1/Extraction/ImageExtractorTest.php index a0aa503f..15c25af3 100644 --- a/tests/V1/Extraction/ImageExtractorTest.php +++ b/tests/V1/Extraction/ImageExtractorTest.php @@ -3,7 +3,7 @@ namespace V1\Extraction; use Mindee\Client; -use Mindee\Extraction\ImageExtractor; +use Mindee\V1\Image\ImageExtractor; use Mindee\Input\LocalResponse; use Mindee\Input\PathInput; use Mindee\Product\BarcodeReader\BarcodeReaderV1; @@ -39,7 +39,7 @@ public function testGivenAnImageShouldExtractPositionFields() $source = $extractedImage->asInputSource(); $this->assertEquals( - sprintf("default_sample_page-001_%03d.jpg", $i + 1), + sprintf("default_sample.jpg_page0-%d.jpg", $i), $source->fileName ); } @@ -68,7 +68,7 @@ public function testGivenAnImageShouldExtractValueFields() $this->assertNotNull($extractedImage->image); $source = $extractedImage->asInputSource(); $this->assertEquals( - sprintf("barcodes_1D_page-001_%03d.jpg", $i + 1), + sprintf("barcodes_1D.jpg_page0-%d.jpg", $i), $source->fileName ); $extractedImage->writeToFile(\TestingUtilities::getRootDataDir() . "/output"); @@ -103,7 +103,7 @@ public function testGivenAPdfShouldExtractPositionFields() $source = $extractedImage->asInputSource(); $this->assertEquals( - sprintf("multipage_sample_page-%03d_%03d.jpg", $page->id + 1, $i + 1), + sprintf("multipage_sample.pdf_page%d-%d.jpg", $page->id, $i), $source->fileName ); } diff --git a/tests/V2/FileOperations/CropFunctional.php b/tests/V2/FileOperations/CropFunctional.php new file mode 100644 index 00000000..93878fb3 --- /dev/null +++ b/tests/V2/FileOperations/CropFunctional.php @@ -0,0 +1,104 @@ +client = new ClientV2($apiKey); + $this->cropModelId = getenv('MINDEE_V2_CROP_MODEL_ID') ?: ''; + $this->findocModelId = getenv('MINDEE_V2_FINDOC_MODEL_ID') ?: ''; + + $this->outputDir = getcwd() . '/output'; + if (!is_dir($this->outputDir)) { + mkdir($this->outputDir, 0777, true); + } + } + + protected function tearDown(): void + { + $file1 = $this->outputDir . '/crop_001.jpg'; + $file2 = $this->outputDir . '/crop_002.jpg'; + + if (file_exists($file1)) { + unlink($file1); + } + if (file_exists($file2)) { + unlink($file2); + } + } + + private function checkFindocReturn(InferenceResponse $findocResponse): void + { + $this->assertGreaterThan(0, strlen($findocResponse->inference->model->id)); + + $totalAmount = $findocResponse->inference->result->fields['total_amount']; + $this->assertNotNull($totalAmount); + $this->assertGreaterThan(0, $totalAmount->value); + } + + public function testExtractCropsFromImageCorrectly(): void + { + $inputSource = new PathInput(\TestingUtilities::getV2ProductDir() . '/crop/default_sample.jpg'); + $cropParams = new CropParameters($this->cropModelId); + + $response = $this->client->enqueueAndGetResult(CropResponse::class, $inputSource, $cropParams); + + $this->assertNotNull($response); + $this->assertCount(2, $response->inference->result->crops); + + $cropOperation = new Crop($inputSource); + $extractedImages = $cropOperation->extractCrops($response->inference->result->crops); + + $this->assertCount(2, $extractedImages); + $this->assertEquals('default_sample.jpg_page0-0.jpg', $extractedImages[0]->filename); + $this->assertEquals('default_sample.jpg_page0-1.jpg', $extractedImages[1]->filename); + + $extractionInput = $extractedImages[0]->asInputSource(); + $findocParams = new InferenceParameters($this->findocModelId); + + $invoice0 = $this->client->enqueueAndGetResult(InferenceResponse::class, $extractionInput, $findocParams); + + $this->checkFindocReturn($invoice0); + + $extractedImages->saveAllToDisk($this->outputDir, quality: 50); + + $file1Info = filesize($this->outputDir . '/crop_001.jpg'); + $this->assertGreaterThanOrEqual(98000, $file1Info); + $this->assertLessThanOrEqual(110000, $file1Info); + + $file2Info = filesize($this->outputDir . '/crop_002.jpg'); + $this->assertGreaterThanOrEqual(98000, $file2Info); + $this->assertLessThanOrEqual(110000, $file2Info); + } + + public function testExtractCropsFromEachPdfPageCorrectly(): void + { + $inputSource = new PathInput(\TestingUtilities::getV2ProductDir() . '/crop/multipage_sample.pdf'); + $cropParams = new CropParameters($this->cropModelId); + + $response = $this->client->enqueueAndGetResult(CropResponse::class, $inputSource, $cropParams); + $cropOperation = new Crop($inputSource); + $extractedImages = $cropOperation->extractCrops($response->inference->result->crops); + + $this->assertCount(5, $extractedImages); + $this->assertEquals('multipage_sample.pdf_page0-0.jpg', $extractedImages[0]->filename); + $this->assertEquals('multipage_sample.pdf_page1-0.jpg', $extractedImages[3]->filename); + } +} diff --git a/tests/V2/FileOperations/CropTest.php b/tests/V2/FileOperations/CropTest.php new file mode 100644 index 00000000..51ddb48c --- /dev/null +++ b/tests/V2/FileOperations/CropTest.php @@ -0,0 +1,68 @@ +cropDataDir = \TestingUtilities::getV2DataDir() . '/products/crop'; + } + + public function testProcessesSinglePageCropSplitCorrectly(): void + { + $inputSample = new PathInput($this->cropDataDir . '/default_sample.jpg'); + + $localResponse = new LocalResponse($this->cropDataDir . '/crop_single.json'); + $doc = $localResponse->deserializeResponse(CropResponse::class); + + $cropOperation = new Crop($inputSample); + $extractedCrops = $cropOperation->extractCrops($doc->inference->result->crops); + + $this->assertCount(1, $extractedCrops); + + $this->assertEquals(0, $extractedCrops[0]->pageId); + $this->assertEquals(0, $extractedCrops[0]->elementId); + + $bitmap0 = $extractedCrops[0]->image; + + $this->assertEquals(2822, $bitmap0->width ?? clone $bitmap0->getWidth()); + $this->assertEquals(1572, $bitmap0->height ?? clone $bitmap0->getHeight()); + } + + public function testProcessesMultiPageReceiptSplitCorrectly(): void + { + $inputSample = new PathInput($this->cropDataDir . '/multipage_sample.pdf'); + + $localResponse = new LocalResponse($this->cropDataDir . '/crop_multiple.json'); + $doc = $localResponse->deserializeResponse(CropResponse::class); + + $cropOperation = new Crop($inputSample); + $extractedCrops = $cropOperation->extractCrops($doc->inference->result->crops); + + $this->assertCount(2, $extractedCrops); + + $this->assertEquals(0, $extractedCrops[0]->pageId); + $this->assertEquals(0, $extractedCrops[0]->elementId); + + $bitmap0 = $extractedCrops[0]->image; + $this->assertEquals(156, $bitmap0->width ?? $bitmap0->getWidth()); + $this->assertEquals(757, $bitmap0->height ?? $bitmap0->getHeight()); + + $this->assertEquals(0, $extractedCrops[1]->pageId); + $this->assertEquals(1, $extractedCrops[1]->elementId); + + $bitmap1 = $extractedCrops[1]->image; + $this->assertEquals(188, $bitmap1->width ?? $bitmap1->getWidth()); + $this->assertEquals(691, $bitmap1->height ?? $bitmap1->getHeight()); + } +} diff --git a/tests/V2/FileOperations/SplitFunctional.php b/tests/V2/FileOperations/SplitFunctional.php new file mode 100644 index 00000000..dd12191d --- /dev/null +++ b/tests/V2/FileOperations/SplitFunctional.php @@ -0,0 +1,95 @@ +client = new ClientV2($apiKey); + $this->splitModelId = getenv('MINDEE_V2_SPLIT_MODEL_ID') ?: ''; + $this->findocModelId = getenv('MINDEE_V2_FINDOC_MODEL_ID') ?: ''; + + $this->outputDir = getcwd() . '/output'; + if (!is_dir($this->outputDir)) { + mkdir($this->outputDir, 0777, true); + } + } + + protected function tearDown(): void + { + $file1 = $this->outputDir . '/split_001.pdf'; + $file2 = $this->outputDir . '/split_002.pdf'; + + if (file_exists($file1)) { + unlink($file1); + } + if (file_exists($file2)) { + unlink($file2); + } + } + + private function checkFindocReturn(InferenceResponse $findocResponse): void + { + $this->assertGreaterThan(0, strlen($findocResponse->inference->model->id)); + + $totalAmount = $findocResponse->inference->result->fields['total_amount']; + $this->assertNotNull($totalAmount); + $this->assertGreaterThan(0, $totalAmount->value); + } + + public function testExtractSplitsFromPdfCorrectly(): void + { + $inputSource = new PathInput(\TestingUtilities::getV2ProductDir() . '/split/default_sample.pdf'); + $splitParams = new SplitParameters($this->splitModelId); + + $response = $this->client->enqueueAndGetResult(SplitResponse::class, $inputSource, $splitParams); + + $this->assertNotNull($response); + $this->assertCount(2, $response->inference->result->splits); + + $splitOperation = new Split($inputSource); + $extractedSplits = $splitOperation->extractSplits( + array_map(fn($s) => $s->pageRange, $response->inference->result->splits) + ); + + $this->assertCount(2, $extractedSplits); + $this->assertEquals('default_sample_001-001.pdf', $extractedSplits[0]->filename); + $this->assertEquals('default_sample_002-002.pdf', $extractedSplits[1]->filename); + + $inferenceInput = $extractedSplits[0]->asInputSource(); + $findocParams = new InferenceParameters($this->findocModelId); + + $invoice0 = $this->client->enqueueAndGetResult(InferenceResponse::class, $inferenceInput, $findocParams); + + $this->checkFindocReturn($invoice0); + + $extractedSplits->saveAllToDisk($this->outputDir); + + for ($i = 0; $i < count($extractedSplits); $i++) { + $fileName = sprintf('split_%03d.pdf', $i + 1); + $filePath = $this->outputDir . '/' . $fileName; + + $this->assertFileExists($filePath); + $this->assertGreaterThan(0, filesize($filePath)); + + $localInput = new PathInput($filePath); + $this->assertEquals($extractedSplits[$i]->getPageCount(), $localInput->getPageCount()); + } + } +} \ No newline at end of file diff --git a/tests/V2/FileOperations/SplitTest.php b/tests/V2/FileOperations/SplitTest.php new file mode 100644 index 00000000..97f8e833 --- /dev/null +++ b/tests/V2/FileOperations/SplitTest.php @@ -0,0 +1,55 @@ +splitDataDir = \TestingUtilities::getV2DataDir() . '/products/split'; + $this->finDocDataDir = \TestingUtilities::getV2DataDir() . '/products/extraction/financial_document'; + } + + public function testProcessesSinglePageSplitCorrectly(): void + { + $inputSample = new PathInput($this->finDocDataDir . '/default_sample.jpg'); + + $localResponse = new LocalResponse($this->splitDataDir . '/split_single.json'); + $doc = $localResponse->deserializeResponse(SplitResponse::class); + + $splitOperation = new Split($inputSample); + $splits = $doc->inference->result->splits; + $extractedSplits = $splitOperation->extractSplits(array_map(fn($s) => $s->pageRange, $splits)); + + $this->assertCount(1, $extractedSplits); + + $this->assertEquals(1, $extractedSplits[0]->getPageCount()); + } + + public function testProcessesMultiPageReceiptSplitCorrectly(): void + { + $inputSample = new PathInput($this->splitDataDir . '/invoice_5p.pdf'); + + $localResponse = new LocalResponse($this->splitDataDir . '/split_multiple.json'); + $doc = $localResponse->deserializeResponse(SplitResponse::class); + + $splitOperation = new Split($inputSample); + $splits = $doc->inference->result->splits; + $extractedSplits = $splitOperation->extractSplits(array_map(fn($s) => $s->pageRange, $splits)); + + $this->assertCount(3, $extractedSplits); + + $this->assertEquals(1, $extractedSplits[0]->getPageCount()); + $this->assertEquals(3, $extractedSplits[1]->getPageCount()); + $this->assertEquals(1, $extractedSplits[2]->getPageCount()); + } +}