From ec35987200098bf0cd8e04d7d67bcf8678843e9d Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 17 Dec 2023 20:09:41 +0100 Subject: [PATCH] PDF Renderer: allow to specify an alternate image or resolution programmatically. Support new rendering_dpi api params. Add pdf renderer tests. Install pdf font in cmake tool chain. resolves #210 resolves #3798 --- CMakeLists.txt | 7 ++++ Makefile.am | 5 +++ include/tesseract/renderer.h | 34 +++++++++++++++++++ src/api/pdfrenderer.cpp | 13 ++++++-- src/api/renderer.cpp | 62 +++++++++++++++++++++++++++++++++++ src/ccmain/tesseractclass.cpp | 1 + src/ccmain/tesseractclass.h | 1 + 7 files changed, 120 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 68da6c532b..4fdf790660 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -100,6 +100,7 @@ option(DISABLE_TIFF "Disable build with libtiff (if available)" OFF) option(DISABLE_ARCHIVE "Disable build with libarchive (if available)" OFF) option(DISABLE_CURL "Disable build with libcurl (if available)" OFF) option(INSTALL_CONFIGS "Install tesseract configs" ON) +option(INSTALL_PDF_TTF "Install pdf font file" ON) if(NOT ${CMAKE_VERSION} VERSION_LESS "3.15.0") if(WIN32 AND MSVC) @@ -573,6 +574,8 @@ endif(ENABLE_OPENCL) message(STATUS "Use system ICU Library [USE_SYSTEM_ICU]: ${USE_SYSTEM_ICU}") message( STATUS "Install tesseract configs [INSTALL_CONFIGS]: ${INSTALL_CONFIGS}") +message( + STATUS "Install tesseract pdf font [INSTALL_PDF_TTF]: ${INSTALL_PDF_TTF}") message(STATUS "--------------------------------------------------------") message(STATUS) @@ -984,6 +987,10 @@ if(INSTALL_CONFIGS) install(FILES ${TESSERACT_TESSCONFIGS} DESTINATION ${CMAKE_INSTALL_PREFIX}/share/tessdata/tessconfigs) endif() +if (INSTALL_PDF_TTF) + install(FILES tessdata/pdf.ttf + DESTINATION ${CMAKE_INSTALL_PREFIX}/share/tessdata) +endif () # ############################################################################## # uninstall target diff --git a/Makefile.am b/Makefile.am index b0e0baeeb8..da9ea0b620 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1269,6 +1269,7 @@ check_PROGRAMS += paragraphs_test if !DISABLED_LEGACY_ENGINE check_PROGRAMS += params_model_test endif # !DISABLED_LEGACY_ENGINE +check_PROGRAMS += pdfrenderer_test check_PROGRAMS += progress_test check_PROGRAMS += qrsequence_test check_PROGRAMS += recodebeam_test @@ -1497,6 +1498,10 @@ progress_test_CPPFLAGS = $(unittest_CPPFLAGS) progress_test_LDFLAGS = $(OPENCL_LDFLAGS) $(LEPTONICA_LIBS) progress_test_LDADD = $(GTEST_LIBS) $(GMOCK_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS) +pdfrenderer_test_SOURCES = unittest/pdfrenderer_test.cc +pdfrenderer_test_CPPFLAGS = $(unittest_CPPFLAGS) +pdfrenderer_test_LDADD = $(TESS_LIBS) $(TRAINING_LIBS) + qrsequence_test_SOURCES = unittest/qrsequence_test.cc qrsequence_test_CPPFLAGS = $(unittest_CPPFLAGS) qrsequence_test_LDADD = $(TESS_LIBS) diff --git a/include/tesseract/renderer.h b/include/tesseract/renderer.h index 6f40523335..154671d830 100644 --- a/include/tesseract/renderer.h +++ b/include/tesseract/renderer.h @@ -106,6 +106,23 @@ class TESS_API TessResultRenderer { return imagenum_; } + /** + * Specifies an alternate image to render with the extracted text. + * It must be called after BeginDocument and before AddImage. + */ + void SetRenderingImage(Pix *rendering_image) { + rendering_image_ = rendering_image; + } + + /** + * Specifies the expected rendering resolution. + * If not set, rendering_dpi api params will be used, else the source image + * resolution. + */ + void SetRenderingResolution(int rendering_dpi) { + rendering_dpi_ = rendering_dpi; + } + protected: /** * Called by concrete classes. @@ -139,12 +156,29 @@ class TESS_API TessResultRenderer { // This method will grow the output buffer if needed. void AppendData(const char *s, int len); + // Renderers can call this to get the actual image to render with extracted + // text. This method returns: + // - the rendering image set by the caller or + // - the input image scaled to the rendering_dpi field if defined or + // - the input image from the api otherwise + Pix *GetRenderingImage(TessBaseAPI *api); + + // Resolution of the rendering image either set manually by the caller or with + // the rendering_dpi api parameter. + int GetRenderingResolution(TessBaseAPI *api); + + // Reset rendering image and dpi to previous state. Destroy scaled rendered + // image if exists. + void ResetRenderingState(Pix *rendering_image_prev, int rendering_dpi_prev); + private: TessResultRenderer *next_; // Can link multiple renderers together FILE *fout_; // output file pointer const char *file_extension_; // standard extension for generated output std::string title_; // title of document being rendered int imagenum_; // index of last image added + Pix *rendering_image_; // Image to render with the extracted text + int rendering_dpi_; // Resolution of the rendering_image bool happy_; // I get grumpy when the disk fills up, etc. }; diff --git a/src/api/pdfrenderer.cpp b/src/api/pdfrenderer.cpp index 81cf2e24d8..25c7bba3e6 100644 --- a/src/api/pdfrenderer.cpp +++ b/src/api/pdfrenderer.cpp @@ -319,7 +319,12 @@ static bool CodepointToUtf16be(int code, char utf16[kMaxBytesPerCodepoint]) { } char *TessPDFRenderer::GetPDFTextObjects(TessBaseAPI *api, double width, double height) { - double ppi = api->GetSourceYResolution(); + double input_image_ppi = api->GetSourceYResolution(); + double ppi = GetRenderingResolution(api); + double scale = 1; + if (input_image_ppi > 0) { + scale = ppi / input_image_ppi; + } // These initial conditions are all arbitrary and will be overwritten double old_x = 0.0, old_y = 0.0; @@ -369,6 +374,7 @@ char *TessPDFRenderer::GetPDFTextObjects(TessBaseAPI *api, double width, double if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { int x1, y1, x2, y2; res_it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2); + x1 *= scale; y1 *= scale; x2 *= scale; y2 *= scale; ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2); } @@ -403,6 +409,7 @@ char *TessPDFRenderer::GetPDFTextObjects(TessBaseAPI *api, double width, double { int word_x1, word_y1, word_x2, word_y2; res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2); + word_x1 *= scale; word_y1 *= scale; word_x2 *= scale; word_y2 *= scale; GetWordBaseline(writing_direction, ppi, height, word_x1, word_y1, word_x2, word_y2, line_x1, line_y1, line_x2, line_y2, &x, &y, &word_length); } @@ -809,9 +816,9 @@ bool TessPDFRenderer::imageToPDFObj(Pix *pix, const char *filename, long int obj } bool TessPDFRenderer::AddImageHandler(TessBaseAPI *api) { - Pix *pix = api->GetInputImage(); + Pix *pix = GetRenderingImage(api); const char *filename = api->GetInputName(); - int ppi = api->GetSourceYResolution(); + int ppi = GetRenderingResolution(api); if (!pix || ppi <= 0) { return false; } diff --git a/src/api/renderer.cpp b/src/api/renderer.cpp index 8d4f1adc1b..d54a55a95d 100644 --- a/src/api/renderer.cpp +++ b/src/api/renderer.cpp @@ -18,12 +18,14 @@ #ifdef HAVE_CONFIG_H # include "config_auto.h" #endif +#include #include #include #include #include // std::unique_ptr #include // std::string #include "serialis.h" // Serialize +#include "tprintf.h" namespace tesseract { @@ -36,6 +38,8 @@ TessResultRenderer::TessResultRenderer(const char *outputbase, const char *exten , file_extension_(extension) , title_("") , imagenum_(-1) + , rendering_image_(nullptr) + , rendering_dpi_(0) , happy_(true) { if (strcmp(outputbase, "-") && strcmp(outputbase, "stdout")) { std::string outfile = std::string(outputbase) + "." + extension; @@ -90,13 +94,71 @@ bool TessResultRenderer::AddImage(TessBaseAPI *api) { return false; } ++imagenum_; + Pix *rendering_image_prev = rendering_image_; + int rendering_dpi_prev = rendering_dpi_; bool ok = AddImageHandler(api); + ResetRenderingState(rendering_image_prev, rendering_dpi_prev); if (next_) { ok = next_->AddImage(api) && ok; } return ok; } +void TessResultRenderer::ResetRenderingState(Pix *rendering_image_prev, + int rendering_dpi_prev) { + if (rendering_image_ != rendering_image_prev) { + pixDestroy(&rendering_image_); + rendering_image_ = rendering_image_prev; + } + if (rendering_dpi_ != rendering_dpi_prev) { + rendering_dpi_ = rendering_dpi_prev; + } +} + +Pix *TessResultRenderer::GetRenderingImage(TessBaseAPI *api) { + if (!rendering_image_) { + Pix *source_image = api->GetInputImage(); + int source_dpi = api->GetSourceYResolution(); + if (!source_image || source_dpi <= 0) { + happy_ = false; + return nullptr; + } + + int rendering_dpi = GetRenderingResolution(api); + if (rendering_dpi != source_dpi) { + float scale = (float)rendering_dpi / (float)source_dpi; + + rendering_image_ = pixScale(source_image, scale, scale); + } else { + return source_image; + } + } + return rendering_image_; +} + +int TessResultRenderer::GetRenderingResolution(tesseract::TessBaseAPI *api) { + if (rendering_dpi_) { + return rendering_dpi_; + } + int source_dpi = api->GetSourceYResolution(); + int rendering_dpi; + if (api->GetIntVariable("rendering_dpi", &rendering_dpi) && + rendering_dpi > 0 && rendering_dpi != source_dpi) { + if (rendering_dpi < kMinCredibleResolution || + rendering_dpi > kMaxCredibleResolution) { +#if !defined(NDEBUG) + tprintf( + "Warning: User defined rendering dpi %d is outside of expected range " + "(%d - %d)!\n", + rendering_dpi, kMinCredibleResolution, kMaxCredibleResolution); +#endif + } + rendering_dpi_ = rendering_dpi; + return rendering_dpi_; + } + return source_dpi; +} + bool TessResultRenderer::EndDocument() { if (!happy_) { return false; diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp index fd58ac8746..df0575ba12 100644 --- a/src/ccmain/tesseractclass.cpp +++ b/src/ccmain/tesseractclass.cpp @@ -349,6 +349,7 @@ Tesseract::Tesseract() , BOOL_MEMBER(textonly_pdf, false, "Create PDF with only one invisible text layer", this->params()) , INT_MEMBER(jpg_quality, 85, "Set JPEG quality level", this->params()) + , INT_MEMBER(rendering_dpi, 0, "Scaled input image resolution before rendering", this->params()) , INT_MEMBER(user_defined_dpi, 0, "Specify DPI for input image", this->params()) , INT_MEMBER(min_characters_to_try, 50, "Specify minimum characters to try during OSD", this->params()) diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h index 732bb9e62e..398fe5861d 100644 --- a/src/ccmain/tesseractclass.h +++ b/src/ccmain/tesseractclass.h @@ -903,6 +903,7 @@ class TESS_API Tesseract : public Wordrec { BOOL_VAR_H(tessedit_create_pdf); BOOL_VAR_H(textonly_pdf); INT_VAR_H(jpg_quality); + INT_VAR_H(rendering_dpi); INT_VAR_H(user_defined_dpi); INT_VAR_H(min_characters_to_try); STRING_VAR_H(unrecognised_char);