Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding video processing capability to Tesseract component. #243

Merged
merged 4 commits into from
May 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions cpp/TesseractOCRTextDetection/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
This repository contains source code and model data for the OpenMPF Tesseract
OCR text detection component.

The component extracts text found in an image, reported as a single track
detection. PDF documents can also be processed with one track detection per
The component extracts text found in an image, video, or generic document.
Image results are reported as a single track
detection [per specified language setting](#detecting-multiple-languages).
Video results are reported as single track detections per frame and language setting.
PDF documents can also be processed with one track detection per
page. The first page corresponds to the detection property `PAGE_NUM=1`. For
debugging purposes, images converted from documents are stored in a temporary
job directory under `plugin/TesseractOCR/tmp-[job-id]-[random tag]`. This
Expand Down
552 changes: 310 additions & 242 deletions cpp/TesseractOCRTextDetection/TesseractOCRTextDetection.cpp

Large diffs are not rendered by default.

32 changes: 20 additions & 12 deletions cpp/TesseractOCRTextDetection/TesseractOCRTextDetection.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ namespace MPF {

class TessApiWrapper;

class TesseractOCRTextDetection : public MPFImageDetectionComponentAdapter {
class TesseractOCRTextDetection : public MPFDetectionComponent {

public:
bool Init() override;
Expand All @@ -71,6 +71,10 @@ namespace MPF {

std::vector<MPFGenericTrack> GetDetections(const MPFGenericJob &job) override;

std::vector<MPFVideoTrack> GetDetections(const MPFVideoJob &job) override;

std::vector<MPFAudioTrack> GetDetections(const MPFAudioJob &job) override;

std::string GetDetectionType() override;

bool Supports(MPFDetectionDataType data_type) override;
Expand Down Expand Up @@ -158,13 +162,11 @@ namespace MPF {

struct Image_results{
std::vector<OCR_output> detections_by_lang;
MPFDetectionError job_status;
};

struct OCR_results {
std::string text_result;
std::string lang;
MPFDetectionError job_status;
double confidence;
};

Expand All @@ -180,7 +182,6 @@ namespace MPF {

struct PDF_page_results {
std::set<std::string> all_missing_languages;
MPFDetectionError job_status;
std::vector<MPFGenericTrack> *tracks;
};

Expand All @@ -201,7 +202,12 @@ namespace MPF {
}
};

bool process_ocr_text(Properties &detection_properties, const MPFImageJob &job, const OCR_output &ocr_out,
std::vector<MPFImageLocation> process_image_job(const MPFJob &job,
TesseractOCRTextDetection::OCR_filter_settings &ocr_fset,
cv::Mat &image_data,
const std::string &run_dir);

bool process_ocr_text(Properties &detection_properties, const MPFJob &job, const OCR_output &ocr_out,
const TesseractOCRTextDetection::OCR_filter_settings &ocr_fset,
int page_num = -1);

Expand All @@ -222,29 +228,32 @@ namespace MPF {
static void process_parallel_image_runs(OCR_job_inputs &inputs, Image_results &results);
static void process_serial_image_runs(OCR_job_inputs &inputs, Image_results &results);

void preprocess_image(const MPFImageJob &job, cv::Mat &input_image, const OCR_filter_settings &ocr_fset);
void rescale_image(const MPFImageJob &job, cv::Mat &input_image, const OCR_filter_settings &ocr_fset);
void preprocess_image(const MPFJob &job, cv::Mat &input_image, const OCR_filter_settings &ocr_fset);
void rescale_image(const MPFJob &job, cv::Mat &input_image, const OCR_filter_settings &ocr_fset);

static void process_tesseract_lang_model(OCR_job_inputs &input, OCR_results &result);

void set_default_parameters();

void set_read_config_parameters();

void load_settings(const MPFJob &job, OCR_filter_settings &ocr_fset, const Text_type &text_type = Unknown);
void load_settings(const MPFJob &job, OCR_filter_settings &ocr_fset);
void load_image_preprocessing_settings(const MPFJob &job,
OCR_filter_settings &ocr_fset,
const Text_type &text_type = Unknown);

void sharpen(cv::Mat &image, double weight);

static std::string process_osd_lang(const std::string &script_type,
const OCR_filter_settings &ocr_fset);

void get_OSD(OSBestResult &best_result, cv::Mat &imi, const MPFImageJob &job,
void get_OSD(OSBestResult &best_result, cv::Mat &imi, const MPFJob &job,
OCR_filter_settings &ocr_fset,
Properties &detection_properties,
std::string &tessdata_script_dir, std::set<std::string> &missing_languages);

bool get_OSD_rotation(OSResults *results, cv::Mat &imi_scaled, cv::Mat &imi_original,
int &rotation, const MPFImageJob &job, OCR_filter_settings &ocr_fset);
int &rotation, const MPFJob &job, OCR_filter_settings &ocr_fset);

static std::string return_valid_tessdir(const std::string &job_name,
const std::string &lang_str,
Expand All @@ -265,8 +274,7 @@ namespace MPF {

void check_default_languages(const OCR_filter_settings &ocr_fset,
const std::string &job_name,
const std::string &run_dir,
MPFDetectionError &job_status);
const std::string &run_dir);
};

// The primary reason this class exists is that tesseract::TessBaseAPI segfaults when copying or moving.
Expand Down
44 changes: 36 additions & 8 deletions cpp/TesseractOCRTextDetection/sample_tesseract_ocr_detector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,10 @@ using std::to_string;
void print_usage(char *argv[]) {

std::cout << "Usage: " << argv[0] <<
" <-i | -g> [--osd] [--oem TESSERACT_OEM] <IMAGE_URI | GENERIC_URI> [TESSERACT_LANGUAGE]" <<
" <-i | -v | -g> [--osd] [--oem TESSERACT_OEM] <IMAGE_URI | VIDEO_URI <START_FRAME> <END_FRAME> | GENERIC_URI> [TESSERACT_LANGUAGE]" <<
std::endl << std::endl;
std::cout << "Notes: " << std::endl << std::endl;
std::cout << " -i | -g : Specifies whether to process an image (-i <IMAGE_URI>) or generic document (-g <GENERIC_URI>)." <<
std::cout << " <-i | -v | -g> : Specifies whether to process an image (-i <IMAGE_URI>), video (-v <VIDEO_URI> <START_FRAME> <END_FRAME>), or generic document (-g <GENERIC_URI>)." <<
std::endl << std::endl;
std::cout << " --osd : When provided, runs the job with automatic orientation and script detection (OSD). " <<
std::endl;
Expand Down Expand Up @@ -102,8 +102,8 @@ bool check_options(const std::string &next_option, const int &argc, char *argv[
if (next_option == "--osd") {
algorithm_properties["ENABLE_OSD_AUTOMATION"] = "true";
uri_index++;
} else if (next_option == "--oem" || argc - uri_index > 2) {
std::cout << "Updating OEM MODE " << argv[uri_index + 1];
} else if (next_option == "--oem" && argc - uri_index > 2) {
std::cout << "Updating OEM MODE " << argv[uri_index + 1] << std::endl;
algorithm_properties["TESSERACT_OEM"] = argv[uri_index + 1];
uri_index += 2;
} else {
Expand Down Expand Up @@ -131,18 +131,30 @@ int main(int argc, char *argv[]) {
algorithm_properties["SHARPEN"] = "1.0";
algorithm_properties["ENABLE_OSD_AUTOMATION"] = "false";

int uri_index = 2;
int uri_index = 2, video_params = 0, start_frame = 0, end_frame = 1;

std::string next_option = std::string(argv[uri_index]);
if (check_options(next_option, argc, argv, algorithm_properties, uri_index)) {
next_option = std::string(argv[uri_index]);
check_options(next_option, argc, argv, algorithm_properties, uri_index);
}

if (argc - uri_index == 1) {
if (media_option == "-v") {
video_params = 2;
if (argc - uri_index < 3) {
print_usage(argv);
return 0;

}
start_frame = std::stoi(argv[uri_index+1]);
end_frame = std::stoi(argv[uri_index+2]);
}

if (argc - uri_index - video_params == 1) {
uri = argv[uri_index];
} else if (argc - uri_index == 2) {
} else if (argc - uri_index - video_params == 2) {
uri = argv[uri_index];
algorithm_properties["TESSERACT_LANGUAGE"] = argv[uri_index + 1];
algorithm_properties["TESSERACT_LANGUAGE"] = argv[uri_index + video_params + 1];
} else {
print_usage(argv);
return 0;
Expand Down Expand Up @@ -176,6 +188,22 @@ int main(int argc, char *argv[]) {
print_detection_properties(locations[i].detection_properties, locations[i].confidence);
}
}
else if (media_option == "-v") {
// Run uri as an image data file.
std::cout << "Running job on video data uri: " << uri << std::endl;
MPFVideoJob job(job_name, uri, start_frame, end_frame, algorithm_properties, media_properties);
int count = 0;
for (auto track: im.GetDetections(job)) {
std::cout << "Track number: " << count << std::endl;
std::map<int, MPFImageLocation> locations = track.frame_locations;
std::cout << "Number of image locations: " << locations.size() << std::endl << std::endl;
for (const auto &location: locations) {
std::cout << "Frame number: " << location.first << std::endl;
print_detection_properties(location.second.detection_properties, location.second.confidence);
}
count ++;
}
}
else {
print_usage(argv);
}
Expand Down
13 changes: 13 additions & 0 deletions cpp/TesseractOCRTextDetection/test/data/NOTICE
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,19 @@ Custom generated pdf for testing document text extraction.
# test-backslash.png
Custom generated image for testing escaped backslash tagging.

# test-video-detection.avi
Short clip of three separate image frames for testing video detection capability.
Contains public domain text from the following sources:

https://en.wikipedia.org/wiki/Diazepam
(Japanese Translation)
Public Domain

http://www.un.org/en/universal-declaration-human-rights/
English text from the Universal
Declaration of Human Rights.
Public Domain

# text-demo.png
Text extracted from open source project https://github.com/tesseract-ocr/tesseract.

Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,25 @@ MPFGenericJob createPDFJob(const std::string &uri, const std::map<std::string, s
return job;
}

/**
* Helper function for creating a video job
* @param uri - Path to existing input media.
* @param start - Video start frame.
* @param end - Video end frame.
* @param custom - Custom job algorithm properties.
*
* @return - An MPF video job with the specified algorithm properties.
*/
MPFVideoJob createVideoJob(const std::string &uri, const int &start, const int &end,
const std::map<std::string, std::string> &custom = {}) {
Properties algorithm_properties;
Properties media_properties;
std::string job_name("OCR_test");
setAlgorithmProperties(algorithm_properties, custom);
MPFVideoJob job(job_name, uri, start, end, algorithm_properties, media_properties);
return job;
}

/**
* Helper function for running given image job. Checks if job results is not empty.
*
Expand Down Expand Up @@ -136,6 +155,27 @@ void runDocumentDetection(const std::string &doc_path, TesseractOCRTextDetection
ASSERT_FALSE(generic_tracks.empty());
}


/**
* Helper function for running given video job. Checks if job results is not empty.
*
* @param vid_path - Path of given video.
* @param ocr - TesseractOCRTextDetection component for running given job.
* @param video_tracks - Output vector of video detection tracks for given job.
* @param start - Video start frame.
* @param end - Video end frame.
* @param custom - Mapping of input job properties.
*/
void runVideoDetection(const std::string &vid_path, TesseractOCRTextDetection &ocr,
std::vector<MPFVideoTrack> &video_tracks,
const int &start, const int &end,
const std::map<std::string, std::string> &custom = {}) {
MPFVideoJob job = createVideoJob(vid_path, start, end, custom);
video_tracks = ocr.GetDetections(job);
ASSERT_FALSE(video_tracks.empty());
}


/**
* Helper function for checking if running given image job will return no results.
*
Expand Down Expand Up @@ -485,6 +525,40 @@ TEST(TESSERACTOCR, CustomModelTest) {
ASSERT_TRUE(ocr.Close());
}

TEST(TESSERACTOCR, VideoProcessingTest) {

// Ensure video processing works as expected.

TesseractOCRTextDetection ocr;
ocr.SetRunDirectory("../plugin");
std::vector<MPFVideoTrack> track_results;
std::vector<MPFImageLocation> results;
ASSERT_TRUE(ocr.Init());

std::map<std::string,std::string> custom_properties = {{"TESSERACT_LANGUAGE", "eng"},
{"ENABLE_OSD_AUTOMATION","TRUE"}};

ASSERT_NO_FATAL_FAILURE(runVideoDetection("data/test-video-detection.avi", ocr, track_results, 0, 2, custom_properties));

for (auto track_result: track_results) {
for (auto result: track_result.frame_locations) {
results.push_back(result.second);
}
}

assertInImage("data/test-video-detection.avi", "Testing Text Detection", results, "TEXT", 0);
assertInImage("data/test-video-detection.avi", "eng", results, "TEXT_LANGUAGE", 0);

assertInImage("data/test-video-detection.avi", "Japanese", results, "OSD_PRIMARY_SCRIPT", 1);
assertInImage("data/test-video-detection.avi", "Japanese", results, "MISSING_LANGUAGE_MODELS", 1);

assertInImage("data/test-video-detection.avi", "All human beings", results, "TEXT", 2);
assertInImage("data/test-video-detection.avi", "Latin", results, "TEXT_LANGUAGE", 2);


ASSERT_TRUE(ocr.Close());
}

TEST(TESSERACTOCR, ImageProcessingTest) {

// Ensure contrast and unstructured image processing settings are enabled.
Expand Down Expand Up @@ -546,8 +620,6 @@ TEST(TESSERACTOCR, ImageProcessingTest) {
ASSERT_TRUE(ocr.Close());
}



TEST(TESSERACTOCR, ModelTest) {

// Ensure user can specify custom model directory locations.
Expand Down