From 2bfb407cef88281c59765e2a34031db78f8fd363 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Fri, 14 Apr 2023 15:22:39 +0700 Subject: [PATCH] fix: improve Google Cloud OCR processing - gzip all JSON OCRs when saving OCR file on disk - add new `created_at` field to save the timestamp of OCR generation --- lib/ProductOpener/Images.pm | 20 +++++---- lib/ProductOpener/Test.pm | 44 +++++++++++++++++++ scripts/run_cloud_vision_ocr.pl | 2 +- stop_words.txt | 1 + .../run_cloud_vision_ocr/ocr_data.json | 3 -- tests/integration/run_cloud_vision_ocr.t | 6 +-- .../send_image_to_cloud_vision/ocr_data.json | 3 -- .../ocr_data_2.json | 3 -- .../ocr_data_3.json | 3 -- tests/unit/send_image_to_cloud_vision.t | 31 +++++++------ 10 files changed, 78 insertions(+), 38 deletions(-) delete mode 100644 tests/integration/expected_test_results/run_cloud_vision_ocr/ocr_data.json delete mode 100644 tests/unit/expected_test_results/send_image_to_cloud_vision/ocr_data.json delete mode 100644 tests/unit/expected_test_results/send_image_to_cloud_vision/ocr_data_2.json delete mode 100644 tests/unit/expected_test_results/send_image_to_cloud_vision/ocr_data_3.json diff --git a/lib/ProductOpener/Images.pm b/lib/ProductOpener/Images.pm index 7d9dcb9f31c80..54d4f9e4e2b0f 100644 --- a/lib/ProductOpener/Images.pm +++ b/lib/ProductOpener/Images.pm @@ -140,6 +140,7 @@ use ProductOpener::URL qw/:all/; use ProductOpener::Users qw/:all/; use ProductOpener::Text qw/:all/; +use IO::Compress::Gzip qw(gzip $GzipError); use Log::Any qw($log); use Encode; use JSON::PP; @@ -2002,7 +2003,7 @@ sub extract_text_from_image ($product_ref, $id, $field, $ocr_engine, $results_re } elsif ($ocr_engine eq 'google_cloud_vision') { - my $json_file = "$www_root/images/products/$path/$filename.json"; + my $json_file = "$www_root/images/products/$path/$filename.json.gz"; open(my $gv_logs, ">>:encoding(UTF-8)", "$data_root/logs/cloud_vision.log"); my $cloudvision_ref = send_image_to_cloud_vision($image, $json_file, \@CLOUD_VISION_FEATURES_TEXT, $gv_logs); close $gv_logs; @@ -2047,7 +2048,7 @@ Call to Google Cloud vision API =head4 $image_path - str path to image -=head4 $json_file - str path to the file where we will store OCR result as JSON +=head4 $json_file - str path to the file where we will store OCR result as gzipped JSON =head4 $features_ref - hash reference - the "features" parameter of Google Cloud Vision @@ -2108,14 +2109,17 @@ sub send_image_to_cloud_vision ($image_path, $json_file, $features_ref, $gv_logs $cloudvision_ref = decode_json($json_response); - $log->info("saving google cloud vision json response to file", {path => $json_file}) if $log->is_info(); + # Adding creation timestamp, to know when the OCR has been generated + $cloudvision_ref->{created_at} = time(); - # UTF-8 issue , see https://stackoverflow.com/questions/4572007/perl-lwpuseragent-mishandling-utf-8-response - $json_response = decode("utf8", $json_response); + $log->info("saving google cloud vision json response to file", {path => $json_file}) if $log->is_info(); - if (open(my $OUT, ">:encoding(UTF-8)", $json_file)) { - print($OUT $json_response); - close($OUT); + if (open(my $OUT, ">:raw", $json_file)) { + my $gzip_handle = IO::Compress::Gzip->new($OUT) + or die "Cannot create gzip filehandle: $GzipError\n"; + my $encoded_json = encode_json($cloudvision_ref); + $gzip_handle->print($encoded_json); + $gzip_handle->close; print($gv_logs "--> cloud vision success for $image_path\n"); } diff --git a/lib/ProductOpener/Test.pm b/lib/ProductOpener/Test.pm index ffc688b751f4f..b9e15ebd910d5 100644 --- a/lib/ProductOpener/Test.pm +++ b/lib/ProductOpener/Test.pm @@ -51,6 +51,8 @@ BEGIN { &remove_all_orgs &check_not_production &wait_for + &read_gzip_file + &check_ocr_result ); # symbols to export on request %EXPORT_TAGS = (all => [@EXPORT_OK]); } @@ -66,15 +68,57 @@ use ProductOpener::Store "store"; use Carp qw/confess/; use Data::DeepAccess qw(deep_exists deep_get deep_set); use Getopt::Long; +use IO::Uncompress::AnyInflate qw(anyinflate $AnyInflateError); use Test::More; use JSON "decode_json"; use File::Basename "fileparse"; use File::Path qw/make_path remove_tree/; use File::Copy; use Path::Tiny qw/path/; +use Scalar::Util qw(looks_like_number); use Log::Any qw($log); +=head2 read_gzip_file($filepath) + +Read gzipped file and return binary content + +=head3 Parameters + +=head4 String $filepath +The path of the gzipped file. + +=cut + +sub read_gzip_file ($filepath) { + my $input = IO::File->new($filepath) or die "Cannot open '$filepath'\n"; + my $buffer; + anyinflate $input => \$buffer or die "anyinflate failed: $AnyInflateError\n"; + return $buffer; +} + +=head2 check_ocr_result($ocr_result) + +Check that OCR result returned by Google Cloud Vision is as expected: + - a single [response] object in `responses` field + - `created_at` integer field + +=head3 Parameters + +=head4 String $ocr_result +String of OCR result JSON as returned by Google Cloud Vision. + +=cut + +sub check_ocr_result ($ocr_result) { + ok(defined $ocr_result->{responses}, "OCR result contains the 'responses' field"); + my @responses = $ocr_result->{responses}; + my $created_at = $ocr_result->{created_at}; + is(scalar @responses, 1, "OCR result contains a single response"); + ok((defined $created_at and looks_like_number($created_at)), "OCR result `created_at` field is valid, $created_at"); + return; +} + =head2 init_expected_results($filepath) Handles test options around expected_results initialization diff --git a/scripts/run_cloud_vision_ocr.pl b/scripts/run_cloud_vision_ocr.pl index 68376a3a952f4..1c5bb7800d1a1 100755 --- a/scripts/run_cloud_vision_ocr.pl +++ b/scripts/run_cloud_vision_ocr.pl @@ -69,7 +69,7 @@ ($file) my $json_file = $destination; $json_file =~ s/\.([^\.]+)$//; - $json_file .= ".json"; + $json_file .= ".json.gz"; print $LOG "file: $file destination: $destination code: $code image_url: $image_url json_file: $json_file\n"; open(my $gv_logs, ">>:encoding(UTF-8)", "$data_root/logs/cloud_vision.log"); diff --git a/stop_words.txt b/stop_words.txt index f6d991ceb7aad..18fa94991cd0e 100644 --- a/stop_words.txt +++ b/stop_words.txt @@ -217,3 +217,4 @@ weigher weighers www xml +gzipped \ No newline at end of file diff --git a/tests/integration/expected_test_results/run_cloud_vision_ocr/ocr_data.json b/tests/integration/expected_test_results/run_cloud_vision_ocr/ocr_data.json deleted file mode 100644 index 1f8d1d822dd56..0000000000000 --- a/tests/integration/expected_test_results/run_cloud_vision_ocr/ocr_data.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "ocr" : "success" -} diff --git a/tests/integration/run_cloud_vision_ocr.t b/tests/integration/run_cloud_vision_ocr.t index 35b732caf9ab4..8fa22cfd5e2bc 100644 --- a/tests/integration/run_cloud_vision_ocr.t +++ b/tests/integration/run_cloud_vision_ocr.t @@ -39,7 +39,7 @@ dircopy("$sample_products_images_path/$product_code_path", $image_dir); fcopy($input_image_path, "$image_dir/2.jpg"); # fake responses for OCR and robtoff my @responses = ( - HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"ocr": "success"}'), + HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"responses": [{}]}'), HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"robotoff": "success"}'), ); my $dump_path = File::Temp->newdir(); @@ -56,10 +56,10 @@ is(scalar @requests, 2, "Two request issued"); my $ocr_request = retrieve("$dump_path/req-0.sto"); my $request_json_body = decode_json($ocr_request->content()); compare_to_expected_results($request_json_body, "$expected_result_dir/ocr_request_body.json", $update_expected_results); -my $ocr_content = read_file("$image_dir/2.json"); +my $ocr_content = read_gzip_file("$image_dir/2.json.gz"); ok($ocr_content, "OCR file is not empty"); my $ocr_data = decode_json($ocr_content); -compare_to_expected_results($ocr_data, "$expected_result_dir/ocr_data.json", $update_expected_results); +check_ocr_result($ocr_data); my $robotoff_request = retrieve("$dump_path/req-1.sto"); # we have url encoded parameters, and order might change --> convert to hash my $request_content = url_params_mixed($robotoff_request->content()); diff --git a/tests/unit/expected_test_results/send_image_to_cloud_vision/ocr_data.json b/tests/unit/expected_test_results/send_image_to_cloud_vision/ocr_data.json deleted file mode 100644 index 07046c2aa314e..0000000000000 --- a/tests/unit/expected_test_results/send_image_to_cloud_vision/ocr_data.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "foo" : "blah" -} diff --git a/tests/unit/expected_test_results/send_image_to_cloud_vision/ocr_data_2.json b/tests/unit/expected_test_results/send_image_to_cloud_vision/ocr_data_2.json deleted file mode 100644 index 8dd659ba8030f..0000000000000 --- a/tests/unit/expected_test_results/send_image_to_cloud_vision/ocr_data_2.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "foo" : "bar" -} diff --git a/tests/unit/expected_test_results/send_image_to_cloud_vision/ocr_data_3.json b/tests/unit/expected_test_results/send_image_to_cloud_vision/ocr_data_3.json deleted file mode 100644 index 8dd659ba8030f..0000000000000 --- a/tests/unit/expected_test_results/send_image_to_cloud_vision/ocr_data_3.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "foo" : "bar" -} diff --git a/tests/unit/send_image_to_cloud_vision.t b/tests/unit/send_image_to_cloud_vision.t index 4f483c823d176..45be363b60ccb 100644 --- a/tests/unit/send_image_to_cloud_vision.t +++ b/tests/unit/send_image_to_cloud_vision.t @@ -14,6 +14,9 @@ use ProductOpener::Images qw/:all/; my ($test_id, $test_dir, $expected_result_dir, $update_expected_results) = (init_expected_results(__FILE__)); +# Default OCR response, containing a single response element +my $ocr_default_response = '{"responses": [{}]}'; + my @ua_requests = (); # put responses for call to requests here, we will pop first my @ua_responses = (); @@ -35,9 +38,9 @@ my $image_path = dirname(__FILE__) . "/inputs/small-img.jpg"; # normal test open(my $gv_logs, ">:encoding(UTF-8)", $gv_logs_path); - my $json_path = $tmp_dir . "/small-img.json"; + my $json_path = $tmp_dir . "/small-img.json.gz"; # expected response - my $response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"foo": "blah"}'); + my $response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), $ocr_default_response); push @ua_responses, $response; send_image_to_cloud_vision($image_path, $json_path, \@CLOUD_VISION_FEATURES_FULL, $gv_logs); close($gv_logs); @@ -45,16 +48,16 @@ my $image_path = dirname(__FILE__) . "/inputs/small-img.jpg"; my $issued_request = shift @ua_requests; my $request_json_body = decode_json($issued_request->content()); compare_to_expected_results($request_json_body, "$expected_result_dir/request_body.json", $update_expected_results); - my $ocr_content = read_file($json_path); + my $ocr_content = read_gzip_file($json_path); ok($ocr_content, "normal test - OCR file is not empty"); my $ocr_data = decode_json($ocr_content); - compare_to_expected_results($ocr_data, "$expected_result_dir/ocr_data.json", $update_expected_results); + check_ocr_result($ocr_data); my $logs = read_file($gv_logs_path); like($logs, qr/cloud vision success/, "normal test - cloud vision success in logs"); # test new request updates open($gv_logs, ">:encoding(UTF-8)", $gv_logs_path); - $response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"foo": "bar"}'); + $response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), $ocr_default_response); push @ua_responses, $response; send_image_to_cloud_vision($image_path, $json_path, \@CLOUD_VISION_FEATURES_FULL, $gv_logs); close($gv_logs); @@ -63,15 +66,15 @@ my $image_path = dirname(__FILE__) . "/inputs/small-img.jpg"; $request_json_body = decode_json($issued_request->content()); compare_to_expected_results($request_json_body, "$expected_result_dir/request_body_2.json", $update_expected_results); - $ocr_content = read_file($json_path); + $ocr_content = read_gzip_file($json_path); $ocr_data = decode_json($ocr_content); - compare_to_expected_results($ocr_data, "$expected_result_dir/ocr_data_2.json", $update_expected_results); + check_ocr_result($ocr_data); $logs = read_file($gv_logs_path); like($logs, qr/cloud vision success/, "test request update - cloud vision success in logs"); # test with different feature set \@CLOUD_VISION_FEATURES_TEXT open($gv_logs, ">:encoding(UTF-8)", $gv_logs_path); - $response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"foo": "bar"}'); + $response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), $ocr_default_response); push @ua_responses, $response; send_image_to_cloud_vision($image_path, $json_path, \@CLOUD_VISION_FEATURES_TEXT, $gv_logs); close($gv_logs); @@ -80,19 +83,19 @@ my $image_path = dirname(__FILE__) . "/inputs/small-img.jpg"; $request_json_body = decode_json($issued_request->content()); compare_to_expected_results($request_json_body, "$expected_result_dir/request_body_3.json", $update_expected_results); - $ocr_content = read_file($json_path); + $ocr_content = read_gzip_file($json_path); $ocr_data = decode_json($ocr_content); - compare_to_expected_results($ocr_data, "$expected_result_dir/ocr_data_3.json", $update_expected_results); + check_ocr_result($ocr_data); $logs = read_file($gv_logs_path); like($logs, qr/cloud vision success/, "test request features text - cloud vision success in logs"); # test with bad json path open($gv_logs, ">:encoding(UTF-8)", $gv_logs_path); - $response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"foo": "blah"}'); + $response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), $ocr_default_response); push @ua_responses, $response; send_image_to_cloud_vision( $image_path, - "/var/lib/not-a-directory/not-writable.json", + "/var/lib/not-a-directory/not-writable.json.gz", \@CLOUD_VISION_FEATURES_FULL, $gv_logs ); close($gv_logs); @@ -105,8 +108,8 @@ my $image_path = dirname(__FILE__) . "/inputs/small-img.jpg"; # test bad request open($gv_logs, ">:encoding(UTF-8)", $gv_logs_path); - $json_path = $tmp_dir . "/small-img2.json"; - $response = HTTP::Response->new("403", "Not authorized", HTTP::Headers->new(), '{"foo": "blah"}'); + $json_path = $tmp_dir . "/small-img2.json.gz"; + $response = HTTP::Response->new("403", "Not authorized", HTTP::Headers->new(), $ocr_default_response); push @ua_responses, $response; send_image_to_cloud_vision($image_path, $json_path, \@CLOUD_VISION_FEATURES_FULL, $gv_logs); close($gv_logs);