Skip to content

Commit

Permalink
fix: improve Google Cloud OCR processing
Browse files Browse the repository at this point in the history
- gzip all JSON OCRs when saving OCR file on disk
- add new `created_at` field to save the timestamp of OCR generation
  • Loading branch information
raphael0202 committed May 12, 2023
1 parent c78a458 commit 2bfb407
Show file tree
Hide file tree
Showing 10 changed files with 78 additions and 38 deletions.
20 changes: 12 additions & 8 deletions lib/ProductOpener/Images.pm
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ use ProductOpener::URL qw/:all/;
use ProductOpener::Users qw/:all/;
use ProductOpener::Text qw/:all/;

use IO::Compress::Gzip qw(gzip $GzipError);
use Log::Any qw($log);
use Encode;
use JSON::PP;
Expand Down Expand Up @@ -2002,7 +2003,7 @@ sub extract_text_from_image ($product_ref, $id, $field, $ocr_engine, $results_re
}
elsif ($ocr_engine eq 'google_cloud_vision') {

my $json_file = "$www_root/images/products/$path/$filename.json";
my $json_file = "$www_root/images/products/$path/$filename.json.gz";
open(my $gv_logs, ">>:encoding(UTF-8)", "$data_root/logs/cloud_vision.log");
my $cloudvision_ref = send_image_to_cloud_vision($image, $json_file, \@CLOUD_VISION_FEATURES_TEXT, $gv_logs);
close $gv_logs;
Expand Down Expand Up @@ -2047,7 +2048,7 @@ Call to Google Cloud vision API
=head4 $image_path - str path to image
=head4 $json_file - str path to the file where we will store OCR result as JSON
=head4 $json_file - str path to the file where we will store OCR result as gzipped JSON
=head4 $features_ref - hash reference - the "features" parameter of Google Cloud Vision
Expand Down Expand Up @@ -2108,14 +2109,17 @@ sub send_image_to_cloud_vision ($image_path, $json_file, $features_ref, $gv_logs

$cloudvision_ref = decode_json($json_response);

$log->info("saving google cloud vision json response to file", {path => $json_file}) if $log->is_info();
# Adding creation timestamp, to know when the OCR has been generated
$cloudvision_ref->{created_at} = time();

# UTF-8 issue , see https://stackoverflow.com/questions/4572007/perl-lwpuseragent-mishandling-utf-8-response
$json_response = decode("utf8", $json_response);
$log->info("saving google cloud vision json response to file", {path => $json_file}) if $log->is_info();

if (open(my $OUT, ">:encoding(UTF-8)", $json_file)) {
print($OUT $json_response);
close($OUT);
if (open(my $OUT, ">:raw", $json_file)) {
my $gzip_handle = IO::Compress::Gzip->new($OUT)
or die "Cannot create gzip filehandle: $GzipError\n";
my $encoded_json = encode_json($cloudvision_ref);
$gzip_handle->print($encoded_json);
$gzip_handle->close;

print($gv_logs "--> cloud vision success for $image_path\n");
}
Expand Down
44 changes: 44 additions & 0 deletions lib/ProductOpener/Test.pm
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ BEGIN {
&remove_all_orgs
&check_not_production
&wait_for
&read_gzip_file
&check_ocr_result
); # symbols to export on request
%EXPORT_TAGS = (all => [@EXPORT_OK]);
}
Expand All @@ -66,15 +68,57 @@ use ProductOpener::Store "store";
use Carp qw/confess/;
use Data::DeepAccess qw(deep_exists deep_get deep_set);
use Getopt::Long;
use IO::Uncompress::AnyInflate qw(anyinflate $AnyInflateError);
use Test::More;
use JSON "decode_json";
use File::Basename "fileparse";
use File::Path qw/make_path remove_tree/;
use File::Copy;
use Path::Tiny qw/path/;
use Scalar::Util qw(looks_like_number);

use Log::Any qw($log);

=head2 read_gzip_file($filepath)
Read gzipped file and return binary content
=head3 Parameters
=head4 String $filepath
The path of the gzipped file.
=cut

sub read_gzip_file ($filepath) {
my $input = IO::File->new($filepath) or die "Cannot open '$filepath'\n";
my $buffer;
anyinflate $input => \$buffer or die "anyinflate failed: $AnyInflateError\n";
return $buffer;
}

=head2 check_ocr_result($ocr_result)
Check that OCR result returned by Google Cloud Vision is as expected:
- a single [response] object in `responses` field
- `created_at` integer field
=head3 Parameters
=head4 String $ocr_result
String of OCR result JSON as returned by Google Cloud Vision.
=cut

sub check_ocr_result ($ocr_result) {
ok(defined $ocr_result->{responses}, "OCR result contains the 'responses' field");
my @responses = $ocr_result->{responses};
my $created_at = $ocr_result->{created_at};
is(scalar @responses, 1, "OCR result contains a single response");
ok((defined $created_at and looks_like_number($created_at)), "OCR result `created_at` field is valid, $created_at");
return;
}

=head2 init_expected_results($filepath)
Handles test options around expected_results initialization
Expand Down
2 changes: 1 addition & 1 deletion scripts/run_cloud_vision_ocr.pl
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ ($file)

my $json_file = $destination;
$json_file =~ s/\.([^\.]+)$//;
$json_file .= ".json";
$json_file .= ".json.gz";

print $LOG "file: $file destination: $destination code: $code image_url: $image_url json_file: $json_file\n";
open(my $gv_logs, ">>:encoding(UTF-8)", "$data_root/logs/cloud_vision.log");
Expand Down
1 change: 1 addition & 0 deletions stop_words.txt
Original file line number Diff line number Diff line change
Expand Up @@ -217,3 +217,4 @@ weigher
weighers
www
xml
gzipped

This file was deleted.

6 changes: 3 additions & 3 deletions tests/integration/run_cloud_vision_ocr.t
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ dircopy("$sample_products_images_path/$product_code_path", $image_dir);
fcopy($input_image_path, "$image_dir/2.jpg");
# fake responses for OCR and robtoff
my @responses = (
HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"ocr": "success"}'),
HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"responses": [{}]}'),
HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"robotoff": "success"}'),
);
my $dump_path = File::Temp->newdir();
Expand All @@ -56,10 +56,10 @@ is(scalar @requests, 2, "Two request issued");
my $ocr_request = retrieve("$dump_path/req-0.sto");
my $request_json_body = decode_json($ocr_request->content());
compare_to_expected_results($request_json_body, "$expected_result_dir/ocr_request_body.json", $update_expected_results);
my $ocr_content = read_file("$image_dir/2.json");
my $ocr_content = read_gzip_file("$image_dir/2.json.gz");
ok($ocr_content, "OCR file is not empty");
my $ocr_data = decode_json($ocr_content);
compare_to_expected_results($ocr_data, "$expected_result_dir/ocr_data.json", $update_expected_results);
check_ocr_result($ocr_data);
my $robotoff_request = retrieve("$dump_path/req-1.sto");
# we have url encoded parameters, and order might change --> convert to hash
my $request_content = url_params_mixed($robotoff_request->content());
Expand Down

This file was deleted.

This file was deleted.

This file was deleted.

31 changes: 17 additions & 14 deletions tests/unit/send_image_to_cloud_vision.t
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ use ProductOpener::Images qw/:all/;

my ($test_id, $test_dir, $expected_result_dir, $update_expected_results) = (init_expected_results(__FILE__));

# Default OCR response, containing a single response element
my $ocr_default_response = '{"responses": [{}]}';

my @ua_requests = ();
# put responses for call to requests here, we will pop first
my @ua_responses = ();
Expand All @@ -35,26 +38,26 @@ my $image_path = dirname(__FILE__) . "/inputs/small-img.jpg";

# normal test
open(my $gv_logs, ">:encoding(UTF-8)", $gv_logs_path);
my $json_path = $tmp_dir . "/small-img.json";
my $json_path = $tmp_dir . "/small-img.json.gz";
# expected response
my $response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"foo": "blah"}');
my $response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), $ocr_default_response);
push @ua_responses, $response;
send_image_to_cloud_vision($image_path, $json_path, \@CLOUD_VISION_FEATURES_FULL, $gv_logs);
close($gv_logs);
is(scalar @ua_requests, 1, "Normal test - One request issued to cloud vision");
my $issued_request = shift @ua_requests;
my $request_json_body = decode_json($issued_request->content());
compare_to_expected_results($request_json_body, "$expected_result_dir/request_body.json", $update_expected_results);
my $ocr_content = read_file($json_path);
my $ocr_content = read_gzip_file($json_path);
ok($ocr_content, "normal test - OCR file is not empty");
my $ocr_data = decode_json($ocr_content);
compare_to_expected_results($ocr_data, "$expected_result_dir/ocr_data.json", $update_expected_results);
check_ocr_result($ocr_data);
my $logs = read_file($gv_logs_path);
like($logs, qr/cloud vision success/, "normal test - cloud vision success in logs");

# test new request updates
open($gv_logs, ">:encoding(UTF-8)", $gv_logs_path);
$response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"foo": "bar"}');
$response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), $ocr_default_response);
push @ua_responses, $response;
send_image_to_cloud_vision($image_path, $json_path, \@CLOUD_VISION_FEATURES_FULL, $gv_logs);
close($gv_logs);
Expand All @@ -63,15 +66,15 @@ my $image_path = dirname(__FILE__) . "/inputs/small-img.jpg";
$request_json_body = decode_json($issued_request->content());
compare_to_expected_results($request_json_body, "$expected_result_dir/request_body_2.json",
$update_expected_results);
$ocr_content = read_file($json_path);
$ocr_content = read_gzip_file($json_path);
$ocr_data = decode_json($ocr_content);
compare_to_expected_results($ocr_data, "$expected_result_dir/ocr_data_2.json", $update_expected_results);
check_ocr_result($ocr_data);
$logs = read_file($gv_logs_path);
like($logs, qr/cloud vision success/, "test request update - cloud vision success in logs");

# test with different feature set \@CLOUD_VISION_FEATURES_TEXT
open($gv_logs, ">:encoding(UTF-8)", $gv_logs_path);
$response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"foo": "bar"}');
$response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), $ocr_default_response);
push @ua_responses, $response;
send_image_to_cloud_vision($image_path, $json_path, \@CLOUD_VISION_FEATURES_TEXT, $gv_logs);
close($gv_logs);
Expand All @@ -80,19 +83,19 @@ my $image_path = dirname(__FILE__) . "/inputs/small-img.jpg";
$request_json_body = decode_json($issued_request->content());
compare_to_expected_results($request_json_body, "$expected_result_dir/request_body_3.json",
$update_expected_results);
$ocr_content = read_file($json_path);
$ocr_content = read_gzip_file($json_path);
$ocr_data = decode_json($ocr_content);
compare_to_expected_results($ocr_data, "$expected_result_dir/ocr_data_3.json", $update_expected_results);
check_ocr_result($ocr_data);
$logs = read_file($gv_logs_path);
like($logs, qr/cloud vision success/, "test request features text - cloud vision success in logs");

# test with bad json path
open($gv_logs, ">:encoding(UTF-8)", $gv_logs_path);
$response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"foo": "blah"}');
$response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), $ocr_default_response);
push @ua_responses, $response;
send_image_to_cloud_vision(
$image_path,
"/var/lib/not-a-directory/not-writable.json",
"/var/lib/not-a-directory/not-writable.json.gz",
\@CLOUD_VISION_FEATURES_FULL, $gv_logs
);
close($gv_logs);
Expand All @@ -105,8 +108,8 @@ my $image_path = dirname(__FILE__) . "/inputs/small-img.jpg";

# test bad request
open($gv_logs, ">:encoding(UTF-8)", $gv_logs_path);
$json_path = $tmp_dir . "/small-img2.json";
$response = HTTP::Response->new("403", "Not authorized", HTTP::Headers->new(), '{"foo": "blah"}');
$json_path = $tmp_dir . "/small-img2.json.gz";
$response = HTTP::Response->new("403", "Not authorized", HTTP::Headers->new(), $ocr_default_response);
push @ua_responses, $response;
send_image_to_cloud_vision($image_path, $json_path, \@CLOUD_VISION_FEATURES_FULL, $gv_logs);
close($gv_logs);
Expand Down

0 comments on commit 2bfb407

Please sign in to comment.