Skip to content

Commit

Permalink
fix: gzip all JSON OCRs when saving OCR file on disk (#8320)
Browse files Browse the repository at this point in the history
To save space, all OCR files were gzipped, but new generated files are still saved as plain text JSON files.
Also add a created_at field in the OCR JSON file containing the timestamp of generation of the OCR file. This is useful to know if we should generate again old OCR files.
  • Loading branch information
raphael0202 committed May 17, 2023
1 parent 9983382 commit 45df380
Show file tree
Hide file tree
Showing 10 changed files with 78 additions and 38 deletions.
20 changes: 12 additions & 8 deletions lib/ProductOpener/Images.pm
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ use ProductOpener::URL qw/:all/;
use ProductOpener::Users qw/:all/;
use ProductOpener::Text qw/:all/;

use IO::Compress::Gzip qw(gzip $GzipError);
use Log::Any qw($log);
use Encode;
use JSON::PP;
Expand Down Expand Up @@ -2002,7 +2003,7 @@ sub extract_text_from_image ($product_ref, $id, $field, $ocr_engine, $results_re
}
elsif ($ocr_engine eq 'google_cloud_vision') {

my $json_file = "$www_root/images/products/$path/$filename.json";
my $json_file = "$www_root/images/products/$path/$filename.json.gz";
open(my $gv_logs, ">>:encoding(UTF-8)", "$data_root/logs/cloud_vision.log");
my $cloudvision_ref = send_image_to_cloud_vision($image, $json_file, \@CLOUD_VISION_FEATURES_TEXT, $gv_logs);
close $gv_logs;
Expand Down Expand Up @@ -2047,7 +2048,7 @@ Call to Google Cloud vision API
=head4 $image_path - str path to image
=head4 $json_file - str path to the file where we will store OCR result as JSON
=head4 $json_file - str path to the file where we will store OCR result as gzipped JSON
=head4 $features_ref - hash reference - the "features" parameter of Google Cloud Vision
Expand Down Expand Up @@ -2108,14 +2109,17 @@ sub send_image_to_cloud_vision ($image_path, $json_file, $features_ref, $gv_logs

$cloudvision_ref = decode_json($json_response);

$log->info("saving google cloud vision json response to file", {path => $json_file}) if $log->is_info();
# Adding creation timestamp, to know when the OCR has been generated
$cloudvision_ref->{created_at} = time();

# UTF-8 issue , see https://stackoverflow.com/questions/4572007/perl-lwpuseragent-mishandling-utf-8-response
$json_response = decode("utf8", $json_response);
$log->info("saving google cloud vision json response to file", {path => $json_file}) if $log->is_info();

if (open(my $OUT, ">:encoding(UTF-8)", $json_file)) {
print($OUT $json_response);
close($OUT);
if (open(my $OUT, ">:raw", $json_file)) {
my $gzip_handle = IO::Compress::Gzip->new($OUT)
or die "Cannot create gzip filehandle: $GzipError\n";
my $encoded_json = encode_json($cloudvision_ref);
$gzip_handle->print($encoded_json);
$gzip_handle->close;

print($gv_logs "--> cloud vision success for $image_path\n");
}
Expand Down
44 changes: 44 additions & 0 deletions lib/ProductOpener/Test.pm
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ BEGIN {
&remove_all_orgs
&check_not_production
&wait_for
&read_gzip_file
&check_ocr_result
); # symbols to export on request
%EXPORT_TAGS = (all => [@EXPORT_OK]);
}
Expand All @@ -66,15 +68,57 @@ use ProductOpener::Store "store";
use Carp qw/confess/;
use Data::DeepAccess qw(deep_exists deep_get deep_set);
use Getopt::Long;
use IO::Uncompress::AnyInflate qw(anyinflate $AnyInflateError);
use Test::More;
use JSON "decode_json";
use File::Basename "fileparse";
use File::Path qw/make_path remove_tree/;
use File::Copy;
use Path::Tiny qw/path/;
use Scalar::Util qw(looks_like_number);

use Log::Any qw($log);

=head2 read_gzip_file($filepath)
Read gzipped file and return binary content
=head3 Parameters
=head4 String $filepath
The path of the gzipped file.
=cut

sub read_gzip_file ($filepath) {
my $input = IO::File->new($filepath) or die "Cannot open '$filepath'\n";
my $buffer;
anyinflate $input => \$buffer or die "anyinflate failed: $AnyInflateError\n";
return $buffer;
}

=head2 check_ocr_result($ocr_result)
Check that OCR result returned by Google Cloud Vision is as expected:
- a single [response] object in `responses` field
- `created_at` integer field
=head3 Parameters
=head4 String $ocr_result
String of OCR result JSON as returned by Google Cloud Vision.
=cut

sub check_ocr_result ($ocr_result) {
ok(defined $ocr_result->{responses}, "OCR result contains the 'responses' field");
my @responses = $ocr_result->{responses};
my $created_at = $ocr_result->{created_at};
is(scalar @responses, 1, "OCR result contains a single response");
ok((defined $created_at and looks_like_number($created_at)), "OCR result `created_at` field is valid, $created_at");
return;
}

=head2 init_expected_results($filepath)
Handles test options around expected_results initialization
Expand Down
2 changes: 1 addition & 1 deletion scripts/run_cloud_vision_ocr.pl
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ ($file)

my $json_file = $destination;
$json_file =~ s/\.([^\.]+)$//;
$json_file .= ".json";
$json_file .= ".json.gz";

print $LOG "file: $file destination: $destination code: $code image_url: $image_url json_file: $json_file\n";
open(my $gv_logs, ">>:encoding(UTF-8)", "$data_root/logs/cloud_vision.log");
Expand Down
1 change: 1 addition & 0 deletions stop_words.txt
Original file line number Diff line number Diff line change
Expand Up @@ -217,3 +217,4 @@ weigher
weighers
www
xml
gzipped

This file was deleted.

6 changes: 3 additions & 3 deletions tests/integration/run_cloud_vision_ocr.t
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ dircopy("$sample_products_images_path/$product_code_path", $image_dir);
fcopy($input_image_path, "$image_dir/2.jpg");
# fake responses for OCR and robtoff
my @responses = (
HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"ocr": "success"}'),
HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"responses": [{}]}'),
HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"robotoff": "success"}'),
);
my $dump_path = File::Temp->newdir();
Expand All @@ -56,10 +56,10 @@ is(scalar @requests, 2, "Two request issued");
my $ocr_request = retrieve("$dump_path/req-0.sto");
my $request_json_body = decode_json($ocr_request->content());
compare_to_expected_results($request_json_body, "$expected_result_dir/ocr_request_body.json", $update_expected_results);
my $ocr_content = read_file("$image_dir/2.json");
my $ocr_content = read_gzip_file("$image_dir/2.json.gz");
ok($ocr_content, "OCR file is not empty");
my $ocr_data = decode_json($ocr_content);
compare_to_expected_results($ocr_data, "$expected_result_dir/ocr_data.json", $update_expected_results);
check_ocr_result($ocr_data);
my $robotoff_request = retrieve("$dump_path/req-1.sto");
# we have url encoded parameters, and order might change --> convert to hash
my $request_content = url_params_mixed($robotoff_request->content());
Expand Down

This file was deleted.

This file was deleted.

This file was deleted.

31 changes: 17 additions & 14 deletions tests/unit/send_image_to_cloud_vision.t
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ use ProductOpener::Images qw/:all/;

my ($test_id, $test_dir, $expected_result_dir, $update_expected_results) = (init_expected_results(__FILE__));

# Default OCR response, containing a single response element
my $ocr_default_response = '{"responses": [{}]}';

my @ua_requests = ();
# put responses for call to requests here, we will pop first
my @ua_responses = ();
Expand All @@ -35,26 +38,26 @@ my $image_path = dirname(__FILE__) . "/inputs/small-img.jpg";

# normal test
open(my $gv_logs, ">:encoding(UTF-8)", $gv_logs_path);
my $json_path = $tmp_dir . "/small-img.json";
my $json_path = $tmp_dir . "/small-img.json.gz";
# expected response
my $response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"foo": "blah"}');
my $response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), $ocr_default_response);
push @ua_responses, $response;
send_image_to_cloud_vision($image_path, $json_path, \@CLOUD_VISION_FEATURES_FULL, $gv_logs);
close($gv_logs);
is(scalar @ua_requests, 1, "Normal test - One request issued to cloud vision");
my $issued_request = shift @ua_requests;
my $request_json_body = decode_json($issued_request->content());
compare_to_expected_results($request_json_body, "$expected_result_dir/request_body.json", $update_expected_results);
my $ocr_content = read_file($json_path);
my $ocr_content = read_gzip_file($json_path);
ok($ocr_content, "normal test - OCR file is not empty");
my $ocr_data = decode_json($ocr_content);
compare_to_expected_results($ocr_data, "$expected_result_dir/ocr_data.json", $update_expected_results);
check_ocr_result($ocr_data);
my $logs = read_file($gv_logs_path);
like($logs, qr/cloud vision success/, "normal test - cloud vision success in logs");

# test new request updates
open($gv_logs, ">:encoding(UTF-8)", $gv_logs_path);
$response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"foo": "bar"}');
$response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), $ocr_default_response);
push @ua_responses, $response;
send_image_to_cloud_vision($image_path, $json_path, \@CLOUD_VISION_FEATURES_FULL, $gv_logs);
close($gv_logs);
Expand All @@ -63,15 +66,15 @@ my $image_path = dirname(__FILE__) . "/inputs/small-img.jpg";
$request_json_body = decode_json($issued_request->content());
compare_to_expected_results($request_json_body, "$expected_result_dir/request_body_2.json",
$update_expected_results);
$ocr_content = read_file($json_path);
$ocr_content = read_gzip_file($json_path);
$ocr_data = decode_json($ocr_content);
compare_to_expected_results($ocr_data, "$expected_result_dir/ocr_data_2.json", $update_expected_results);
check_ocr_result($ocr_data);
$logs = read_file($gv_logs_path);
like($logs, qr/cloud vision success/, "test request update - cloud vision success in logs");

# test with different feature set \@CLOUD_VISION_FEATURES_TEXT
open($gv_logs, ">:encoding(UTF-8)", $gv_logs_path);
$response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"foo": "bar"}');
$response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), $ocr_default_response);
push @ua_responses, $response;
send_image_to_cloud_vision($image_path, $json_path, \@CLOUD_VISION_FEATURES_TEXT, $gv_logs);
close($gv_logs);
Expand All @@ -80,19 +83,19 @@ my $image_path = dirname(__FILE__) . "/inputs/small-img.jpg";
$request_json_body = decode_json($issued_request->content());
compare_to_expected_results($request_json_body, "$expected_result_dir/request_body_3.json",
$update_expected_results);
$ocr_content = read_file($json_path);
$ocr_content = read_gzip_file($json_path);
$ocr_data = decode_json($ocr_content);
compare_to_expected_results($ocr_data, "$expected_result_dir/ocr_data_3.json", $update_expected_results);
check_ocr_result($ocr_data);
$logs = read_file($gv_logs_path);
like($logs, qr/cloud vision success/, "test request features text - cloud vision success in logs");

# test with bad json path
open($gv_logs, ">:encoding(UTF-8)", $gv_logs_path);
$response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), '{"foo": "blah"}');
$response = HTTP::Response->new("200", "OK", HTTP::Headers->new(), $ocr_default_response);
push @ua_responses, $response;
send_image_to_cloud_vision(
$image_path,
"/var/lib/not-a-directory/not-writable.json",
"/var/lib/not-a-directory/not-writable.json.gz",
\@CLOUD_VISION_FEATURES_FULL, $gv_logs
);
close($gv_logs);
Expand All @@ -105,8 +108,8 @@ my $image_path = dirname(__FILE__) . "/inputs/small-img.jpg";

# test bad request
open($gv_logs, ">:encoding(UTF-8)", $gv_logs_path);
$json_path = $tmp_dir . "/small-img2.json";
$response = HTTP::Response->new("403", "Not authorized", HTTP::Headers->new(), '{"foo": "blah"}');
$json_path = $tmp_dir . "/small-img2.json.gz";
$response = HTTP::Response->new("403", "Not authorized", HTTP::Headers->new(), $ocr_default_response);
push @ua_responses, $response;
send_image_to_cloud_vision($image_path, $json_path, \@CLOUD_VISION_FEATURES_FULL, $gv_logs);
close($gv_logs);
Expand Down

0 comments on commit 45df380

Please sign in to comment.