Skip to content

Commit

Permalink
fix: parsing of dots in packagins / recycling instructions (#7948)
Browse files Browse the repository at this point in the history
* fix: parsing of dots in packagins / recycling instructions + better handling of commas
  • Loading branch information
stephanegigandet committed Jan 6, 2023
1 parent 6062126 commit 1b2e690
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 3 deletions.
11 changes: 10 additions & 1 deletion lib/ProductOpener/Packaging.pm
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,9 @@ sub parse_packaging_component_data_from_text_phrase ($text, $text_language) {
$text = $';
}

# We might have escaped dots and commas inside numbers from analyze_and_combine_packaging_data()
$text =~ s/(\d)\\(\.|\,)(\d)/$1$2$3/g;

# Also try to match the canonicalized form so that we can match the extended synonyms that are only available in canonicalized form
my $textid = get_string_id_for_lang($text_language, $text);

Expand Down Expand Up @@ -793,7 +796,13 @@ sub analyze_and_combine_packaging_data ($product_ref, $response_ref) {
# Packaging text field (populated by OCR of the packaging image and/or contributors or producers)
if (defined $product_ref->{packaging_text}) {

my @packaging_text_entries = split(/,|;|\n/, $product_ref->{packaging_text});
# Separate phrases by matching:
# . , ; and newlines
# but we want to keep commas and dots that are inside numbers (3.40 or 1,5)
# so we escape them first
my $packaging_text = $product_ref->{packaging_text};
$packaging_text =~ s/(\d)(\.|,)(\d)/$1\\$2$3/g;
my @packaging_text_entries = split(/(?<!\\)\.|(?<!\\),|;|\n/, $packaging_text);
push(@phrases, @packaging_text_entries);
$number_of_packaging_text_entries = scalar @packaging_text_entries;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"lc" : "fr",
"misc_tags" : [
"en:packagings-not-complete",
"en:packagings-not-empty-but-not-complete",
"en:packagings-not-empty"
],
"packaging_text" : "6 bouteilles en plastique transparent PET de 1,5 L 脿 recycler",
"packagings" : [
{
"material" : "en:pet-polyethylene-terephthalate",
"number_of_units" : 6,
"quantity_per_unit" : "1,5 l",
"quantity_per_unit_unit" : "l",
"quantity_per_unit_value" : 1.5,
"recycling" : "en:recycle",
"shape" : "en:bottle"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"lc" : "fr",
"misc_tags" : [
"en:packagings-not-complete",
"en:packagings-not-empty-but-not-complete",
"en:packagings-not-empty"
],
"packaging_text" : "1 bo卯te en m茅tal,4 bouteilles (plastique).",
"packagings" : [
{
"material" : "en:metal",
"number_of_units" : 1,
"shape" : "en:box"
},
{
"material" : "en:plastic",
"number_of_units" : 4,
"shape" : "en:bottle"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"lc" : "fr",
"misc_tags" : [
"en:packagings-not-complete",
"en:packagings-not-empty-but-not-complete",
"en:packagings-not-empty"
],
"packaging_text" : "Film plastique 脿 jeter. 脡tui carton 脿 recycler.",
"packagings" : [
{
"material" : "en:plastic",
"recycling" : "en:discard",
"shape" : "en:film"
},
{
"material" : "en:cardboard",
"recycling" : "en:recycle",
"shape" : "en:sleeve"
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"en:packagings-not-empty-but-not-complete",
"en:packagings-not-empty"
],
"packaging_text" : "barquette en plastique 脿 jeter;film plastique 脿 jeter; bo卯te en carton 脿 recycler",
"packaging_text" : "barquette en plastique 脿 jeter; film plastique 脿 jeter; bo卯te en carton 脿 recycler",
"packagings" : [
{
"material" : "en:plastic",
Expand Down
29 changes: 28 additions & 1 deletion tests/unit/packaging.t
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ bo卯te en carton 脿 recycler"
'packaging_text_fr_multiple_semi_colon',
{
lc => "fr",
packaging_text => "barquette en plastique 脿 jeter;film plastique 脿 jeter; bo卯te en carton 脿 recycler"
packaging_text => "barquette en plastique 脿 jeter; film plastique 脿 jeter; bo卯te en carton 脿 recycler"
}
],
[
Expand Down Expand Up @@ -517,6 +517,33 @@ bo卯te en carton 脿 recycler"
}
],

# dots were not parsed correctly
[
'fr-dot-to-separate-components',
{
lc => "fr",
packaging_text => "Film plastique 脿 jeter. 脡tui carton 脿 recycler.",
}
],

# comma inside a number: don't split
[
'fr-comma-inside-a-number',
{
lc => "fr",
packaging_text => "6 bouteilles en plastique transparent PET de 1,5 L 脿 recycler",
}
],

# comma without spaces, not in a number: split
[
'fr-comma-without-space',
{
lc => "fr",
packaging_text => "1 bo卯te en m茅tal,4 bouteilles (plastique).",
}
],

);

my $json = JSON->new->allow_nonref->canonical;
Expand Down

0 comments on commit 1b2e690

Please sign in to comment.