Skip to content

Commit

Permalink
fix: conflict additives variants and and (#8905)
Browse files Browse the repository at this point in the history
* update parsing additives with variants

* update test

* perltidy
  • Loading branch information
benbenben2 committed Aug 28, 2023
1 parent 9ea491c commit c8d6e44
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 3 deletions.
33 changes: 31 additions & 2 deletions lib/ProductOpener/Ingredients.pm
Original file line number Diff line number Diff line change
Expand Up @@ -4859,8 +4859,33 @@ sub preparse_ingredients_text ($product_lc, $text) {

# colorants alimentaires E (124,122,133,104,110)
my $roman_numerals = "i|ii|iii|iv|v|vi|vii|viii|ix|x|xi|xii|xii|xiv|xv";
my $additivesregexp
= '(\d{3}|\d{4})(( |-|\.)?([abcdefgh]))?(( |-|\.)?((' . $roman_numerals . ')|\((' . $roman_numerals . ')\)))?';
my $additivesregexp;
# special cases, when $and (" a ", " e " or " i ") conflict with variants (E470a, E472e or E451i or E451(i))
# in these cases, we fetch variant only if there is no space before
# E470a -> ok, E470 a -> not ok, E470 a, -> ok
# E451i -> ok, E451 i -> not ok, E451 i, -> ok
if ($and eq " a " || $and eq " e ") {
# based on $additivesregexp below in the else, with following modifications
# no space before abcdefgh
$additivesregexp
= '(\d{3}|\d{4})((-|\.)?([abcdefgh]))?(( |,|.)?((' . $roman_numerals . ')|\((' . $roman_numerals . ')\)))?';
}
elsif ($and eq " i ") {
# based on $additivesregexp below in the else, with following modifications
# no space before i
$additivesregexp
= '(\d{3}|\d{4})(( |-|\.)?([abcdefgh]))?((-|\.)?(('
. $roman_numerals . ')|\(('
. $roman_numerals
. ')\)))?';
}
else {
$additivesregexp
= '(\d{3}|\d{4})(( |-|\.)?([abcdefgh]))?(( |-|\.)?(('
. $roman_numerals . ')|\(('
. $roman_numerals
. ')\)))?';
}

$text
=~ s/\b(e|ins|sin|i-n-s|s-i-n|i\.n\.s\.?|s\.i\.n\.?)(:|\(|\[| | n| nb|#|°)+((($additivesregexp)( |\/| \/ | - |,|, |$and))+($additivesregexp))\b(\s?(\)|\]))?/normalize_additives_enumeration($product_lc,$3)/ieg;
Expand All @@ -4878,6 +4903,10 @@ sub preparse_ingredients_text ($product_lc, $text) {

# Canonicalize additives to remove the dash that can make further parsing break
# Match E + number + letter a to h + i to xv, followed by a space or separator
# $3 would be either \d{3} or \d{4} in $additivesregexp
# $6 would be ([abcdefgh]) in $additivesregexp
# $9 would be (( |-|\.)?((' . $roman_numerals . ')|\((' . $roman_numerals . ')\))) in $additivesregexp
# $12 would be (\b|\s|,|\.|;|\/|-|\\|\)|\]|$)
$text =~ s/(\b)e( |-|\.)?$additivesregexp(\b|\s|,|\.|;|\/|-|\\|\)|\]|$)/replace_additive($3,$6,$9) . $12/ieg;

# E100 et E120 -> E100, E120
Expand Down
27 changes: 26 additions & 1 deletion tests/unit/ingredients_parsing.t
Original file line number Diff line number Diff line change
Expand Up @@ -600,7 +600,32 @@ my @lists = (
["ru", "масло растительное (подсолнечное, соевое)", "масло растительное подсолнечное, масло растительное соевое"],

# grammes -> g
["fr", "Teneur en fruits: 50gr pour 100 grammes", "Teneur en fruits: 50g pour 100 g"]
["fr", "Teneur en fruits: 50gr pour 100 grammes", "Teneur en fruits: 50g pour 100 g"],

# test conflicts between the word "and" in some languages and additives variants. With letters i or e or a.
[
"hr",
"bojilo: E 150a, tvari za rahljenje: E 500 i E 503, sol.",
"bojilo: e150a, tvari za rahljenje: e500, e503, sol."
],
[
"hr",
"bojilo: E 150a, tvari za rahljenje: E 500 i, E 503, sol.",
"bojilo: e150a, tvari za rahljenje: e500 i, e503, sol."
],
[
"hr",
"bojilo: E 150a, tvari za rahljenje: E 500(i), E 503, sol.",
"bojilo: e150a, tvari za rahljenje: e500i, e503, sol."
],
[
"hr",
"bojilo: E 150a, tvari za rahljenje: E 500i, E 503, sol.",
"bojilo: e150a, tvari za rahljenje: e500i, e503, sol."
],
["it", "formaggio, E 472 e, E470a.", "formaggio, e472 e, e470a."],
["it", "formaggio, E 472 e E470a.", "formaggio, e472, e470a."],
["sk", "syr, E470 a E470a, mlieko.", "syr, e470, e470a, mlieko."]
);

foreach my $test_ref (@lists) {
Expand Down

0 comments on commit c8d6e44

Please sign in to comment.