Skip to content

Commit

Permalink
fix: improve ingredients extraction (#8942)
Browse files Browse the repository at this point in the history
  • Loading branch information
benbenben2 committed Sep 8, 2023
1 parent bacf2a7 commit ddd8177
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 17 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ unit_test:
@echo "🥫 unit tests success"

integration_test:
@echo "🥫 Running unit tests …"
@echo "🥫 Running integration tests …"
# we launch the server and run tests within same container
# we also need dynamicfront for some assets to exists
# this is the place where variables are important
Expand Down
22 changes: 16 additions & 6 deletions lib/ProductOpener/Ingredients.pm
Original file line number Diff line number Diff line change
Expand Up @@ -3977,7 +3977,9 @@ my %phrases_after_ingredients_list = (
],

en => [
'adds a trivial amount', # e.g. adds a trivial amount of added sugars per serving
'after opening',
#'Best before',
'nutrition(al)? (as sold|facts|information|typical|value[s]?)',
# "nutrition advice" seems to appear before ingredients rather than after.
# "nutritional" on its own would match the ingredient "nutritional yeast" etc.
Expand All @@ -3988,7 +3990,6 @@ my %phrases_after_ingredients_list = (
'once opened[,]? (consume|keep|refrigerate|store|use)',
'(Storage( instructions)?[: ]+)?Store in a cool[,]? dry place',
'(dist(\.)?|distributed|sold)(\&|and|sold| )* (by|exclusively)',
#'Best before',
#'See bottom of tin',
],

Expand Down Expand Up @@ -4236,10 +4237,10 @@ my %prefixes_before_dash = (fr => ['demi', 'saint',],);
my %ignore_phrases = (
de => [
'\d\d?\s?%\sFett\si(\.|,)\s?Tr(\.|,)?', # 45 % Fett i.Tr.
"inklusive",
'inklusive',
],
en => ["na|n/a|not applicable",],
fr => ["non applicable|non concerné",],
en => ['not applicable',],
fr => ['non applicable|non concerné',],

);

Expand Down Expand Up @@ -4444,7 +4445,7 @@ sub cut_ingredients_text_for_lang ($text, $language) {
if (defined $phrases_after_ingredients_list{$language}) {

foreach my $regexp (@{$phrases_after_ingredients_list{$language}}) {
if ($text =~ /\s*\b$regexp\b(.*)$/is) {
if ($text =~ /\*?\s*\b$regexp\b(.*)$/is) {
$text = $`;
$log->debug("removed phrases_after_ingredients_list", {removed => $1, kept => $text, regexp => $regexp})
if $log->is_debug();
Expand All @@ -4459,7 +4460,16 @@ sub cut_ingredients_text_for_lang ($text, $language) {
if (defined $ignore_phrases{$language}) {

foreach my $regexp (@{$ignore_phrases{$language}}) {
$text =~ s/^\s*($regexp)(\.)?\s*$//is;
# substract regexp
$text =~ s/\s*\b(?:$regexp)\s*/ /gi;
# rm opened-closed parenthesis
$text =~ s/\(\s?\)//g;
# rm double commas
$text =~ s/\s?,\s?,/,/g;
# rm double spaces
$text =~ s/\s+/ /g;
# rm space before comma
$text =~ s/\s,\s?/, /g;
}
}

Expand Down
3 changes: 3 additions & 0 deletions taxonomies/ingredients.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16569,6 +16569,9 @@ ciqual_food_name:en:Soy oil
ciqual_food_name:fr:Huile de soja
# ingredient/soya-oil has 41836 products 1in 28 languages @2021-08-16

<en:soya oil
en:non-gmo soybean oil

<en:soya oil
en:refined soya oil
pl:rafinowany olej sojowy
Expand Down
2 changes: 1 addition & 1 deletion taxonomies/labels.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4376,7 +4376,7 @@ nl:Niet geschikt voor kinderen onder 1 jaar
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#en:description:Labels used for identifying products (and ingredients) that have been grown organically.

en:Organic, organically grown, organically produced, ingredient produced organically, from organic farming, From Organic Agriculture
en:Organic, organically grown, organically produced, ingredient produced organically, from organic farming, From Organic Agriculture, organic ingredients
bg:Био, биологично земеделие, биологично
ca:Orgànic,de cultiu ecologic
cs:Bio
Expand Down
13 changes: 4 additions & 9 deletions tests/unit/ingredients_clean.t
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ my @tests = (
[
"fr",
"lait 98 % ,sel,ferments lactiques,coagulant Valeurs nutritionnelles Pour 100 g 1225 kj 295 kcal pour 22g 270 kJ 65 kcal Matières grasses dont acides gras saturés pour 100g 23g/ 15,5g pour 22g 5,1g/ 3,4g Glucides dont sucres traces Protéines pour 100g 22 g pour 22g 4,8 g Sel pour 100g 1,8 g pour 22g 0,40g Calcium pour 100g 680 mg(85 % ) pour 22g 150 mg(19 % ) Afin d'éviter les risques d'étouffement pour les enfants de moins de 4 ans, coupez en petites bouchées. AQR: Apports Quotidiens de Référence А conserver au froid après achat.",
"lait 98 % ,sel,ferments lactiques,coagulant"
"lait 98 %, sel,ferments lactiques,coagulant"
],

[
Expand Down Expand Up @@ -63,14 +63,9 @@ my @tests = (
],

[
"fr", "Ingrédients :
Pulpe de tomate 41% (tomate pelée 24.6%, jus de tomate 16.4%, acidifiant : acide citrique), purée de tomate 25%, eau, oignon,
crème fraîche
5%, lait de coco déshydraté 2,5% (contient des protéines de lait), curry 2%, sucre, amidon modifié de maïs, poivron vert, poivron rouge, sel, noix de coco râpée 1%, arôme naturel de curry 0,25%, acidifiant : acide lactique. Peut contenir des traces de céleri et de moutarde.
",
"Pulpe de tomate 41% (tomate pelée 24.6%, jus de tomate 16.4%, acidifiant : acide citrique), purée de tomate 25%, eau, oignon,
crème fraîche
5%, lait de coco déshydraté 2,5% (contient des protéines de lait), curry 2%, sucre, amidon modifié de maïs, poivron vert, poivron rouge, sel, noix de coco râpée 1%, arôme naturel de curry 0,25%, acidifiant : acide lactique. Peut contenir des traces de céleri et de moutarde."
"fr",
"Ingrédients : Pulpe de tomate 41% (tomate pelée 24.6%, jus de tomate 16.4%, acidifiant : acide citrique), purée de tomate 25%, eau, oignon, crème fraîche 5%, lait de coco déshydraté 2,5% (contient des protéines de lait), curry 2%, sucre, amidon modifié de maïs, poivron vert, poivron rouge, sel, noix de coco râpée 1%, arôme naturel de curry 0,25%, acidifiant : acide lactique. Peut contenir des traces de céleri et de moutarde.",
"Pulpe de tomate 41% (tomate pelée 24.6%, jus de tomate 16.4%, acidifiant : acide citrique), purée de tomate 25%, eau, oignon, crème fraîche 5%, lait de coco déshydraté 2,5% (contient des protéines de lait), curry 2%, sucre, amidon modifié de maïs, poivron vert, poivron rouge, sel, noix de coco râpée 1%, arôme naturel de curry 0,25%, acidifiant : acide lactique. Peut contenir des traces de céleri et de moutarde."
],

[
Expand Down
42 changes: 42 additions & 0 deletions tests/unit/ingredients_extract.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/perl -w

# Tests of Ingredients::preparse_ingredients_text()

use Modern::Perl '2017';
use utf8;

use Test::More;
use Log::Any::Adapter 'TAP';

use ProductOpener::Products qw/:all/;
use ProductOpener::Tags qw/:all/;
use ProductOpener::TagsEntries qw/:all/;
use ProductOpener::Ingredients qw/:all/;

my @lists = (
# en phrases_after_ingredients_list
[
"en",
"carrots, green peas, corn, scallion. *adds a trivial amount of added sugars per serving.",
"carrots, green peas, corn, scallion.",
],
# en ignore_phrases,
[
"en",
"Egg White, Xanthan Gum (not applicable), Salt, Glucono-delta-lactone.",
"Egg White, Xanthan Gum, Salt, Glucono-delta-lactone.",
],
);

foreach my $test_ref (@lists) {
my $lc = $test_ref->[0]; # Language
my $ingredients_text_from_image = $test_ref->[1];
my $cut_ingredients_text_from_image = cut_ingredients_text_for_lang($ingredients_text_from_image, $lc);
print STDERR "input from the picture extraction (ingredients list ($lc)): $ingredients_text_from_image\n";
print STDERR "cut_ingredients_text_from_image (result from sub routine): $cut_ingredients_text_from_image\n";
my $expected = $test_ref->[2];
is(lc($cut_ingredients_text_from_image), lc($expected))
or print STDERR "Original ingredients: $ingredients_text_from_image ($lc)\n";
}

done_testing();

0 comments on commit ddd8177

Please sign in to comment.