diff --git a/lib/ProductOpener/Ingredients.pm b/lib/ProductOpener/Ingredients.pm index 53492e832b41f..2b3b969ca6dda 100644 --- a/lib/ProductOpener/Ingredients.pm +++ b/lib/ProductOpener/Ingredients.pm @@ -93,6 +93,10 @@ BEGIN { &has_specific_ingredient_property + &init_origins_regexps + &match_ingredient_origin + &parse_origins_from_text + ); # symbols to export on request %EXPORT_TAGS = (all => [@EXPORT_OK]); } @@ -1014,6 +1018,38 @@ sub match_ingredient_origin ($product_lc, $text_ref, $matched_ingredient_ref) { return 1; } + # Try to match without a "from" marker (e.g. "Strawberry France") + elsif ($$text_ref + =~ /\s*([^,.;:]+)\s+((?:$origins_regexp)(?:(?:,|$and_or)(?:\s?)(?:$origins_regexp))*)\s*(?:,|;|\.| - |$)/i) + { + # Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE") + # in order to not overmatch something like "Origin of milk: UK, some other mention." + # In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy. + + $matched_ingredient_ref->{ingredient} = $1; + $matched_ingredient_ref->{origins} = $2; + $matched_ingredient_ref->{matched_text} = $&; + + # keep the matched ingredient only if it is a known ingredient in the taxonomy, in order to avoid false positives + # e.g. "something made in France" should not be turned into ingredient "something made in" + origin "France" + if ( + not( + exists_taxonomy_tag( + "ingredients", + canonicalize_taxonomy_tag($product_lc, "ingredients", $matched_ingredient_ref->{ingredient}) + ) + ) + ) + { + $matched_ingredient_ref = {}; + } + else { + # Remove the matched text + $$text_ref = $` . ' ' . $'; + + return 1; + } + } return 0; } @@ -1082,6 +1118,9 @@ sub parse_origins_from_text ($product_ref, $text) { my $product_lc = $product_ref->{lc}; + # Normalize single quotes + $text =~ s/’/'/g; + # Go through the ingredient lists multiple times # as long as we have one match my $matched_ingredient = "start"; diff --git a/lib/ProductOpener/Test.pm b/lib/ProductOpener/Test.pm index af99bef7ca855..a6be075be5439 100644 --- a/lib/ProductOpener/Test.pm +++ b/lib/ProductOpener/Test.pm @@ -295,6 +295,11 @@ sub compare_to_expected_results ($object_ref, $expected_results_file, $update_ex my $json = JSON->new->allow_nonref->canonical; + my $desc = undef; + if (defined $test_ref) { + $desc = $test_ref->{desc} // $test_ref->{id}; + } + if ($update_expected_results) { open(my $result, ">:encoding(UTF-8)", $expected_results_file) or confess("Could not create $expected_results_file: $!"); @@ -309,7 +314,7 @@ sub compare_to_expected_results ($object_ref, $expected_results_file, $update_ex local $/; #Enable 'slurp' mode my $expected_object_ref = $json->decode(<$expected_result>); - is_deeply($object_ref, $expected_object_ref) or diag(explain $test_ref, explain $object_ref); + is_deeply($object_ref, $expected_object_ref, $desc) or diag(explain $test_ref, explain $object_ref); } else { fail("could not load $expected_results_file"); diff --git a/tests/unit/expected_test_results/parse_origins_from_text/empty.json b/tests/unit/expected_test_results/parse_origins_from_text/empty.json new file mode 100644 index 0000000000000..b19b86b407625 --- /dev/null +++ b/tests/unit/expected_test_results/parse_origins_from_text/empty.json @@ -0,0 +1,4 @@ +{ + "lc" : "en", + "origin_en" : "" +} diff --git a/tests/unit/expected_test_results/parse_origins_from_text/just-a-country.json b/tests/unit/expected_test_results/parse_origins_from_text/just-a-country.json new file mode 100644 index 0000000000000..69e52daddf327 --- /dev/null +++ b/tests/unit/expected_test_results/parse_origins_from_text/just-a-country.json @@ -0,0 +1,4 @@ +{ + "lc" : "en", + "origin_en" : "Italy" +} diff --git a/tests/unit/expected_test_results/parse_origins_from_text/real-list-fr.json b/tests/unit/expected_test_results/parse_origins_from_text/real-list-fr.json new file mode 100644 index 0000000000000..d0819826d352a --- /dev/null +++ b/tests/unit/expected_test_results/parse_origins_from_text/real-list-fr.json @@ -0,0 +1,48 @@ +{ + "lc" : "fr", + "origin_fr" : "amandes d’Espagne. Chocolat noir de Côte d’Ivoire. Huile de noisette d’Italie. sucre de France. noisettes d’Italie. fèves de cacao de Côte d’Ivoire. Framboises lyophilisées d’Espagne. arôme naturel de framboise fabriqué en France.", + "specific_ingredients" : [ + { + "id" : "en:almond", + "ingredient" : "amandes", + "origins" : "en:spain", + "text" : "amandes d'Espagne." + }, + { + "id" : "en:dark-chocolate", + "ingredient" : "Chocolat noir", + "origins" : "en:cote-d-ivoire", + "text" : "Chocolat noir de Côte d'Ivoire." + }, + { + "id" : "en:hazelnut-oil", + "ingredient" : "Huile de noisette", + "origins" : "en:italy", + "text" : "Huile de noisette d'Italie." + }, + { + "id" : "en:sugar", + "ingredient" : "sucre", + "origins" : "en:france", + "text" : "sucre de France." + }, + { + "id" : "en:hazelnut", + "ingredient" : "noisettes", + "origins" : "en:italy", + "text" : "noisettes d'Italie." + }, + { + "id" : "en:cocoa-bean", + "ingredient" : "fèves de cacao", + "origins" : "en:cote-d-ivoire", + "text" : "fèves de cacao de Côte d'Ivoire." + }, + { + "id" : "en:freeze-dried-raspberries", + "ingredient" : "Framboises lyophilisées", + "origins" : "en:spain", + "text" : "Framboises lyophilisées d'Espagne." + } + ] +} diff --git a/tests/unit/expected_test_results/parse_origins_from_text/rubish-entry.json b/tests/unit/expected_test_results/parse_origins_from_text/rubish-entry.json new file mode 100644 index 0000000000000..e569a44d8227e --- /dev/null +++ b/tests/unit/expected_test_results/parse_origins_from_text/rubish-entry.json @@ -0,0 +1,4 @@ +{ + "lc" : "en", + "origin_en" : "NNSTeia nauns" +} diff --git a/tests/unit/expected_test_results/parse_origins_from_text/simple-extraction-en.json b/tests/unit/expected_test_results/parse_origins_from_text/simple-extraction-en.json new file mode 100644 index 0000000000000..c9ef59e7f37e2 --- /dev/null +++ b/tests/unit/expected_test_results/parse_origins_from_text/simple-extraction-en.json @@ -0,0 +1,12 @@ +{ + "lc" : "en", + "origin_en" : "Sugar from Italy.", + "specific_ingredients" : [ + { + "id" : "en:sugar", + "ingredient" : "Sugar", + "origins" : "en:italy", + "text" : "Sugar from Italy." + } + ] +} diff --git a/tests/unit/expected_test_results/parse_origins_from_text/simple-extraction-fr.json b/tests/unit/expected_test_results/parse_origins_from_text/simple-extraction-fr.json new file mode 100644 index 0000000000000..e2e3e3029086d --- /dev/null +++ b/tests/unit/expected_test_results/parse_origins_from_text/simple-extraction-fr.json @@ -0,0 +1,12 @@ +{ + "lc" : "fr", + "origin_fr" : "Sucre France.", + "specific_ingredients" : [ + { + "id" : "en:sugar", + "ingredient" : "Sucre", + "origins" : "en:france", + "text" : "Sucre France." + } + ] +} diff --git a/tests/unit/match_ingredient_origin.t b/tests/unit/match_ingredient_origin.t new file mode 100644 index 0000000000000..35be57a5df2f0 --- /dev/null +++ b/tests/unit/match_ingredient_origin.t @@ -0,0 +1,125 @@ +use ProductOpener::PerlStandards; + +use Test::More; +use Log::Any::Adapter 'TAP'; + +use ProductOpener::Ingredients qw/match_ingredient_origin init_origins_regexps/; + +my @tests = ( + + { + desc => "Empty", + lc => "en", + text => "", + expected => [], + }, + { + desc => "Just a country", + lc => "en", + text => "Italy", + expected => [], + }, + { + desc => "Rubish entry", + lc => "en", + text => "NNSTeia nauns", + expected => [], + }, + { + desc => "simple en extraction", + lc => "en", + text => "Sugar from Italy.", + expected => [ + { + ingredient => 'Sugar', + matched_text => 'Sugar from Italy.', + origins => 'Italy' + } + ], + }, + { + desc => "simple fr extraction", + lc => "fr", + text => "Sucre France.", + expected => [ + { + ingredient => 'Sucre', + matched_text => 'Sucre France.', + origins => 'France' + } + ], + }, + { + desc => "Real well written case in fr", + lc => "fr", + text => + "amandes d'Espagne. Chocolat noir de Côte d'Ivoire. Huile de noisette d'Italie. sucre de France. noisettes d'Italie. fèves de cacao de Côte d'Ivoire. Framboises lyophilisées d'Espagne. arôme naturel de framboise fabriqué en France.", + expected => [ + { + 'ingredient' => 'amandes', + 'matched_text' => 'amandes d\'Espagne.', + 'origins' => 'Espagne' + }, + { + 'ingredient' => 'Chocolat noir', + 'matched_text' => " Chocolat noir de Côte d'Ivoire.", + 'origins' => "Côte d'Ivoire" + }, + { + 'ingredient' => 'Huile de noisette', + 'matched_text' => ' Huile de noisette d\'Italie.', + 'origins' => 'Italie' + }, + { + 'ingredient' => 'sucre', + 'matched_text' => ' sucre de France.', + 'origins' => 'France' + }, + { + 'ingredient' => 'noisettes', + 'matched_text' => ' noisettes d\'Italie.', + 'origins' => 'Italie' + }, + { + 'ingredient' => "fèves de cacao", + 'matched_text' => " fèves de cacao de Côte d'Ivoire.", + 'origins' => "Côte d'Ivoire" + }, + { + 'ingredient' => "Framboises lyophilisées", + 'matched_text' => " Framboises lyophilisées d'Espagne.", + 'origins' => 'Espagne' + } + ], + }, +); + +init_origins_regexps(); + +foreach my $test_ref (@tests) { + my $matched_ingredients_ref = []; + my $result = 1; + my $input_text = $test_ref->{text}; + while ($result) { + my $matched_ingredient_ref = {}; + $result = match_ingredient_origin($test_ref->{lc}, \$test_ref->{text}, $matched_ingredient_ref); + if ($result) { + push @$matched_ingredients_ref, $matched_ingredient_ref; + } + } + my $expected = $test_ref->{expected}; + is_deeply($matched_ingredients_ref, $expected, $test_ref->{desc}) + || diag( + explain( + { + lc => $test_ref->{lc}, + input_text => $input_text, + remaining_text => $test_ref->{text}, + matched => $matched_ingredients_ref, + expected => $expected + } + ) + ); +} + +done_testing(); diff --git a/tests/unit/parse_origins_from_text.t b/tests/unit/parse_origins_from_text.t new file mode 100644 index 0000000000000..5e4883c7c4f41 --- /dev/null +++ b/tests/unit/parse_origins_from_text.t @@ -0,0 +1,80 @@ +use ProductOpener::PerlStandards; + +use Test::More; +use Log::Any::Adapter 'TAP'; + +use ProductOpener::Test qw/:all/; + +use ProductOpener::Ingredients qw/parse_origins_from_text init_origins_regexps/; + +my ($test_id, $test_dir, $expected_result_dir, $update_expected_results) = (init_expected_results(__FILE__)); + +my @tests = ( + { + id => "empty", + desc => "Empty", + product => { + lc => "en", + origin_en => "", + } + }, + { + id => "just-a-country", + desc => "Just a country", + product => { + lc => "en", + origin_en => "Italy", + } + }, + { + id => "rubish-entry", + desc => "Rubish entry", + product => { + lc => "en", + origin_en => "NNSTeia nauns", + } + }, + { + id => "simple-extraction-en", + desc => "simple en extraction", + product => { + lc => "en", + origin_en => "Sugar from Italy.", + } + }, + { + id => "simple-extraction-fr", + desc => "simple fr extraction", + product => { + lc => "fr", + origin_fr => "Sucre France." + }, + }, + { + id => "real-list-fr", + desc => "Real well written case in fr", + product => { + lc => "fr", + origin_fr => + "amandes d’Espagne. Chocolat noir de Côte d’Ivoire. Huile de noisette d’Italie. sucre de France. noisettes d’Italie. fèves de cacao de Côte d’Ivoire. Framboises lyophilisées d’Espagne. arôme naturel de framboise fabriqué en France.", + }, + } +); + +init_origins_regexps(); + +my $json = JSON->new->allow_nonref->canonical; + +foreach my $test_ref (@tests) { + + my $testid = $test_ref->{id}; + my $product_ref = $test_ref->{product}; + my $text = $product_ref->{"origin_" . $product_ref->{lc}}; + + parse_origins_from_text($product_ref, $text); + + compare_to_expected_results($product_ref, "$expected_result_dir/$testid.json", $update_expected_results, $test_ref); + +} + +done_testing();