Skip to content

Commit

Permalink
test: match_ingredient_origin unit test (#8174)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexgarel committed Mar 13, 2023
1 parent e4085c7 commit aae0385
Show file tree
Hide file tree
Showing 10 changed files with 334 additions and 1 deletion.
39 changes: 39 additions & 0 deletions lib/ProductOpener/Ingredients.pm
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,10 @@ BEGIN {
&has_specific_ingredient_property
&init_origins_regexps
&match_ingredient_origin
&parse_origins_from_text
); # symbols to export on request
%EXPORT_TAGS = (all => [@EXPORT_OK]);
}
Expand Down Expand Up @@ -1014,6 +1018,38 @@ sub match_ingredient_origin ($product_lc, $text_ref, $matched_ingredient_ref) {

return 1;
}
# Try to match without a "from" marker (e.g. "Strawberry France")
elsif ($$text_ref
=~ /\s*([^,.;:]+)\s+((?:$origins_regexp)(?:(?:,|$and_or)(?:\s?)(?:$origins_regexp))*)\s*(?:,|;|\.| - |$)/i)
{
# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE")
# in order to not overmatch something like "Origin of milk: UK, some other mention."
# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy.

$matched_ingredient_ref->{ingredient} = $1;
$matched_ingredient_ref->{origins} = $2;
$matched_ingredient_ref->{matched_text} = $&;

# keep the matched ingredient only if it is a known ingredient in the taxonomy, in order to avoid false positives
# e.g. "something made in France" should not be turned into ingredient "something made in" + origin "France"
if (
not(
exists_taxonomy_tag(
"ingredients",
canonicalize_taxonomy_tag($product_lc, "ingredients", $matched_ingredient_ref->{ingredient})
)
)
)
{
$matched_ingredient_ref = {};
}
else {
# Remove the matched text
$$text_ref = $` . ' ' . $';

return 1;
}
}
return 0;
}

Expand Down Expand Up @@ -1082,6 +1118,9 @@ sub parse_origins_from_text ($product_ref, $text) {

my $product_lc = $product_ref->{lc};

# Normalize single quotes
$text =~ s//'/g;

# Go through the ingredient lists multiple times
# as long as we have one match
my $matched_ingredient = "start";
Expand Down
7 changes: 6 additions & 1 deletion lib/ProductOpener/Test.pm
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,11 @@ sub compare_to_expected_results ($object_ref, $expected_results_file, $update_ex

my $json = JSON->new->allow_nonref->canonical;

my $desc = undef;
if (defined $test_ref) {
$desc = $test_ref->{desc} // $test_ref->{id};
}

if ($update_expected_results) {
open(my $result, ">:encoding(UTF-8)", $expected_results_file)
or confess("Could not create $expected_results_file: $!");
Expand All @@ -309,7 +314,7 @@ sub compare_to_expected_results ($object_ref, $expected_results_file, $update_ex

local $/; #Enable 'slurp' mode
my $expected_object_ref = $json->decode(<$expected_result>);
is_deeply($object_ref, $expected_object_ref) or diag(explain $test_ref, explain $object_ref);
is_deeply($object_ref, $expected_object_ref, $desc) or diag(explain $test_ref, explain $object_ref);
}
else {
fail("could not load $expected_results_file");
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"lc" : "en",
"origin_en" : ""
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"lc" : "en",
"origin_en" : "Italy"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
{
"lc" : "fr",
"origin_fr" : "amandes d’Espagne. Chocolat noir de Côte d’Ivoire. Huile de noisette d’Italie. sucre de France. noisettes d’Italie. fèves de cacao de Côte d’Ivoire. Framboises lyophilisées d’Espagne. arôme naturel de framboise fabriqué en France.",
"specific_ingredients" : [
{
"id" : "en:almond",
"ingredient" : "amandes",
"origins" : "en:spain",
"text" : "amandes d'Espagne."
},
{
"id" : "en:dark-chocolate",
"ingredient" : "Chocolat noir",
"origins" : "en:cote-d-ivoire",
"text" : "Chocolat noir de Côte d'Ivoire."
},
{
"id" : "en:hazelnut-oil",
"ingredient" : "Huile de noisette",
"origins" : "en:italy",
"text" : "Huile de noisette d'Italie."
},
{
"id" : "en:sugar",
"ingredient" : "sucre",
"origins" : "en:france",
"text" : "sucre de France."
},
{
"id" : "en:hazelnut",
"ingredient" : "noisettes",
"origins" : "en:italy",
"text" : "noisettes d'Italie."
},
{
"id" : "en:cocoa-bean",
"ingredient" : "fèves de cacao",
"origins" : "en:cote-d-ivoire",
"text" : "fèves de cacao de Côte d'Ivoire."
},
{
"id" : "en:freeze-dried-raspberries",
"ingredient" : "Framboises lyophilisées",
"origins" : "en:spain",
"text" : "Framboises lyophilisées d'Espagne."
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"lc" : "en",
"origin_en" : "NNSTeia nauns"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"lc" : "en",
"origin_en" : "Sugar from Italy.",
"specific_ingredients" : [
{
"id" : "en:sugar",
"ingredient" : "Sugar",
"origins" : "en:italy",
"text" : "Sugar from Italy."
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"lc" : "fr",
"origin_fr" : "Sucre France.",
"specific_ingredients" : [
{
"id" : "en:sugar",
"ingredient" : "Sucre",
"origins" : "en:france",
"text" : "Sucre France."
}
]
}
125 changes: 125 additions & 0 deletions tests/unit/match_ingredient_origin.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
use ProductOpener::PerlStandards;

use Test::More;
use Log::Any::Adapter 'TAP';

use ProductOpener::Ingredients qw/match_ingredient_origin init_origins_regexps/;

my @tests = (

{
desc => "Empty",
lc => "en",
text => "",
expected => [],
},
{
desc => "Just a country",
lc => "en",
text => "Italy",
expected => [],
},
{
desc => "Rubish entry",
lc => "en",
text => "NNSTeia nauns",
expected => [],
},
{
desc => "simple en extraction",
lc => "en",
text => "Sugar from Italy.",
expected => [
{
ingredient => 'Sugar',
matched_text => 'Sugar from Italy.',
origins => 'Italy'
}
],
},
{
desc => "simple fr extraction",
lc => "fr",
text => "Sucre France.",
expected => [
{
ingredient => 'Sucre',
matched_text => 'Sucre France.',
origins => 'France'
}
],
},
{
desc => "Real well written case in fr",
lc => "fr",
text =>
"amandes d'Espagne. Chocolat noir de Côte d'Ivoire. Huile de noisette d'Italie. sucre de France. noisettes d'Italie. fèves de cacao de Côte d'Ivoire. Framboises lyophilisées d'Espagne. arôme naturel de framboise fabriqué en France.",
expected => [
{
'ingredient' => 'amandes',
'matched_text' => 'amandes d\'Espagne.',
'origins' => 'Espagne'
},
{
'ingredient' => 'Chocolat noir',
'matched_text' => " Chocolat noir de Côte d'Ivoire.",
'origins' => "Côte d'Ivoire"
},
{
'ingredient' => 'Huile de noisette',
'matched_text' => ' Huile de noisette d\'Italie.',
'origins' => 'Italie'
},
{
'ingredient' => 'sucre',
'matched_text' => ' sucre de France.',
'origins' => 'France'
},
{
'ingredient' => 'noisettes',
'matched_text' => ' noisettes d\'Italie.',
'origins' => 'Italie'
},
{
'ingredient' => "fèves de cacao",
'matched_text' => " fèves de cacao de Côte d'Ivoire.",
'origins' => "Côte d'Ivoire"
},
{
'ingredient' => "Framboises lyophilisées",
'matched_text' => " Framboises lyophilisées d'Espagne.",
'origins' => 'Espagne'
}
],
},
);

init_origins_regexps();

foreach my $test_ref (@tests) {
my $matched_ingredients_ref = [];
my $result = 1;
my $input_text = $test_ref->{text};
while ($result) {
my $matched_ingredient_ref = {};
$result = match_ingredient_origin($test_ref->{lc}, \$test_ref->{text}, $matched_ingredient_ref);
if ($result) {
push @$matched_ingredients_ref, $matched_ingredient_ref;
}
}
my $expected = $test_ref->{expected};
is_deeply($matched_ingredients_ref, $expected, $test_ref->{desc})
|| diag(
explain(
{
lc => $test_ref->{lc},
input_text => $input_text,
remaining_text => $test_ref->{text},
matched => $matched_ingredients_ref,
expected => $expected
}
)
);
}

done_testing();
80 changes: 80 additions & 0 deletions tests/unit/parse_origins_from_text.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
use ProductOpener::PerlStandards;

use Test::More;
use Log::Any::Adapter 'TAP';

use ProductOpener::Test qw/:all/;

use ProductOpener::Ingredients qw/parse_origins_from_text init_origins_regexps/;

my ($test_id, $test_dir, $expected_result_dir, $update_expected_results) = (init_expected_results(__FILE__));

my @tests = (
{
id => "empty",
desc => "Empty",
product => {
lc => "en",
origin_en => "",
}
},
{
id => "just-a-country",
desc => "Just a country",
product => {
lc => "en",
origin_en => "Italy",
}
},
{
id => "rubish-entry",
desc => "Rubish entry",
product => {
lc => "en",
origin_en => "NNSTeia nauns",
}
},
{
id => "simple-extraction-en",
desc => "simple en extraction",
product => {
lc => "en",
origin_en => "Sugar from Italy.",
}
},
{
id => "simple-extraction-fr",
desc => "simple fr extraction",
product => {
lc => "fr",
origin_fr => "Sucre France."
},
},
{
id => "real-list-fr",
desc => "Real well written case in fr",
product => {
lc => "fr",
origin_fr =>
"amandes d’Espagne. Chocolat noir de Côte d’Ivoire. Huile de noisette d’Italie. sucre de France. noisettes d’Italie. fèves de cacao de Côte d’Ivoire. Framboises lyophilisées d’Espagne. arôme naturel de framboise fabriqué en France.",
},
}
);

init_origins_regexps();

my $json = JSON->new->allow_nonref->canonical;

foreach my $test_ref (@tests) {

my $testid = $test_ref->{id};
my $product_ref = $test_ref->{product};
my $text = $product_ref->{"origin_" . $product_ref->{lc}};

parse_origins_from_text($product_ref, $text);

compare_to_expected_results($product_ref, "$expected_result_dir/$testid.json", $update_expected_results, $test_ref);

}

done_testing();

0 comments on commit aae0385

Please sign in to comment.