Skip to content

Commit

Permalink
feat: parse origins of ingredients field (#6995)
Browse files Browse the repository at this point in the history
* feat: parse origins of ingredients field #4461

* feat: parse origins of ingredients field #4461

* feat: parse origins of ingredients field #4461
  • Loading branch information
stephanegigandet committed Jul 7, 2022
1 parent 881791a commit 79fa9ae
Show file tree
Hide file tree
Showing 4 changed files with 583 additions and 11 deletions.
183 changes: 172 additions & 11 deletions lib/ProductOpener/Ingredients.pm
Original file line number Diff line number Diff line change
Expand Up @@ -1016,13 +1016,27 @@ sub add_specific_ingredients_from_labels($) {
}


=head2 parse_specific_ingredients_from_text ( product_ref, $text )
=head2 parse_specific_ingredients_from_text ( product_ref, $text, $percent_regexp )
Lists of ingredients sometime include extra mentions for specific ingredients
at the end of the ingredients list. e.g. "Prepared with 50g of fruits for 100g of finished product".
This function extracts those mentions and adds them to the specific_ingredients structure.
This function is also used to parse the origins of ingredients field.
=head3 Arguments
=head4 product_ref
=head4 text $text
=head4 percent regulart expression $percent_regexp
Used to find % values, language specific.
Pass undef in order to skip % recognition. This is useful if we know the text is only for the origins of ingredients.
=head3 Return values
=head4 specific_ingredients structure
Expand Down Expand Up @@ -1062,7 +1076,8 @@ sub parse_specific_ingredients_from_text($$$) {
# examples:
# Total Milk Content 73%.

if ($text =~ /\s*(?:total |min |minimum )?([^,.;]+?)\s+content(?::| )+$percent_regexp\s*(?:per 100\s*(?:g)(?:[^,.;-]*?))?(?:;|\.| - |$)/i) {
if ((defined $percent_regexp)
and ($text =~ /\s*(?:total |min |minimum )?([^,.;]+?)\s+content(?::| )+$percent_regexp\s*(?:per 100\s*(?:g)(?:[^,.;-]*?))?(?:;|\.| - |$)/i)) {
$percent = $2; # $percent_regexp
$ingredient = $1;
$matched_text = $&;
Expand All @@ -1071,7 +1086,7 @@ sub parse_specific_ingredients_from_text($$$) {
}

# Origin of the milk: United Kingdom
elsif ($text =~ /\s*(?:origin of (?:the )?)([^,.;]+?)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) {
elsif ($text =~ /\s*(?:origin of (?:the )?)([^,.;:]+)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) {
# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE")
# in order to not overmatch something like "Origin of milk: UK, some other mention."
# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy.
Expand All @@ -1081,7 +1096,6 @@ sub parse_specific_ingredients_from_text($$$) {
# Remove the matched text
$text = $` . ' ' . $';
}

}
elsif ($product_lc eq "fr") {

Expand All @@ -1090,7 +1104,8 @@ sub parse_specific_ingredients_from_text($$$) {
# Teneur en lactose < 0,01 g/100 g.
# Préparée avec 50 g de fruits pour 100 g de produit fini.

if ($text =~ /\s*(?:(?:préparé|prepare)(?:e|s|es)? avec)(?: au moins)?(?::| )+$percent_regexp (?:de |d')?([^,.;]+?)\s*(?:pour 100\s*(?:g)(?:[^,.;-]*?))?(?:;|\.| - |$)/i) {
if ((defined $percent_regexp)
and ($text =~ /\s*(?:(?:préparé|prepare)(?:e|s|es)? avec)(?: au moins)?(?::| )+$percent_regexp (?:de |d')?([^,.;]+?)\s*(?:pour 100\s*(?:g)(?:[^,.;-]*?))?(?:;|\.| - |$)/i)) {
$percent = $1; # $percent_regexp
$ingredient = $2;
$matched_text = $&;
Expand All @@ -1100,7 +1115,8 @@ sub parse_specific_ingredients_from_text($$$) {

# Teneur totale en sucres : 60 g pour 100 g de produit fini.
# Teneur en citron de 100%
elsif ($text =~ /\s*teneur(?: min| minimum| minimale| totale)?(?: en | de | d'| du )([^,.;]+?)\s*(?:pour 100\s*(?:g)(?: de produit(?: fini)?)?)?(?: de)?(?::| )+$percent_regexp\s*(?:pour 100\s*(?:g)(?:[^,.;]*?))?(?:;|\.| - |$)/i) {
elsif ((defined $percent_regexp)
and ($text =~ /\s*teneur(?: min| minimum| minimale| totale)?(?: en | de | d'| du )([^,.;]+?)\s*(?:pour 100\s*(?:g)(?: de produit(?: fini)?)?)?(?: de)?(?::| )+$percent_regexp\s*(?:pour 100\s*(?:g)(?:[^,.;]*?))?(?:;|\.| - |$)/i)) {
$percent = $2; # $percent_regexp
$ingredient = $1;
$matched_text = $&;
Expand All @@ -1109,7 +1125,7 @@ sub parse_specific_ingredients_from_text($$$) {
}

# Origine du Cacao: Pérou
elsif ($text =~ /\s*(?:origine (?:de |du |de la |des |de l'))([^,.;]+?)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) {
elsif ($text =~ /\s*(?:origine (?:de |du |de la |des |de l'))([^,.;:]+)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) {
# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE")
# in order to not overmatch something like "Origin of milk: UK, some other mention."
# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy.
Expand All @@ -1118,6 +1134,8 @@ sub parse_specific_ingredients_from_text($$$) {
$matched_text = $&;
# Remove the matched text
$text = $` . ' ' . $';
# Remove extra spaces
$ingredient =~ s/\s+$//;
}

}
Expand Down Expand Up @@ -1145,6 +1163,136 @@ sub parse_specific_ingredients_from_text($$$) {
}


=head2 parse_origins_from_text ( product_ref, $text)
This function parses the origins of ingredients field to extract the origins of specific ingredients.
The origins are stored in the specific_ingredients structure of the product.
Note: this function is similar to parse_specific_ingredients_from_text() that operates on ingredients lists.
The difference is that parse_specific_ingredients_from_text() only extracts and recognizes text that is
an extra mention at the end of an ingredient list (e.g. "Origin of strawberries: Spain"),
while parse_origins_from_text() will also recognize text like "Strawberries: Spain".
=head3 Arguments
=head4 product_ref
=head4 text $text
=head3 Return values
=head4 specific_ingredients structure
Array of specific ingredients.
=head4
=cut

sub parse_origins_from_text($$) {

my $product_ref = shift;
my $text = shift;

my $product_lc = $product_ref->{lc};

# Go through the ingredient lists multiple times
# as long as we have one match
my $ingredient = "start";

while ($ingredient) {

# Initialize values
$ingredient = undef;
my $matched_text;
my $origins;

# Note: in regular expressions below, use non-capturing groups (starting with (?: )
# for all groups, except groups that capture actual data: ingredient name, percent, origins

# Regexps should match until we reach a . ; or the end of the text

if ($product_lc eq "en") {

# Origin of the milk: United Kingdom.
if ($text =~ /\s*(?:origin of (?:the )?)([^,.;:]+)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) {
# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE")
# in order to not overmatch something like "Origin of milk: UK, some other mention."
# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy.
$origins = $2;
$ingredient = $1;
$matched_text = $&;
# Remove the matched text
$text = $` . ' ' . $';
}
# Strawberries: Spain
elsif ($text =~ /\s*([^,.;:]+)(?::)\s*([^,.;]+?)\s*(?:;|\.| - |$)/i) {
# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE")
# in order to not overmatch something like "Origin of milk: UK, some other mention."
# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy.
$origins = $2;
$ingredient = $1;
$matched_text = $&;
# Remove the matched text
$text = $` . ' ' . $';
}
}
elsif ($product_lc eq "fr") {

# Origine du Cacao: Pérou
if ($text =~ /\s*(?:origine (?:de |du |de la |des |de l'))([^,.;:]+)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) {
# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE")
# in order to not overmatch something like "Origin of milk: UK, some other mention."
# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy.
$origins = $2;
$ingredient = $1;
$matched_text = $&;
# Remove the matched text
$text = $` . ' ' . $';
# Remove extra spaces
$ingredient =~ s/\s+$//;
}
# Cacao: Pérou
elsif ($text =~ /\s*([^,.;:]+)(?::)\s*([^,.;]+?)\s*(?:;|\.| - |$)/i) {
# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE")
# in order to not overmatch something like "Origin of milk: UK, some other mention."
# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy.
$origins = $2;
$ingredient = $1;
$matched_text = $&;
# Remove the matched text
$text = $` . ' ' . $';
# Remove extra spaces
$ingredient =~ s/\s+$//;
}
# TODO:
# Fraises de Bretagne
# Filet de dinde de Vendée

}

# If we found an ingredient, save it in specific_ingredients
if (defined $ingredient) {
my $ingredient_id = canonicalize_taxonomy_tag($product_lc, "ingredients", $ingredient);

$matched_text =~ s/^\s+//;

my $specific_ingredients_ref = {
id => $ingredient_id,
ingredient => $ingredient,
text => $matched_text,
};

defined $origins and $specific_ingredients_ref->{origins} = join(",", map {canonicalize_taxonomy_tag($product_lc, "origins", $_)} split(/,/, $origins ));

push @{$product_ref->{specific_ingredients}}, $specific_ingredients_ref;
}
}

return $text;
}


=head2 parse_ingredients_text ( product_ref )
Parse the ingredients_text field to extract individual ingredients.
Expand Down Expand Up @@ -2165,15 +2313,28 @@ sub extract_ingredients_from_text($) {

delete $product_ref->{ingredients_percent_analysis};

# Parse the ingredients list to extract individual ingredients and sub-ingredients
# to create the ingredients array with nested sub-ingredients arrays
# The specific ingredients array will contain indications regarding the percentage,
# origins, labels etc. of specific ingredients. Those information may come from:
# - the origin of ingredients field ("origin")
# - labels (e.g. "British eggs")
# - the end of the list of the ingredients. e.g. "Origin of the rice: Thailand"

$product_ref->{specific_ingredients} = [];

parse_ingredients_text($product_ref);
# Ingredients origins may be listed in the origin field
# e.g. "Origin of the rice: Thailand."
my $product_lc = $product_ref->{lc};
if (defined $product_ref->{"origin_" . $product_lc}) {
parse_origins_from_text($product_ref, $product_ref->{"origin_" . $product_lc});
}

# Add specific ingredients from labels
add_specific_ingredients_from_labels($product_ref);
add_specific_ingredients_from_labels($product_ref);

# Parse the ingredients list to extract individual ingredients and sub-ingredients
# to create the ingredients array with nested sub-ingredients arrays

parse_ingredients_text($product_ref);

if (defined $product_ref->{ingredients}) {

Expand Down
Loading

0 comments on commit 79fa9ae

Please sign in to comment.