feat: parse origins of ingredients field (#6995)

* feat: parse origins of ingredients field #4461 * feat: parse origins of ingredients field #4461 * feat: parse origins of ingredients field #4461
openfoodfacts · Jul 7, 2022 · 79fa9ae · 79fa9ae
1 parent 881791a
commit 79fa9ae
Show file tree

Hide file tree

Showing 4 changed files with 583 additions and 11 deletions.
diff --git a/lib/ProductOpener/Ingredients.pm b/lib/ProductOpener/Ingredients.pm
@@ -1016,13 +1016,27 @@ sub add_specific_ingredients_from_labels($) {
 }
 
 
-=head2 parse_specific_ingredients_from_text ( product_ref, $text )
+=head2 parse_specific_ingredients_from_text ( product_ref, $text, $percent_regexp )
 
 Lists of ingredients sometime include extra mentions for specific ingredients
 at the end of the ingredients list. e.g. "Prepared with 50g of fruits for 100g of finished product".
 
 This function extracts those mentions and adds them to the specific_ingredients structure.
 
+This function is also used to parse the origins of ingredients field.
+
+=head3 Arguments
+
+=head4 product_ref
+
+=head4 text $text
+
+=head4 percent regulart expression $percent_regexp
+
+Used to find % values, language specific.
+
+Pass undef in order to skip % recognition. This is useful if we know the text is only for the origins of ingredients.
+
 =head3 Return values
 
 =head4 specific_ingredients structure
@@ -1062,7 +1076,8 @@ sub parse_specific_ingredients_from_text($$$) {
 			# examples:
 			# Total Milk Content 73%.
 
-			if ($text =~ /\s*(?:total |min |minimum )?([^,.;]+?)\s+content(?::| )+$percent_regexp\s*(?:per 100\s*(?:g)(?:[^,.;-]*?))?(?:;|\.| - |$)/i) {
+			if ((defined $percent_regexp)
+				and ($text =~ /\s*(?:total |min |minimum )?([^,.;]+?)\s+content(?::| )+$percent_regexp\s*(?:per 100\s*(?:g)(?:[^,.;-]*?))?(?:;|\.| - |$)/i)) {
 				$percent = $2;	# $percent_regexp
 				$ingredient = $1;
 				$matched_text = $&;
@@ -1071,7 +1086,7 @@ sub parse_specific_ingredients_from_text($$$) {
 			}
 
 			# Origin of the milk: United Kingdom
-			elsif ($text =~ /\s*(?:origin of (?:the )?)([^,.;]+?)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) {
+			elsif ($text =~ /\s*(?:origin of (?:the )?)([^,.;:]+)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) {
 				# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE")
 				# in order to not overmatch something like "Origin of milk: UK, some other mention."
 				# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy.
@@ -1081,7 +1096,6 @@ sub parse_specific_ingredients_from_text($$$) {
 				# Remove the matched text
 				$text = $` . ' ' . $';
 			}
-
 		}
 		elsif ($product_lc eq "fr") {
 
@@ -1090,7 +1104,8 @@ sub parse_specific_ingredients_from_text($$$) {
 			# Teneur en lactose < 0,01 g/100 g.
 			# Préparée avec 50 g de fruits pour 100 g de produit fini.
 
-			if ($text =~ /\s*(?:(?:préparé|prepare)(?:e|s|es)? avec)(?: au moins)?(?::| )+$percent_regexp (?:de |d')?([^,.;]+?)\s*(?:pour 100\s*(?:g)(?:[^,.;-]*?))?(?:;|\.| - |$)/i) {
+			if ((defined $percent_regexp)
+				and ($text =~ /\s*(?:(?:préparé|prepare)(?:e|s|es)? avec)(?: au moins)?(?::| )+$percent_regexp (?:de |d')?([^,.;]+?)\s*(?:pour 100\s*(?:g)(?:[^,.;-]*?))?(?:;|\.| - |$)/i)) {
 				$percent = $1;	# $percent_regexp
 				$ingredient = $2;
 				$matched_text = $&;
@@ -1100,7 +1115,8 @@ sub parse_specific_ingredients_from_text($$$) {
 
 			# Teneur totale en sucres : 60 g pour 100 g de produit fini.
 			# Teneur en citron de 100%
-			elsif ($text =~ /\s*teneur(?: min| minimum| minimale| totale)?(?: en | de | d'| du )([^,.;]+?)\s*(?:pour 100\s*(?:g)(?: de produit(?: fini)?)?)?(?: de)?(?::| )+$percent_regexp\s*(?:pour 100\s*(?:g)(?:[^,.;]*?))?(?:;|\.| - |$)/i) {
+			elsif ((defined $percent_regexp)
+				and ($text =~ /\s*teneur(?: min| minimum| minimale| totale)?(?: en | de | d'| du )([^,.;]+?)\s*(?:pour 100\s*(?:g)(?: de produit(?: fini)?)?)?(?: de)?(?::| )+$percent_regexp\s*(?:pour 100\s*(?:g)(?:[^,.;]*?))?(?:;|\.| - |$)/i)) {
 				$percent = $2;	# $percent_regexp
 				$ingredient = $1;
 				$matched_text = $&;
@@ -1109,7 +1125,7 @@ sub parse_specific_ingredients_from_text($$$) {
 			}
 
 			# Origine du Cacao: Pérou
-			elsif ($text =~ /\s*(?:origine (?:de |du |de la |des |de l'))([^,.;]+?)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) {
+			elsif ($text =~ /\s*(?:origine (?:de |du |de la |des |de l'))([^,.;:]+)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) {
 				# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE")
 				# in order to not overmatch something like "Origin of milk: UK, some other mention."
 				# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy.
@@ -1118,6 +1134,8 @@ sub parse_specific_ingredients_from_text($$$) {
 				$matched_text = $&;
 				# Remove the matched text
 				$text = $` . ' ' . $';
+				# Remove extra spaces
+				$ingredient =~ s/\s+$//;
 			}
 
 		}
@@ -1145,6 +1163,136 @@ sub parse_specific_ingredients_from_text($$$) {
 }
 
 
+=head2 parse_origins_from_text ( product_ref, $text)
+
+This function parses the origins of ingredients field to extract the origins of specific ingredients.
+The origins are stored in the specific_ingredients structure of the product.
+
+Note: this function is similar to parse_specific_ingredients_from_text() that operates on ingredients lists.
+The difference is that parse_specific_ingredients_from_text() only extracts and recognizes text that is
+an extra mention at the end of an ingredient list (e.g. "Origin of strawberries: Spain"),
+while parse_origins_from_text() will also recognize text like "Strawberries: Spain".
+
+=head3 Arguments
+
+=head4 product_ref
+
+=head4 text $text
+
+=head3 Return values
+
+=head4 specific_ingredients structure
+
+Array of specific ingredients.
+
+=head4 
+
+=cut
+
+sub parse_origins_from_text($$) {
+
+	my $product_ref = shift;
+	my $text = shift;
+
+	my $product_lc = $product_ref->{lc};
+
+	# Go through the ingredient lists multiple times
+	# as long as we have one match
+	my $ingredient = "start";
+
+	while ($ingredient) {
+
+		# Initialize values
+		$ingredient = undef;
+		my $matched_text;
+		my $origins;
+
+		# Note: in regular expressions below, use non-capturing groups (starting with (?: )
+		# for all groups, except groups that capture actual data: ingredient name, percent, origins
+
+		# Regexps should match until we reach a . ; or the end of the text
+
+		if ($product_lc eq "en") {
+
+			# Origin of the milk: United Kingdom.
+			if ($text =~ /\s*(?:origin of (?:the )?)([^,.;:]+)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) {
+				# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE")
+				# in order to not overmatch something like "Origin of milk: UK, some other mention."
+				# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy.
+				$origins = $2;
+				$ingredient = $1;
+				$matched_text = $&;
+				# Remove the matched text
+				$text = $` . ' ' . $';
+			}
+			# Strawberries: Spain
+			elsif ($text =~ /\s*([^,.;:]+)(?::)\s*([^,.;]+?)\s*(?:;|\.| - |$)/i) {
+				# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE")
+				# in order to not overmatch something like "Origin of milk: UK, some other mention."
+				# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy.
+				$origins = $2;
+				$ingredient = $1;
+				$matched_text = $&;
+				# Remove the matched text
+				$text = $` . ' ' . $';
+			}
+		}
+		elsif ($product_lc eq "fr") {
+
+			# Origine du Cacao: Pérou
+			if ($text =~ /\s*(?:origine (?:de |du |de la |des |de l'))([^,.;:]+)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) {
+				# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE")
+				# in order to not overmatch something like "Origin of milk: UK, some other mention."
+				# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy.
+				$origins = $2;
+				$ingredient = $1;
+				$matched_text = $&;
+				# Remove the matched text
+				$text = $` . ' ' . $';
+				# Remove extra spaces
+				$ingredient =~ s/\s+$//;
+			}
+			# Cacao: Pérou
+			elsif ($text =~ /\s*([^,.;:]+)(?::)\s*([^,.;]+?)\s*(?:;|\.| - |$)/i) {
+				# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE")
+				# in order to not overmatch something like "Origin of milk: UK, some other mention."
+				# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy.
+				$origins = $2;
+				$ingredient = $1;
+				$matched_text = $&;
+				# Remove the matched text
+				$text = $` . ' ' . $';
+				# Remove extra spaces
+				$ingredient =~ s/\s+$//;
+			}
+			# TODO:
+			# Fraises de Bretagne
+			# Filet de dinde de Vendée		
+
+		}
+
+		# If we found an ingredient, save it in specific_ingredients
+		if (defined $ingredient) {
+			my $ingredient_id = canonicalize_taxonomy_tag($product_lc, "ingredients", $ingredient);
+
+			$matched_text =~ s/^\s+//;
+
+			my $specific_ingredients_ref = {
+				id => $ingredient_id,
+				ingredient => $ingredient,
+				text => $matched_text,
+			};
+
+			defined $origins and $specific_ingredients_ref->{origins} = join(",", map {canonicalize_taxonomy_tag($product_lc, "origins", $_)} split(/,/, $origins ));
+
+			push @{$product_ref->{specific_ingredients}}, $specific_ingredients_ref;
+		}
+	}
+
+	return $text;
+}
+
+
 =head2 parse_ingredients_text ( product_ref )
 
 Parse the ingredients_text field to extract individual ingredients.
@@ -2165,15 +2313,28 @@ sub extract_ingredients_from_text($) {
 
 	delete $product_ref->{ingredients_percent_analysis};
 
-	# Parse the ingredients list to extract individual ingredients and sub-ingredients
-	# to create the ingredients array with nested sub-ingredients arrays
+	# The specific ingredients array will contain indications regarding the percentage,
+	# origins, labels etc. of specific ingredients. Those information may come from:
+	# - the origin of ingredients field ("origin")
+	# - labels (e.g. "British eggs")
+	# - the end of the list of the ingredients. e.g. "Origin of the rice: Thailand"
 
 	$product_ref->{specific_ingredients} = [];
 
-	parse_ingredients_text($product_ref);
+	# Ingredients origins may be listed in the origin field
+	# e.g. "Origin of the rice: Thailand."
+	my $product_lc = $product_ref->{lc};
+	if (defined $product_ref->{"origin_" . $product_lc}) {
+		parse_origins_from_text($product_ref, $product_ref->{"origin_" . $product_lc});
+	}
 
 	# Add specific ingredients from labels
-	add_specific_ingredients_from_labels($product_ref);	
+	add_specific_ingredients_from_labels($product_ref);		
+
+	# Parse the ingredients list to extract individual ingredients and sub-ingredients
+	# to create the ingredients array with nested sub-ingredients arrays	
+
+	parse_ingredients_text($product_ref);
 
 	if (defined $product_ref->{ingredients}) {