feat: Packaging import through producers platform (#8207)

This PR enables producers to send us detailed packaging import data through CSV / Excel files uploaded on the producers platform. The default is for producers to send fields like "packaging 1 shape", "packaging 1 material" etc. for each packaging component, with separate columns for each component. At least one big producer (Les Mousquetaires / Intermarché) is sending us data with multiple lines for one product (one for each packaging component), so we now have a mechanism to support this as well. Also included changes: - Extended packaging shapes and materials taxonomies, to support values sent from some producers - New packaging-shapes, packaging-materials and packaging-recycling facets, that are very useful to see if we can correctly map producer data to our taxonomies. Those are populated from the packagings data structure. - New feature in Tags.pm canonicalize_taxonomy_tag() now recognizes entries like "Parent / Child" and "Synonym 1 / Synonym 2" (respectively mapped to the child, and to the entry that matches both synonyms) - Remove the import of packaging data from GS1 (we only had one single shape for all of the product, the data is often incorrect. GS1 now has a new much improved format for packaging data, that we can add support for) - Fix for #8197 - Some refactoring (e.g. deduplicating regular expressions used to process imported data) - A lot of tests --------- Co-authored-by: Alex Garel <alex@garel.org>
openfoodfacts · Apr 4, 2023 · bfc1fe2 · bfc1fe2
1 parent ceb524b
commit bfc1fe2
Show file tree

Hide file tree

Showing 333 changed files with 4,526,659 additions and 572 deletions.
diff --git a/cgi/product_multilingual.pl b/cgi/product_multilingual.pl
@@ -118,8 +118,10 @@ ($product_ref)
 
 		my $input_packaging_ref = {};
 		my $prefix = "packaging_" . $packaging_id . "_";
-		foreach
-			my $property ("number_of_units", "shape", "material", "recycling", "quantity_per_unit", "weight_measured")
+		foreach my $property (
+			"number_of_units", "shape", "material", "recycling",
+			"quantity_per_unit", "weight_measured", "weight_specified"
+			)
 		{
 			$input_packaging_ref->{$property} = remove_tags_and_quote(decode utf8 => single_param($prefix . $property));
 		}

diff --git a/cpanfile b/cpanfile
@@ -68,7 +68,7 @@ requires 'Data::DeepAccess';
 requires 'XML::XML2JSON';
 requires 'Redis';
 requires 'Digest::SHA1';
-
+requires 'Data::Difference';
 
 # Mojolicious/Minion
 requires 'Mojolicious::Lite';

diff --git a/lib/ProductOpener/APIProductWrite.pm b/lib/ProductOpener/APIProductWrite.pm
@@ -140,8 +140,13 @@ sub update_packagings ($request_ref, $product_ref, $field, $is_addition, $value)
 				$input_packaging_ref, $response_ref);
 
 			if (defined $packaging_ref) {
-				# Add or combine with the existing packagings components array
-				add_or_combine_packaging_component_data($product_ref, $packaging_ref, $response_ref);
+				if (not $is_addition) {
+					push @{$product_ref->{packagings}}, $packaging_ref;
+				}
+				else {
+					# Add or combine with the existing packagings components array
+					add_or_combine_packaging_component_data($product_ref, $packaging_ref, $response_ref);
+				}
 			}
 		}
 	}

diff --git a/lib/ProductOpener/Ecoscore.pm b/lib/ProductOpener/Ecoscore.pm
@@ -497,7 +497,7 @@ sub load_ecoscore_data_packaging() {
 				target_shape => "bottle",
 				target_material => "rpet",
 				source_shape => "bottle",
-				source_material => "transparent pet"
+				source_material => "transparent rpet"
 			},
 			{
 				target_material => "plastic",
@@ -851,9 +851,13 @@ sub compute_ecoscore ($product_ref) {
 					$product_ref->{ecoscore_data}{"scores"}{$cc} = 79;
 				}
 
-				$log->debug("compute_ecoscore - final score and grade",
-					{score => $product_ref->{"scores"}{$cc}, grade => $product_ref->{"grades"}{$cc}})
-					if $log->is_debug();
+				$log->debug(
+					"compute_ecoscore - final score and grade",
+					{
+						score => $product_ref->{ecoscore_data}{"scores"}{$cc},
+						grade => $product_ref->{ecoscore_data}{"grades"}{$cc}
+					}
+				) if $log->is_debug();
 			}
 
 			# The following values correspond to the Eco-Score for France.

diff --git a/lib/ProductOpener/GS1.pm b/lib/ProductOpener/GS1.pm
@@ -744,19 +744,22 @@ my %gs1_product_to_off = (
 									},
 								],
 
-								[
-									"packaging_information:packagingInformationModule",
-									{
-										fields => [
-											[
-												"packaging",
-												{
-													fields => [["packagingTypeCode", "+packaging%packagingTypeCode"],],
-												},
-											],
-										],
-									},
-								],
+								# 20230328: this packaging field is too imprecise, and the packaging field is deprecated,
+								# as we have a new packagings components structure
+								#
+								#								[
+								#									"packaging_information:packagingInformationModule",
+								#									{
+								#										fields => [
+								#											[
+								#												"packaging",
+								#												{
+								#													fields => [["packagingTypeCode", "+packaging%packagingTypeCode"],],
+								#												},
+								#											],
+								#										],
+								#									},
+								#								],
 
 								[
 									"packaging_marking:packagingMarkingModule",

diff --git a/lib/ProductOpener/Import.pm b/lib/ProductOpener/Import.pm
@@ -66,6 +66,8 @@ BEGIN {
 	use vars qw(@ISA @EXPORT_OK %EXPORT_TAGS);
 	@EXPORT_OK = qw(
 
+		$IMPORT_MAX_PACKAGING_COMPONENTS
+
 		&import_csv_file
 		&import_products_categories_from_public_database
 
@@ -89,7 +91,7 @@ use ProductOpener::Ingredients qw/:all/;
 use ProductOpener::Images qw/:all/;
 use ProductOpener::DataQuality qw/:all/;
 use ProductOpener::Data qw/:all/;
-use ProductOpener::ImportConvert qw/clean_fields clean_weights assign_quantity_from_field/;
+use ProductOpener::ImportConvert qw/:all/;
 use ProductOpener::Users qw/:all/;
 use ProductOpener::Orgs qw/:all/;
 use ProductOpener::Data qw/:all/;
@@ -111,6 +113,9 @@ use DateTime::Format::ISO8601;
 use URI;
 use Digest::MD5 qw(md5_hex);
 use LWP::UserAgent;
+use Data::Difference qw(data_diff);
+
+$IMPORT_MAX_PACKAGING_COMPONENTS = 10;
 
 # private function to import images from dir
 # args:
@@ -270,7 +275,7 @@ sub deduped_colnames ($columns_ref) {
 Do some pre-processing on input field values:
 
 - Fields suffixed with _if_not_existing are loaded only if the product does not have an existing value
-- Tags fields have special behaviours:
+- Special handling of tags fields:
 	- Empty values are skipped
 	- For labels and categories, we can have columns like labels:Bio with values like 1, Y, Yes
 	- [tags type]_if_match_in_taxonomy : contains candidate values that we import only if we have a matching taxonomy entry
@@ -292,7 +297,7 @@ sub preprocess_field ($imported_product_ref, $product_ref, $field, $yes_regexp,
 			. $imported_product_ref->{$field . "_if_not_existing"} . "\n";
 		$imported_product_ref->{$field} = $imported_product_ref->{$field . "_if_not_existing"};
 	}
-	# if it is a field with a tag behaviour (taxonomized or not)
+	# if it is a tag field (taxonomized or not)
 	# (see %tags_fields in Tags.pm)
 	if (defined $tags_fields{$field}) {
 		foreach my $subfield (sort keys %{$imported_product_ref}) {
@@ -527,10 +532,8 @@ sub set_field_value (
 
 			my $tagid;
 
-			next if $tag =~ /^(\s|,|-|\%|;|_|°)*$/;
 			next
-				if $tag
-				=~ /^\s*((n(\/|\.)?a(\.)?)|(not applicable)|unknown|inconnu|inconnue|non renseigné|non applicable|nr|n\/r)\s*$/i;
+				if $tag =~ /^\s*($empty_regexp|$unknown_regexp|$not_applicable_regexp)\s*$/i;
 
 			$tag =~ s/^\s+//;
 			$tag =~ s/\s+$//;
@@ -1107,6 +1110,99 @@ sub set_nutrition_data_per_fields ($args_ref, $imported_product_ref, $product_re
 	return;
 }
 
+sub import_packaging_components (
+	$args_ref, $imported_product_ref, $product_ref, $stats_ref,
+	$modified_ref, $modified_fields_ref, $differing_ref, $differing_fields_ref,
+	$packagings_edited_ref, $time
+	)
+{
+
+	my $code = $imported_product_ref->{code};
+
+	# keep a deep copy of the existing packaging components, so that we can check if the resulting components are different
+	my $original_packagings_ref = dclone($product_ref->{packagings} || []);
+
+	# build a list of input packaging components
+	my @input_packagings = ();
+	my $data_is_complete = 0;
+
+	# packaging data is specified in the CSV file in columns named like packagings_1_number_of_units
+	# we currently search up to 10 components
+
+	for (my $i = 1; $i <= $IMPORT_MAX_PACKAGING_COMPONENTS; $i++) {
+		my $input_packaging_ref = {};
+		foreach
+			my $field (qw(number_of_units shape material recycling quantity_per_unit weight_specified weight_measured))
+		{
+			$input_packaging_ref->{$field} = $imported_product_ref->{"packaging_${i}_${field}"};
+		}
+		$log->debug("input_packaging_ref", {i => $i, input_packaging_ref => $input_packaging_ref}) if $log->is_debug();
+
+		# Taxonomize the input packaging component data
+		push @input_packagings,
+			get_checked_and_taxonomized_packaging_component_data($imported_product_ref->{lc}, $input_packaging_ref, {});
+
+		# Record if we have complete input data, with all key fields (for at least 1 component)
+		# not considered a key field (and thus may be lost): recycling instruction, quantity per unit
+		if (
+				(defined $input_packaging_ref->{number_of_units})
+			and (defined $input_packaging_ref->{shape})
+			and (defined $input_packaging_ref->{material})
+			and
+			((defined $input_packaging_ref->{weight_specified}) or (defined $input_packaging_ref->{weight_measured}))
+			)
+		{
+			$data_is_complete = 1;
+		}
+	}
+
+	if ($data_is_complete) {
+		# We seem to have complete data, replace existing data
+		$product_ref->{packagings} = \@input_packagings;
+	}
+	else {
+		# We have partial data, that may be missing fields like number of units, weight etc.
+		# In that case, we try to merge the input components with the existing components
+		# so that we don't lose user entered data such as weights
+		# This may result in some components being duplicated, if the existing component and
+		# the input component have incompatible fields (e.g. if one is a "tray" and the other a "box",
+		# even though they refer to the same thing)
+
+		foreach my $input_packaging_ref (@input_packagings) {
+			add_or_combine_packaging_component_data($product_ref, $input_packaging_ref, {});
+		}
+	}
+
+	# Check if the packagings data has changed
+	my @diffs = data_diff($original_packagings_ref, $product_ref->{packagings});
+	if (scalar @diffs > 0) {
+		$log->debug(
+			"packagings diff",
+			{
+				original_packagings => $original_packagings_ref,
+				input_packagings => \@input_packagings,
+				new_packagings => $product_ref->{packagings},
+				data_is_complete => $data_is_complete,
+				diffs => \@diffs
+			}
+		) if $log->is_debug();
+		$stats_ref->{products_packagings_updated}{$code} = 1;
+		if (scalar @$original_packagings_ref == 0) {
+			$stats_ref->{products_packagings_created}{$code} = 1;
+		}
+		else {
+			$stats_ref->{products_packagings_changed}{$code} = 1;
+		}
+		$$modified_ref++;
+		$packagings_edited_ref->{$code}++;
+		# push @$modified_fields_ref, "nutrients.$field";
+	}
+
+	# Update the packagings_complete_field
+
+	return;
+}
+
 =head2 import_csv_file ( ARGUMENTS )
 
 C<import_csv_file()> imports product data in the Open Food Facts CSV format
@@ -1315,7 +1411,11 @@ sub import_csv_file ($args_ref) {
 
 	$log->debug("importing products", {}) if $log->is_debug();
 
-	open(my $io, '<:encoding(UTF-8)', $args_ref->{csv_file}) or die("Could not open " . $args_ref->{csv_file} . ": $!");
+	my $io;
+	if (not open($io, '<:encoding(UTF-8)', $args_ref->{csv_file})) {
+		$stats_ref->{error} = "Could not open " . $args_ref->{csv_file} . ": $!";
+		return $stats_ref;
+	}
 
 	# first line contains headers
 	my $columns_ref = $csv->getline($io);
@@ -1330,6 +1430,7 @@ sub import_csv_file ($args_ref) {
 	my @edited = ();
 	my %edited = ();
 	my %nutrients_edited = ();
+	my %packagings_edited = ();
 	my $skip_not_existing = 0;
 	my $skip_no_images = 0;
 
@@ -1344,7 +1445,7 @@ sub import_csv_file ($args_ref) {
 		$i++;
 
 		# By default, use the orgid passed in the arguments
-		# it may be overrode later on a per product basis
+		# it may be overriden later on a per product basis
 		my $org_id = $args_ref->{org_id};
 		my $org_ref;
 
@@ -1573,7 +1674,7 @@ sub import_csv_file ($args_ref) {
 		$Owner_id = get_owner_id($User_id, $Org_id, $args_ref->{owner_id});
 		my $product_id = product_id_for_owner($Owner_id, $code);
 
-		# The userid can be overrode on a per product basis
+		# The userid can be overriden on a per product basis
 		# when we import data from the producers platform to the public platform
 		# we use the orgid as the userid
 		my $user_id = $args_ref->{user_id};
@@ -1982,6 +2083,16 @@ sub import_csv_file ($args_ref) {
 
 		set_nutrition_data_per_fields($args_ref, $imported_product_ref, $product_ref, $stats_ref, \$modified,);
 
+		# Packaging data
+
+		import_packaging_components(
+			$args_ref, $imported_product_ref, $product_ref, $stats_ref,
+			\$modified, \@modified_fields, \$differing, \%differing_fields,
+			\%packagings_edited, $time,
+		);
+
+		# Compute extra stats
+
 		if ((defined $stats_ref->{products_info_added}{$code}) or (defined $stats_ref->{products_info_changed}{$code}))
 		{
 			$stats_ref->{products_info_updated}{$code} = 1;

diff --git a/lib/ProductOpener/ImportConvert.pm b/lib/ProductOpener/ImportConvert.pm
@@ -55,6 +55,12 @@ BEGIN {
 	use vars qw(@ISA @EXPORT_OK %EXPORT_TAGS);
 	@EXPORT_OK = qw(
 
+		$empty_regexp
+		$unknown_regexp
+		$not_applicable_regexp
+		$none_regexp
+		$empty_unknown_not_applicable_or_none_regexp
+
 		%fields
 		@fields
 		%products
@@ -103,6 +109,7 @@ use ProductOpener::Products qw/:all/;
 use ProductOpener::Ingredients qw/:all/;
 use ProductOpener::Food qw/:all/;
 use ProductOpener::Units qw/:all/;
+use ProductOpener::Text qw/:all/;
 
 use CGI qw/:cgi :form escapeHTML/;
 use URI::Escape::XS;
@@ -122,6 +129,15 @@ use XML::Rules;
 
 my $mode = "append";
 
+# Regular expressions that can be combined to match specific inputs
+$empty_regexp = '(?:,|\%|;|_|°|-|\/|\\|\.|\s)*';
+$unknown_regexp = 'unknown|inconnu|inconnue|non renseigné(?:e)?(?:s)?|nr|n\/r';
+$not_applicable_regexp = 'n(?:\/|\\|\.|-)?a(?:\.)?|(?:not|non)(?: |-)applicable|no aplica';
+$none_regexp = 'none|aucun|aucune|aucun\(e\)';
+
+$empty_unknown_not_applicable_or_none_regexp
+	= join('|', ($empty_regexp, $unknown_regexp, $not_applicable_regexp, $none_regexp));
+
 =head1 FUNCTIONS
 
 =cut
@@ -852,7 +868,7 @@ sub clean_fields ($product_ref) {
 					$brand =~ s/^\s+//;
 					$brand =~ s/\s+$//;
 					# we may get brands with quantifiers like * + ? etc. we need to escape them
-					$brand =~ s/(\*|\+|\?|\(|\)|\[|\]|\{|\}|\$|\^|\\)/\\$1/g;
+					$brand = regexp_escape($brand);
 
 					# dashes/dots/spaces -> allow matching dashes/dot/spaces
 					# e.g. "bons.mayennais" matches "bons mayennais"
@@ -1124,12 +1140,11 @@ sub clean_fields ($product_ref) {
 
 		# remove N, N/A, NA etc.
 		# but not "no", "none" that are useful values (e.g. for specific labels "organic:no", allergens : "none")
-		$product_ref->{$field}
-			=~ s/(^|,)\s*((n(\/|\.)?a(\.)?)|(not applicable)|unknown|inconnu|inconnue|non renseigné|non applicable|no aplica|nr|n\/r)\s*(,|$)//ig;
+		$product_ref->{$field} =~ s/(^|,)\s*($unknown_regexp|$not_applicable_regexp)\s*(,|$)//ig;
 
 		# remove none except for allergens and traces
 		if ($field !~ /allergens|traces/) {
-			$product_ref->{$field} =~ s/(^|,)\s*(none|aucun|aucune|aucun\(e\))\s*(,|$)//ig;
+			$product_ref->{$field} =~ s/(^|,)\s*($none_regexp)\s*(,|$)//ig;
 		}
 
 		if (   ($field =~ /_fr/)