Skip to content

Commit

Permalink
feat: Packaging import through producers platform (#8207)
Browse files Browse the repository at this point in the history
This PR enables producers to send us detailed packaging import data through CSV / Excel files uploaded on the producers platform.

The default is for producers to send fields like "packaging 1 shape", "packaging 1 material" etc. for each packaging component, with separate columns for each component.

At least one big producer (Les Mousquetaires / Intermarch茅) is sending us data with multiple lines for one product (one for each packaging component), so we now have a mechanism to support this as well.

Also included changes:
- Extended packaging shapes and materials taxonomies, to support values sent from some producers
- New packaging-shapes, packaging-materials and packaging-recycling facets, that are very useful to see if we can correctly map producer data to our taxonomies. Those are populated from the packagings data structure.
- New feature in Tags.pm canonicalize_taxonomy_tag() now recognizes entries like "Parent / Child" and "Synonym 1 / Synonym 2" (respectively mapped to the child, and to the entry that matches both synonyms)
- Remove the import of packaging data from GS1 (we only had one single shape for all of the product, the data is often incorrect. GS1 now has a new much improved format for packaging data, that we can add support for)
- Fix for #8197 
- Some refactoring (e.g. deduplicating regular expressions used to process imported data)
- A lot of tests

---------

Co-authored-by: Alex Garel <alex@garel.org>
  • Loading branch information
stephanegigandet and alexgarel committed Apr 4, 2023
1 parent ceb524b commit bfc1fe2
Show file tree
Hide file tree
Showing 333 changed files with 4,526,659 additions and 572 deletions.
6 changes: 4 additions & 2 deletions cgi/product_multilingual.pl
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,10 @@ ($product_ref)

my $input_packaging_ref = {};
my $prefix = "packaging_" . $packaging_id . "_";
foreach
my $property ("number_of_units", "shape", "material", "recycling", "quantity_per_unit", "weight_measured")
foreach my $property (
"number_of_units", "shape", "material", "recycling",
"quantity_per_unit", "weight_measured", "weight_specified"
)
{
$input_packaging_ref->{$property} = remove_tags_and_quote(decode utf8 => single_param($prefix . $property));
}
Expand Down
2 changes: 1 addition & 1 deletion cpanfile
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ requires 'Data::DeepAccess';
requires 'XML::XML2JSON';
requires 'Redis';
requires 'Digest::SHA1';

requires 'Data::Difference';

# Mojolicious/Minion
requires 'Mojolicious::Lite';
Expand Down
9 changes: 7 additions & 2 deletions lib/ProductOpener/APIProductWrite.pm
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,13 @@ sub update_packagings ($request_ref, $product_ref, $field, $is_addition, $value)
$input_packaging_ref, $response_ref);

if (defined $packaging_ref) {
# Add or combine with the existing packagings components array
add_or_combine_packaging_component_data($product_ref, $packaging_ref, $response_ref);
if (not $is_addition) {
push @{$product_ref->{packagings}}, $packaging_ref;
}
else {
# Add or combine with the existing packagings components array
add_or_combine_packaging_component_data($product_ref, $packaging_ref, $response_ref);
}
}
}
}
Expand Down
12 changes: 8 additions & 4 deletions lib/ProductOpener/Ecoscore.pm
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,7 @@ sub load_ecoscore_data_packaging() {
target_shape => "bottle",
target_material => "rpet",
source_shape => "bottle",
source_material => "transparent pet"
source_material => "transparent rpet"
},
{
target_material => "plastic",
Expand Down Expand Up @@ -851,9 +851,13 @@ sub compute_ecoscore ($product_ref) {
$product_ref->{ecoscore_data}{"scores"}{$cc} = 79;
}

$log->debug("compute_ecoscore - final score and grade",
{score => $product_ref->{"scores"}{$cc}, grade => $product_ref->{"grades"}{$cc}})
if $log->is_debug();
$log->debug(
"compute_ecoscore - final score and grade",
{
score => $product_ref->{ecoscore_data}{"scores"}{$cc},
grade => $product_ref->{ecoscore_data}{"grades"}{$cc}
}
) if $log->is_debug();
}

# The following values correspond to the Eco-Score for France.
Expand Down
29 changes: 16 additions & 13 deletions lib/ProductOpener/GS1.pm
Original file line number Diff line number Diff line change
Expand Up @@ -744,19 +744,22 @@ my %gs1_product_to_off = (
},
],

[
"packaging_information:packagingInformationModule",
{
fields => [
[
"packaging",
{
fields => [["packagingTypeCode", "+packaging%packagingTypeCode"],],
},
],
],
},
],
# 20230328: this packaging field is too imprecise, and the packaging field is deprecated,
# as we have a new packagings components structure
#
# [
# "packaging_information:packagingInformationModule",
# {
# fields => [
# [
# "packaging",
# {
# fields => [["packagingTypeCode", "+packaging%packagingTypeCode"],],
# },
# ],
# ],
# },
# ],

[
"packaging_marking:packagingMarkingModule",
Expand Down
129 changes: 120 additions & 9 deletions lib/ProductOpener/Import.pm
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ BEGIN {
use vars qw(@ISA @EXPORT_OK %EXPORT_TAGS);
@EXPORT_OK = qw(
$IMPORT_MAX_PACKAGING_COMPONENTS
&import_csv_file
&import_products_categories_from_public_database
Expand All @@ -89,7 +91,7 @@ use ProductOpener::Ingredients qw/:all/;
use ProductOpener::Images qw/:all/;
use ProductOpener::DataQuality qw/:all/;
use ProductOpener::Data qw/:all/;
use ProductOpener::ImportConvert qw/clean_fields clean_weights assign_quantity_from_field/;
use ProductOpener::ImportConvert qw/:all/;
use ProductOpener::Users qw/:all/;
use ProductOpener::Orgs qw/:all/;
use ProductOpener::Data qw/:all/;
Expand All @@ -111,6 +113,9 @@ use DateTime::Format::ISO8601;
use URI;
use Digest::MD5 qw(md5_hex);
use LWP::UserAgent;
use Data::Difference qw(data_diff);

$IMPORT_MAX_PACKAGING_COMPONENTS = 10;

# private function to import images from dir
# args:
Expand Down Expand Up @@ -270,7 +275,7 @@ sub deduped_colnames ($columns_ref) {
Do some pre-processing on input field values:
- Fields suffixed with _if_not_existing are loaded only if the product does not have an existing value
- Tags fields have special behaviours:
- Special handling of tags fields:
- Empty values are skipped
- For labels and categories, we can have columns like labels:Bio with values like 1, Y, Yes
- [tags type]_if_match_in_taxonomy : contains candidate values that we import only if we have a matching taxonomy entry
Expand All @@ -292,7 +297,7 @@ sub preprocess_field ($imported_product_ref, $product_ref, $field, $yes_regexp,
. $imported_product_ref->{$field . "_if_not_existing"} . "\n";
$imported_product_ref->{$field} = $imported_product_ref->{$field . "_if_not_existing"};
}
# if it is a field with a tag behaviour (taxonomized or not)
# if it is a tag field (taxonomized or not)
# (see %tags_fields in Tags.pm)
if (defined $tags_fields{$field}) {
foreach my $subfield (sort keys %{$imported_product_ref}) {
Expand Down Expand Up @@ -527,10 +532,8 @@ sub set_field_value (

my $tagid;

next if $tag =~ /^(\s|,|-|\%|;|_|掳)*$/;
next
if $tag
=~ /^\s*((n(\/|\.)?a(\.)?)|(not applicable)|unknown|inconnu|inconnue|non renseign茅|non applicable|nr|n\/r)\s*$/i;
if $tag =~ /^\s*($empty_regexp|$unknown_regexp|$not_applicable_regexp)\s*$/i;

$tag =~ s/^\s+//;
$tag =~ s/\s+$//;
Expand Down Expand Up @@ -1107,6 +1110,99 @@ sub set_nutrition_data_per_fields ($args_ref, $imported_product_ref, $product_re
return;
}

sub import_packaging_components (
$args_ref, $imported_product_ref, $product_ref, $stats_ref,
$modified_ref, $modified_fields_ref, $differing_ref, $differing_fields_ref,
$packagings_edited_ref, $time
)
{

my $code = $imported_product_ref->{code};

# keep a deep copy of the existing packaging components, so that we can check if the resulting components are different
my $original_packagings_ref = dclone($product_ref->{packagings} || []);

# build a list of input packaging components
my @input_packagings = ();
my $data_is_complete = 0;

# packaging data is specified in the CSV file in columns named like packagings_1_number_of_units
# we currently search up to 10 components

for (my $i = 1; $i <= $IMPORT_MAX_PACKAGING_COMPONENTS; $i++) {
my $input_packaging_ref = {};
foreach
my $field (qw(number_of_units shape material recycling quantity_per_unit weight_specified weight_measured))
{
$input_packaging_ref->{$field} = $imported_product_ref->{"packaging_${i}_${field}"};
}
$log->debug("input_packaging_ref", {i => $i, input_packaging_ref => $input_packaging_ref}) if $log->is_debug();

# Taxonomize the input packaging component data
push @input_packagings,
get_checked_and_taxonomized_packaging_component_data($imported_product_ref->{lc}, $input_packaging_ref, {});

# Record if we have complete input data, with all key fields (for at least 1 component)
# not considered a key field (and thus may be lost): recycling instruction, quantity per unit
if (
(defined $input_packaging_ref->{number_of_units})
and (defined $input_packaging_ref->{shape})
and (defined $input_packaging_ref->{material})
and
((defined $input_packaging_ref->{weight_specified}) or (defined $input_packaging_ref->{weight_measured}))
)
{
$data_is_complete = 1;
}
}

if ($data_is_complete) {
# We seem to have complete data, replace existing data
$product_ref->{packagings} = \@input_packagings;
}
else {
# We have partial data, that may be missing fields like number of units, weight etc.
# In that case, we try to merge the input components with the existing components
# so that we don't lose user entered data such as weights
# This may result in some components being duplicated, if the existing component and
# the input component have incompatible fields (e.g. if one is a "tray" and the other a "box",
# even though they refer to the same thing)

foreach my $input_packaging_ref (@input_packagings) {
add_or_combine_packaging_component_data($product_ref, $input_packaging_ref, {});
}
}

# Check if the packagings data has changed
my @diffs = data_diff($original_packagings_ref, $product_ref->{packagings});
if (scalar @diffs > 0) {
$log->debug(
"packagings diff",
{
original_packagings => $original_packagings_ref,
input_packagings => \@input_packagings,
new_packagings => $product_ref->{packagings},
data_is_complete => $data_is_complete,
diffs => \@diffs
}
) if $log->is_debug();
$stats_ref->{products_packagings_updated}{$code} = 1;
if (scalar @$original_packagings_ref == 0) {
$stats_ref->{products_packagings_created}{$code} = 1;
}
else {
$stats_ref->{products_packagings_changed}{$code} = 1;
}
$$modified_ref++;
$packagings_edited_ref->{$code}++;
#聽push @$modified_fields_ref, "nutrients.$field";
}

# Update the packagings_complete_field

return;
}

=head2 import_csv_file ( ARGUMENTS )
C<import_csv_file()> imports product data in the Open Food Facts CSV format
Expand Down Expand Up @@ -1315,7 +1411,11 @@ sub import_csv_file ($args_ref) {

$log->debug("importing products", {}) if $log->is_debug();

open(my $io, '<:encoding(UTF-8)', $args_ref->{csv_file}) or die("Could not open " . $args_ref->{csv_file} . ": $!");
my $io;
if (not open($io, '<:encoding(UTF-8)', $args_ref->{csv_file})) {
$stats_ref->{error} = "Could not open " . $args_ref->{csv_file} . ": $!";
return $stats_ref;
}

# first line contains headers
my $columns_ref = $csv->getline($io);
Expand All @@ -1330,6 +1430,7 @@ sub import_csv_file ($args_ref) {
my @edited = ();
my %edited = ();
my %nutrients_edited = ();
my %packagings_edited = ();
my $skip_not_existing = 0;
my $skip_no_images = 0;

Expand All @@ -1344,7 +1445,7 @@ sub import_csv_file ($args_ref) {
$i++;

# By default, use the orgid passed in the arguments
# it may be overrode later on a per product basis
# it may be overriden later on a per product basis
my $org_id = $args_ref->{org_id};
my $org_ref;

Expand Down Expand Up @@ -1573,7 +1674,7 @@ sub import_csv_file ($args_ref) {
$Owner_id = get_owner_id($User_id, $Org_id, $args_ref->{owner_id});
my $product_id = product_id_for_owner($Owner_id, $code);

# The userid can be overrode on a per product basis
# The userid can be overriden on a per product basis
# when we import data from the producers platform to the public platform
# we use the orgid as the userid
my $user_id = $args_ref->{user_id};
Expand Down Expand Up @@ -1982,6 +2083,16 @@ sub import_csv_file ($args_ref) {

set_nutrition_data_per_fields($args_ref, $imported_product_ref, $product_ref, $stats_ref, \$modified,);

# Packaging data

import_packaging_components(
$args_ref, $imported_product_ref, $product_ref, $stats_ref,
\$modified, \@modified_fields, \$differing, \%differing_fields,
\%packagings_edited, $time,
);

# Compute extra stats

if ((defined $stats_ref->{products_info_added}{$code}) or (defined $stats_ref->{products_info_changed}{$code}))
{
$stats_ref->{products_info_updated}{$code} = 1;
Expand Down
23 changes: 19 additions & 4 deletions lib/ProductOpener/ImportConvert.pm
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ BEGIN {
use vars qw(@ISA @EXPORT_OK %EXPORT_TAGS);
@EXPORT_OK = qw(
$empty_regexp
$unknown_regexp
$not_applicable_regexp
$none_regexp
$empty_unknown_not_applicable_or_none_regexp
%fields
@fields
%products
Expand Down Expand Up @@ -103,6 +109,7 @@ use ProductOpener::Products qw/:all/;
use ProductOpener::Ingredients qw/:all/;
use ProductOpener::Food qw/:all/;
use ProductOpener::Units qw/:all/;
use ProductOpener::Text qw/:all/;

use CGI qw/:cgi :form escapeHTML/;
use URI::Escape::XS;
Expand All @@ -122,6 +129,15 @@ use XML::Rules;

my $mode = "append";

# Regular expressions that can be combined to match specific inputs
$empty_regexp = '(?:,|\%|;|_|掳|-|\/|\\|\.|\s)*';
$unknown_regexp = 'unknown|inconnu|inconnue|non renseign茅(?:e)?(?:s)?|nr|n\/r';
$not_applicable_regexp = 'n(?:\/|\\|\.|-)?a(?:\.)?|(?:not|non)(?: |-)applicable|no aplica';
$none_regexp = 'none|aucun|aucune|aucun\(e\)';

$empty_unknown_not_applicable_or_none_regexp
= join('|', ($empty_regexp, $unknown_regexp, $not_applicable_regexp, $none_regexp));

=head1 FUNCTIONS
=cut
Expand Down Expand Up @@ -852,7 +868,7 @@ sub clean_fields ($product_ref) {
$brand =~ s/^\s+//;
$brand =~ s/\s+$//;
# we may get brands with quantifiers like * + ? etc. we need to escape them
$brand =~ s/(\*|\+|\?|\(|\)|\[|\]|\{|\}|\$|\^|\\)/\\$1/g;
$brand = regexp_escape($brand);

# dashes/dots/spaces -> allow matching dashes/dot/spaces
# e.g. "bons.mayennais" matches "bons mayennais"
Expand Down Expand Up @@ -1124,12 +1140,11 @@ sub clean_fields ($product_ref) {

# remove N, N/A, NA etc.
# but not "no", "none" that are useful values (e.g. for specific labels "organic:no", allergens : "none")
$product_ref->{$field}
=~ s/(^|,)\s*((n(\/|\.)?a(\.)?)|(not applicable)|unknown|inconnu|inconnue|non renseign茅|non applicable|no aplica|nr|n\/r)\s*(,|$)//ig;
$product_ref->{$field} =~ s/(^|,)\s*($unknown_regexp|$not_applicable_regexp)\s*(,|$)//ig;

# remove none except for allergens and traces
if ($field !~ /allergens|traces/) {
$product_ref->{$field} =~ s/(^|,)\s*(none|aucun|aucune|aucun\(e\))\s*(,|$)//ig;
$product_ref->{$field} =~ s/(^|,)\s*($none_regexp)\s*(,|$)//ig;
}

if ( ($field =~ /_fr/)
Expand Down

0 comments on commit bfc1fe2

Please sign in to comment.