From e1304de45393305f27ea80da47bcfacf44f6f90f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Gigandet?= Date: Thu, 23 Mar 2023 14:39:08 +0100 Subject: [PATCH] feat: Match synonyms and xx: entries when computing taxonomy suggestions (#8190) * taxonomy: add packaging material code in canonical name * refactor memcached caching * feat: match synonyms for taxonomy suggestions, fixes: #8002 and cache suggestions --------- Co-authored-by: Alex Garel --- lib/ProductOpener/Cache.pm | 40 ++++- lib/ProductOpener/Display.pm | 30 +--- lib/ProductOpener/Ecoscore.pm | 113 ++++++++---- lib/ProductOpener/Index.pm | 11 -- lib/ProductOpener/Orgs.pm | 1 - lib/ProductOpener/Tags.pm | 97 ++++++++++- lib/ProductOpener/TaxonomySuggestions.pm | 164 +++++++++++++----- lib/ProductOpener/Users.pm | 1 - taxonomies/packaging_materials.txt | 151 ++++++++-------- taxonomies/test.txt | 2 + .../integration/api_v3_taxonomy_suggestions.t | 2 +- .../categories-string-strawberry.json | 1 + .../categories-term-fr-frais.json | 6 +- .../categories-term-fr-fraise.json | 4 +- .../categories-term-strawberry.json | 1 + .../patch-packagings-quantity-and-weight.json | 4 +- ...kagings-weights-as-strings-with-units.json | 4 +- .../patch-packagings-weights-as-strings.json | 4 +- .../patch-properties-with-lc-name.json | 4 +- .../categories-string-fr-frais.json | 6 +- .../categories-string-fr-fraise.json | 4 +- .../categories-string-strawberry.json | 1 + .../categories-term-strawberry.json | 1 + .../packaging-materials-01.json | 4 +- .../packaging-materials-1-pet.json | 2 + .../packaging-materials-1.json | 28 +-- ...als-cc-fr-categories-yaourt-shape-pot.json | 12 +- .../packaging-materials-cc-fr-shape-pot.json | 12 +- .../packaging-materials-cc-fr.json | 14 +- .../packaging-materials-pet-1.json | 4 +- .../packaging-materials.json | 14 +- .../packaging-recycling-fr-recy.json | 3 +- .../packaging-shapes-string-fr-po.json | 13 +- .../packaging-shapes-string-po.json | 3 +- .../export/5601009974337.json | 2 +- .../export_more_fields/5601009974337.json | 4 +- .../ecoscore/energy-drink.json | 4 +- .../grade-a-with-recyclable-label.json | 4 +- .../expected_test_results/ecoscore/milk.json | 12 +- .../ecoscore/packaging-en-pet-bottle.json | 6 +- .../ecoscore/uk-milk.json | 12 +- .../packaging/en-1-pet-plastic-bottle.json | 2 +- .../packaging/fr-comma-inside-a-number.json | 2 +- .../packaging_text_fr_bouteille_pet.json | 2 +- ...aging_text_fr_bouteille_plastique_pet.json | 2 +- .../packaging_text_fr_line_feeds.json | 4 +- .../packaging_text_fr_unknown_shape.json | 4 +- .../packaging_text_nl_statiegeldfles.json | 2 +- tests/unit/tags.t | 11 ++ tests/unit/taxonomy_suggestions.t | 97 +++++++++++ 50 files changed, 638 insertions(+), 293 deletions(-) create mode 100644 tests/unit/taxonomy_suggestions.t diff --git a/lib/ProductOpener/Cache.pm b/lib/ProductOpener/Cache.pm index e203c7283929a..4f4113e951fe1 100644 --- a/lib/ProductOpener/Cache.pm +++ b/lib/ProductOpener/Cache.pm @@ -20,14 +20,14 @@ package ProductOpener::Cache; -use utf8; -use Modern::Perl '2017'; +use ProductOpener::PerlStandards; use Exporter qw< import >; BEGIN { use vars qw(@ISA @EXPORT_OK %EXPORT_TAGS); @EXPORT_OK = qw( $memd + &generate_cache_key ); # symbols to export on request %EXPORT_TAGS = (all => [@EXPORT_OK]); } @@ -38,6 +38,8 @@ use ProductOpener::Store qw/:all/; use ProductOpener::Config qw/:all/; use Cache::Memcached::Fast; +use JSON; +use Digest::MD5 qw(md5_hex); use Log::Any qw($log); # Initialize exported variables @@ -46,7 +48,41 @@ $memd = Cache::Memcached::Fast->new( { 'servers' => $memd_servers, 'utf8' => 1, + compress_threshold => 10000, } ); +my $json = JSON->new->utf8->allow_nonref->canonical; + +=head1 FUNCTIONS + +=head2 generate_cache_key($name, $context_ref) + +Generate a key to use for caching, that depends on the content of the $context_ref object. +The key is prependend by the name of the variable we want to store, so that we can set multiple variables for the same context +(e.g. a count of search results + the search results themselves) + +=head3 Arguments + +=head4 $name Name of the variable we want to cache. + +=head4 $object_ref Reference to all the context / parameters etc. that have an influence on what we want to cache + +=head3 Return values + +MD5 of the key. + +=cut + +sub generate_cache_key ($name, $context_ref) { + + # We generate a sorted JSON so that we always have the same key for the context object + # even if it contains hashes (Storable::freeze may not have the same order of keys) + my $context_json = $json->encode($context_ref); + my $key = $server_domain . ':' . $name . '/' . md5_hex($context_json); + $log->debug("generate_cache_key", {context_ref => $context_ref, context_json => $context_json, key => $key}) + if $log->is_debug(); + return $key; +} + 1; diff --git a/lib/ProductOpener/Display.pm b/lib/ProductOpener/Display.pm index 358754d8a8465..a68c95ccaa5f3 100644 --- a/lib/ProductOpener/Display.pm +++ b/lib/ProductOpener/Display.pm @@ -169,8 +169,8 @@ use ProductOpener::PackagerCodes qw(:all); use ProductOpener::Export qw(:all); use ProductOpener::API qw(:all); use ProductOpener::Units qw/:all/; +use ProductOpener::Cache qw/:all/; -use Cache::Memcached::Fast; use Encode; use URI::Escape::XS; use CGI qw(:cgi :cgi-lib :form escapeHTML'); @@ -187,7 +187,6 @@ use CLDR::Number; use CLDR::Number::Format::Decimal; use CLDR::Number::Format::Percent; use Storable qw(dclone freeze); -use Digest::MD5 qw(md5_hex); use boolean; use Excel::Writer::XLSX; use Template; @@ -291,13 +290,6 @@ $tt = Template->new( ); # Initialize exported variables -$memd = Cache::Memcached::Fast->new( - { - 'servers' => $memd_servers, - 'utf8' => 1, - compress_threshold => 10000, - } -); $default_request_ref = {page => 1,}; @@ -1434,9 +1426,8 @@ sub query_list_of_tags ($request_ref, $query_ref) { } #get cache results for aggregate query - my $key = $server_domain . "/" . freeze($aggregate_parameters); + my $key = generate_cache_key("aggregate", $aggregate_parameters); $log->debug("MongoDB query key", {key => $key}) if $log->is_debug(); - $key = md5_hex($key); my $results = get_cache_results($key, $request_ref); if ((not defined $results) or (ref($results) ne "ARRAY") or (not defined $results->[0])) { @@ -1514,9 +1505,8 @@ sub query_list_of_tags ($request_ref, $query_ref) { else { #get total count for aggregate (without limit) and put result in cache - my $key_count = $server_domain . "/" . freeze($aggregate_count_parameters); + my $key_count = generate_cache_key("aggregate_count", $aggregate_count_parameters); $log->debug("MongoDB aggregate count query key", {key => $key_count}) if $log->is_debug(); - $key_count = md5_hex($key_count); my $results_count = get_cache_results($key_count, $request_ref); if (not defined $results_count) { @@ -4880,14 +4870,9 @@ sub search_and_display_products ($request_ref, $query_ref, $sort_by, $limit, $pa skip => $skip ]; - # Sort the keys of hashes - my $json = JSON::PP->new->utf8->canonical->encode($mongodb_query_ref); - - my $key = $server_domain . "/" . $json; - - $log->debug("MongoDB query key - search-products", {key => $key}) if $log->is_debug(); + my $key = generate_cache_key("search_products", $mongodb_query_ref); - $key = "search-products-" . md5_hex($key); + $log->debug("MongoDB query key - search_products", {key => $key}) if $log->is_debug(); $request_ref->{structured_response} = get_cache_results($key, $request_ref); @@ -4930,9 +4915,8 @@ sub search_and_display_products ($request_ref, $query_ref, $sort_by, $limit, $pa } elsif (keys %{$query_ref} > 0) { #check if count results is in cache - my $key_count = $server_domain . "/" . freeze($query_ref); - $log->debug("MongoDB query key - search-count", {key => $key_count}) if $log->is_debug(); - $key_count = "search-count-" . md5_hex($key_count); + my $key_count = generate_cache_key("search_products_count", $query_ref); + $log->debug("MongoDB query key - search_products_count", {key => $key_count}) if $log->is_debug(); my $results_count = get_cache_results($key_count, $request_ref); if (not defined $results_count) { diff --git a/lib/ProductOpener/Ecoscore.pm b/lib/ProductOpener/Ecoscore.pm index be4786f2c6b40..23ba2caebff51 100644 --- a/lib/ProductOpener/Ecoscore.pm +++ b/lib/ProductOpener/Ecoscore.pm @@ -229,9 +229,10 @@ sub load_ecoscore_data_origins_of_ingredients_distances() { next if ((not defined $origin) or ($origin eq "")); - my $origin_id = canonicalize_taxonomy_tag("en", "origins", $origin); + my $origin_id_exists_in_taxonomy; + my $origin_id = canonicalize_taxonomy_tag("en", "origins", $origin, \$origin_id_exists_in_taxonomy); - if (not exists_taxonomy_tag("origins", $origin_id)) { + if (not $origin_id_exists_in_taxonomy) { $log->error("ecoscore origin does not exist in taxonomy", {origin => $origin, origin_id => $origin_id}) if $log->is_error(); @@ -310,29 +311,30 @@ sub load_ecoscore_data_origins_of_ingredients() { next if ((not defined $origin) or ($origin eq "")); - my $origin_id = canonicalize_taxonomy_tag("fr", "origins", $origin); + my $origin_id_exists_in_taxonomy; + my $origin_id = canonicalize_taxonomy_tag("fr", "origins", $origin, \$origin_id_exists_in_taxonomy); - if (not exists_taxonomy_tag("origins", $origin_id)) { + if (not $origin_id_exists_in_taxonomy) { # Eco-Score entries like "Macedonia [FYROM]": remove the [..] part # but keep it in the first try, as it is needed to distinguish "Congo [DRC]" and "Congo [Republic]" if ($origin =~ /^(.*)\[(.*)\]/) { - $origin_id = canonicalize_taxonomy_tag("fr", "origins", $1); - if (not exists_taxonomy_tag("origins", $origin_id)) { - $origin_id = canonicalize_taxonomy_tag("fr", "origins", $2); + $origin_id = canonicalize_taxonomy_tag("fr", "origins", $1, \$origin_id_exists_in_taxonomy); + if (not $origin_id_exists_in_taxonomy) { + $origin_id = canonicalize_taxonomy_tag("fr", "origins", $2, \$origin_id_exists_in_taxonomy); } } } # La Guyane Française -> Guyane Française - if (not exists_taxonomy_tag("origins", $origin_id)) { + if (not $origin_id_exists_in_taxonomy) { if ($origin =~ /^(la|les|l'|le)\s?(.*)$/i) { - $origin_id = canonicalize_taxonomy_tag("fr", "origins", $2); + $origin_id = canonicalize_taxonomy_tag("fr", "origins", $2, \$origin_id_exists_in_taxonomy); } } - if (not exists_taxonomy_tag("origins", $origin_id)) { + if (not $origin_id_exists_in_taxonomy) { $log->error("ecoscore origin does not exist in taxonomy", {origin => $origin, origin_id => $origin_id}) if $log->is_error(); @@ -426,9 +428,11 @@ sub load_ecoscore_data_packaging() { $material = $'; } - my $material_id = canonicalize_taxonomy_tag("fr", "packaging_materials", $material); + my $material_id_exists_in_taxonomy; + my $material_id + = canonicalize_taxonomy_tag("fr", "packaging_materials", $material, \$material_id_exists_in_taxonomy); - if (not exists_taxonomy_tag("packaging_materials", $material_id)) { + if (not $material_id_exists_in_taxonomy) { $log->error( "ecoscore material does not exist in taxonomy", {material => $material, material_id => $material_id} @@ -472,24 +476,67 @@ sub load_ecoscore_data_packaging() { # "Bouteille PET Biosourcé",75 # "Bouteille rPET transparente (100%)",100 - $ecoscore_data{packaging_materials}{"en:opaque-pet.en:bottle"} - = $ecoscore_data{packaging_materials}{"en:colored-pet.en:bottle"}; - $properties{"packaging_materials"}{"en:opaque-pet.en:bottle"}{"ecoscore_score:en"} - = $ecoscore_data{packaging_materials}{"en:colored-pet.en:bottle"}{score}; - $ecoscore_data{packaging_materials}{"en:pet-polyethylene-terephthalate.en:bottle"} - = $ecoscore_data{packaging_materials}{"en:colored-pet.en:bottle"}; - $properties{"packaging_materials"}{"en:pet-polyethylene-terephthalate.en:bottle"}{"ecoscore_score:en"} - = $ecoscore_data{packaging_materials}{"en:colored-pet.en:bottle"}{score}; - - # Assign transparent rPET bottle score to rPET - $ecoscore_data{packaging_materials}{"en:rpet-recycled-polyethylene-terephthalate"} - = $ecoscore_data{packaging_materials}{"en:transparent-rpet.en:bottle"}; - $properties{"packaging_materials"}{"en:rpet-recycled-polyethylene-terephthalate"}{"ecoscore_score:en"} - = $ecoscore_data{packaging_materials}{"en:transparent-rpet.en:bottle"}{score}; - - $ecoscore_data{packaging_materials}{"en:plastic"} = $ecoscore_data{packaging_materials}{"en:other-plastics"}; - $properties{"packaging_materials"}{"en:plastic"}{"ecoscore_score:en"} - = $ecoscore_data{packaging_materials}{"en:plastic"}{score}; + # We assign the same score to some target material.shape as a source material.shape + # Use English names for source / target shapes and materials + # they will be canonicalized with the taxonomies + my @assignments = ( + { + target_shape => "bottle", + target_material => "opaque pet", + source_shape => "bottle", + source_material => "colored pet" + }, + { + target_shape => "bottle", + target_material => "polyethylene terephthalate", + source_shape => "bottle", + source_material => "colored pet" + }, + # Assign transparent rPET bottle score to rPET + { + target_shape => "bottle", + target_material => "rpet", + source_shape => "bottle", + source_material => "transparent pet" + }, + { + target_material => "plastic", + source_material => "other plastics" + }, + ); + + foreach my $assignment_ref (@assignments) { + + # We canonicalize the names given in the assignments, as the taxonomies can change over time, including the canonical names + my $target_material + = canonicalize_taxonomy_tag_or_die("en", "packaging_materials", $assignment_ref->{target_material},); + + my $source_material + = canonicalize_taxonomy_tag_or_die("en", "packaging_materials", $assignment_ref->{source_material},); + + my $target = $target_material; + my $source = $source_material; + + if (defined $assignment_ref->{target_shape}) { + my $target_shape + = canonicalize_taxonomy_tag_or_die("en", "packaging_shapes", $assignment_ref->{target_shape},); + + my $source_shape + = canonicalize_taxonomy_tag_or_die("en", "packaging_shapes", $assignment_ref->{source_shape},); + + $target .= '.' . $target_shape; + $source .= '.' . $source_shape; + } + + if (defined $ecoscore_data{packaging_materials}{$source}) { + $ecoscore_data{packaging_materials}{$target} = $ecoscore_data{packaging_materials}{$source}; + $properties{packaging_materials}{$target}{"ecoscore_score:en"} + = $ecoscore_data{packaging_materials}{$source}{"score"}; + } + else { + die("source of assignement $source does not have Eco-Score data"); + } + } } else { die("Could not open ecoscore materials CSV $csv_file: $!"); @@ -532,15 +579,17 @@ sub load_ecoscore_data_packaging() { # skip ondulated cardboard (should be a material) next if ($shape eq "Carton ondulé"); - my $shape_id = canonicalize_taxonomy_tag("fr", "packaging_shapes", $shape); + my $shape_id_exists_in_taxonomy; + my $shape_id = canonicalize_taxonomy_tag("fr", "packaging_shapes", $shape, \$shape_id_exists_in_taxonomy); # Handle special cases that are not recognized by the packaging shapes taxonomy # conserve is used in preservation taxonomy, but it may be a packaging if ($shape_id =~ /^fr:conserve/i) { $shape_id = "en:can"; + $shape_id_exists_in_taxonomy = 1; } - if (not exists_taxonomy_tag("packaging_shapes", $shape_id)) { + if (not $shape_id_exists_in_taxonomy) { $log->error("ecoscore shape does not exist in taxonomy", {shape => $shape, shape_id => $shape_id}) if $log->is_error(); $errors++; diff --git a/lib/ProductOpener/Index.pm b/lib/ProductOpener/Index.pm index 4c5b2e026903f..51f2f8d2d3a08 100644 --- a/lib/ProductOpener/Index.pm +++ b/lib/ProductOpener/Index.pm @@ -32,7 +32,6 @@ BEGIN { &normalize - $memd $lang_dir %texts @@ -47,7 +46,6 @@ use ProductOpener::Config qw/:all/; use CGI qw/:standard escape unescape/; use Time::Local; -use Cache::Memcached::Fast; use Digest::MD5 qw(md5); use URI::Escape; use URI::Escape::XS; @@ -65,15 +63,6 @@ use HTML::Entities qw(decode_entities); #setlocale(LC_CTYPE, "fr_FR"); # May need to be changed depending on system # -> setting a locale makes unac_string fail to unaccent... :-( -# Initialize exported variables - -$memd = Cache::Memcached::Fast->new( - { - 'servers' => ["127.0.0.1:11211"], - 'utf8' => 1, - } -); - # Load the texts from the /lang directory # The /lang directory is not present in the openfoodfacts-server repository, diff --git a/lib/ProductOpener/Orgs.pm b/lib/ProductOpener/Orgs.pm index 870a1fca3ae08..93a749d3eed54 100644 --- a/lib/ProductOpener/Orgs.pm +++ b/lib/ProductOpener/Orgs.pm @@ -68,7 +68,6 @@ use ProductOpener::Store qw/:all/; use ProductOpener::Config qw/:all/; use ProductOpener::Mail qw/:all/; use ProductOpener::Lang qw/:all/; -use ProductOpener::Cache qw/:all/; use ProductOpener::Display qw/:all/; use ProductOpener::Tags qw/:all/; diff --git a/lib/ProductOpener/Tags.pm b/lib/ProductOpener/Tags.pm index c9b651386dd4e..94fb040a1578d 100644 --- a/lib/ProductOpener/Tags.pm +++ b/lib/ProductOpener/Tags.pm @@ -76,6 +76,7 @@ BEGIN { &list_taxonomy_tags_in_language &canonicalize_taxonomy_tag + &canonicalize_taxonomy_tag_or_die &canonicalize_taxonomy_tag_linkeddata &canonicalize_taxonomy_tag_weblink &canonicalize_taxonomy_tag_link @@ -1788,7 +1789,7 @@ sub build_tags_taxonomy ($tagtype, $publish) { =head2 build_all_taxonomies ( $pubish) -Build all taxonomies +Build all taxonomies, including the test taxonomy =head3 Parameters @@ -1797,7 +1798,7 @@ Build all taxonomies =cut sub build_all_taxonomies ($publish) { - foreach my $taxonomy (@taxonomy_fields) { + foreach my $taxonomy (@taxonomy_fields, "test") { # traces and data_quality_xxx are not real taxonomy per se # (but built from allergens and data_quality) if ($taxonomy ne "traces" and rindex($taxonomy, 'data_quality_', 0) != 0) { @@ -2991,11 +2992,84 @@ sub get_taxonomyurl ($tag_lc, $tagid) { } } -# Return the canonical id of a tag string in a specific language +=head2 canonicalize_taxonomy_tag_or_die ($tag_lc, $tagtype, $tag) -sub canonicalize_taxonomy_tag ($tag_lc, $tagtype, $tag) { +Canonicalize a string to check if matches an entry in a taxonomy, and die otherwise. + +This function is used during initialization, to check that some initialization data has matching entries in taxonomies. + +=head3 Arguments + +=head4 $tag_lc + +The language of the string. + +=head4 $tagtype + +The type of the tag (e.g. categories, labels, allergens) + +=head4 $tag + +The string that we want to match to a tag. + +=head4 $exists_in_taxonomy_ref + +A reference to a variable that will be assigned 1 if we found a matching taxonomy entry, or 0 otherwise. + +=head3 Return value + +If the string could be matched to an existing taxonomy entry, the canonical id for the entry is returned. + +Otherwise, the function dies. + +=cut + +sub canonicalize_taxonomy_tag_or_die ($tag_lc, $tagtype, $tag) { + + my $exists_in_taxonomy; + my $tagid = canonicalize_taxonomy_tag($tag_lc, $tagtype, $tag, \$exists_in_taxonomy); + if (not $exists_in_taxonomy) { + die("$tag ($tag_lc) could not be matched to an entry in the $tagtype taxonomy"); + } + return $tagid; +} + +=head2 canonicalize_taxonomy_tag ($tag_lc, $tagtype, $tag, $exists_in_taxonomy_ref = undef) + +Canonicalize a string to check if matches an entry in a taxonomy + +=head3 Arguments + +=head4 $tag_lc + +The language of the string. + +=head4 $tagtype + +The type of the tag (e.g. categories, labels, allergens) + +=head4 $tag + +The string that we want to match to a tag. + +=head4 $exists_in_taxonomy_ref + +A reference to a variable that will be assigned 1 if we found a matching taxonomy entry, or 0 otherwise. + +=head3 Return value + +If the string could be matched to an existing taxonomy entry, the canonical id for the entry is returned. + +Otherwise, we return the string prepended with the language code (e.g. en:An unknown entry) + +=cut + +sub canonicalize_taxonomy_tag ($tag_lc, $tagtype, $tag, $exists_in_taxonomy_ref = undef) { if (not defined $tag) { + if (defined $exists_in_taxonomy_ref) { + $$exists_in_taxonomy_ref = 0; + } return ""; } @@ -3042,7 +3116,7 @@ sub canonicalize_taxonomy_tag ($tag_lc, $tagtype, $tag) { $additive_tagid = $2; } if (defined $name) { - my $name_id = canonicalize_taxonomy_tag($tag_lc, "additives", $name); + my $name_id = canonicalize_taxonomy_tag($tag_lc, "additives", $name, $exists_in_taxonomy_ref); # caramelo e150c -> name_id is e150 if (("en:" . $additive_tagid) =~ /^$name_id/) { return "en:" . $additive_tagid; @@ -3142,8 +3216,14 @@ sub canonicalize_taxonomy_tag ($tag_lc, $tagtype, $tag) { $tagid = $tag_lc . ':' . $tagid; - if ((defined $translations_from{$tagtype}) and (defined $translations_from{$tagtype}{$tagid})) { + my $exists_in_taxonomy = 0; + + if ( (defined $translations_from{$tagtype}) + and (defined $translations_from{$tagtype}{$tagid}) + and not((exists $just_synonyms{$tagtype}) and (exists $just_synonyms{$tagtype}{$tagid}))) + { $tagid = $translations_from{$tagtype}{$tagid}; + $exists_in_taxonomy = 1; } elsif (defined $tag) { # no translation available, tag is not in known taxonomy @@ -3154,8 +3234,11 @@ sub canonicalize_taxonomy_tag ($tag_lc, $tagtype, $tag) { $tagid = ""; } - return $tagid; + if (defined $exists_in_taxonomy_ref) { + $$exists_in_taxonomy_ref = $exists_in_taxonomy; + } + return $tagid; } sub canonicalize_taxonomy_tag_linkeddata ($tagtype, $tag) { diff --git a/lib/ProductOpener/TaxonomySuggestions.pm b/lib/ProductOpener/TaxonomySuggestions.pm index f514707573bed..5f98e7d6677f9 100644 --- a/lib/ProductOpener/TaxonomySuggestions.pm +++ b/lib/ProductOpener/TaxonomySuggestions.pm @@ -49,6 +49,7 @@ use ProductOpener::Display qw/:all/; use ProductOpener::Lang qw/:all/; use ProductOpener::Tags qw/:all/; use ProductOpener::PackagerCodes qw/:all/; +use ProductOpener::Cache qw/:all/; use List::Util qw/min/; use Data::DeepAccess qw(deep_exists deep_get); @@ -97,17 +98,56 @@ Hash of fields that can be taken into account to generate relevant suggestions - categories: comma separated list of categories (tags ids or strings in the $search_lc language) - shape: packaging shape (tag id or string in the $search_lc language) +=head3 Note + +The results of this function are cached for 1 day using memcached. +Restart memcached if you want fresh results (e.g. when taxonomy are category stats change). + =cut sub get_taxonomy_suggestions ($tagtype, $search_lc, $string, $context_ref, $options_ref) { - $log->debug("get_taxonomy_suggestions_api", - {tagtype => $tagtype, search_lc => $search_lc, context_ref => $context_ref, options_ref => $options_ref}) - if $log->is_debug(); + $log->debug( + "get_taxonomy_suggestions - start", + { + tagtype => $tagtype, + search_lc => $search_lc, + string => $string, + context_ref => $context_ref, + options_ref => $options_ref + } + ) if $log->is_debug(); + + # Check if we have cached suggestions + my $key = generate_cache_key( + "get_taxonomy_suggestions", + { + tagtype => $tagtype, + search_lc => $search_lc, + string => $string, + context_ref => $context_ref, + options_ref => $options_ref + } + ); + + my $results_ref = $memd->get($key); - my @tags = generate_sorted_list_of_taxonomy_entries($tagtype, $search_lc, $context_ref); + if (not defined $results_ref) { + $log->debug("suggestions are not cached", {key => $key}) if $log->is_debug(); - return filter_suggestions_matching_string(\@tags, $tagtype, $search_lc, $string, $options_ref); + my @tags = generate_sorted_list_of_taxonomy_entries($tagtype, $search_lc, $context_ref); + + my @filtered_tags = filter_suggestions_matching_string(\@tags, $tagtype, $search_lc, $string, $options_ref); + $results_ref = \@filtered_tags; + + $log->debug("storing suggestions in cache", {key => $key}) if $log->is_debug(); + $memd->set($key, $results_ref, 24 * 3600); # Cache suggestions for 1 day + } + else { + $log->debug("got suggestions from cache", {key => $key}) if $log->is_debug(); + } + + return @$results_ref; } =head2 generate_sorted_list_of_taxonomy_entries($tagtype, $search_lc, $context_ref) @@ -245,6 +285,50 @@ sub add_sorted_entries_to_tags ($tags_ref, $seen_tags_ref, $entries_ref, $tagtyp return; } +# Match the normalized form of a tag synonym to the normalized input of an user + +sub match_stringids ($stringid, $fuzzystringid, $synonymid) { + + $log->debug("match string ids", {stringid => $stringid, fuzzystringid => $fuzzystringid, synonymid => $synonymid}) + if $log->is_debug(); + + # matching at start, best matches + if ($synonymid =~ /^$stringid/) { + return "start"; + } + # matching inside + elsif ($synonymid =~ /$stringid/) { + return "inside"; + } + # fuzzy match + elsif ($synonymid =~ /$fuzzystringid/) { + return "fuzzy"; + } + + return "none"; +} + +# best_match is used to see how well matches the best matching synonym + +sub best_match ($stringid, $fuzzystringid, $synonyms_ids_ref) { + + my $best_match = "none"; + + foreach my $synonymid (@$synonyms_ids_ref) { + my $match = match_stringids($stringid, $fuzzystringid, $synonymid); + if ($match eq "start") { + # Best match, we can return without looking at the other synonyms + return "start"; + } + elsif (($match eq "inside") + or (($match eq "fuzzy") and ($best_match eq "none"))) + { + $best_match = $match; + } + } + return $best_match; +} + =head2 filter_suggestions_matching_string ($tags_ref, $tagtype, $search_lc, $string, $options_ref) Filter a list of potential taxonomy suggestions matching a string. @@ -333,45 +417,45 @@ sub filter_suggestions_matching_string ($tags_ref, $tagtype, $search_lc, $string # just_synonyms are not real entries next if defined $just_synonyms{$tagtype}{$canon_tagid}; - my $tag; # this is the content string - my $tagid; # this is the tag - - # search if the tag exists in target language - if (defined $translations_to{$tagtype}{$canon_tagid}{$search_lc}) { - - $tag = $translations_to{$tagtype}{$canon_tagid}{$search_lc}; - # TODO: explain why $tagid can be different from $canon_tagid - $tagid = get_string_id_for_lang($search_lc, $tag); - - # add language prefix if we are not searching current interface language - if (not($search_lc eq $original_lc)) { - $tag = $search_lc . ":" . $tag; + # We will match synonyms in the search language, and in the wildcard xx: language + my $tag = display_taxonomy_tag($search_lc, $tagtype, $canon_tagid); + my $tag_xx = display_taxonomy_tag("xx", $tagtype, $canon_tagid); + + # Build a list of normalized synonyms in the search language and the wildcard xx: language + my @synonyms_ids = map {get_string_id_for_lang($search_lc, $_)} ( + @{deep_get(\%synonyms_for, $tagtype, $search_lc, get_string_id_for_lang($search_lc, $tag)) || []}, + @{deep_get(\%synonyms_for, $tagtype, "xx", get_string_id_for_lang("xx", $tag_xx)) || []} + ); + + # check how well the synonyms match the input string + my $best_match = best_match($stringid, $fuzzystringid, \@synonyms_ids); + + $log->debug( + "synonyms_ids for canon_tagid", + { + tagtype => $tagtype, + canon_tagid => $canon_tagid, + tag => $tag, + synonym_ids => \@synonyms_ids, + best_match => $best_match } - } - # also search for special language code "xx" which is universal - elsif (defined $translations_to{$tagtype}{$canon_tagid}{xx}) { - $tag = $translations_to{$tagtype}{$canon_tagid}{xx}; - $tagid = get_string_id_for_lang("xx", $tag); - } + ) if $log->is_debug(); - if (defined $tag) { - # matching at start, best matches - if ($tagid =~ /^$stringid/) { - push @suggestions, $tag; - # only matches at start are considered - $suggestions_count++; - } - # matching inside - elsif ($tagid =~ /$stringid/) { - push @suggestions_c, $tag; - } - # fuzzy match - elsif ($tagid =~ /$fuzzystringid/) { - push @suggestions_f, $tag; - } - # end as soon as we got enough + # matching at start, best matches + if ($best_match eq "start") { + push @suggestions, $tag; + # count matches at start so that we can return only if we have enough matches + $suggestions_count++; last if $suggestions_count >= $limit; } + # matching inside + elsif ($best_match eq "inside") { + push @suggestions_c, $tag; + } + # fuzzy match + elsif ($best_match eq "fuzzy") { + push @suggestions_f, $tag; + } } } diff --git a/lib/ProductOpener/Users.pm b/lib/ProductOpener/Users.pm index a64346bf14a08..d0bf666c1a460 100644 --- a/lib/ProductOpener/Users.pm +++ b/lib/ProductOpener/Users.pm @@ -78,7 +78,6 @@ use ProductOpener::Store qw/:all/; use ProductOpener::Config qw/:all/; use ProductOpener::Mail qw/:all/; use ProductOpener::Lang qw/:all/; -use ProductOpener::Cache qw/:all/; use ProductOpener::Display qw/:all/; use ProductOpener::Orgs qw/:all/; use ProductOpener::Products qw/:all/; diff --git a/taxonomies/packaging_materials.txt b/taxonomies/packaging_materials.txt index 69c5d306c4c61..dd822c5f16658 100644 --- a/taxonomies/packaging_materials.txt +++ b/taxonomies/packaging_materials.txt @@ -100,7 +100,8 @@ pt:Plástico reciclado # Plastics: # use the following conventions for plastic: -# in each language, for the main name, use "abbreviation + name", and put the name as the first synonym +# in each language, for the main name, use "abbreviation + resin code + ' - ' + name" +# + abbreviation + name, and name as synonyms # all abbreviations, symbols etc. go in the xx: entry, and do not need to be added to the translations in other languages # (unless the abbreviation is different: e.g. "PEBD" = "Polyéthylène basse densité" in French instead of PELD) @@ -109,26 +110,26 @@ en:Mixed plastics fr:Plastiques mixtes 'packaging-shapes-string-fr-po', method => 'GET', - path => '/api/v3/taxonomy_suggestions?tagtype=packaging_shapes&string=po', + path => '/api/v3/taxonomy_suggestions?tagtype=packaging_shapes&string=po&lc=fr', expected_status_code => 200, }, # Packaging shape suggestions can be specific to a country and categories, and shape diff --git a/tests/integration/expected_test_results/api_cgi_suggest/categories-string-strawberry.json b/tests/integration/expected_test_results/api_cgi_suggest/categories-string-strawberry.json index e7953e7b6e429..a56e72dc24c0b 100644 --- a/tests/integration/expected_test_results/api_cgi_suggest/categories-string-strawberry.json +++ b/tests/integration/expected_test_results/api_cgi_suggest/categories-string-strawberry.json @@ -1,4 +1,5 @@ [ + "Mint-flavoured syrup with sugar diluted in water", "Strawberry and blueberry compotes", "Strawberry applesauces", "Strawberry biscuits", diff --git a/tests/integration/expected_test_results/api_cgi_suggest/categories-term-fr-frais.json b/tests/integration/expected_test_results/api_cgi_suggest/categories-term-fr-frais.json index e9c2088ca0e3b..5f1b0dcb35f9d 100644 --- a/tests/integration/expected_test_results/api_cgi_suggest/categories-term-fr-frais.json +++ b/tests/integration/expected_test_results/api_cgi_suggest/categories-term-fr-frais.json @@ -15,13 +15,13 @@ "Ail frais", "Aliments à base de plantes frais", "Ananas frais", + "Aneth fraîche", "Artichauts frais", "Barquettes à la fraise", "Barre de chocolat au lait frais", "Barre de chocolat au lait frais avec génoise", "Barres de céréales aux fraises", + "Basilic fraîche", "Biscuits à la fraise", - "Boisson lactée aromatisée à la fraise sucrée au lait partiellement écrémé enrichie à la vitamine D", - "Boudin noir rayon frais", - "Brocolis frais" + "Boisson lactée aromatisée à la fraise sucrée au lait partiellement écrémé enrichie à la vitamine D" ] diff --git a/tests/integration/expected_test_results/api_cgi_suggest/categories-term-fr-fraise.json b/tests/integration/expected_test_results/api_cgi_suggest/categories-term-fr-fraise.json index 245de2a9df7bb..8637bded5fb74 100644 --- a/tests/integration/expected_test_results/api_cgi_suggest/categories-term-fr-fraise.json +++ b/tests/integration/expected_test_results/api_cgi_suggest/categories-term-fr-fraise.json @@ -13,6 +13,7 @@ "Barres de céréales aux fraises", "Biscuits à la fraise", "Boisson lactée aromatisée à la fraise sucrée au lait partiellement écrémé enrichie à la vitamine D", + "Boisson préparée sucrée à partir de sirop à la menthe à diluer dans l'eau", "Cheesecakes à la fraise", "Compotes de fraise", "Compotes fraise groseille", @@ -22,6 +23,5 @@ "Coulis de fraise", "Crêpes fourrées aux fraises", "Cônes vanille fraise", - "Glaces à la fraise", - "Jus de fraise" + "Glaces à la fraise" ] diff --git a/tests/integration/expected_test_results/api_cgi_suggest/categories-term-strawberry.json b/tests/integration/expected_test_results/api_cgi_suggest/categories-term-strawberry.json index e7953e7b6e429..a56e72dc24c0b 100644 --- a/tests/integration/expected_test_results/api_cgi_suggest/categories-term-strawberry.json +++ b/tests/integration/expected_test_results/api_cgi_suggest/categories-term-strawberry.json @@ -1,4 +1,5 @@ [ + "Mint-flavoured syrup with sugar diluted in water", "Strawberry and blueberry compotes", "Strawberry applesauces", "Strawberry biscuits", diff --git a/tests/integration/expected_test_results/api_v3_product_write/patch-packagings-quantity-and-weight.json b/tests/integration/expected_test_results/api_v3_product_write/patch-packagings-quantity-and-weight.json index 3157bc73b2771..10fa5d9629213 100644 --- a/tests/integration/expected_test_results/api_v3_product_write/patch-packagings-quantity-and-weight.json +++ b/tests/integration/expected_test_results/api_v3_product_write/patch-packagings-quantity-and-weight.json @@ -27,8 +27,8 @@ "packagings" : [ { "material" : { - "id" : "en:pet-polyethylene-terephthalate", - "lc_name" : "PET - Polyethylene terephthalate" + "id" : "en:pet-1-polyethylene-terephthalate", + "lc_name" : "PET 1 - Polyethylene terephthalate" }, "number_of_units" : 6, "quantity_per_unit" : "25cl", diff --git a/tests/integration/expected_test_results/api_v3_product_write/patch-packagings-weights-as-strings-with-units.json b/tests/integration/expected_test_results/api_v3_product_write/patch-packagings-weights-as-strings-with-units.json index 3498af35b3ff0..55fab98912f43 100644 --- a/tests/integration/expected_test_results/api_v3_product_write/patch-packagings-weights-as-strings-with-units.json +++ b/tests/integration/expected_test_results/api_v3_product_write/patch-packagings-weights-as-strings-with-units.json @@ -27,8 +27,8 @@ "packagings" : [ { "material" : { - "id" : "en:pet-polyethylene-terephthalate", - "lc_name" : "PET - Polyethylene terephthalate" + "id" : "en:pet-1-polyethylene-terephthalate", + "lc_name" : "PET 1 - Polyethylene terephthalate" }, "number_of_units" : 6, "quantity_per_unit" : "25cl", diff --git a/tests/integration/expected_test_results/api_v3_product_write/patch-packagings-weights-as-strings.json b/tests/integration/expected_test_results/api_v3_product_write/patch-packagings-weights-as-strings.json index 1eb4a0d533721..c33e099f99de7 100644 --- a/tests/integration/expected_test_results/api_v3_product_write/patch-packagings-weights-as-strings.json +++ b/tests/integration/expected_test_results/api_v3_product_write/patch-packagings-weights-as-strings.json @@ -27,8 +27,8 @@ "packagings" : [ { "material" : { - "id" : "en:pet-polyethylene-terephthalate", - "lc_name" : "PET - Polyethylene terephthalate" + "id" : "en:pet-1-polyethylene-terephthalate", + "lc_name" : "PET 1 - Polyethylene terephthalate" }, "number_of_units" : 6, "quantity_per_unit" : "25cl", diff --git a/tests/integration/expected_test_results/api_v3_product_write/patch-properties-with-lc-name.json b/tests/integration/expected_test_results/api_v3_product_write/patch-properties-with-lc-name.json index c6d26f97363c5..ce84db1a538d6 100644 --- a/tests/integration/expected_test_results/api_v3_product_write/patch-properties-with-lc-name.json +++ b/tests/integration/expected_test_results/api_v3_product_write/patch-properties-with-lc-name.json @@ -5,8 +5,8 @@ "packagings" : [ { "material" : { - "id" : "en:pet-polyethylene-terephthalate", - "lc_name" : "PET - Polyethylene terephthalate" + "id" : "en:pet-1-polyethylene-terephthalate", + "lc_name" : "PET 1 - Polyethylene terephthalate" }, "number_of_units" : 2, "recycling" : { diff --git a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/categories-string-fr-frais.json b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/categories-string-fr-frais.json index b0842b752499d..d028085639c2b 100644 --- a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/categories-string-fr-frais.json +++ b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/categories-string-fr-frais.json @@ -18,15 +18,15 @@ "Ail frais", "Aliments à base de plantes frais", "Ananas frais", + "Aneth fraîche", "Artichauts frais", "Barquettes à la fraise", "Barre de chocolat au lait frais", "Barre de chocolat au lait frais avec génoise", "Barres de céréales aux fraises", + "Basilic fraîche", "Biscuits à la fraise", - "Boisson lactée aromatisée à la fraise sucrée au lait partiellement écrémé enrichie à la vitamine D", - "Boudin noir rayon frais", - "Brocolis frais" + "Boisson lactée aromatisée à la fraise sucrée au lait partiellement écrémé enrichie à la vitamine D" ], "warnings" : [] } diff --git a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/categories-string-fr-fraise.json b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/categories-string-fr-fraise.json index f1e259f15db7c..cb37e6a175647 100644 --- a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/categories-string-fr-fraise.json +++ b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/categories-string-fr-fraise.json @@ -16,6 +16,7 @@ "Barres de céréales aux fraises", "Biscuits à la fraise", "Boisson lactée aromatisée à la fraise sucrée au lait partiellement écrémé enrichie à la vitamine D", + "Boisson préparée sucrée à partir de sirop à la menthe à diluer dans l'eau", "Cheesecakes à la fraise", "Compotes de fraise", "Compotes fraise groseille", @@ -25,8 +26,7 @@ "Coulis de fraise", "Crêpes fourrées aux fraises", "Cônes vanille fraise", - "Glaces à la fraise", - "Jus de fraise" + "Glaces à la fraise" ], "warnings" : [] } diff --git a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/categories-string-strawberry.json b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/categories-string-strawberry.json index 5a364909eb108..6364a344431b7 100644 --- a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/categories-string-strawberry.json +++ b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/categories-string-strawberry.json @@ -2,6 +2,7 @@ "errors" : [], "status" : "success", "suggestions" : [ + "Mint-flavoured syrup with sugar diluted in water", "Strawberry and blueberry compotes", "Strawberry applesauces", "Strawberry biscuits", diff --git a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/categories-term-strawberry.json b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/categories-term-strawberry.json index 5a364909eb108..6364a344431b7 100644 --- a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/categories-term-strawberry.json +++ b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/categories-term-strawberry.json @@ -2,6 +2,7 @@ "errors" : [], "status" : "success", "suggestions" : [ + "Mint-flavoured syrup with sugar diluted in water", "Strawberry and blueberry compotes", "Strawberry applesauces", "Strawberry biscuits", diff --git a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-01.json b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-01.json index 59999eb888630..90a16a037d766 100644 --- a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-01.json +++ b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-01.json @@ -1,6 +1,8 @@ { "errors" : [], "status" : "success", - "suggestions" : [], + "suggestions" : [ + "PET 1 - Polyethylene terephthalate" + ], "warnings" : [] } diff --git a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-1-pet.json b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-1-pet.json index ee841f3bfa464..4006feee5997f 100644 --- a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-1-pet.json +++ b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-1-pet.json @@ -2,6 +2,8 @@ "errors" : [], "status" : "success", "suggestions" : [ + "PET 1 - Polyethylene terephthalate", + "Paper and plastic", "81 C/PET", "81 C/PETmet", "81 C/rPET", diff --git a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-1.json b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-1.json index 601b68523907b..0037d1f6170fe 100644 --- a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-1.json +++ b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-1.json @@ -2,6 +2,19 @@ "errors" : [], "status" : "success", "suggestions" : [ + "Recycled cardboard", + "Recycled paper", + "Lithium battery", + "Nickel–cadmium battery", + "Nickel–metal hydride battery", + "PET 1 - Polyethylene terephthalate", + "Silver-oxide battery", + "Zinc–carbon battery", + "Aluminium", + "Non-corrugated cardboard", + "Cork", + "Paper and plastic", + "Green Glass", "81 C/PAP", "91 C/FE", "81 C/PP", @@ -13,20 +26,7 @@ "81 C/LLDPE", "81 C/MDPE", "81 C/OPP", - "81 C/PA", - "81 C/PC", - "81 C/PET", - "81 C/PETmet", - "81 C/PEmet", - "81 C/PLA", - "81 C/PVC", - "81 C/PVDC", - "81 C/rPET", - "91 C/ABS", - "91 C/CPP", - "91 C/EVOH", - "91 C/HDPE", - "91 C/LDPE" + "81 C/PA" ], "warnings" : [] } diff --git a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-cc-fr-categories-yaourt-shape-pot.json b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-cc-fr-categories-yaourt-shape-pot.json index 6207303caeed8..87d1a5d18cf9c 100644 --- a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-cc-fr-categories-yaourt-shape-pot.json +++ b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-cc-fr-categories-yaourt-shape-pot.json @@ -5,15 +5,10 @@ "Plastic", "Glass", "Cardboard", - "PP - Polypropylene", "Aluminium", - "PS - Polystyrene", "Metal", "Paper", - "PET - Polyethylene terephthalate", "Clear Glass", - "HDPE - High-density polyethylene", - "Other plastics", "Steel", "80 C/ALU", "80 C/FE", @@ -26,7 +21,12 @@ "81 C/LLDPE", "81 C/MDPE", "81 C/OPP", - "81 C/PA" + "81 C/PA", + "81 C/PAP", + "81 C/PC", + "81 C/PET", + "81 C/PETmet", + "81 C/PEmet" ], "warnings" : [] } diff --git a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-cc-fr-shape-pot.json b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-cc-fr-shape-pot.json index c9ee7b6209ec4..d4d3ac7cb8791 100644 --- a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-cc-fr-shape-pot.json +++ b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-cc-fr-shape-pot.json @@ -6,14 +6,9 @@ "Glass", "Cardboard", "Metal", - "PP - Polypropylene", "Aluminium", - "PET - Polyethylene terephthalate", - "PS - Polystyrene", "Paper", "Clear Glass", - "HDPE - High-density polyethylene", - "Other plastics", "Steel", "80 C/ALU", "80 C/FE", @@ -26,7 +21,12 @@ "81 C/LLDPE", "81 C/MDPE", "81 C/OPP", - "81 C/PA" + "81 C/PA", + "81 C/PAP", + "81 C/PC", + "81 C/PET", + "81 C/PETmet", + "81 C/PEmet" ], "warnings" : [] } diff --git a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-cc-fr.json b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-cc-fr.json index 216dfd89907ab..77794ceb33b06 100644 --- a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-cc-fr.json +++ b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-cc-fr.json @@ -8,25 +8,25 @@ "Metal", "Paper", "Aluminium", - "PP - Polypropylene", - "PET - Polyethylene terephthalate", "Tetra Pak", "Steel", "Wood", "Paperboard", - "HDPE - High-density polyethylene", - "LDPE - Low-density polyethylene", "Non-corrugated cardboard", - "PS - Polystyrene", "Clear Glass", - "Other plastics", "Cork", "Multilayer composite", "Kraft paper", "Tetra Brik", "Tetra Brik Aseptic", "FSC cardboard", - "Brown Glass" + "Brown Glass", + "Paper and plastic", + "Light aluminium", + "Elopak", + "Recycled plastic", + "84 C/PAP", + "Fabric" ], "warnings" : [] } diff --git a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-pet-1.json b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-pet-1.json index 59999eb888630..90a16a037d766 100644 --- a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-pet-1.json +++ b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials-pet-1.json @@ -1,6 +1,8 @@ { "errors" : [], "status" : "success", - "suggestions" : [], + "suggestions" : [ + "PET 1 - Polyethylene terephthalate" + ], "warnings" : [] } diff --git a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials.json b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials.json index 49864bd2df733..c42f80c2393c9 100644 --- a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials.json +++ b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-materials.json @@ -7,18 +7,12 @@ "Glass", "Metal", "Paper", - "PET - Polyethylene terephthalate", - "PP - Polypropylene", "Aluminium", "Paperboard", "Tetra Pak", - "HDPE - High-density polyethylene", - "LDPE - Low-density polyethylene", "Steel", - "Other plastics", "Non-corrugated cardboard", "Clear Glass", - "PS - Polystyrene", "Wood", "Multilayer composite", "Tetra Brik", @@ -26,7 +20,13 @@ "84 C/PAP", "Cork", "Elopak", - "Brown Glass" + "Brown Glass", + "Paper and plastic", + "FSC cardboard", + "Green Glass", + "Kraft paper", + "Transparent PET", + "SIG" ], "warnings" : [] } diff --git a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-recycling-fr-recy.json b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-recycling-fr-recy.json index aa3e4c135e7de..38a45870b319f 100644 --- a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-recycling-fr-recy.json +++ b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-recycling-fr-recy.json @@ -8,7 +8,8 @@ "Recycle in paper bin", "Recycle with drink cartons", "Recycle with plastics", - "Recycle with plastics - metal and bricks" + "Recycle with plastics - metal and bricks", + "Discard" ], "warnings" : [] } diff --git a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-shapes-string-fr-po.json b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-shapes-string-fr-po.json index 2cd8b06971020..ebc7d9e68398f 100644 --- a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-shapes-string-fr-po.json +++ b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-shapes-string-fr-po.json @@ -2,15 +2,14 @@ "errors" : [], "status" : "success", "suggestions" : [ + "Sachet", "Pot", - "Pouch flask", - "Terrine pot", - "Individual pot", - "Spoon", - "Spout", + "Gourde", + "Pot individuel", + "Support", + "Flacon à pompe", "Ampoule", - "Tablespoon", - "Teaspoon" + "Sac de transport" ], "warnings" : [] } diff --git a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-shapes-string-po.json b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-shapes-string-po.json index 2cd8b06971020..1b7a3aa63edec 100644 --- a/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-shapes-string-po.json +++ b/tests/integration/expected_test_results/api_v3_taxonomy_suggestions/packaging-shapes-string-po.json @@ -4,10 +4,11 @@ "suggestions" : [ "Pot", "Pouch flask", + "Spout", "Terrine pot", + "Backing", "Individual pot", "Spoon", - "Spout", "Ampoule", "Tablespoon", "Teaspoon" diff --git a/tests/integration/expected_test_results/export/5601009974337.json b/tests/integration/expected_test_results/export/5601009974337.json index 8a0c6985457a2..d8eb145ee6b53 100644 --- a/tests/integration/expected_test_results/export/5601009974337.json +++ b/tests/integration/expected_test_results/export/5601009974337.json @@ -51,7 +51,7 @@ "obsolete" : "0", "origin_fr" : "", "origins" : "Portugal", - "packaging" : "Plástico, PE, HDPE - Polietileno de alta densidade", + "packaging" : "Plástico, PE, HDPE 2 - Polietileno de alta densidade", "polyunsaturated-fat_unit" : "", "polyunsaturated-fat_value" : "", "potassium_unit" : "", diff --git a/tests/integration/expected_test_results/export_more_fields/5601009974337.json b/tests/integration/expected_test_results/export_more_fields/5601009974337.json index 156f32e01733b..14a3da2c599a0 100644 --- a/tests/integration/expected_test_results/export_more_fields/5601009974337.json +++ b/tests/integration/expected_test_results/export_more_fields/5601009974337.json @@ -74,8 +74,8 @@ "origin_fr" : "", "origins" : "Portugal", "origins_tags" : "en:portugal", - "packaging" : "Plástico, PE, HDPE - Polietileno de alta densidade", - "packaging_tags" : "en:plastic,en:pe-polyethylene,en:hdpe-high-density-polyethylene", + "packaging" : "Plástico, PE, HDPE 2 - Polietileno de alta densidade", + "packaging_tags" : "en:plastic,en:pe-7-polyethylene,en:hdpe-2-high-density-polyethylene", "polyunsaturated-fat_unit" : "", "polyunsaturated-fat_value" : "", "potassium_unit" : "", diff --git a/tests/unit/expected_test_results/ecoscore/energy-drink.json b/tests/unit/expected_test_results/ecoscore/energy-drink.json index 2b87c01c3be4f..be7938ac8a77b 100644 --- a/tests/unit/expected_test_results/ecoscore/energy-drink.json +++ b/tests/unit/expected_test_results/ecoscore/energy-drink.json @@ -77,12 +77,12 @@ "packaging_text" : "1 bouteille en plastique PET, 1 bouchon PEHD", "packagings" : [ { - "material" : "en:pet-polyethylene-terephthalate", + "material" : "en:pet-1-polyethylene-terephthalate", "number_of_units" : 1, "shape" : "en:bottle" }, { - "material" : "en:hdpe-high-density-polyethylene", + "material" : "en:hdpe-2-high-density-polyethylene", "number_of_units" : 1, "shape" : "en:bottle-cap" } diff --git a/tests/unit/expected_test_results/ecoscore/grade-a-with-recyclable-label.json b/tests/unit/expected_test_results/ecoscore/grade-a-with-recyclable-label.json index 713d1336cdbcb..c7053137802a6 100644 --- a/tests/unit/expected_test_results/ecoscore/grade-a-with-recyclable-label.json +++ b/tests/unit/expected_test_results/ecoscore/grade-a-with-recyclable-label.json @@ -234,7 +234,7 @@ { "ecoscore_material_score" : 21, "ecoscore_shape_ratio" : 0.1, - "material" : "en:pp-polypropylene", + "material" : "en:pp-5-polypropylene", "non_recyclable_and_non_biodegradable" : "no", "number_of_units" : 1, "recycling" : "en:discard", @@ -526,7 +526,7 @@ "shape" : "en:lid" }, { - "material" : "en:pp-polypropylene", + "material" : "en:pp-5-polypropylene", "number_of_units" : 1, "recycling" : "en:discard", "shape" : "en:label" diff --git a/tests/unit/expected_test_results/ecoscore/milk.json b/tests/unit/expected_test_results/ecoscore/milk.json index 8def44cdf4dea..64daa1a636c75 100644 --- a/tests/unit/expected_test_results/ecoscore/milk.json +++ b/tests/unit/expected_test_results/ecoscore/milk.json @@ -218,8 +218,8 @@ { "ecoscore_material_score" : 50, "ecoscore_shape_ratio" : 1, - "material" : "en:pet-polyethylene-terephthalate", - "material_shape" : "en:pet-polyethylene-terephthalate.en:bottle", + "material" : "en:pet-1-polyethylene-terephthalate", + "material_shape" : "en:pet-1-polyethylene-terephthalate.en:bottle", "non_recyclable_and_non_biodegradable" : "no", "number_of_units" : 1, "shape" : "en:bottle" @@ -227,8 +227,8 @@ { "ecoscore_material_score" : 50, "ecoscore_shape_ratio" : 0.1, - "material" : "en:hdpe-high-density-polyethylene", - "material_shape" : "en:hdpe-high-density-polyethylene.en:bottle-cap", + "material" : "en:hdpe-2-high-density-polyethylene", + "material_shape" : "en:hdpe-2-high-density-polyethylene.en:bottle-cap", "non_recyclable_and_non_biodegradable" : "no", "number_of_units" : 1, "shape" : "en:bottle-cap" @@ -473,12 +473,12 @@ "packaging_text" : "1 bouteille en plastique PET, 1 bouchon PEHD", "packagings" : [ { - "material" : "en:pet-polyethylene-terephthalate", + "material" : "en:pet-1-polyethylene-terephthalate", "number_of_units" : 1, "shape" : "en:bottle" }, { - "material" : "en:hdpe-high-density-polyethylene", + "material" : "en:hdpe-2-high-density-polyethylene", "number_of_units" : 1, "shape" : "en:bottle-cap" } diff --git a/tests/unit/expected_test_results/ecoscore/packaging-en-pet-bottle.json b/tests/unit/expected_test_results/ecoscore/packaging-en-pet-bottle.json index 59e45340d26b7..9138f09463099 100644 --- a/tests/unit/expected_test_results/ecoscore/packaging-en-pet-bottle.json +++ b/tests/unit/expected_test_results/ecoscore/packaging-en-pet-bottle.json @@ -220,8 +220,8 @@ { "ecoscore_material_score" : 50, "ecoscore_shape_ratio" : 1, - "material" : "en:pet-polyethylene-terephthalate", - "material_shape" : "en:pet-polyethylene-terephthalate.en:bottle", + "material" : "en:pet-1-polyethylene-terephthalate", + "material_shape" : "en:pet-1-polyethylene-terephthalate.en:bottle", "non_recyclable_and_non_biodegradable" : "no", "shape" : "en:bottle" } @@ -423,7 +423,7 @@ "packaging_text" : "PET bottle", "packagings" : [ { - "material" : "en:pet-polyethylene-terephthalate", + "material" : "en:pet-1-polyethylene-terephthalate", "shape" : "en:bottle" } ] diff --git a/tests/unit/expected_test_results/ecoscore/uk-milk.json b/tests/unit/expected_test_results/ecoscore/uk-milk.json index 1b5b2c9358cad..248416ce86c86 100644 --- a/tests/unit/expected_test_results/ecoscore/uk-milk.json +++ b/tests/unit/expected_test_results/ecoscore/uk-milk.json @@ -218,8 +218,8 @@ { "ecoscore_material_score" : 50, "ecoscore_shape_ratio" : 1, - "material" : "en:pet-polyethylene-terephthalate", - "material_shape" : "en:pet-polyethylene-terephthalate.en:bottle", + "material" : "en:pet-1-polyethylene-terephthalate", + "material_shape" : "en:pet-1-polyethylene-terephthalate.en:bottle", "non_recyclable_and_non_biodegradable" : "no", "number_of_units" : 1, "shape" : "en:bottle" @@ -227,8 +227,8 @@ { "ecoscore_material_score" : 50, "ecoscore_shape_ratio" : 0.1, - "material" : "en:hdpe-high-density-polyethylene", - "material_shape" : "en:hdpe-high-density-polyethylene.en:bottle-cap", + "material" : "en:hdpe-2-high-density-polyethylene", + "material_shape" : "en:hdpe-2-high-density-polyethylene.en:bottle-cap", "non_recyclable_and_non_biodegradable" : "no", "number_of_units" : 1, "shape" : "en:bottle-cap" @@ -475,12 +475,12 @@ "packaging_text" : "1 PET plastic bottle, 1 PEHD bottle cap", "packagings" : [ { - "material" : "en:pet-polyethylene-terephthalate", + "material" : "en:pet-1-polyethylene-terephthalate", "number_of_units" : 1, "shape" : "en:bottle" }, { - "material" : "en:hdpe-high-density-polyethylene", + "material" : "en:hdpe-2-high-density-polyethylene", "number_of_units" : 1, "shape" : "en:bottle-cap" } diff --git a/tests/unit/expected_test_results/packaging/en-1-pet-plastic-bottle.json b/tests/unit/expected_test_results/packaging/en-1-pet-plastic-bottle.json index fc9bad19e1266..a5c4bc3b13b63 100644 --- a/tests/unit/expected_test_results/packaging/en-1-pet-plastic-bottle.json +++ b/tests/unit/expected_test_results/packaging/en-1-pet-plastic-bottle.json @@ -8,7 +8,7 @@ "packaging_text" : "1 PET plastic bottle", "packagings" : [ { - "material" : "en:pet-polyethylene-terephthalate", + "material" : "en:pet-1-polyethylene-terephthalate", "number_of_units" : 1, "shape" : "en:bottle" } diff --git a/tests/unit/expected_test_results/packaging/fr-comma-inside-a-number.json b/tests/unit/expected_test_results/packaging/fr-comma-inside-a-number.json index 5b17eb4fff271..c5715b67c70c4 100644 --- a/tests/unit/expected_test_results/packaging/fr-comma-inside-a-number.json +++ b/tests/unit/expected_test_results/packaging/fr-comma-inside-a-number.json @@ -8,7 +8,7 @@ "packaging_text" : "6 bouteilles en plastique transparent PET de 1,5 L à recycler", "packagings" : [ { - "material" : "en:pet-polyethylene-terephthalate", + "material" : "en:pet-1-polyethylene-terephthalate", "number_of_units" : 6, "quantity_per_unit" : "1,5 l", "quantity_per_unit_unit" : "l", diff --git a/tests/unit/expected_test_results/packaging/packaging_text_fr_bouteille_pet.json b/tests/unit/expected_test_results/packaging/packaging_text_fr_bouteille_pet.json index b5ea0fa757d95..ca90907e7c1b6 100644 --- a/tests/unit/expected_test_results/packaging/packaging_text_fr_bouteille_pet.json +++ b/tests/unit/expected_test_results/packaging/packaging_text_fr_bouteille_pet.json @@ -8,7 +8,7 @@ "packaging_text" : "bouteille PET", "packagings" : [ { - "material" : "en:pet-polyethylene-terephthalate", + "material" : "en:pet-1-polyethylene-terephthalate", "shape" : "en:bottle" } ] diff --git a/tests/unit/expected_test_results/packaging/packaging_text_fr_bouteille_plastique_pet.json b/tests/unit/expected_test_results/packaging/packaging_text_fr_bouteille_plastique_pet.json index f00e856ea4eca..7dc335a76f686 100644 --- a/tests/unit/expected_test_results/packaging/packaging_text_fr_bouteille_plastique_pet.json +++ b/tests/unit/expected_test_results/packaging/packaging_text_fr_bouteille_plastique_pet.json @@ -8,7 +8,7 @@ "packaging_text" : "bouteille plastique PET", "packagings" : [ { - "material" : "en:pet-polyethylene-terephthalate", + "material" : "en:pet-1-polyethylene-terephthalate", "shape" : "en:bottle" } ] diff --git a/tests/unit/expected_test_results/packaging/packaging_text_fr_line_feeds.json b/tests/unit/expected_test_results/packaging/packaging_text_fr_line_feeds.json index 3031e604cdc12..cb1188891b813 100644 --- a/tests/unit/expected_test_results/packaging/packaging_text_fr_line_feeds.json +++ b/tests/unit/expected_test_results/packaging/packaging_text_fr_line_feeds.json @@ -8,7 +8,7 @@ "packaging_text" : "1 bouteille en plastique opaque PE-HD de 1L à recycler\n1 bouchon en plastique opaque PE-HD à recycler\n1 opercule en métal à recycler\n1 étiquette en papier à recycler", "packagings" : [ { - "material" : "en:hdpe-high-density-polyethylene", + "material" : "en:hdpe-2-high-density-polyethylene", "number_of_units" : 1, "quantity_per_unit" : "1l", "quantity_per_unit_unit" : "l", @@ -17,7 +17,7 @@ "shape" : "en:bottle" }, { - "material" : "en:hdpe-high-density-polyethylene", + "material" : "en:hdpe-2-high-density-polyethylene", "number_of_units" : 1, "recycling" : "en:recycle", "shape" : "en:bottle-cap" diff --git a/tests/unit/expected_test_results/packaging/packaging_text_fr_unknown_shape.json b/tests/unit/expected_test_results/packaging/packaging_text_fr_unknown_shape.json index 98b3552d14f56..bba48fbef73f0 100644 --- a/tests/unit/expected_test_results/packaging/packaging_text_fr_unknown_shape.json +++ b/tests/unit/expected_test_results/packaging/packaging_text_fr_unknown_shape.json @@ -8,7 +8,7 @@ "packaging_text" : "1 bouteille en plastique opaque PE-HD de 1L à recycler\n1 bouchon en plastique opaque PE-HD à recycler\n1 opercule à recycler\n1 machin en papier à recycler", "packagings" : [ { - "material" : "en:hdpe-high-density-polyethylene", + "material" : "en:hdpe-2-high-density-polyethylene", "number_of_units" : 1, "quantity_per_unit" : "1l", "quantity_per_unit_unit" : "l", @@ -17,7 +17,7 @@ "shape" : "en:bottle" }, { - "material" : "en:hdpe-high-density-polyethylene", + "material" : "en:hdpe-2-high-density-polyethylene", "number_of_units" : 1, "recycling" : "en:recycle", "shape" : "en:bottle-cap" diff --git a/tests/unit/expected_test_results/packaging/packaging_text_nl_statiegeldfles.json b/tests/unit/expected_test_results/packaging/packaging_text_nl_statiegeldfles.json index 3c631c46ed47f..6139e3b08104b 100644 --- a/tests/unit/expected_test_results/packaging/packaging_text_nl_statiegeldfles.json +++ b/tests/unit/expected_test_results/packaging/packaging_text_nl_statiegeldfles.json @@ -8,7 +8,7 @@ "packaging_text" : "statiegeldfles", "packagings" : [ { - "material" : "en:pet-polyethylene-terephthalate", + "material" : "en:pet-1-polyethylene-terephthalate", "recycling" : "en:return-pet-bottle-to-store", "shape" : "en:unknown" } diff --git a/tests/unit/tags.t b/tests/unit/tags.t index 7363893246b2d..7b39c9e619ec8 100755 --- a/tests/unit/tags.t +++ b/tests/unit/tags.t @@ -672,6 +672,17 @@ is( "fr:un label français inconnu, Ecológico, en:A New English label, Missing language prefix, Comercio justo, en:one-percent-for-the-planet" ); +# canonicalize_taxonomy_tag can now return 0 or 1 to indicate if the tag matched an existing taxonomy entry + +my $exists; + +is(canonicalize_taxonomy_tag("fr", "test", "Yaourts au citron", \$exists), "en:lemon-yogurts"); +is($exists, 1); + +is(canonicalize_taxonomy_tag("fr", "test", "Yaourts au citron qui n'existe pas", \$exists), + "fr:Yaourts au citron qui n'existe pas"); +is($exists, 0); + is(canonicalize_taxonomy_tag('fr', 'categories', 'café'), "en:coffees"); # Tests to verify we match the xx:Ä Märket entry diff --git a/tests/unit/taxonomy_suggestions.t b/tests/unit/taxonomy_suggestions.t new file mode 100644 index 0000000000000..2e40d65845fc1 --- /dev/null +++ b/tests/unit/taxonomy_suggestions.t @@ -0,0 +1,97 @@ +#!/usr/bin/perl -w + +use Modern::Perl '2017'; +use utf8; + +use Test::More; +use Log::Any::Adapter 'TAP'; + +use ProductOpener::Tags qw/:all/; +use ProductOpener::TaxonomySuggestions qw/:all/; + +ProductOpener::Tags::retrieve_tags_taxonomy("test"); + +# Filtering suggestions matching strings + +my $tags_ref = ["en:banana-yogurts", "en:yogurts", "en:soup", "en:vegetable", "en:non-existent-entry",]; + +my @filter_tests = ( + { + desc => 'Empty string', + tags => $tags_ref, + tagtype => "test", + lc => "en", + string => "", + expected => ['Banana yogurts', 'Yogurts', 'Soup', 'Vegetable'], + }, + { + desc => 'Match at start', + tags => $tags_ref, + tagtype => "test", + lc => "en", + string => "ba", + expected => ['Banana yogurts'], + }, + { + desc => 'Match at start and inside, return start first', + tags => $tags_ref, + tagtype => "test", + lc => "en", + string => "yog", + expected => ['Yogurts', 'Banana yogurts'], + }, + { + desc => 'No match', + tags => $tags_ref, + tagtype => "test", + lc => "en", + string => "xyz", + expected => [], + }, + { + desc => 'match an xx: synonym', + tags => $tags_ref, + tagtype => "test", + lc => "en", + string => "something else", + expected => ["Soup"], + }, +); + +foreach my $test_ref (@filter_tests) { + my @results = ProductOpener::TaxonomySuggestions::filter_suggestions_matching_string($test_ref->{tags}, + $test_ref->{tagtype}, $test_ref->{lc}, $test_ref->{string}, {}); + if (not is_deeply(\@results, $test_ref->{expected})) { + diag explain($test_ref, \@results); + } +} + +# Complete suggestion generation + +my @suggest_tests = ( + { + desc => 'Match at start', + tagtype => "test", + lc => "en", + string => "ba", + expected => ['Banana yogurts'], + }, + { + desc => 'Match at start and inside, return start first', + tagtype => "test", + lc => "en", + string => "yog", + expected => ['Yogurts', 'Banana yogurts', 'Lemon yogurts', 'Passion fruit yogurts'], + }, + +); + +foreach my $test_ref (@suggest_tests) { + my @results = ProductOpener::TaxonomySuggestions::get_taxonomy_suggestions($test_ref->{tagtype}, $test_ref->{lc}, + $test_ref->{string}, {}, {}); + if (not is_deeply(\@results, $test_ref->{expected})) { + diag explain($test_ref, \@results); + } +} + +done_testing();