Skip to content

Commit

Permalink
feat: Match synonyms and xx: entries when computing taxonomy suggesti…
Browse files Browse the repository at this point in the history
…ons (#8190)

* taxonomy: add packaging material code in canonical name
* refactor memcached caching
* feat: match synonyms for taxonomy suggestions, fixes: #8002 and cache suggestions

---------

Co-authored-by: Alex Garel <alex@garel.org>
  • Loading branch information
stephanegigandet and alexgarel committed Mar 23, 2023
1 parent cd2f65f commit e1304de
Show file tree
Hide file tree
Showing 50 changed files with 638 additions and 293 deletions.
40 changes: 38 additions & 2 deletions lib/ProductOpener/Cache.pm
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,14 @@

package ProductOpener::Cache;

use utf8;
use Modern::Perl '2017';
use ProductOpener::PerlStandards;
use Exporter qw< import >;

BEGIN {
use vars qw(@ISA @EXPORT_OK %EXPORT_TAGS);
@EXPORT_OK = qw(
$memd
&generate_cache_key
); # symbols to export on request
%EXPORT_TAGS = (all => [@EXPORT_OK]);
}
Expand All @@ -38,6 +38,8 @@ use ProductOpener::Store qw/:all/;
use ProductOpener::Config qw/:all/;

use Cache::Memcached::Fast;
use JSON;
use Digest::MD5 qw(md5_hex);
use Log::Any qw($log);

# Initialize exported variables
Expand All @@ -46,7 +48,41 @@ $memd = Cache::Memcached::Fast->new(
{
'servers' => $memd_servers,
'utf8' => 1,
compress_threshold => 10000,
}
);

my $json = JSON->new->utf8->allow_nonref->canonical;

=head1 FUNCTIONS
=head2 generate_cache_key($name, $context_ref)
Generate a key to use for caching, that depends on the content of the $context_ref object.
The key is prependend by the name of the variable we want to store, so that we can set multiple variables for the same context
(e.g. a count of search results + the search results themselves)
=head3 Arguments
=head4 $name Name of the variable we want to cache.
=head4 $object_ref Reference to all the context / parameters etc. that have an influence on what we want to cache
=head3 Return values
MD5 of the key.
=cut

sub generate_cache_key ($name, $context_ref) {

# We generate a sorted JSON so that we always have the same key for the context object
# even if it contains hashes (Storable::freeze may not have the same order of keys)
my $context_json = $json->encode($context_ref);
my $key = $server_domain . ':' . $name . '/' . md5_hex($context_json);
$log->debug("generate_cache_key", {context_ref => $context_ref, context_json => $context_json, key => $key})
if $log->is_debug();
return $key;
}

1;
30 changes: 7 additions & 23 deletions lib/ProductOpener/Display.pm
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,8 @@ use ProductOpener::PackagerCodes qw(:all);
use ProductOpener::Export qw(:all);
use ProductOpener::API qw(:all);
use ProductOpener::Units qw/:all/;
use ProductOpener::Cache qw/:all/;

use Cache::Memcached::Fast;
use Encode;
use URI::Escape::XS;
use CGI qw(:cgi :cgi-lib :form escapeHTML');
Expand All @@ -187,7 +187,6 @@ use CLDR::Number;
use CLDR::Number::Format::Decimal;
use CLDR::Number::Format::Percent;
use Storable qw(dclone freeze);
use Digest::MD5 qw(md5_hex);
use boolean;
use Excel::Writer::XLSX;
use Template;
Expand Down Expand Up @@ -291,13 +290,6 @@ $tt = Template->new(
);

# Initialize exported variables
$memd = Cache::Memcached::Fast->new(
{
'servers' => $memd_servers,
'utf8' => 1,
compress_threshold => 10000,
}
);

$default_request_ref = {page => 1,};

Expand Down Expand Up @@ -1434,9 +1426,8 @@ sub query_list_of_tags ($request_ref, $query_ref) {
}

#get cache results for aggregate query
my $key = $server_domain . "/" . freeze($aggregate_parameters);
my $key = generate_cache_key("aggregate", $aggregate_parameters);
$log->debug("MongoDB query key", {key => $key}) if $log->is_debug();
$key = md5_hex($key);
my $results = get_cache_results($key, $request_ref);

if ((not defined $results) or (ref($results) ne "ARRAY") or (not defined $results->[0])) {
Expand Down Expand Up @@ -1514,9 +1505,8 @@ sub query_list_of_tags ($request_ref, $query_ref) {
else {

#get total count for aggregate (without limit) and put result in cache
my $key_count = $server_domain . "/" . freeze($aggregate_count_parameters);
my $key_count = generate_cache_key("aggregate_count", $aggregate_count_parameters);
$log->debug("MongoDB aggregate count query key", {key => $key_count}) if $log->is_debug();
$key_count = md5_hex($key_count);
my $results_count = get_cache_results($key_count, $request_ref);

if (not defined $results_count) {
Expand Down Expand Up @@ -4880,14 +4870,9 @@ sub search_and_display_products ($request_ref, $query_ref, $sort_by, $limit, $pa
skip => $skip
];

# Sort the keys of hashes
my $json = JSON::PP->new->utf8->canonical->encode($mongodb_query_ref);

my $key = $server_domain . "/" . $json;

$log->debug("MongoDB query key - search-products", {key => $key}) if $log->is_debug();
my $key = generate_cache_key("search_products", $mongodb_query_ref);

$key = "search-products-" . md5_hex($key);
$log->debug("MongoDB query key - search_products", {key => $key}) if $log->is_debug();

$request_ref->{structured_response} = get_cache_results($key, $request_ref);

Expand Down Expand Up @@ -4930,9 +4915,8 @@ sub search_and_display_products ($request_ref, $query_ref, $sort_by, $limit, $pa
}
elsif (keys %{$query_ref} > 0) {
#check if count results is in cache
my $key_count = $server_domain . "/" . freeze($query_ref);
$log->debug("MongoDB query key - search-count", {key => $key_count}) if $log->is_debug();
$key_count = "search-count-" . md5_hex($key_count);
my $key_count = generate_cache_key("search_products_count", $query_ref);
$log->debug("MongoDB query key - search_products_count", {key => $key_count}) if $log->is_debug();
my $results_count = get_cache_results($key_count, $request_ref);
if (not defined $results_count) {

Expand Down
113 changes: 81 additions & 32 deletions lib/ProductOpener/Ecoscore.pm
Original file line number Diff line number Diff line change
Expand Up @@ -229,9 +229,10 @@ sub load_ecoscore_data_origins_of_ingredients_distances() {

next if ((not defined $origin) or ($origin eq ""));

my $origin_id = canonicalize_taxonomy_tag("en", "origins", $origin);
my $origin_id_exists_in_taxonomy;
my $origin_id = canonicalize_taxonomy_tag("en", "origins", $origin, \$origin_id_exists_in_taxonomy);

if (not exists_taxonomy_tag("origins", $origin_id)) {
if (not $origin_id_exists_in_taxonomy) {

$log->error("ecoscore origin does not exist in taxonomy", {origin => $origin, origin_id => $origin_id})
if $log->is_error();
Expand Down Expand Up @@ -310,29 +311,30 @@ sub load_ecoscore_data_origins_of_ingredients() {

next if ((not defined $origin) or ($origin eq ""));

my $origin_id = canonicalize_taxonomy_tag("fr", "origins", $origin);
my $origin_id_exists_in_taxonomy;
my $origin_id = canonicalize_taxonomy_tag("fr", "origins", $origin, \$origin_id_exists_in_taxonomy);

if (not exists_taxonomy_tag("origins", $origin_id)) {
if (not $origin_id_exists_in_taxonomy) {

# Eco-Score entries like "Macedonia [FYROM]": remove the [..] part
# but keep it in the first try, as it is needed to distinguish "Congo [DRC]" and "Congo [Republic]"
if ($origin =~ /^(.*)\[(.*)\]/) {
$origin_id = canonicalize_taxonomy_tag("fr", "origins", $1);
if (not exists_taxonomy_tag("origins", $origin_id)) {
$origin_id = canonicalize_taxonomy_tag("fr", "origins", $2);
$origin_id = canonicalize_taxonomy_tag("fr", "origins", $1, \$origin_id_exists_in_taxonomy);
if (not $origin_id_exists_in_taxonomy) {
$origin_id = canonicalize_taxonomy_tag("fr", "origins", $2, \$origin_id_exists_in_taxonomy);
}
}
}

# La Guyane Française -> Guyane Française
if (not exists_taxonomy_tag("origins", $origin_id)) {
if (not $origin_id_exists_in_taxonomy) {

if ($origin =~ /^(la|les|l'|le)\s?(.*)$/i) {
$origin_id = canonicalize_taxonomy_tag("fr", "origins", $2);
$origin_id = canonicalize_taxonomy_tag("fr", "origins", $2, \$origin_id_exists_in_taxonomy);
}
}

if (not exists_taxonomy_tag("origins", $origin_id)) {
if (not $origin_id_exists_in_taxonomy) {

$log->error("ecoscore origin does not exist in taxonomy", {origin => $origin, origin_id => $origin_id})
if $log->is_error();
Expand Down Expand Up @@ -426,9 +428,11 @@ sub load_ecoscore_data_packaging() {
$material = $';
}

my $material_id = canonicalize_taxonomy_tag("fr", "packaging_materials", $material);
my $material_id_exists_in_taxonomy;
my $material_id
= canonicalize_taxonomy_tag("fr", "packaging_materials", $material, \$material_id_exists_in_taxonomy);

if (not exists_taxonomy_tag("packaging_materials", $material_id)) {
if (not $material_id_exists_in_taxonomy) {
$log->error(
"ecoscore material does not exist in taxonomy",
{material => $material, material_id => $material_id}
Expand Down Expand Up @@ -472,24 +476,67 @@ sub load_ecoscore_data_packaging() {
# "Bouteille PET Biosourcé",75
# "Bouteille rPET transparente (100%)",100

$ecoscore_data{packaging_materials}{"en:opaque-pet.en:bottle"}
= $ecoscore_data{packaging_materials}{"en:colored-pet.en:bottle"};
$properties{"packaging_materials"}{"en:opaque-pet.en:bottle"}{"ecoscore_score:en"}
= $ecoscore_data{packaging_materials}{"en:colored-pet.en:bottle"}{score};
$ecoscore_data{packaging_materials}{"en:pet-polyethylene-terephthalate.en:bottle"}
= $ecoscore_data{packaging_materials}{"en:colored-pet.en:bottle"};
$properties{"packaging_materials"}{"en:pet-polyethylene-terephthalate.en:bottle"}{"ecoscore_score:en"}
= $ecoscore_data{packaging_materials}{"en:colored-pet.en:bottle"}{score};

# Assign transparent rPET bottle score to rPET
$ecoscore_data{packaging_materials}{"en:rpet-recycled-polyethylene-terephthalate"}
= $ecoscore_data{packaging_materials}{"en:transparent-rpet.en:bottle"};
$properties{"packaging_materials"}{"en:rpet-recycled-polyethylene-terephthalate"}{"ecoscore_score:en"}
= $ecoscore_data{packaging_materials}{"en:transparent-rpet.en:bottle"}{score};

$ecoscore_data{packaging_materials}{"en:plastic"} = $ecoscore_data{packaging_materials}{"en:other-plastics"};
$properties{"packaging_materials"}{"en:plastic"}{"ecoscore_score:en"}
= $ecoscore_data{packaging_materials}{"en:plastic"}{score};
# We assign the same score to some target material.shape as a source material.shape
# Use English names for source / target shapes and materials
# they will be canonicalized with the taxonomies
my @assignments = (
{
target_shape => "bottle",
target_material => "opaque pet",
source_shape => "bottle",
source_material => "colored pet"
},
{
target_shape => "bottle",
target_material => "polyethylene terephthalate",
source_shape => "bottle",
source_material => "colored pet"
},
# Assign transparent rPET bottle score to rPET
{
target_shape => "bottle",
target_material => "rpet",
source_shape => "bottle",
source_material => "transparent pet"
},
{
target_material => "plastic",
source_material => "other plastics"
},
);

foreach my $assignment_ref (@assignments) {

# We canonicalize the names given in the assignments, as the taxonomies can change over time, including the canonical names
my $target_material
= canonicalize_taxonomy_tag_or_die("en", "packaging_materials", $assignment_ref->{target_material},);

my $source_material
= canonicalize_taxonomy_tag_or_die("en", "packaging_materials", $assignment_ref->{source_material},);

my $target = $target_material;
my $source = $source_material;

if (defined $assignment_ref->{target_shape}) {
my $target_shape
= canonicalize_taxonomy_tag_or_die("en", "packaging_shapes", $assignment_ref->{target_shape},);

my $source_shape
= canonicalize_taxonomy_tag_or_die("en", "packaging_shapes", $assignment_ref->{source_shape},);

$target .= '.' . $target_shape;
$source .= '.' . $source_shape;
}

if (defined $ecoscore_data{packaging_materials}{$source}) {
$ecoscore_data{packaging_materials}{$target} = $ecoscore_data{packaging_materials}{$source};
$properties{packaging_materials}{$target}{"ecoscore_score:en"}
= $ecoscore_data{packaging_materials}{$source}{"score"};
}
else {
die("source of assignement $source does not have Eco-Score data");
}
}
}
else {
die("Could not open ecoscore materials CSV $csv_file: $!");
Expand Down Expand Up @@ -532,15 +579,17 @@ sub load_ecoscore_data_packaging() {
# skip ondulated cardboard (should be a material)
next if ($shape eq "Carton ondulé");

my $shape_id = canonicalize_taxonomy_tag("fr", "packaging_shapes", $shape);
my $shape_id_exists_in_taxonomy;
my $shape_id = canonicalize_taxonomy_tag("fr", "packaging_shapes", $shape, \$shape_id_exists_in_taxonomy);

# Handle special cases that are not recognized by the packaging shapes taxonomy
# conserve is used in preservation taxonomy, but it may be a packaging
if ($shape_id =~ /^fr:conserve/i) {
$shape_id = "en:can";
$shape_id_exists_in_taxonomy = 1;
}

if (not exists_taxonomy_tag("packaging_shapes", $shape_id)) {
if (not $shape_id_exists_in_taxonomy) {
$log->error("ecoscore shape does not exist in taxonomy", {shape => $shape, shape_id => $shape_id})
if $log->is_error();
$errors++;
Expand Down
11 changes: 0 additions & 11 deletions lib/ProductOpener/Index.pm
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ BEGIN {
&normalize
$memd
$lang_dir
%texts
Expand All @@ -47,7 +46,6 @@ use ProductOpener::Config qw/:all/;

use CGI qw/:standard escape unescape/;
use Time::Local;
use Cache::Memcached::Fast;
use Digest::MD5 qw(md5);
use URI::Escape;
use URI::Escape::XS;
Expand All @@ -65,15 +63,6 @@ use HTML::Entities qw(decode_entities);
#setlocale(LC_CTYPE, "fr_FR"); # May need to be changed depending on system
# -> setting a locale makes unac_string fail to unaccent... :-(

# Initialize exported variables

$memd = Cache::Memcached::Fast->new(
{
'servers' => ["127.0.0.1:11211"],
'utf8' => 1,
}
);

# Load the texts from the /lang directory

# The /lang directory is not present in the openfoodfacts-server repository,
Expand Down
1 change: 0 additions & 1 deletion lib/ProductOpener/Orgs.pm
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ use ProductOpener::Store qw/:all/;
use ProductOpener::Config qw/:all/;
use ProductOpener::Mail qw/:all/;
use ProductOpener::Lang qw/:all/;
use ProductOpener::Cache qw/:all/;
use ProductOpener::Display qw/:all/;
use ProductOpener::Tags qw/:all/;

Expand Down

0 comments on commit e1304de

Please sign in to comment.