Skip to content

Commit

Permalink
fix: Ingredient parsing improvement for additives (#6569)
Browse files Browse the repository at this point in the history
* emulsifying as synonym to emulsifyer
  • Loading branch information
borrokk committed Apr 29, 2022
1 parent fe05d3d commit f994a08
Show file tree
Hide file tree
Showing 4 changed files with 161 additions and 35 deletions.
119 changes: 119 additions & 0 deletions t/expected_test_results/ingredients/en-emulsifier-synonyms.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
{
"ingredients" : [
{
"id": "en:emulsifier",
"ingredients": [
{
"from_palm_oil":"maybe",
"id" : "en:e471",
"percent_estimate" : 50,
"percent_max" : 100,
"percent_min" : 25,
"text" : "e471",
"vegan" : "maybe",
"vegetarian" : "maybe"
},
{
"id" : "en:e477",
"percent_estimate" : 25,
"percent_max" : 50,
"percent_min" : 0,
"text" : "e477",
"vegan" : "maybe",
"vegetarian" : "maybe"
}
],
"percent_estimate" : 75,
"percent_max" : 100,
"percent_min" : 50,
"text" : "Emulsifying"
},
{
"id" : "en:stabiliser",
"ingredients" : [
{
"id" : "en:e412",
"percent_estimate" : 12.5,
"percent_max" : 50,
"percent_min" : 0,
"text" : "e412",
"vegan" : "yes",
"vegetarian" : "yes"
},
{
"id" : "en:e410",
"percent_estimate" : 12.5,
"percent_max" : 25,
"percent_min" : 0,
"text" : "e410",
"vegan" : "yes",
"vegetarian" : "yes"
}
],
"percent_estimate" : 25,
"percent_max" : 50,
"percent_min" : 0,
"text" : "Stabilizing Agents"
}
],
"ingredients_analysis" : {
"en:may-contain-palm-oil" : [
"en:e471"
],
"en:maybe-vegan" : [
"en:e471",
"en:e477"
],
"en:maybe-vegetarian" : [
"en:e471",
"en:e477"
]
},
"ingredients_analysis_tags" : [
"en:may-contain-palm-oil",
"en:maybe-vegan",
"en:maybe-vegetarian"
],
"ingredients_hierarchy" : [
"en:emulsifier",
"en:stabiliser",
"en:e471",
"en:e477",
"en:e412",
"en:e410"
],
"ingredients_n" : 6,
"ingredients_n_tags" : [
"6",
"1-10"
],
"ingredients_original_tags" : [
"en:emulsifier",
"en:stabiliser",
"en:e471",
"en:e477",
"en:e412",
"en:e410"
],
"ingredients_percent_analysis" : 1,
"ingredients_tags" : [
"en:emulsifier",
"en:stabiliser",
"en:e471",
"en:e477",
"en:e412",
"en:e410"
],
"ingredients_text" : "Emulsifying (INS 471, INS 477) & Stabilizing Agents (INS 412, INS 410)",
"ingredients_with_specified_percent_n" : 0,
"ingredients_with_specified_percent_sum" : 0,
"ingredients_with_unspecified_percent_n" : 4,
"ingredients_with_unspecified_percent_sum" : 100,
"known_ingredients_n" : 6,
"lc" : "en",
"nutriments" : {
"fruits-vegetables-nuts-estimate-from-ingredients_100g" : 0,
"fruits-vegetables-nuts-estimate-from-ingredients_serving" : 0
},
"unknown_ingredients_n" : 0
}
71 changes: 39 additions & 32 deletions t/ingredients.t
Original file line number Diff line number Diff line change
Expand Up @@ -34,31 +34,31 @@ my $resultsdir;

GetOptions ("results=s" => \$resultsdir)
or die("Error in command line arguments.\n\n" . $usage);

if ((defined $resultsdir) and (! -e $resultsdir)) {
mkdir($resultsdir, 0755) or die("Could not create $resultsdir directory: $!\n");
}

my @tests = (

# FR

[
'fr-chocolate-cake',
{
lc => "fr",
ingredients_text => "farine (12%), chocolat (beurre de cacao (15%), sucre [10%], protéines de lait, oeuf 1%) - émulsifiants : E463, E432 et E472 - correcteurs d'acidité : E322/E333 E474-E475, acidifiant (acide citrique, acide phosphorique) - sel"
}
],
],

[
'fr-palm-kernel-fat',
{
lc => "fr",
ingredients_text => "graisse de palmiste"
}
],
],

[
'fr-marmelade',
{
Expand All @@ -75,7 +75,14 @@ my @tests = (
ingredients_text => "Natural orange flavor, Lemon flavouring"
}
],

# test synonyms for emulsifier/emulsifying - also checking if synonyms are case sensitive
[
'en-emulsifier-synonyms',
{
lc => "en",
ingredients_text => "Emulsifying (INS 471, INS 477) & Stabilizing Agents (INS 412, INS 410)"
}
],
# FR * label
[
"fr-starred-label",
Expand All @@ -101,7 +108,7 @@ my @tests = (
lc => "fr",
ingredients_text => "Fraise 12,3% ; Orange 6.5%, Pomme (3,5%)",
}
],
],

# FR origins labels
[
Expand All @@ -120,7 +127,7 @@ my @tests = (
ingredients_text => "80% jus de pomme biologique, 20% de coing biologique, sel marin, 98% chlorure de sodium (France, Italie)",
}
],

[
"fr-percents-origins-2",
{
Expand All @@ -136,7 +143,7 @@ my @tests = (
lc => "fr",
ingredients_text => "mono - et diglycérides d'acides gras d'origine végétale, huile d'origine végétale, gélatine (origine végétale)",
}
],
],

# from vegetal origin
[
Expand All @@ -145,7 +152,7 @@ my @tests = (
lc => "en",
ingredients_text => "Gelatin (vegetal), Charcoal (not from animals), ferments (from plants), non-animal rennet, flavours (derived from plants)",
}
],
],

# FR labels
[
Expand All @@ -154,31 +161,31 @@ my @tests = (
lc => "fr",
ingredients_text => "jus d'orange (sans conservateur), saumon (msc), sans gluten",
}
],
],

# Processing

[
"fr-processing-multi",
{
lc => "fr",
ingredients_text => "tomates pelées cuites, rondelle de citron, dés de courgette, lait cru, aubergines crues, jambon cru en tranches",
}
],
],

# Bugs #3827, #3706, #3826 - truncated purée

[
"fr-truncated-puree",
{
lc => "fr",
ingredients_text =>
"19% purée de tomate, 90% boeuf, 100% pur jus de fruit, 45% de matière grasses",
}
],
],

# FI additives, percent

[
"fi-additives-percents",
{
Expand All @@ -188,7 +195,7 @@ my @tests = (
],

# FI percents

[
"fi-percents",
{
Expand All @@ -206,22 +213,22 @@ my @tests = (
ingredients_text => "hyytelöimisaine (pektiinit)",
}
],

[
"fi-origins",
{
lc => "fi",
ingredients_text => "Mansikka alkuperä Suomi, Mustaherukka (alkuperä Etelä-Afrikka), Vadelma (alkuperä : Ruotsi), Appelsiini (luomu), kaakao ja kaakaovoi (reilu kauppa)",
}
],

[
"fi-additives-origins",
{
lc => "fi",
ingredients_text => "emulgointiaine : auringonkukkalesitiini, aromi)(EU), vehnäjauho 33% (Ranska), sokeri",
}
],
],

# FI labels
[
Expand Down Expand Up @@ -267,7 +274,7 @@ my @tests = (
ingredients_text => "a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,0,1,2,3,4,5,6,7,8,9,10,100,1000,vt,leaf,something(bio),somethingelse(u)",
}
],

# Origins with regions
[
"en-origins",
Expand Down Expand Up @@ -306,7 +313,7 @@ my @tests = (
ingredients_text => "emmental (Allemagne, France, Pays-Bas, contient lait)",
}
],

# ES percent, too many loops

[
Expand All @@ -316,17 +323,17 @@ my @tests = (
ingredients_text => "Tomate, pimiento (12%), atún (10%), aceite de oliva virgen extra (4%), huevo (3%), cebolla (3%), azúcar, almidón de maíz, sal y acidulante: ácido cítrico.",
}
],

# Ingredient that is also an existing label - https://github.com/openfoodfacts/openfoodfacts-server/issues/4907

[
"fr-huile-de-palme-certifiee-durable",
{
lc => "fr",
ingredients_text => "huiles végétales non hydrogénées (huile de palme certifiée durable, huile de colza)",
},
],

# Russian oil parsing
[
"ru-russian-oil",
Expand Down Expand Up @@ -418,25 +425,25 @@ foreach my $test_ref (@tests) {

my $testid = $test_ref->[0];
my $product_ref = $test_ref->[1];

# Run the test

if (defined $product_ref->{labels}) {
compute_field_tags($product_ref, $product_ref->{lc}, "labels");
}

extract_ingredients_from_text($product_ref);

# Save the result

if (defined $resultsdir) {
open (my $result, ">:encoding(UTF-8)", "$resultsdir/$testid.json") or die("Could not create $resultsdir/$testid.json: $!\n");
print $result $json->pretty->encode($product_ref);
close ($result);
}

# Compare the result with the expected result

if (open (my $expected_result, "<:encoding(UTF-8)", "$expected_dir/$testdir/$testid.json")) {

local $/; #Enable 'slurp' mode
Expand Down
2 changes: 1 addition & 1 deletion taxonomies/additives_classes.txt
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ description:fr:Additif alimentaire qui stabilise, retient ou intensifie la coule


# nova:en:emulsifier
en: Emulsifier, clouding agent, crystallization inhibitor, density adjustment agent, dispersing agent, emulsifier blend, emulsifying agent, plasticizer, surface active agent, suspension agent
en: Emulsifier, clouding agent, crystallization inhibitor, density adjustment agent, dispersing agent, emulsifier blend, emulsifying agent, emulsifying, plasticizer, surface active agent, suspension agent
bg: Емулгатор, емулгатори
ca:Emulsionant, emulgent, emulgents, emulsiu, agent dispersant, agents dispersants
cs: Emulgátor
Expand Down
4 changes: 2 additions & 2 deletions taxonomies/allergens.txt
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ jp:魚,サバ,サケ
lt:žuvis
lv:zivis
mt:ħut
nb:fisk, sild, ansjos, brisling, kryddersild, kryddersildefilèt, eddiksild, pollock, hyse, sei, torsk, hvitlaks, kvitlaks, vassild
nb:fisk, sild, ansjos, brisling, kryddersild, kryddersildefilèt, eddiksild, pollock, hyse, sei, torsk, hvitlaks, kvitlaks, vassild
nl:vis
nl_be:vis
pl:ryba
Expand Down Expand Up @@ -220,7 +220,7 @@ ar:حليبب, لاكتوز
bg:суроватка, млечна, мляко, лактоза
cs:mléko, laktózy
da:mælk, laktose, mælkepulver, skummetmælk, skummetmælkspulver, sødmælk, sødmælkspulver, komælk, fløde, piskefløde, mælkefedt, mælkeproteiner, mælkesukker, vallepulver, kærnemælk, letmælk, ost, emmentaler, cheddarost, ostepuler
de:Milch, Bergkäse, Blauschimmelkäse, Butter, Buttermilch, Buttermilchpulver, Butterreinfett, Camembert, Cheddar, Edamer, Edamer-Käse, Emmentaler, Frischkäse, Gorgonzola, Gouda, Hartkäse, Joghurt, Joghurtpulver, Käse, Käsepulver, Kondensmilch, Kuhmilch, Kuhvollmilch, Laktose, Rohmilch, Milchzucker, Milcheiweiß, Milchpulver, Milcherzeugnisse, Magermilchpulver, Magermilchjoghurt, Magermilchkonzentrat, Magermilchjoghurtpulver, Magermilch, Magerquark, Molkenpulver, Molkenerzeugnis, Molkeneiweißkäse, Milchschokolade, Mozzarella, Pecorino, Quark, Ricotta, Speisequark, Sahne, Sahnepulver, Sahnejoghurt, Sauermilchquark, Sauerrahm, Rahm, Rahmjoghurt, Sauermilch, Sauerrahmpulver, Schlagsahne, Süssmolkenpulver, Süßmolkenpulver, Vollmilch, Vollmilchschokolade, Vollmilchpulver, Ziegenmilch, Süßmolke, Süssmolke, Butterfett, Kondensmagermilch, Joghurterzeugnis, Schafsmilch, Schafmilch, Käsereimilch, Milchfett, Milchprotein, Molke, Sauermolke, Molkeneiweiß, Molkeneiweiss, Molkepulver, Milcheiweißerzeugnis, Milcheiweisserzeugnis, Magermilchpulverzusatz, Molkenprotein, Buttereinfett, Süßmolkepulver, Süssmolkepulver, Vollmilchschokoladeüberzug
de:Milch, Bergkäse, Blauschimmelkäse, Butter, Buttermilch, Buttermilchpulver, Butterreinfett, Camembert, Cheddar, Edamer, Edamer-Käse, Emmentaler, Frischkäse, Gorgonzola, Gouda, Hartkäse, Joghurt, Joghurtpulver, Käse, Käsepulver, Kondensmilch, Kuhmilch, Kuhvollmilch, Laktose, Rohmilch, Milchzucker, Milcheiweiß, Milchpulver, Milcherzeugnisse, Magermilchpulver, Magermilchjoghurt, Magermilchkonzentrat, Magermilchjoghurtpulver, Magermilch, Magerquark, Molkenpulver, Molkenerzeugnis, Molkeneiweißkäse, Milchschokolade, Mozzarella, Pecorino, Quark, Ricotta, Speisequark, Sahne, Sahnepulver, Sahnejoghurt, Sauermilchquark, Sauerrahm, Rahm, Rahmjoghurt, Sauermilch, Sauerrahmpulver, Schlagsahne, Süssmolkenpulver, Süßmolkenpulver, Vollmilch, Vollmilchschokolade, Vollmilchpulver, Ziegenmilch, Süßmolke, Süssmolke, Butterfett, Kondensmagermilch, Joghurterzeugnis, Schafsmilch, Schafmilch, Käsereimilch, Milchfett, Milchprotein, Molke, Sauermolke, Molkeneiweiß, Molkeneiweiss, Molkepulver, Milcheiweißerzeugnis, Milcheiweisserzeugnis, Magermilchpulverzusatz, Molkenprotein, Buttereinfett, Süßmolkepulver, Süssmolkepulver, Vollmilchschokoladeüberzug
el:γάλα, λακτόζης
es:leche, lactosa, lácteos, derivados lácteos, productos lácteos, leche y derivados, queso, proteína de leche, leche y sus derivados
et:laktoos, lõssi, piim, piima, piimast
Expand Down

0 comments on commit f994a08

Please sign in to comment.