feat: store raw stopwords and rewrite them at export (#358)

* store raw stopwords and rewrite them at export * Apply suggestions from code review Co-authored-by: Alex Garel <alex@garel.org> --------- Co-authored-by: Alex Garel <alex@garel.org>
openfoodfacts · Jan 24, 2024 · 444f76d · 444f76d
1 parent b18ac03
commit 444f76d
Show file tree

Hide file tree

Showing 5 changed files with 16 additions and 15 deletions.
diff --git a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
@@ -243,15 +243,20 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                     id = "stopwords:" + str(index_stopwords)
                     data = self._set_data_id(data, id, line_number)
                     index_stopwords += 1
+                    # remove "stopwords:" part
+                    line = line[10:]
+                    # compute raw values outside _get_lc_value as it removes stop words !
+                    tags = [words.strip() for words in line[3:].split(",")]
                     try:
-                        lc, value = self._get_lc_value(line[10:])
+                        lc, value = self._get_lc_value(line)
                     except ValueError:
                         self.parser_logger.error(
                             f"Missing language code at line {line_number + 1} ? '{self.parser_logger.ellipsis(line)}'"
                         )
                     else:
-                        data.tags["tags_" + lc] = value
-                        # add the list with its lc
+                        data.tags["tags_" + lc] = tags
+                        data.tags["tags_ids_" + lc] = value
+                        # add the normalized list with its lc
                         self.stopwords[lc] = value
                 elif line.startswith("synonyms"):
                     # general synonyms definition for a language

diff --git a/parser/openfoodfacts_taxonomy_parser/unparser.py b/parser/openfoodfacts_taxonomy_parser/unparser.py
@@ -39,14 +39,9 @@ def get_all_nodes(self, project_label):
     def list_tags_lc(self, node):
         """return an ordered list of the language codes (lc) used in a node"""
         lc_list = []
-        if "stopwords" in node["id"]:
-            # stopwords node only have a tags_lc property
-            key = "tags_"
-            # number of dashes to split on to get language code
-            dash_before_lc = 1
-        else:
-            key = "tags_ids_"
-            dash_before_lc = 2
+        key = "tags_ids_"
+        # number of dashes to split on to get language code
+        dash_before_lc = 2
 
         for property in node:
             if property.startswith(key):

diff --git a/parser/tests/data/test.txt b/parser/tests/data/test.txt
@@ -1,6 +1,6 @@
 # test taxonomy
 
-stopwords:fr: aux,au,de,le,du,la,a,et
+stopwords:fr: aux,au,de,le,du,la,a,et,test normalisation
 
 synonyms:en:passion fruit, passionfruit
 

diff --git a/parser/tests/integration/test_parse_unparse_integration.py b/parser/tests/integration/test_parse_unparse_integration.py
@@ -56,7 +56,7 @@ def test_round_trip(neo4j):
     for line in original_lines:
         # first tweak: spaces between stopwords
         if line.startswith("stopwords:fr: aux"):
-            line = "stopwords:fr:aux, au, de, le, du, la, a, et"
+            line = "stopwords:fr:aux, au, de, le, du, la, a, et, test normalisation"
         # second tweak: renaming parent
         elif line.startswith("<fr:yaourts fruit de la passion"):
             line = "<en:Passion fruit yogurts"
@@ -98,7 +98,7 @@ def test_two_branch_round_trip(neo4j):
     for line in original_lines:
         # first tweak: spaces between stopwords
         if line.startswith("stopwords:fr: aux"):
-            line = "stopwords:fr:aux, au, de, le, du, la, a, et"
+            line = "stopwords:fr:aux, au, de, le, du, la, a, et, test normalisation"
         # second tweak: renaming parent
         elif line.startswith("<fr:yaourts fruit de la passion"):
             line = "<en:Passion fruit yogurts"

diff --git a/parser/tests/integration/test_parser_integration.py b/parser/tests/integration/test_parser_integration.py
@@ -69,7 +69,8 @@ def test_calling(neo4j):
         results = session.run(query)
         expected_stopwords = {
             "id": "stopwords:0",
-            "tags_fr": ["aux", "au", "de", "le", "du", "la", "a", "et"],
+            "tags_fr": ["aux", "au", "de", "le", "du", "la", "a", "et", "test normalisation"],
+            "tags_ids_fr": ["aux", "au", "de", "le", "du", "la", "a", "et", "test-normalisation"],
             "preceding_lines": [],
         }
         for result in results: