openfoodfacts · perierc · Feb 7, 2024 · Jan 25, 2024 · Jan 25, 2024 · Jan 25, 2024
@@ -286,6 +286,12 @@ def is_valid_branch_name(self):
         """
         return parser_utils.normalize_text(self.branch_name, char="_") == self.branch_name
 
+    def is_tag_key(self, key):
+        """
+        Helper function to check if a key is a tag key (tags_XX)
+        """
+        return key.startswith("tags_") and not ("_ids_" in key or key.endswith("_comments"))
+
     async def list_projects(self, status=None):
         """
         Helper function for listing all existing projects created in Taxonomy Editor
@@ -494,12 +500,21 @@ async def update_node(self, label, entry, new_node):
         curr_node = result[0]["n"]
         deleted_keys = set(curr_node.keys()) - set(new_node.keys())
 
+        # Check for deleted keys having associated comments and delete them too
+        comments_keys_to_delete = set()
+        for key in deleted_keys:
+            comments_key = key + "_comments"
+            if new_node.get(comments_key) is not None:
+                comments_keys_to_delete.add(comments_key)
+        deleted_keys = deleted_keys.union(comments_keys_to_delete)
+
         # Check for keys having null/empty values
         for key in new_node.keys():
             if ((new_node[key] == []) or (new_node[key] is None)) and key != "preceding_lines":
                 deleted_keys.add(key)
+                deleted_keys.add(key + "_comments")
                 # Delete tags_ids if we delete tags of a language
-                if key.startswith("tags_") and "_ids_" not in key:
+                if self.is_tag_key(key):
                     deleted_keys.add("tags_ids_" + key.split("_", 1)[1])
 
         # Build query
@@ -515,20 +530,16 @@ async def update_node(self, label, entry, new_node):
         normalised_new_node = {}
         stopwords = await self.get_stopwords_dict()
         for key in set(new_node.keys()) - deleted_keys:
-            if key.startswith("tags_"):
-                if "_ids_" not in key:
-                    keys_language_code = key.split("_", 1)[1]
-                    normalised_value = []
-                    for value in new_node[key]:
-                        normalised_value.append(
-                            parser_utils.normalize_text(
-                                value, keys_language_code, stopwords=stopwords
-                            )
-                        )
-                    normalised_new_node[key] = new_node[key]
-                    normalised_new_node["tags_ids_" + keys_language_code] = normalised_value
-                else:
-                    pass  # We generate tags_ids, and ignore the one sent
+            if self.is_tag_key(key):
+                # Normalise tags and store them in tags_ids
+                keys_language_code = key.split("_", 1)[1]
+                normalised_value = []
+                for value in new_node[key]:
+                    normalised_value.append(
+                        parser_utils.normalize_text(value, keys_language_code, stopwords=stopwords)
+                    )
+                normalised_new_node[key] = new_node[key]
+                normalised_new_node["tags_ids_" + keys_language_code] = normalised_value
             else:
                 # No need to normalise
                 normalised_new_node[key] = new_node[key]

@@ -66,16 +66,17 @@ def _create_entry_nodes(self, entry_nodes: list[NodeData], project_label: str):
         # we don't know in advance which properties and tags
         # we will encounter in the batch
         # so we accumulate them in this set
-        seen_properties_and_tags = set()
+        seen_properties_and_tags_and_comments = set()
 
         for entry_node in entry_nodes:
             if entry_node.get_node_type() != NodeType.ENTRY:
                 raise ValueError(f"Only ENTRY nodes should be passed to this function")
-            seen_properties_and_tags.update(entry_node.tags)
-            seen_properties_and_tags.update(entry_node.properties)
+            seen_properties_and_tags_and_comments.update(entry_node.tags)
+            seen_properties_and_tags_and_comments.update(entry_node.properties)
+            seen_properties_and_tags_and_comments.update(entry_node.comments)
 
         additional_properties_queries = [
-            f"{key} : entry_node.{key}" for key in seen_properties_and_tags
+            f"{key} : entry_node.{key}" for key in seen_properties_and_tags_and_comments
         ]
 
         base_properties_query = f"""

@@ -1,4 +1,5 @@
 import collections
+import copy
 import logging
 import re
 import sys
@@ -25,10 +26,16 @@ class NodeData:
     is_before: str | None = None
     main_language: str | None = None
     preceding_lines: list[str] = field(default_factory=list)
-    parent_tag: list[tuple[str, int]] = field(default_factory=list)
+    parent_tags: list[tuple[str, int]] = field(default_factory=list)
     src_position: int | None = None
     properties: dict[str, str] = field(default_factory=dict)
     tags: dict[str, list[str]] = field(default_factory=dict)
+    comments: dict[str, list[str]] = field(default_factory=dict)
+    # comments_stack is a list of tuples (line_number, comment),
+    # to keep track of comments just above the current line
+    # during parsing of an entry, to be able to add them
+    # to the right property or tag when possible
+    comments_stack: list[(int, str)] = field(default_factory=list)
 
     def to_dict(self):
         return {
@@ -38,6 +45,7 @@ def to_dict(self):
             "src_position": self.src_position,
             **self.properties,
             **self.tags,
+            **self.comments,
         }
 
     def get_node_type(self):
@@ -151,12 +159,12 @@ def _header_harvest(self, filename: str) -> tuple[list[str], int]:
 
     def _entry_end(self, line: str, data: NodeData) -> bool:
         """Return True if the block ended"""
-        # stopwords and synonyms are one-liner, entries are separated by a blank line
-        if line.startswith("stopwords") or line.startswith("synonyms") or not line:
-            # can be the end of an block or just additional line separator,
-            # file_iter() always end with ''
-            if data.id:  # to be sure that it's an end
-                return True
+        if data.id.startswith("stopwords") or data.id.startswith("synonyms"):
+            # stopwords and synonyms are one-liners; if the id is set, it's the end
+            return True
+        if not line and data.id:
+            # entries are separated by a blank line
+            return True
         return False
 
     def _remove_separating_line(self, data: NodeData) -> NodeData:
@@ -186,6 +194,43 @@ def _remove_separating_line(self, data: NodeData) -> NodeData:
                 data.preceding_lines.pop(0)
         return data
 
+    def _get_node_data_with_comments_above_key(
+        self, data: NodeData, line_number: int, key: str
+    ) -> NodeData:
+        """Returns the updated node data with comments above the given
+        key stored in the {key}_comments property."""
+        new_data = copy.deepcopy(data)
+
+        # Get comments just above the given line
+        comments_above = []
+        current_line = line_number - 1
+        while new_data.comments_stack and new_data.comments_stack[-1][0] == current_line:
+            comments_above.append(new_data.comments_stack.pop()[1])
+            current_line -= 1
+        if comments_above:
+            new_data.comments[key + "_comments"] = comments_above[::-1]
+
+        return new_data
+
+    def _get_node_data_with_parent_and_end_comments(self, data: NodeData) -> NodeData:
+        """Returns the updated node data with parent and end comments"""
+        new_data = copy.deepcopy(data)
+
+        # Get parent comments (part of an entry block and just above/between the parents lines)
+        parent_comments = []
+        while new_data.preceding_lines and new_data.preceding_lines[-1] != "":
+            parent_comments.append(new_data.preceding_lines.pop())
+        if parent_comments:
+            new_data.comments["parent_comments"] = parent_comments[::-1]
+
+        # Get end comments (part of an entry block after the last tag/prop
+        # and before the separating blank line)
+        end_comments = [comment[1] for comment in new_data.comments_stack]
+        if end_comments:
+            new_data.comments["end_comments"] = end_comments
+
+        return new_data
+
     def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[NodeData]:
         """Transform data from file to dictionary"""
         saved_nodes = []
@@ -215,14 +260,21 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                     self.parser_logger.error(msg)
                 else:
                     data = self._remove_separating_line(data)
+                    if data.get_node_type() == NodeType.ENTRY:
+                        data = self._get_node_data_with_parent_and_end_comments(data)
                     yield data  # another function will use this dictionary to create a node
                     saved_nodes.append(data.id)
                 data = NodeData(is_before=data.id)
 
             # harvest the line
             if not (line) or line[0] == "#":
-                # comment or blank
-                data.preceding_lines.append(line)
+                # comment or blank line
+                if data.id:
+                    # we are within the node definition
+                    data.comments_stack.append((line_number, line))
+                else:
+                    # we are before the actual node
+                    data.preceding_lines.append(line)
             else:
                 line = line.rstrip(",")
                 if not data.src_position:
@@ -234,7 +286,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                     index_stopwords += 1
                     # remove "stopwords:" part
                     line = line[10:]
-                    # compute raw values outside _get_lc_value as it removes stop words !
+                    # compute raw values outside _get_lc_value as it normalizes them!
                     tags = [words.strip() for words in line[3:].split(",")]
                     try:
                         lc, value = self._get_lc_value(line)
@@ -252,7 +304,9 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                     id = "synonyms:" + str(index_synonyms)
                     data = self._set_data_id(data, id, line_number)
                     index_synonyms += 1
+                    # remove "synonyms:" part
                     line = line[9:]
+                    # compute raw values outside _get_lc_value as it normalizes them!
                     tags = [words.strip() for words in line[3:].split(",")]
                     try:
                         lc, value = self._get_lc_value(line)
@@ -265,7 +319,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                         data.tags["tags_ids_" + lc] = value
                 elif line[0] == "<":
                     # parent definition
-                    data.parent_tag.append((self._add_line(line[1:]), line_number + 1))
+                    data.parent_tags.append((self._add_line(line[1:]), line_number + 1))
                 elif language_code_prefix.match(line):
                     # synonyms definition
                     if not data.id:
@@ -286,6 +340,9 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                             tagsids_list.append(word_normalized)
                     data.tags["tags_" + lang] = tags_list
                     data.tags["tags_ids_" + lang] = tagsids_list
+                    data = self._get_node_data_with_comments_above_key(
+                        data, line_number, "tags_" + lang
+                    )
                 else:
                     # property definition
                     property_name = None
@@ -306,7 +363,11 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                                 f"Reading error at line {line_number + 1}, unexpected format: '{self.parser_logger.ellipsis(line)}'"
                             )
                         if property_name:
-                            data.properties["prop_" + property_name + "_" + lc] = property_value
+                            prop_key = "prop_" + property_name + "_" + lc
+                            data.properties[prop_key] = property_value
+                            data = self._get_node_data_with_comments_above_key(
+                                data, line_number, prop_key
+                            )
 
         data.id = "__footer__"
         data.preceding_lines.pop(0)
@@ -395,8 +456,8 @@ def _create_taxonomy(self, filename: str) -> Taxonomy:
                 other_nodes.append(entry)
             if entry.is_before:
                 previous_links.append(PreviousLink(before_id=entry.is_before, id=entry.id))
-            if entry.parent_tag:
-                for position, (parent, line_position) in enumerate(entry.parent_tag):
+            if entry.parent_tags:
+                for position, (parent, line_position) in enumerate(entry.parent_tags):
                     raw_child_links.append(
                         ChildLink(
                             parent_id=parent,

@@ -55,10 +55,10 @@ def get_tags_line(self, node, lc):
         return lc + ":" + line
 
     def list_property_and_lc(self, node):
-        """return a ordered list of properties with their language code (lc)"""
+        """return an ordered list of properties with their language code (lc)"""
         # there is no rule for the order of properties
         # properties will be arranged in alphabetical order
-        return [property[5:] for property in node if property.startswith("prop_")]
+        return sorted([property[5:] for property in node if property.startswith("prop_") and not property.endswith("_comments")])
 
     def get_property_line(self, node, property):
         """return a string that should look like the original property line"""
@@ -90,28 +90,33 @@ def iter_lines(self, project_label):
             if add_blank:
                 yield ""
             # comments
-            if node["preceding_lines"]:
-                yield from node["preceding_lines"]
+            yield from node.get("preceding_lines", [])
             if has_content:
                 tags_lc = self.list_tags_lc(node)
                 if node["id"].startswith("stopwords"):
                     yield "stopwords:" + self.get_tags_line(node, tags_lc[0])
                 elif node["id"].startswith("synonyms"):
                     yield "synonyms:" + self.get_tags_line(node, tags_lc[0])
                 else:
-                    # synonyms line
+                    # parents
+                    yield from node.get("parent_comments", [])
                     yield from self.get_parents_lines(parents)
                     # main language synonyms first
                     main_language = node.pop("main_language")
                     tags_lc.remove(main_language)
+                    yield from node.get("tags_" + main_language + "_comments", [])
                     yield self.get_tags_line(node, main_language)
                     # more synonyms after
                     for lc in tags_lc:
+                        yield from node.get("tags_" + lc + "_comments", [])
                         yield self.get_tags_line(node, lc)
                     # properties
                     properties_list = self.list_property_and_lc(node)
                     for property in properties_list:
+                        yield from node.get("prop_" + property + "_comments", [])
                         yield self.get_property_line(node, property)
+                    # final comments
+                    yield from node.get("end_comments", [])
             previous_block_id = node["id"]
 
     def rewrite_file(self, filename, lines):

@@ -23,8 +23,8 @@ fr:yaourts au fruit de la passion allégés
 # meat
 
 en:meat
-vegan:en:no
 carbon_footprint_fr_foodges_value:fr:10
+vegan:en:no
 
 <en:meat
 en:fake-meat

@@ -25,7 +25,7 @@ def test_fileiter(neo4j):
     file_iterator = taxonomy_parser._file_iter(TEST_TAXONOMY_TXT)
     for counter, (_, line) in enumerate(file_iterator):
         assert line == "" or line[0] == "#" or ":" in line
-        if counter == 26:
+        if counter == 25:
             assert line == "carbon_footprint_fr_foodges_value:fr:10"
     assert counter == 37
 

@@ -1,4 +1,4 @@
-import { Box, Grid, Paper, TextField, Typography } from "@mui/material";
+import { Box, Grid, Paper, Typography } from "@mui/material";
 import MaterialTable, { MTableToolbar } from "@material-table/core";
 import { useEffect, useState } from "react";
 
@@ -14,7 +14,11 @@ const ListAllEntryProperties = ({ nodeObject, setNodeObject }) => {
       // UUID of properties will have a "_uuid" suffix
       // Ex: prop_vegan_en_uuid
 
-      if (key.startsWith("prop") && key.endsWith("uuid")) {
+      if (
+        key.startsWith("prop") &&
+        key.endsWith("uuid") &&
+        !key.endsWith("_comments_uuid")
+      ) {
         const uuid = nodeObject[key][0]; // UUID
         // Removing "prop_" prefix from key to render only the name
         const property_name = key.split("_").slice(1, -1).join("_");
@@ -33,13 +37,6 @@ const ListAllEntryProperties = ({ nodeObject, setNodeObject }) => {
     setData(renderedProperties);
   }, [nodeObject]);
 
-  // Helper function used for changing comments from node
-  const changeCommentData = (value) => {
-    const newNodeObject = { ...nodeObject };
-    newNodeObject["preceding_lines"] = value;
-    setNodeObject(newNodeObject);
-  };
-
   // Helper function used for changing properties of node
   const changePropertyData = (key, value) => {
     setNodeObject((prevState) => {
@@ -60,21 +57,6 @@ const ListAllEntryProperties = ({ nodeObject, setNodeObject }) => {
 
   return (
     <Box>
-      {/* Comments */}
-      <Typography sx={{ ml: 4, mt: 2, mb: 1 }} variant="h5">
-        Comments
-      </Typography>
-      <TextField
-        sx={{ ml: 8, mt: 1, width: 250 }}
-        minRows={4}
-        multiline
-        onChange={(event) => {
-          changeCommentData(event.target.value.split("\n"));
-        }}
-        value={nodeObject?.preceding_lines.join("\n")}
-        variant="outlined"
-      />
-
       {/* Properties */}
       <Box sx={{ width: "50%", ml: 4 }}>
         <MaterialTable