Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(parser,frontend,backend): conserve comments order and sort properties at export #364

Merged
merged 11 commits into from
Feb 7, 2024
41 changes: 26 additions & 15 deletions backend/editor/entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,12 @@ def is_valid_branch_name(self):
"""
return parser_utils.normalize_text(self.branch_name, char="_") == self.branch_name

def is_tag_key(self, key):
"""
Helper function to check if a key is a tag key (tags_XX)
"""
return key.startswith("tags_") and not ("_ids_" in key or key.endswith("_comments"))

async def list_projects(self, status=None):
"""
Helper function for listing all existing projects created in Taxonomy Editor
Expand Down Expand Up @@ -494,12 +500,21 @@ async def update_node(self, label, entry, new_node):
curr_node = result[0]["n"]
deleted_keys = set(curr_node.keys()) - set(new_node.keys())

# Check for deleted keys having associated comments and delete them too
comments_keys_to_delete = set()
for key in deleted_keys:
comments_key = key + "_comments"
if new_node.get(comments_key) is not None:
alexgarel marked this conversation as resolved.
Show resolved Hide resolved
comments_keys_to_delete.add(comments_key)
deleted_keys = deleted_keys.union(comments_keys_to_delete)

# Check for keys having null/empty values
for key in new_node.keys():
if ((new_node[key] == []) or (new_node[key] is None)) and key != "preceding_lines":
deleted_keys.add(key)
deleted_keys.add(key + "_comments")
# Delete tags_ids if we delete tags of a language
if key.startswith("tags_") and "_ids_" not in key:
if self.is_tag_key(key):
deleted_keys.add("tags_ids_" + key.split("_", 1)[1])

# Build query
Expand All @@ -515,20 +530,16 @@ async def update_node(self, label, entry, new_node):
normalised_new_node = {}
stopwords = await self.get_stopwords_dict()
for key in set(new_node.keys()) - deleted_keys:
if key.startswith("tags_"):
if "_ids_" not in key:
keys_language_code = key.split("_", 1)[1]
normalised_value = []
for value in new_node[key]:
normalised_value.append(
parser_utils.normalize_text(
value, keys_language_code, stopwords=stopwords
)
)
normalised_new_node[key] = new_node[key]
normalised_new_node["tags_ids_" + keys_language_code] = normalised_value
else:
pass # We generate tags_ids, and ignore the one sent
if self.is_tag_key(key):
# Normalise tags and store them in tags_ids
keys_language_code = key.split("_", 1)[1]
normalised_value = []
for value in new_node[key]:
normalised_value.append(
parser_utils.normalize_text(value, keys_language_code, stopwords=stopwords)
)
normalised_new_node[key] = new_node[key]
normalised_new_node["tags_ids_" + keys_language_code] = normalised_value
else:
# No need to normalise
normalised_new_node[key] = new_node[key]
Expand Down
9 changes: 5 additions & 4 deletions parser/openfoodfacts_taxonomy_parser/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,16 +66,17 @@ def _create_entry_nodes(self, entry_nodes: list[NodeData], project_label: str):
# we don't know in advance which properties and tags
# we will encounter in the batch
# so we accumulate them in this set
seen_properties_and_tags = set()
seen_properties_and_tags_and_comments = set()

for entry_node in entry_nodes:
if entry_node.get_node_type() != NodeType.ENTRY:
raise ValueError(f"Only ENTRY nodes should be passed to this function")
seen_properties_and_tags.update(entry_node.tags)
seen_properties_and_tags.update(entry_node.properties)
seen_properties_and_tags_and_comments.update(entry_node.tags)
seen_properties_and_tags_and_comments.update(entry_node.properties)
seen_properties_and_tags_and_comments.update(entry_node.comments)

additional_properties_queries = [
f"{key} : entry_node.{key}" for key in seen_properties_and_tags
f"{key} : entry_node.{key}" for key in seen_properties_and_tags_and_comments
]

base_properties_query = f"""
Expand Down
89 changes: 75 additions & 14 deletions parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import collections
import copy
import logging
import re
import sys
Expand All @@ -25,10 +26,16 @@ class NodeData:
is_before: str | None = None
main_language: str | None = None
preceding_lines: list[str] = field(default_factory=list)
parent_tag: list[tuple[str, int]] = field(default_factory=list)
parent_tags: list[tuple[str, int]] = field(default_factory=list)
src_position: int | None = None
properties: dict[str, str] = field(default_factory=dict)
tags: dict[str, list[str]] = field(default_factory=dict)
comments: dict[str, list[str]] = field(default_factory=dict)
# comments_stack is a list of tuples (line_number, comment),
# to keep track of comments just above the current line
# during parsing of an entry, to be able to add them
# to the right property or tag when possible
comments_stack: list[(int, str)] = field(default_factory=list)

def to_dict(self):
return {
Expand All @@ -38,6 +45,7 @@ def to_dict(self):
"src_position": self.src_position,
**self.properties,
**self.tags,
**self.comments,
}

def get_node_type(self):
Expand Down Expand Up @@ -151,12 +159,12 @@ def _header_harvest(self, filename: str) -> tuple[list[str], int]:

def _entry_end(self, line: str, data: NodeData) -> bool:
"""Return True if the block ended"""
# stopwords and synonyms are one-liner, entries are separated by a blank line
if line.startswith("stopwords") or line.startswith("synonyms") or not line:
# can be the end of an block or just additional line separator,
# file_iter() always end with ''
if data.id: # to be sure that it's an end
return True
if data.id.startswith("stopwords") or data.id.startswith("synonyms"):
# stopwords and synonyms are one-liners; if the id is set, it's the end
eric-nguyen-cs marked this conversation as resolved.
Show resolved Hide resolved
return True
if not line and data.id:
# entries are separated by a blank line
return True
return False

def _remove_separating_line(self, data: NodeData) -> NodeData:
Expand Down Expand Up @@ -186,6 +194,43 @@ def _remove_separating_line(self, data: NodeData) -> NodeData:
data.preceding_lines.pop(0)
return data

def _get_node_data_with_comments_above_key(
self, data: NodeData, line_number: int, key: str
) -> NodeData:
"""Returns the updated node data with comments above the given
key stored in the {key}_comments property."""
new_data = copy.deepcopy(data)
perierc marked this conversation as resolved.
Show resolved Hide resolved

# Get comments just above the given line
comments_above = []
current_line = line_number - 1
while new_data.comments_stack and new_data.comments_stack[-1][0] == current_line:
comments_above.append(new_data.comments_stack.pop()[1])
current_line -= 1
if comments_above:
new_data.comments[key + "_comments"] = comments_above[::-1]

return new_data

def _get_node_data_with_parent_and_end_comments(self, data: NodeData) -> NodeData:
"""Returns the updated node data with parent and end comments"""
new_data = copy.deepcopy(data)

# Get parent comments (part of an entry block and just above/between the parents lines)
parent_comments = []
while new_data.preceding_lines and new_data.preceding_lines[-1] != "":
parent_comments.append(new_data.preceding_lines.pop())
if parent_comments:
new_data.comments["parent_comments"] = parent_comments[::-1]

# Get end comments (part of an entry block after the last tag/prop
# and before the separating blank line)
end_comments = [comment[1] for comment in new_data.comments_stack]
if end_comments:
new_data.comments["end_comments"] = end_comments

return new_data

def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[NodeData]:
"""Transform data from file to dictionary"""
saved_nodes = []
Expand Down Expand Up @@ -215,14 +260,21 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
self.parser_logger.error(msg)
else:
data = self._remove_separating_line(data)
if data.get_node_type() == NodeType.ENTRY:
alexgarel marked this conversation as resolved.
Show resolved Hide resolved
data = self._get_node_data_with_parent_and_end_comments(data)
yield data # another function will use this dictionary to create a node
saved_nodes.append(data.id)
data = NodeData(is_before=data.id)

# harvest the line
if not (line) or line[0] == "#":
# comment or blank
data.preceding_lines.append(line)
# comment or blank line
if data.id:
# we are within the node definition
data.comments_stack.append((line_number, line))
else:
# we are before the actual node
data.preceding_lines.append(line)
else:
line = line.rstrip(",")
if not data.src_position:
Expand All @@ -234,7 +286,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
index_stopwords += 1
# remove "stopwords:" part
line = line[10:]
# compute raw values outside _get_lc_value as it removes stop words !
# compute raw values outside _get_lc_value as it normalizes them!
tags = [words.strip() for words in line[3:].split(",")]
try:
lc, value = self._get_lc_value(line)
Expand All @@ -252,7 +304,9 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
id = "synonyms:" + str(index_synonyms)
data = self._set_data_id(data, id, line_number)
index_synonyms += 1
# remove "synonyms:" part
line = line[9:]
# compute raw values outside _get_lc_value as it normalizes them!
tags = [words.strip() for words in line[3:].split(",")]
try:
lc, value = self._get_lc_value(line)
Expand All @@ -265,7 +319,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
data.tags["tags_ids_" + lc] = value
elif line[0] == "<":
# parent definition
data.parent_tag.append((self._add_line(line[1:]), line_number + 1))
data.parent_tags.append((self._add_line(line[1:]), line_number + 1))
elif language_code_prefix.match(line):
# synonyms definition
if not data.id:
Expand All @@ -286,6 +340,9 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
tagsids_list.append(word_normalized)
data.tags["tags_" + lang] = tags_list
data.tags["tags_ids_" + lang] = tagsids_list
data = self._get_node_data_with_comments_above_key(
data, line_number, "tags_" + lang
)
else:
# property definition
property_name = None
Expand All @@ -306,7 +363,11 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
f"Reading error at line {line_number + 1}, unexpected format: '{self.parser_logger.ellipsis(line)}'"
)
if property_name:
data.properties["prop_" + property_name + "_" + lc] = property_value
prop_key = "prop_" + property_name + "_" + lc
data.properties[prop_key] = property_value
data = self._get_node_data_with_comments_above_key(
data, line_number, prop_key
)

data.id = "__footer__"
data.preceding_lines.pop(0)
alexgarel marked this conversation as resolved.
Show resolved Hide resolved
Expand Down Expand Up @@ -395,8 +456,8 @@ def _create_taxonomy(self, filename: str) -> Taxonomy:
other_nodes.append(entry)
if entry.is_before:
previous_links.append(PreviousLink(before_id=entry.is_before, id=entry.id))
if entry.parent_tag:
for position, (parent, line_position) in enumerate(entry.parent_tag):
if entry.parent_tags:
for position, (parent, line_position) in enumerate(entry.parent_tags):
raw_child_links.append(
ChildLink(
parent_id=parent,
Expand Down
15 changes: 10 additions & 5 deletions parser/openfoodfacts_taxonomy_parser/unparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,10 @@ def get_tags_line(self, node, lc):
return lc + ":" + line

def list_property_and_lc(self, node):
"""return a ordered list of properties with their language code (lc)"""
"""return an ordered list of properties with their language code (lc)"""
# there is no rule for the order of properties
# properties will be arranged in alphabetical order
return [property[5:] for property in node if property.startswith("prop_")]
return sorted([property[5:] for property in node if property.startswith("prop_") and not property.endswith("_comments")])
perierc marked this conversation as resolved.
Show resolved Hide resolved

def get_property_line(self, node, property):
"""return a string that should look like the original property line"""
Expand Down Expand Up @@ -90,28 +90,33 @@ def iter_lines(self, project_label):
if add_blank:
yield ""
# comments
if node["preceding_lines"]:
yield from node["preceding_lines"]
yield from node.get("preceding_lines", [])
if has_content:
tags_lc = self.list_tags_lc(node)
if node["id"].startswith("stopwords"):
yield "stopwords:" + self.get_tags_line(node, tags_lc[0])
elif node["id"].startswith("synonyms"):
yield "synonyms:" + self.get_tags_line(node, tags_lc[0])
else:
# synonyms line
# parents
yield from node.get("parent_comments", [])
yield from self.get_parents_lines(parents)
# main language synonyms first
main_language = node.pop("main_language")
tags_lc.remove(main_language)
yield from node.get("tags_" + main_language + "_comments", [])
yield self.get_tags_line(node, main_language)
# more synonyms after
for lc in tags_lc:
yield from node.get("tags_" + lc + "_comments", [])
yield self.get_tags_line(node, lc)
# properties
properties_list = self.list_property_and_lc(node)
for property in properties_list:
yield from node.get("prop_" + property + "_comments", [])
yield self.get_property_line(node, property)
# final comments
yield from node.get("end_comments", [])
previous_block_id = node["id"]

def rewrite_file(self, filename, lines):
Expand Down
2 changes: 1 addition & 1 deletion parser/tests/data/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ fr:yaourts au fruit de la passion allégés
# meat

en:meat
vegan:en:no
carbon_footprint_fr_foodges_value:fr:10
vegan:en:no

<en:meat
en:fake-meat
Expand Down
2 changes: 1 addition & 1 deletion parser/tests/unit/test_parser_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def test_fileiter(neo4j):
file_iterator = taxonomy_parser._file_iter(TEST_TAXONOMY_TXT)
for counter, (_, line) in enumerate(file_iterator):
assert line == "" or line[0] == "#" or ":" in line
if counter == 26:
if counter == 25:
assert line == "carbon_footprint_fr_foodges_value:fr:10"
assert counter == 37

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { Box, Grid, Paper, TextField, Typography } from "@mui/material";
import { Box, Grid, Paper, Typography } from "@mui/material";
import MaterialTable, { MTableToolbar } from "@material-table/core";
import { useEffect, useState } from "react";

Expand All @@ -14,7 +14,11 @@ const ListAllEntryProperties = ({ nodeObject, setNodeObject }) => {
// UUID of properties will have a "_uuid" suffix
// Ex: prop_vegan_en_uuid

if (key.startsWith("prop") && key.endsWith("uuid")) {
if (
key.startsWith("prop") &&
key.endsWith("uuid") &&
!key.endsWith("_comments_uuid")
) {
const uuid = nodeObject[key][0]; // UUID
// Removing "prop_" prefix from key to render only the name
const property_name = key.split("_").slice(1, -1).join("_");
Expand All @@ -33,13 +37,6 @@ const ListAllEntryProperties = ({ nodeObject, setNodeObject }) => {
setData(renderedProperties);
}, [nodeObject]);

// Helper function used for changing comments from node
const changeCommentData = (value) => {
const newNodeObject = { ...nodeObject };
newNodeObject["preceding_lines"] = value;
setNodeObject(newNodeObject);
};

// Helper function used for changing properties of node
const changePropertyData = (key, value) => {
setNodeObject((prevState) => {
Expand All @@ -60,21 +57,6 @@ const ListAllEntryProperties = ({ nodeObject, setNodeObject }) => {

return (
<Box>
{/* Comments */}
<Typography sx={{ ml: 4, mt: 2, mb: 1 }} variant="h5">
Comments
</Typography>
<TextField
sx={{ ml: 8, mt: 1, width: 250 }}
minRows={4}
multiline
onChange={(event) => {
changeCommentData(event.target.value.split("\n"));
}}
value={nodeObject?.preceding_lines.join("\n")}
variant="outlined"
/>

{/* Properties */}
<Box sx={{ width: "50%", ml: 4 }}>
<MaterialTable
Expand Down
Loading
Loading