feat: Search functionality - Backend (#91)

* add full text search index in neo4j * implement search API * search for id as well as tags
openfoodfacts · Oct 4, 2022 · 3317fae · 3317fae
1 parent 46bc840
commit 3317fae
Show file tree

Hide file tree

Showing 7 changed files with 123 additions and 17 deletions.
diff --git a/backend/editor/api.py b/backend/editor/api.py
@@ -14,7 +14,7 @@
 
 # DB helper imports
 from .entries import initialize_db, shutdown_db
-from .entries import get_all_nodes, get_nodes, get_children, get_parents, get_label
+from .entries import get_all_nodes, get_nodes, get_children, get_parents, get_label, full_text_search
 from .entries import update_nodes, update_node_children
 from .entries import create_node, add_node_to_end, add_node_to_beginning, delete_node
 #------------------------------------------------------------------------#
@@ -187,6 +187,11 @@ async def findFooter(response: Response):
     footer = list(result)
     return footer[0]
 
+@app.get("/search")
+async def searchNode(response: Response, query: str):
+    result = full_text_search(query)
+    return result
+
 # Post methods
 
 @app.post("/nodes")

diff --git a/backend/editor/entries.py b/backend/editor/entries.py
@@ -4,6 +4,7 @@
 import re
 from neo4j import GraphDatabase         # Interface with Neo4J
 from . import settings                  # Neo4J settings
+from .normalizer import normalizing      # Normalizing tags
 
 def initialize_db():
     """
@@ -242,3 +243,51 @@ def update_node_children(entry, new_children_ids):
         result = session.run(query, {"id": entry, "child": child})
 
     return result
+
+def full_text_search(text):
+    """
+    Helper function used for searching a taxonomy
+    """
+    # Escape special characters
+    normalized_text = re.sub(r"[^A-Za-z0-9_]", r" ", text)
+    normalized_id_text = normalizing(text)
+
+    text_query_exact = "*" + normalized_text + '*'
+    text_query_fuzzy = normalized_text + "~"
+    text_id_query_fuzzy = normalized_id_text + "~"
+    text_id_query_exact = "*" + normalized_id_text + "*"
+    params = {
+        "text_query_fuzzy" : text_query_fuzzy,
+        "text_query_exact" : text_query_exact,
+        "text_id_query_fuzzy" : text_id_query_fuzzy,
+        "text_id_query_exact" : text_id_query_exact 
+    }
+
+    # Fuzzy search and wildcard (*) search on two indexes
+    # Fuzzy search has more priority, since it matches more close strings
+    # IDs are given slightly lower priority than tags in fuzzy search
+    query = """
+        CALL {
+                CALL db.index.fulltext.queryNodes("nodeSearchIds", $text_id_query_fuzzy)
+                yield node, score as score_
+                return node, score_ * 3 as score
+            UNION
+                CALL db.index.fulltext.queryNodes("nodeSearchTags", $text_query_fuzzy)
+                yield node, score as score_
+                return node, score_ * 5 as score
+            UNION
+                CALL db.index.fulltext.queryNodes("nodeSearchIds", $text_id_query_exact)
+                yield node, score as score_
+                return node, score_ as score
+            UNION
+                CALL db.index.fulltext.queryNodes("nodeSearchTags", $text_query_exact)
+                yield node, score as score_
+                return node, score_ as score 
+        }
+        with node.id as node, score
+        RETURN node, sum(score) as score
+        
+        ORDER BY score DESC
+    """
+    result = [record["node"] for record in session.run(query, params)]
+    return result
diff --git a/backend/editor/normalizer.py b/backend/editor/normalizer.py
@@ -0,0 +1,30 @@
+import re
+import unicodedata
+import unidecode
+
+def normalizing(line, lang="default"):
+    """normalize a string depending of the language code lang"""
+    line = unicodedata.normalize("NFC", line)
+
+    # removing accent
+    if lang in ["fr", "ca", "es", "it", "nl", "pt", "sk", "en"]:
+        line = re.sub(r"[¢£¤¥§©ª®°²³µ¶¹º¼½¾×‰€™]", "-", line)
+        line = unidecode.unidecode(line)
+
+    # lower case except if language in list
+    if lang not in []:
+        line = line.lower()
+
+    # changing unwanted character to "-"
+    line = re.sub(r"[\u0000-\u0027\u200b]", "-", line)
+    line = re.sub(r"&\w+;", "-", line)
+    line = re.sub(
+        r"[\s!\"#\$%&'()*+,\/:;<=>?@\[\\\]^_`{\|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿×ˆ˜–—‘’‚“”„†‡•…‰‹›€™\t]",  # noqa: E501
+        "-",
+        line,
+    )
+
+    # removing excess "-"
+    line = re.sub(r"-+", "-", line)
+    line = line.strip("-")
+    return line
diff --git a/backend/sample/test-neo4j.json b/backend/sample/test-neo4j.json
@@ -35,8 +35,10 @@
             "main_language": "en",
             "tags_en":["yogurts", "yoghurts"],
             "tags_ids_en":["yogurts", "yoghurts"],
+            "tags_en_str":"yogurts yoghurts",
             "tags_fr": ["yaourts", "yoghourts", "yogourts"],
             "tags_ids_fr": ["yaourts", "yoghourts", "yogourts"],
+            "tags_fr_str":"yaourts yoghourts yogourts",
             "preceding_lines" : [],
             "src_position": 9
         },
@@ -46,8 +48,10 @@
             "main_language": "en",
             "tags_en":["banana yogurts"],
             "tags_ids_en":["banana-yogurts"],
+            "tags_en_str":"banana yogurts",
             "tags_fr": ["yaourts à la banane"],
             "tags_ids_fr": ["yaourts-banane"],
+            "tags_fr_str":"yaourts à la banane",
             "preceding_lines" : [],
             "src_position": 12
         },
@@ -57,8 +61,10 @@
             "main_language": "en",
             "tags_en":["Passion fruit yogurts"],
             "tags_ids_en":["passion-fruit-yogurts"],
+            "tags_en_str":"Passion fruit yogurts",
             "tags_fr": ["yaourts au fruit de la passion"],
             "tags_ids_fr": ["yaourts-fruit-passion"],
+            "tags_fr_str":"yaourts au fruit de la passion",
             "preceding_lines" : [],
             "src_position": 16
         },
@@ -68,6 +74,7 @@
             "main_language": "fr",
             "tags_fr": ["yaourts au fruit de la passion allégés"],
             "tags_ids_fr": ["yaourts-fruit-passion-alleges"],
+            "tags_fr_str":"yaourts au fruit de la passion allégés",
             "preceding_lines" : [],
             "src_position": 20
         },
@@ -77,6 +84,7 @@
             "main_language": "en",
             "tags_en": ["meat"],
             "tags_ids_en": ["meat"],
+            "tags_en_str": "meat",
             "prop_vegan_en": "no",
             "prop_carbon_footprint_fr_foodges_value_fr": "10",
             "preceding_lines": ["# meat", ""],
@@ -88,6 +96,7 @@
             "main_language": "en",
             "tags_en": ["fake-meat"],
             "tags_ids_en": ["fake-meat"],
+            "tags_en_str":"fake-meat",
             "prop_vegan_en": "yes",
             "preceding_lines" : [],
             "src_position": 29
@@ -98,6 +107,7 @@
             "main_language": "en",
             "tags_en": ["fake-stuff"],
             "tags_ids_en": ["fake-stuff"],
+            "tags_en_str":"fake-stuff",
             "preceding_lines" : [],
             "src_position": 33
         },
@@ -107,6 +117,7 @@
             "main_language": "en",
             "tags_en": ["fake-duck-meat"],
             "tags_ids_en": ["fake-duck-meat"],
+            "tags_en_str":"fake-duck-meat",
             "preceding_lines" : [],
             "src_position": 35
         },

diff --git a/parser/openfoodfacts_taxonomy_parser/parser.py b/parser/openfoodfacts_taxonomy_parser/parser.py
@@ -1,7 +1,9 @@
 import logging
 import re
+import sys
 import unicodedata
 
+import iso639
 import unidecode
 from neo4j import GraphDatabase
 
@@ -58,11 +60,7 @@ def create_node(self, data):
                 entry_query += " SET n." + key + " = $" + key + "\n"
 
         query = id_query + entry_query + position_query
-        self.session.run(
-            query,
-            data,
-            is_before=self.is_before,
-        )
+        self.session.run(query, data, is_before=self.is_before)
 
     def normalized_filename(self, filename):
         """add the .txt extension if it is missing in the filename"""
@@ -323,6 +321,7 @@ def harvest(self, filename):
                             # in case 2 normalized synonyms are the same
                             tagsids_list.append(word_normalized)
                     data["tags_" + lang] = tags_list
+                    data["tags_" + lang + "_str"] = " ".join(tags_list)
                     data["tags_ids_" + lang] = tagsids_list
                 else:
                     # property definition
@@ -388,11 +387,7 @@ def create_previous_link(self):
                     id_previous,
                 )
             elif not relation[0]:
-                logging.error(
-                    "link not created between %s and %s",
-                    id,
-                    id_previous,
-                )
+                logging.error("link not created between %s and %s", id, id_previous)
 
     def parent_search(self):
         """Get the parent and the child to link"""
@@ -423,18 +418,34 @@ def delete_used_properties(self):
         query = "MATCH (n) SET n.is_before = null, n.parents = null"
         self.session.run(query)
 
+    def create_fulltext_index(self):
+        query = """
+            CREATE FULLTEXT INDEX nodeSearchIds FOR (n:ENTRY) ON EACH [n.id]
+            OPTIONS {indexConfig: {`fulltext.analyzer`: 'keyword'}}
+        """
+        self.session.run(query)
+
+        language_codes = [lang.alpha2 for lang in list(iso639.languages) if lang.alpha2 != ""]
+        tags_prefixed_lc = ["n.tags_" + lc + "_str" for lc in language_codes]
+        tags_prefixed_lc = ", ".join(tags_prefixed_lc)
+        query = (
+            f"""CREATE FULLTEXT INDEX nodeSearchTags FOR (n:ENTRY) ON EACH [{tags_prefixed_lc}]"""
+        )
+        self.session.run(query)
+
     def __call__(self, filename):
         """process the file"""
         self.create_nodes(filename)
         self.create_child_link()
         self.create_previous_link()
+        self.create_fulltext_index()
         # self.delete_used_properties()
 
 
 if __name__ == "__main__":
-    import sys
-
-    logging.basicConfig(filename="parser.log", encoding="utf-8", level=logging.INFO)
+    logging.basicConfig(
+        handlers=[logging.FileHandler(filename="parser.log", encoding="utf-8")], level=logging.INFO
+    )
     filename = sys.argv[1] if len(sys.argv) > 1 else "test"
     parse = Parser()
     parse(filename)
diff --git a/parser/requirements-test.txt b/parser/requirements-test.txt
@@ -7,5 +7,4 @@ py==1.11.0
 pyparsing==3.0.9
 pytest==7.1.2
 pytz==2022.1
-tomli==2.0.1
-Unidecode==1.3.4
+tomli==2.0.1
diff --git a/parser/requirements.txt b/parser/requirements.txt
@@ -1,3 +1,4 @@
 neo4j==4.4.5
 pytz==2022.1
-Unidecode==1.3.4
+Unidecode==1.3.4
+iso-639==0.4.5