Skip to content

Commit

Permalink
feat: Search functionality - Backend (#91)
Browse files Browse the repository at this point in the history
* add full text search index in neo4j
* implement search API
* search for id as well as tags
  • Loading branch information
aadarsh-ram committed Oct 4, 2022
1 parent 46bc840 commit 3317fae
Show file tree
Hide file tree
Showing 7 changed files with 123 additions and 17 deletions.
7 changes: 6 additions & 1 deletion backend/editor/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

# DB helper imports
from .entries import initialize_db, shutdown_db
from .entries import get_all_nodes, get_nodes, get_children, get_parents, get_label
from .entries import get_all_nodes, get_nodes, get_children, get_parents, get_label, full_text_search
from .entries import update_nodes, update_node_children
from .entries import create_node, add_node_to_end, add_node_to_beginning, delete_node
#------------------------------------------------------------------------#
Expand Down Expand Up @@ -187,6 +187,11 @@ async def findFooter(response: Response):
footer = list(result)
return footer[0]

@app.get("/search")
async def searchNode(response: Response, query: str):
result = full_text_search(query)
return result

# Post methods

@app.post("/nodes")
Expand Down
49 changes: 49 additions & 0 deletions backend/editor/entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import re
from neo4j import GraphDatabase # Interface with Neo4J
from . import settings # Neo4J settings
from .normalizer import normalizing # Normalizing tags

def initialize_db():
"""
Expand Down Expand Up @@ -242,3 +243,51 @@ def update_node_children(entry, new_children_ids):
result = session.run(query, {"id": entry, "child": child})

return result

def full_text_search(text):
"""
Helper function used for searching a taxonomy
"""
# Escape special characters
normalized_text = re.sub(r"[^A-Za-z0-9_]", r" ", text)
normalized_id_text = normalizing(text)

text_query_exact = "*" + normalized_text + '*'
text_query_fuzzy = normalized_text + "~"
text_id_query_fuzzy = normalized_id_text + "~"
text_id_query_exact = "*" + normalized_id_text + "*"
params = {
"text_query_fuzzy" : text_query_fuzzy,
"text_query_exact" : text_query_exact,
"text_id_query_fuzzy" : text_id_query_fuzzy,
"text_id_query_exact" : text_id_query_exact
}

# Fuzzy search and wildcard (*) search on two indexes
# Fuzzy search has more priority, since it matches more close strings
# IDs are given slightly lower priority than tags in fuzzy search
query = """
CALL {
CALL db.index.fulltext.queryNodes("nodeSearchIds", $text_id_query_fuzzy)
yield node, score as score_
return node, score_ * 3 as score
UNION
CALL db.index.fulltext.queryNodes("nodeSearchTags", $text_query_fuzzy)
yield node, score as score_
return node, score_ * 5 as score
UNION
CALL db.index.fulltext.queryNodes("nodeSearchIds", $text_id_query_exact)
yield node, score as score_
return node, score_ as score
UNION
CALL db.index.fulltext.queryNodes("nodeSearchTags", $text_query_exact)
yield node, score as score_
return node, score_ as score
}
with node.id as node, score
RETURN node, sum(score) as score
ORDER BY score DESC
"""
result = [record["node"] for record in session.run(query, params)]
return result
30 changes: 30 additions & 0 deletions backend/editor/normalizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import re
import unicodedata
import unidecode

def normalizing(line, lang="default"):
"""normalize a string depending of the language code lang"""
line = unicodedata.normalize("NFC", line)

# removing accent
if lang in ["fr", "ca", "es", "it", "nl", "pt", "sk", "en"]:
line = re.sub(r"[¢£¤¥§©ª®°²³µ¶¹º¼½¾×‰€™]", "-", line)
line = unidecode.unidecode(line)

# lower case except if language in list
if lang not in []:
line = line.lower()

# changing unwanted character to "-"
line = re.sub(r"[\u0000-\u0027\u200b]", "-", line)
line = re.sub(r"&\w+;", "-", line)
line = re.sub(
r"[\s!\"#\$%&'()*+,\/:;<=>?@\[\\\]^_`{\|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿×ˆ˜–—‘’‚“”„†‡•…‰‹›€™\t]", # noqa: E501
"-",
line,
)

# removing excess "-"
line = re.sub(r"-+", "-", line)
line = line.strip("-")
return line
11 changes: 11 additions & 0 deletions backend/sample/test-neo4j.json
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,10 @@
"main_language": "en",
"tags_en":["yogurts", "yoghurts"],
"tags_ids_en":["yogurts", "yoghurts"],
"tags_en_str":"yogurts yoghurts",
"tags_fr": ["yaourts", "yoghourts", "yogourts"],
"tags_ids_fr": ["yaourts", "yoghourts", "yogourts"],
"tags_fr_str":"yaourts yoghourts yogourts",
"preceding_lines" : [],
"src_position": 9
},
Expand All @@ -46,8 +48,10 @@
"main_language": "en",
"tags_en":["banana yogurts"],
"tags_ids_en":["banana-yogurts"],
"tags_en_str":"banana yogurts",
"tags_fr": ["yaourts à la banane"],
"tags_ids_fr": ["yaourts-banane"],
"tags_fr_str":"yaourts à la banane",
"preceding_lines" : [],
"src_position": 12
},
Expand All @@ -57,8 +61,10 @@
"main_language": "en",
"tags_en":["Passion fruit yogurts"],
"tags_ids_en":["passion-fruit-yogurts"],
"tags_en_str":"Passion fruit yogurts",
"tags_fr": ["yaourts au fruit de la passion"],
"tags_ids_fr": ["yaourts-fruit-passion"],
"tags_fr_str":"yaourts au fruit de la passion",
"preceding_lines" : [],
"src_position": 16
},
Expand All @@ -68,6 +74,7 @@
"main_language": "fr",
"tags_fr": ["yaourts au fruit de la passion allégés"],
"tags_ids_fr": ["yaourts-fruit-passion-alleges"],
"tags_fr_str":"yaourts au fruit de la passion allégés",
"preceding_lines" : [],
"src_position": 20
},
Expand All @@ -77,6 +84,7 @@
"main_language": "en",
"tags_en": ["meat"],
"tags_ids_en": ["meat"],
"tags_en_str": "meat",
"prop_vegan_en": "no",
"prop_carbon_footprint_fr_foodges_value_fr": "10",
"preceding_lines": ["# meat", ""],
Expand All @@ -88,6 +96,7 @@
"main_language": "en",
"tags_en": ["fake-meat"],
"tags_ids_en": ["fake-meat"],
"tags_en_str":"fake-meat",
"prop_vegan_en": "yes",
"preceding_lines" : [],
"src_position": 29
Expand All @@ -98,6 +107,7 @@
"main_language": "en",
"tags_en": ["fake-stuff"],
"tags_ids_en": ["fake-stuff"],
"tags_en_str":"fake-stuff",
"preceding_lines" : [],
"src_position": 33
},
Expand All @@ -107,6 +117,7 @@
"main_language": "en",
"tags_en": ["fake-duck-meat"],
"tags_ids_en": ["fake-duck-meat"],
"tags_en_str":"fake-duck-meat",
"preceding_lines" : [],
"src_position": 35
},
Expand Down
37 changes: 24 additions & 13 deletions parser/openfoodfacts_taxonomy_parser/parser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import logging
import re
import sys
import unicodedata

import iso639
import unidecode
from neo4j import GraphDatabase

Expand Down Expand Up @@ -58,11 +60,7 @@ def create_node(self, data):
entry_query += " SET n." + key + " = $" + key + "\n"

query = id_query + entry_query + position_query
self.session.run(
query,
data,
is_before=self.is_before,
)
self.session.run(query, data, is_before=self.is_before)

def normalized_filename(self, filename):
"""add the .txt extension if it is missing in the filename"""
Expand Down Expand Up @@ -323,6 +321,7 @@ def harvest(self, filename):
# in case 2 normalized synonyms are the same
tagsids_list.append(word_normalized)
data["tags_" + lang] = tags_list
data["tags_" + lang + "_str"] = " ".join(tags_list)
data["tags_ids_" + lang] = tagsids_list
else:
# property definition
Expand Down Expand Up @@ -388,11 +387,7 @@ def create_previous_link(self):
id_previous,
)
elif not relation[0]:
logging.error(
"link not created between %s and %s",
id,
id_previous,
)
logging.error("link not created between %s and %s", id, id_previous)

def parent_search(self):
"""Get the parent and the child to link"""
Expand Down Expand Up @@ -423,18 +418,34 @@ def delete_used_properties(self):
query = "MATCH (n) SET n.is_before = null, n.parents = null"
self.session.run(query)

def create_fulltext_index(self):
query = """
CREATE FULLTEXT INDEX nodeSearchIds FOR (n:ENTRY) ON EACH [n.id]
OPTIONS {indexConfig: {`fulltext.analyzer`: 'keyword'}}
"""
self.session.run(query)

language_codes = [lang.alpha2 for lang in list(iso639.languages) if lang.alpha2 != ""]
tags_prefixed_lc = ["n.tags_" + lc + "_str" for lc in language_codes]
tags_prefixed_lc = ", ".join(tags_prefixed_lc)
query = (
f"""CREATE FULLTEXT INDEX nodeSearchTags FOR (n:ENTRY) ON EACH [{tags_prefixed_lc}]"""
)
self.session.run(query)

def __call__(self, filename):
"""process the file"""
self.create_nodes(filename)
self.create_child_link()
self.create_previous_link()
self.create_fulltext_index()
# self.delete_used_properties()


if __name__ == "__main__":
import sys

logging.basicConfig(filename="parser.log", encoding="utf-8", level=logging.INFO)
logging.basicConfig(
handlers=[logging.FileHandler(filename="parser.log", encoding="utf-8")], level=logging.INFO
)
filename = sys.argv[1] if len(sys.argv) > 1 else "test"
parse = Parser()
parse(filename)
3 changes: 1 addition & 2 deletions parser/requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,4 @@ py==1.11.0
pyparsing==3.0.9
pytest==7.1.2
pytz==2022.1
tomli==2.0.1
Unidecode==1.3.4
tomli==2.0.1
3 changes: 2 additions & 1 deletion parser/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
neo4j==4.4.5
pytz==2022.1
Unidecode==1.3.4
Unidecode==1.3.4
iso-639==0.4.5

0 comments on commit 3317fae

Please sign in to comment.