diff --git a/learning_resources_search/constants.py b/learning_resources_search/constants.py index 6cf1ad993c..78757dad6d 100644 --- a/learning_resources_search/constants.py +++ b/learning_resources_search/constants.py @@ -90,13 +90,13 @@ class FilterConfig: ENGLISH_TEXT_FIELD = { "type": "text", - "fields": {"english": {"type": "text", "analyzer": "english"}}, + "fields": {"english": {"type": "text", "analyzer": "custom_english"}}, } ENGLISH_TEXT_FIELD_WITH_SUGGEST = { "type": "text", "fields": { - "english": {"type": "text", "analyzer": "english"}, + "english": {"type": "text", "analyzer": "custom_english"}, "trigram": {"type": "text", "analyzer": "trigram"}, }, } @@ -429,3 +429,13 @@ class FilterConfig: }, **LEARNING_RESOURCE_SORTBY_OPTIONS, } + +SYNONYMS = [ + "ai, ml, artificial intelligence, machine learning", + "math, mathematics", + "chem, chemistry", + "bio, biology", + "econ, economics", + "natural language processing, nlp", + "large language model, llm", +] diff --git a/learning_resources_search/indexing_api.py b/learning_resources_search/indexing_api.py index 0b8803dcdd..6904db5b9a 100644 --- a/learning_resources_search/indexing_api.py +++ b/learning_resources_search/indexing_api.py @@ -26,6 +26,7 @@ COURSE_TYPE, MAPPING, PERCOLATE_INDEX_TYPE, + SYNONYMS, IndexestoUpdate, ) from learning_resources_search.exceptions import ReindexError @@ -143,26 +144,41 @@ def clear_and_create_index(*, index_name=None, skip_mapping=False, object_type=N }, "analysis": { "analyzer": { - "folding": { + "trigram": { "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "synonyms_filter", "shingle"], + }, + "custom_english": { "tokenizer": "standard", "filter": [ + "english_possessive_stemmer", "lowercase", - "asciifolding", # remove accents if we use folding analyzer + "synonyms_filter", + "english_stop", + "english_keywords", + "english_stemmer", ], }, - "trigram": { - "type": "custom", - "tokenizer": "standard", - "filter": ["lowercase", "shingle"], - }, }, "filter": { "shingle": { "type": "shingle", "min_shingle_size": 2, "max_shingle_size": 3, - } + }, + "english_stop": {"type": "stop", "stopwords": "_english_"}, + "english_keywords": {"type": "keyword_marker", "keywords": []}, + "english_stemmer": {"type": "stemmer", "language": "english"}, + "english_possessive_stemmer": { + "type": "stemmer", + "language": "possessive_english", + }, + "synonyms_filter": { + "type": "synonym_graph", + "synonyms": SYNONYMS, + "expand": "true", + }, }, }, }