Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 30 additions & 48 deletions learning_resources_search/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,13 @@
from learning_resources.models import LearningResource
from learning_resources_search.connection import (
get_default_alias_name,
get_vector_model_id,
)
from learning_resources_search.constants import (
COMBINED_INDEX,
CONTENT_FILE_TYPE,
COURSE_QUERY_FIELDS,
COURSE_TYPE,
DEPARTMENT_QUERY_FIELDS,
HYBRID_COMBINED_INDEX,
HYBRID_SEARCH_MODE,
LEARNING_RESOURCE,
LEARNING_RESOURCE_QUERY_FIELDS,
Expand Down Expand Up @@ -55,6 +54,23 @@
"-created_on",
]

HYBRID_SEARCH_KNN_K_VALUE = 5
HYBRID_SEARCH_PAGINATION_DEPTH = 10
HYBRID_SEARCH_POST_PROCESSOR = {
"description": "Post processor for hybrid search",
"phase_results_processors": [
{
"normalization-processor": {
"normalization": {"technique": "min_max"},
"combination": {
"technique": "arithmetic_mean",
"parameters": {"weights": [0.8, 0.2]},
},
}
}
],
}


def gen_content_file_id(content_file_id):
"""
Expand Down Expand Up @@ -86,7 +102,7 @@ def relevant_indexes(resource_types, aggregations, endpoint, use_hybrid_search):
if endpoint == CONTENT_FILE_TYPE:
return [get_default_alias_name(COURSE_TYPE)]
elif use_hybrid_search:
return [get_default_alias_name(COMBINED_INDEX)]
return [get_default_alias_name(HYBRID_COMBINED_INDEX)]

if aggregations and "resource_type" in aggregations:
return map(get_default_alias_name, LEARNING_RESOURCE_TYPES)
Expand Down Expand Up @@ -652,41 +668,22 @@ def add_text_query_to_search(
text_query = {"bool": {"must": [text_query], "filter": query_type_query}}

if use_hybrid_search:
vector_model_id = get_vector_model_id()
if not vector_model_id:
log.error("Vector model not found. Cannot perform hybrid search.")
error_message = "Vector model not found."
raise ValueError(error_message)

vector_query_description = {
"neural": {
"description_embedding": {
"query_text": text,
"model_id": vector_model_id,
"min_score": 0.015,
},
}
}

vector_query_title = {
"neural": {
"title_embedding": {
"query_text": text,
"model_id": vector_model_id,
"min_score": 0.015,
},
encoder = dense_encoder()
query_vector = encoder.embed_query(text)
vector_query = {
"knn": {
"vector_embedding": {
"vector": query_vector,
"k": HYBRID_SEARCH_KNN_K_VALUE,
}
}
}

search = search.extra(
query={
"hybrid": {
"pagination_depth": 10,
"queries": [
text_query,
vector_query_description,
vector_query_title,
],
"pagination_depth": HYBRID_SEARCH_PAGINATION_DEPTH,
"queries": [text_query, vector_query],
}
}
)
Expand Down Expand Up @@ -803,22 +800,7 @@ def execute_learn_search(search_params):
search = construct_search(search_params)

if search_params.get("search_mode") == HYBRID_SEARCH_MODE:
search = search.extra(
search_pipeline={
"description": "Post processor for hybrid search",
"phase_results_processors": [
{
"normalization-processor": {
"normalization": {"technique": "min_max"},
"combination": {
"technique": "arithmetic_mean",
"parameters": {"weights": [0.6, 0.2, 0.2]},
},
}
}
],
}
)
search = search.extra(search_pipeline=HYBRID_SEARCH_POST_PROCESSOR)

results = search.execute().to_dict()
if results.get("_shards", {}).get("failures"):
Expand Down
42 changes: 9 additions & 33 deletions learning_resources_search/api_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1946,8 +1946,7 @@ def test_execute_learn_search_for_learning_resource_query(opensearch):
"content",
"summary",
"flashcards",
"description_embedding",
"title_embedding",
"vector_embedding",
]
},
}
Expand Down Expand Up @@ -2395,8 +2394,7 @@ def test_execute_learn_search_with_script_score(
"content",
"summary",
"flashcards",
"description_embedding",
"title_embedding",
"vector_embedding",
]
},
}
Expand All @@ -2417,10 +2415,8 @@ def test_execute_learn_search_with_hybrid_search(mocker, settings, opensearch):

settings.DEFAULT_SEARCH_MODE = "best_fields"

mocker.patch(
"learning_resources_search.api.get_vector_model_id",
return_value="vector_model_id",
)
mock_encoder = mocker.patch("learning_resources_search.api.dense_encoder")()
mock_encoder.embed_query.return_value = [0.1, 0.2, 0.3]

search_params = {
"aggregations": ["offered_by"],
Expand Down Expand Up @@ -2727,24 +2723,7 @@ def test_execute_learn_search_with_hybrid_search(mocker, settings, opensearch):
"filter": {"exists": {"field": "resource_type"}},
}
},
{
"neural": {
"description_embedding": {
"query_text": "math",
"model_id": "vector_model_id",
"min_score": 0.015,
}
}
},
{
"neural": {
"title_embedding": {
"query_text": "math",
"model_id": "vector_model_id",
"min_score": 0.015,
}
}
},
{"knn": {"vector_embedding": {"vector": [0.1, 0.2, 0.3], "k": 5}}},
],
}
},
Expand Down Expand Up @@ -2805,7 +2784,7 @@ def test_execute_learn_search_with_hybrid_search(mocker, settings, opensearch):
"normalization": {"technique": "min_max"},
"combination": {
"technique": "arithmetic_mean",
"parameters": {"weights": [0.6, 0.2, 0.2]},
"parameters": {"weights": [0.8, 0.2]},
},
}
}
Expand All @@ -2824,8 +2803,7 @@ def test_execute_learn_search_with_hybrid_search(mocker, settings, opensearch):
"content",
"summary",
"flashcards",
"description_embedding",
"title_embedding",
"vector_embedding",
]
},
}
Expand Down Expand Up @@ -3217,8 +3195,7 @@ def test_execute_learn_search_with_min_score(mocker, settings, opensearch):
"content",
"summary",
"flashcards",
"description_embedding",
"title_embedding",
"vector_embedding",
]
},
}
Expand Down Expand Up @@ -3396,8 +3373,7 @@ def test_execute_learn_search_for_content_file_query(opensearch):
"content",
"summary",
"flashcards",
"description_embedding",
"title_embedding",
"vector_embedding",
]
},
}
Expand Down
30 changes: 0 additions & 30 deletions learning_resources_search/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,33 +135,3 @@ def refresh_index(index):
"""
conn = get_conn()
conn.indices.refresh(index)


def get_vector_model_id():
"""
Get the model ID for the currently loaded vector model
"""
conn = get_conn()
model_name = settings.OPENSEARCH_VECTOR_MODEL_NAME
body = {"query": {"term": {"name.keyword": model_name}}}
models = conn.transport.perform_request(
"GET", "/_plugins/_ml/models/_search", body=body
)

if len(models.get("hits", {}).get("hits", [])) > 0:
return models["hits"]["hits"][0]["_source"]["model_id"]

return None


def get_vector_model_info():
"""
Get information about the currently loaded vector model
"""

conn = get_conn()
model_id = get_vector_model_id()
if not model_id:
return None

return conn.transport.perform_request("GET", f"/_plugins/_ml/models/{model_id}")
29 changes: 4 additions & 25 deletions learning_resources_search/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@
CURRENT_INDEX = "current_index"
REINDEXING_INDEX = "reindexing_index"
BOTH_INDEXES = "all_indexes"
COMBINED_INDEX = "combined_hybrid"

HYBRID_COMBINED_INDEX = "combined_hybrid"
LEARNING_RESOURCE = "learning_resource"
HYBRID_SEARCH_MODE = "hybrid"

Expand All @@ -49,7 +48,7 @@ class IndexestoUpdate(Enum):
)


BASE_INDEXES = (PERCOLATE_INDEX_TYPE, COMBINED_INDEX)
BASE_INDEXES = (PERCOLATE_INDEX_TYPE, HYBRID_COMBINED_INDEX)

ALL_INDEX_TYPES = BASE_INDEXES + LEARNING_RESOURCE_TYPES

Expand Down Expand Up @@ -323,26 +322,7 @@ class FilterConfig:
"max_weekly_hours": {"type": "integer"},
}

EMBEDDING_FIELDS = {
"title_embedding": {
"type": "knn_vector",
"method": {
"engine": "lucene",
"space_type": "l2",
"name": "hnsw",
"parameters": {},
},
},
"description_embedding": {
"type": "knn_vector",
"method": {
"engine": "lucene",
"space_type": "l2",
"name": "hnsw",
"parameters": {},
},
},
}
EMBEDDING_FIELDS = {"vector_embedding": {"type": "knn_vector"}}


CONTENT_FILE_MAP = {
Expand Down Expand Up @@ -471,8 +451,7 @@ class FilterConfig:
"content",
"summary",
"flashcards",
"description_embedding",
"title_embedding",
"vector_embedding",
]

LEARNING_RESOURCE_SEARCH_SORTBY_OPTIONS = {
Expand Down
Loading
Loading