From e3a682321fc001f80902cc1c3e9be9da79702303 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Thu, 2 Apr 2026 10:25:23 -0400 Subject: [PATCH 01/10] removing unused payload fields --- vector_search/constants.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/vector_search/constants.py b/vector_search/constants.py index 7e552febae..8f151bb51c 100644 --- a/vector_search/constants.py +++ b/vector_search/constants.py @@ -73,25 +73,19 @@ "resource_type_group": models.PayloadSchemaType.KEYWORD, } - +""" +Note: Be intentional about which fields we add as indexes. +Only add fields that we expect to filter or facet on frequently. +""" QDRANT_CONTENT_FILE_INDEXES = { - "chunk_number": models.PayloadSchemaType.INTEGER, "key": models.PayloadSchemaType.KEYWORD, "title": models.PayloadSchemaType.KEYWORD, - "course_number": models.PayloadSchemaType.INTEGER, "platform.code": models.PayloadSchemaType.KEYWORD, "offered_by.code": models.PayloadSchemaType.KEYWORD, - "published": models.PayloadSchemaType.BOOL, - "content_feature_type": models.PayloadSchemaType.KEYWORD, - "file_type": models.PayloadSchemaType.KEYWORD, "file_extension": models.PayloadSchemaType.KEYWORD, "run_readable_id": models.PayloadSchemaType.KEYWORD, "resource_readable_id": models.PayloadSchemaType.KEYWORD, - "run_title": models.PayloadSchemaType.KEYWORD, "edx_module_id": models.PayloadSchemaType.KEYWORD, - "checksum": models.PayloadSchemaType.KEYWORD, - "content_type": models.PayloadSchemaType.KEYWORD, - "edx_block_id": models.PayloadSchemaType.KEYWORD, "url": models.PayloadSchemaType.KEYWORD, } From 2434a09ca9e980cc5915875803b8c1045447b2ee Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Thu, 2 Apr 2026 12:09:37 -0400 Subject: [PATCH 02/10] fix field mapping --- vector_search/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vector_search/constants.py b/vector_search/constants.py index 8f151bb51c..296778763a 100644 --- a/vector_search/constants.py +++ b/vector_search/constants.py @@ -38,7 +38,7 @@ "topic": "topics[].name", "ocw_topic": "ocw_topics", "level": "runs[].level[].code", - "department": "departments.department_id", + "department": "departments[].department_id", "platform": "platform.code", "offered_by": "offered_by.code", "delivery": "delivery[].code", From be3c20a2c6a8a2ffaba5cf3cf7e47b19cf964137 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Thu, 2 Apr 2026 13:17:02 -0400 Subject: [PATCH 03/10] remove unused from serializer --- vector_search/serializers.py | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/vector_search/serializers.py b/vector_search/serializers.py index d4d7a8975f..151fe05a4f 100644 --- a/vector_search/serializers.py +++ b/vector_search/serializers.py @@ -214,11 +214,6 @@ class ContentFileVectorSearchRequestSerializer(serializers.Serializer): child=serializers.CharField(), help_text="The filename of the content file", ) - course_number = serializers.ListField( - required=False, - child=serializers.CharField(), - help_text="Course number of the content file", - ) offered_by = serializers.ListField( required=False, child=serializers.CharField(), @@ -229,12 +224,7 @@ class ContentFileVectorSearchRequestSerializer(serializers.Serializer): child=serializers.CharField(), help_text="platform(s) of the content file", ) - content_feature_type = serializers.ListField( - required=False, - child=serializers.CharField(), - help_text="The feature type of the content file. " - "Possible options are at api/v1/course_features/", - ) + file_extension = serializers.ListField( required=False, child=serializers.CharField(), @@ -252,11 +242,6 @@ class ContentFileVectorSearchRequestSerializer(serializers.Serializer): "The readable_id value of the parent learning resource for the content file" ), ) - edx_module_id = serializers.ListField( - required=False, - child=serializers.CharField(), - help_text="The edx_module_id of the content file", - ) collection_name = serializers.CharField( required=False, help_text=("Manually specify the name of the Qdrant collection to query"), @@ -271,17 +256,6 @@ class ContentFileVectorSearchRequestSerializer(serializers.Serializer): "The number of chunks in each group. Only relevant when group_by is used" ), ) - url = serializers.ListField( - required=False, - child=serializers.CharField(), - help_text="The url of the content file. ", - ) - title = serializers.ListField( - required=False, - child=serializers.CharField(), - help_text="The title of the content file. ", - ) - url__isnull = serializers.BooleanField( required=False, default=None, From d9be639987c3e34a9320bde67b2ff1dbdd89040f Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Thu, 2 Apr 2026 13:33:17 -0400 Subject: [PATCH 04/10] fix test --- vector_search/views_test.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/vector_search/views_test.py b/vector_search/views_test.py index 201e455dd4..4e3207434b 100644 --- a/vector_search/views_test.py +++ b/vector_search/views_test.py @@ -46,7 +46,7 @@ def test_vector_search_filters(mocker, client): ), models.FieldCondition(key="free", match=models.MatchValue(value=True)), models.FieldCondition( - key="departments.department_id", + key="departments[].department_id", match=models.MatchAny(any=["6", "7"]), ), ] @@ -92,7 +92,7 @@ def test_vector_search_filters_empty_query(mocker, client): ), models.FieldCondition(key="free", match=models.MatchValue(value=True)), models.FieldCondition( - key="departments.department_id", + key="departments[].department_id", match=models.MatchAny(any=["6", "7"]), ), ] @@ -157,16 +157,9 @@ def test_content_file_vector_search_filters( models.FieldCondition( key="platform.code", match=models.MatchAny(any=["edx"]) ), - models.FieldCondition( - key="course_number", match=models.MatchAny(any=["test"]) - ), models.FieldCondition( key="run_readable_id", match=models.MatchAny(any=["test_run_id"]) ), - models.FieldCondition( - key="content_feature_type", - match=models.MatchAny(any=["test_feature"]), - ), models.FieldCondition( key="resource_readable_id", match=models.MatchAny( @@ -226,15 +219,9 @@ def test_content_file_vector_search_filters_empty_query( models.FieldCondition( key="platform.code", match=models.MatchAny(any=["edx"]) ), - models.FieldCondition( - key="course_number", match=models.MatchAny(any=["test"]) - ), models.FieldCondition( key="run_readable_id", match=models.MatchAny(any=["test_run_id"]) ), - models.FieldCondition( - key="content_feature_type", match=models.MatchAny(any=["test_feature"]) - ), models.FieldCondition( key="resource_readable_id", match=models.MatchAny(any=["test_resource_id_1", "test_resource_id_2"]), From 093ac990615190036f45223c9cbb78dd8f6f3d49 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Thu, 2 Apr 2026 13:39:31 -0400 Subject: [PATCH 05/10] regenerate api spec --- frontends/api/src/generated/v0/api.ts | 90 --------------------------- openapi/specs/v0.yaml | 41 ------------ 2 files changed, 131 deletions(-) diff --git a/frontends/api/src/generated/v0/api.ts b/frontends/api/src/generated/v0/api.ts index 8d0d862075..b550183b2e 100644 --- a/frontends/api/src/generated/v0/api.ts +++ b/frontends/api/src/generated/v0/api.ts @@ -11539,9 +11539,6 @@ export const VectorContentFilesSearchApiAxiosParamCreator = function ( * Vector Search for content * @summary Content File Vector Search * @param {string} [collection_name] Manually specify the name of the Qdrant collection to query - * @param {Array} [content_feature_type] The feature type of the content file. Possible options are at api/v1/course_features/ - * @param {Array} [course_number] Course number of the content file - * @param {Array} [edx_module_id] The edx_module_id of the content file * @param {Array} [file_extension] The extension of the content file. * @param {string} [group_by] The attribute to group results by * @param {number} [group_size] The number of chunks in each group. Only relevant when group_by is used @@ -11555,18 +11552,13 @@ export const VectorContentFilesSearchApiAxiosParamCreator = function ( * @param {Array} [resource_readable_id] The readable_id value of the parent learning resource for the content file * @param {Array} [run_readable_id] The readable_id value of the run that the content file belongs to * @param {VectorContentFilesSearchRetrieveSortbyEnum} [sortby] if the parameter starts with \'-\' the sort is in descending order * `id` - id * `-id` - -id * `resource_readable_id` - resource_readable_id * `-resource_readable_id` - -resource_readable_id - * @param {Array} [title] The title of the content file. * @param {boolean | null} [title__isnull] Filter to content files where title is null/not null - * @param {Array} [url] The url of the content file. * @param {boolean | null} [url__isnull] Filter to content files where url is null/not null * @param {*} [options] Override http request option. * @throws {RequiredError} */ vectorContentFilesSearchRetrieve: async ( collection_name?: string, - content_feature_type?: Array, - course_number?: Array, - edx_module_id?: Array, file_extension?: Array, group_by?: string, group_size?: number, @@ -11580,9 +11572,7 @@ export const VectorContentFilesSearchApiAxiosParamCreator = function ( resource_readable_id?: Array, run_readable_id?: Array, sortby?: VectorContentFilesSearchRetrieveSortbyEnum, - title?: Array, title__isnull?: boolean | null, - url?: Array, url__isnull?: boolean | null, options: RawAxiosRequestConfig = {}, ): Promise => { @@ -11606,18 +11596,6 @@ export const VectorContentFilesSearchApiAxiosParamCreator = function ( localVarQueryParameter["collection_name"] = collection_name } - if (content_feature_type) { - localVarQueryParameter["content_feature_type"] = content_feature_type - } - - if (course_number) { - localVarQueryParameter["course_number"] = course_number - } - - if (edx_module_id) { - localVarQueryParameter["edx_module_id"] = edx_module_id - } - if (file_extension) { localVarQueryParameter["file_extension"] = file_extension } @@ -11670,18 +11648,10 @@ export const VectorContentFilesSearchApiAxiosParamCreator = function ( localVarQueryParameter["sortby"] = sortby } - if (title) { - localVarQueryParameter["title"] = title - } - if (title__isnull !== undefined) { localVarQueryParameter["title__isnull"] = title__isnull } - if (url) { - localVarQueryParameter["url"] = url - } - if (url__isnull !== undefined) { localVarQueryParameter["url__isnull"] = url__isnull } @@ -11717,9 +11687,6 @@ export const VectorContentFilesSearchApiFp = function ( * Vector Search for content * @summary Content File Vector Search * @param {string} [collection_name] Manually specify the name of the Qdrant collection to query - * @param {Array} [content_feature_type] The feature type of the content file. Possible options are at api/v1/course_features/ - * @param {Array} [course_number] Course number of the content file - * @param {Array} [edx_module_id] The edx_module_id of the content file * @param {Array} [file_extension] The extension of the content file. * @param {string} [group_by] The attribute to group results by * @param {number} [group_size] The number of chunks in each group. Only relevant when group_by is used @@ -11733,18 +11700,13 @@ export const VectorContentFilesSearchApiFp = function ( * @param {Array} [resource_readable_id] The readable_id value of the parent learning resource for the content file * @param {Array} [run_readable_id] The readable_id value of the run that the content file belongs to * @param {VectorContentFilesSearchRetrieveSortbyEnum} [sortby] if the parameter starts with \'-\' the sort is in descending order * `id` - id * `-id` - -id * `resource_readable_id` - resource_readable_id * `-resource_readable_id` - -resource_readable_id - * @param {Array} [title] The title of the content file. * @param {boolean | null} [title__isnull] Filter to content files where title is null/not null - * @param {Array} [url] The url of the content file. * @param {boolean | null} [url__isnull] Filter to content files where url is null/not null * @param {*} [options] Override http request option. * @throws {RequiredError} */ async vectorContentFilesSearchRetrieve( collection_name?: string, - content_feature_type?: Array, - course_number?: Array, - edx_module_id?: Array, file_extension?: Array, group_by?: string, group_size?: number, @@ -11758,9 +11720,7 @@ export const VectorContentFilesSearchApiFp = function ( resource_readable_id?: Array, run_readable_id?: Array, sortby?: VectorContentFilesSearchRetrieveSortbyEnum, - title?: Array, title__isnull?: boolean | null, - url?: Array, url__isnull?: boolean | null, options?: RawAxiosRequestConfig, ): Promise< @@ -11772,9 +11732,6 @@ export const VectorContentFilesSearchApiFp = function ( const localVarAxiosArgs = await localVarAxiosParamCreator.vectorContentFilesSearchRetrieve( collection_name, - content_feature_type, - course_number, - edx_module_id, file_extension, group_by, group_size, @@ -11788,9 +11745,7 @@ export const VectorContentFilesSearchApiFp = function ( resource_readable_id, run_readable_id, sortby, - title, title__isnull, - url, url__isnull, options, ) @@ -11835,9 +11790,6 @@ export const VectorContentFilesSearchApiFactory = function ( return localVarFp .vectorContentFilesSearchRetrieve( requestParameters.collection_name, - requestParameters.content_feature_type, - requestParameters.course_number, - requestParameters.edx_module_id, requestParameters.file_extension, requestParameters.group_by, requestParameters.group_size, @@ -11851,9 +11803,7 @@ export const VectorContentFilesSearchApiFactory = function ( requestParameters.resource_readable_id, requestParameters.run_readable_id, requestParameters.sortby, - requestParameters.title, requestParameters.title__isnull, - requestParameters.url, requestParameters.url__isnull, options, ) @@ -11875,27 +11825,6 @@ export interface VectorContentFilesSearchApiVectorContentFilesSearchRetrieveRequ */ readonly collection_name?: string - /** - * The feature type of the content file. Possible options are at api/v1/course_features/ - * @type {Array} - * @memberof VectorContentFilesSearchApiVectorContentFilesSearchRetrieve - */ - readonly content_feature_type?: Array - - /** - * Course number of the content file - * @type {Array} - * @memberof VectorContentFilesSearchApiVectorContentFilesSearchRetrieve - */ - readonly course_number?: Array - - /** - * The edx_module_id of the content file - * @type {Array} - * @memberof VectorContentFilesSearchApiVectorContentFilesSearchRetrieve - */ - readonly edx_module_id?: Array - /** * The extension of the content file. * @type {Array} @@ -11987,13 +11916,6 @@ export interface VectorContentFilesSearchApiVectorContentFilesSearchRetrieveRequ */ readonly sortby?: VectorContentFilesSearchRetrieveSortbyEnum - /** - * The title of the content file. - * @type {Array} - * @memberof VectorContentFilesSearchApiVectorContentFilesSearchRetrieve - */ - readonly title?: Array - /** * Filter to content files where title is null/not null * @type {boolean} @@ -12001,13 +11923,6 @@ export interface VectorContentFilesSearchApiVectorContentFilesSearchRetrieveRequ */ readonly title__isnull?: boolean | null - /** - * The url of the content file. - * @type {Array} - * @memberof VectorContentFilesSearchApiVectorContentFilesSearchRetrieve - */ - readonly url?: Array - /** * Filter to content files where url is null/not null * @type {boolean} @@ -12038,9 +11953,6 @@ export class VectorContentFilesSearchApi extends BaseAPI { return VectorContentFilesSearchApiFp(this.configuration) .vectorContentFilesSearchRetrieve( requestParameters.collection_name, - requestParameters.content_feature_type, - requestParameters.course_number, - requestParameters.edx_module_id, requestParameters.file_extension, requestParameters.group_by, requestParameters.group_size, @@ -12054,9 +11966,7 @@ export class VectorContentFilesSearchApi extends BaseAPI { requestParameters.resource_readable_id, requestParameters.run_readable_id, requestParameters.sortby, - requestParameters.title, requestParameters.title__isnull, - requestParameters.url, requestParameters.url__isnull, options, ) diff --git a/openapi/specs/v0.yaml b/openapi/specs/v0.yaml index 1be5097352..1d0f7f9086 100644 --- a/openapi/specs/v0.yaml +++ b/openapi/specs/v0.yaml @@ -833,31 +833,6 @@ paths: type: string minLength: 1 description: Manually specify the name of the Qdrant collection to query - - in: query - name: content_feature_type - schema: - type: array - items: - type: string - minLength: 1 - description: The feature type of the content file. Possible options are at - api/v1/course_features/ - - in: query - name: course_number - schema: - type: array - items: - type: string - minLength: 1 - description: Course number of the content file - - in: query - name: edx_module_id - schema: - type: array - items: - type: string - minLength: 1 - description: The edx_module_id of the content file - in: query name: file_extension schema: @@ -959,28 +934,12 @@ paths: * `-id` - -id * `resource_readable_id` - resource_readable_id * `-resource_readable_id` - -resource_readable_id - - in: query - name: title - schema: - type: array - items: - type: string - minLength: 1 - description: 'The title of the content file. ' - in: query name: title__isnull schema: type: boolean nullable: true description: Filter to content files where title is null/not null - - in: query - name: url - schema: - type: array - items: - type: string - minLength: 1 - description: 'The url of the content file. ' - in: query name: url__isnull schema: From aa335e91c0438585678289e05d6c2763a0b64cc7 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Thu, 2 Apr 2026 15:58:04 -0400 Subject: [PATCH 06/10] fix issue with resource result ordering --- vector_search/tasks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vector_search/tasks.py b/vector_search/tasks.py index e2fbaa3f68..e70eb96dca 100644 --- a/vector_search/tasks.py +++ b/vector_search/tasks.py @@ -66,6 +66,7 @@ def generate_embeddings(ids, resource_type, overwrite): resource_type (string): resource_type value for the learning resource objects """ + return None try: with wrap_retry_exception(*SEARCH_CONN_EXCEPTIONS): embed_learning_resources(ids, resource_type, overwrite) From d582707b2fc6d62bc1bb7305b08ab9ba96dd5e31 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Thu, 2 Apr 2026 18:02:20 -0400 Subject: [PATCH 07/10] adding fix for ordering --- vector_search/utils.py | 20 +++++++++++++------- vector_search/utils_test.py | 19 +++++++++++++++++++ 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/vector_search/utils.py b/vector_search/utils.py index 808f21da0a..e97856d4db 100644 --- a/vector_search/utils.py +++ b/vector_search/utils.py @@ -785,13 +785,19 @@ def process_batch(docs_batch, summaries_list): def _resource_vector_hits(search_result): hits = [hit.payload["readable_id"] for hit in search_result] """ - Always lookup learning resources by readable_id for portability - in case we load points from external systems - """ - return LearningResourceSerializer( - LearningResource.objects.for_serialization().filter(readable_id__in=hits), - many=True, - ).data + Always lookup learning resources by readable_id for portability + in case we load points from external systems + """ + resources_by_id = { + r.readable_id: r + for r in LearningResource.objects.for_serialization().filter( + readable_id__in=hits + ) + } + # Re-order to match the Qdrant ranking + ordered_resources = [resources_by_id[rid] for rid in hits if rid in resources_by_id] + + return LearningResourceSerializer(ordered_resources, many=True).data def _content_file_vector_hits(search_result): diff --git a/vector_search/utils_test.py b/vector_search/utils_test.py index 747bafcc3e..cb21b59753 100644 --- a/vector_search/utils_test.py +++ b/vector_search/utils_test.py @@ -37,6 +37,7 @@ _chunk_documents, _embed_course_metadata_as_contentfile, _get_text_splitter, + _resource_vector_hits, create_qdrant_collections, embed_learning_resources, embed_topics, @@ -1228,3 +1229,21 @@ def test_vector_search_group_by_offset_behavior(mocker, use_group_by): call_args = mock_qdrant.query_points.call_args.kwargs assert call_args.get("offset") == 15 assert "group_by" not in call_args + + +def test_resource_vector_hits_preserves_qdrant_score_order(): + """Results should be returned in the same order as the search_result (qdrant score order).""" + resources = LearningResourceFactory.create_batch(5) + # Shuffle to create a non-alphabetical, non-pk order (simulating qdrant ranking) + shuffled = random.sample(resources, len(resources)) + + # Build mock ScoredPoints with readable_ids in the shuffled order + search_result = [ + MagicMock(payload={"readable_id": r.readable_id}) for r in shuffled + ] + + result = _resource_vector_hits(search_result) + + expected_readable_ids = [r.readable_id for r in shuffled] + actual_readable_ids = [r["readable_id"] for r in result] + assert actual_readable_ids == expected_readable_ids From 91acc003e59f51c9f01fbd80e4d1aa39fb112bae Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Fri, 3 Apr 2026 10:31:29 -0400 Subject: [PATCH 08/10] remove debug code --- vector_search/tasks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vector_search/tasks.py b/vector_search/tasks.py index e70eb96dca..e2fbaa3f68 100644 --- a/vector_search/tasks.py +++ b/vector_search/tasks.py @@ -66,7 +66,6 @@ def generate_embeddings(ids, resource_type, overwrite): resource_type (string): resource_type value for the learning resource objects """ - return None try: with wrap_retry_exception(*SEARCH_CONN_EXCEPTIONS): embed_learning_resources(ids, resource_type, overwrite) From efb1b3eea95a8dd6b25e0763c256d044a0848a19 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Fri, 3 Apr 2026 10:44:41 -0400 Subject: [PATCH 09/10] fix formatting for array notation --- vector_search/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vector_search/constants.py b/vector_search/constants.py index 296778763a..0adc4f31a4 100644 --- a/vector_search/constants.py +++ b/vector_search/constants.py @@ -64,7 +64,7 @@ "topics[].name": models.PayloadSchemaType.KEYWORD, "ocw_topics": models.PayloadSchemaType.KEYWORD, "runs[].level.code": models.PayloadSchemaType.KEYWORD, - "departments.department_id": models.PayloadSchemaType.KEYWORD, + "departments[].department_id": models.PayloadSchemaType.KEYWORD, "platform.code": models.PayloadSchemaType.KEYWORD, "offered_by.code": models.PayloadSchemaType.KEYWORD, "delivery[].code": models.PayloadSchemaType.KEYWORD, From 184bc96a278c562e3e91197bf3d1ccc6fae66a8a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 3 Apr 2026 15:24:58 +0000 Subject: [PATCH 10/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- vector_search/utils_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vector_search/utils_test.py b/vector_search/utils_test.py index 19e3230fd3..3015bbe01c 100644 --- a/vector_search/utils_test.py +++ b/vector_search/utils_test.py @@ -39,8 +39,8 @@ _embed_course_metadata_as_contentfile, _generate_content_file_points, _get_text_splitter, - _resource_vector_hits, _is_markdown_content, + _resource_vector_hits, create_qdrant_collections, embed_learning_resources, embed_topics,