diff --git a/frontends/api/src/generated/v0/api.ts b/frontends/api/src/generated/v0/api.ts index 4961849eae..368c911b52 100644 --- a/frontends/api/src/generated/v0/api.ts +++ b/frontends/api/src/generated/v0/api.ts @@ -11533,9 +11533,6 @@ export const VectorContentFilesSearchApiAxiosParamCreator = function ( * Vector Search for content * @summary Content File Vector Search * @param {string} [collection_name] Manually specify the name of the Qdrant collection to query - * @param {Array} [content_feature_type] The feature type of the content file. Possible options are at api/v1/course_features/ - * @param {Array} [course_number] Course number of the content file - * @param {Array} [edx_module_id] The edx_module_id of the content file * @param {Array} [file_extension] The extension of the content file. * @param {string} [group_by] The attribute to group results by * @param {number} [group_size] The number of chunks in each group. Only relevant when group_by is used @@ -11549,18 +11546,13 @@ export const VectorContentFilesSearchApiAxiosParamCreator = function ( * @param {Array} [resource_readable_id] The readable_id value of the parent learning resource for the content file * @param {Array} [run_readable_id] The readable_id value of the run that the content file belongs to * @param {VectorContentFilesSearchRetrieveSortbyEnum} [sortby] if the parameter starts with \'-\' the sort is in descending order * `id` - id * `-id` - -id * `resource_readable_id` - resource_readable_id * `-resource_readable_id` - -resource_readable_id - * @param {Array} [title] The title of the content file. * @param {boolean | null} [title__isnull] Filter to content files where title is null/not null - * @param {Array} [url] The url of the content file. * @param {boolean | null} [url__isnull] Filter to content files where url is null/not null * @param {*} [options] Override http request option. * @throws {RequiredError} */ vectorContentFilesSearchRetrieve: async ( collection_name?: string, - content_feature_type?: Array, - course_number?: Array, - edx_module_id?: Array, file_extension?: Array, group_by?: string, group_size?: number, @@ -11574,9 +11566,7 @@ export const VectorContentFilesSearchApiAxiosParamCreator = function ( resource_readable_id?: Array, run_readable_id?: Array, sortby?: VectorContentFilesSearchRetrieveSortbyEnum, - title?: Array, title__isnull?: boolean | null, - url?: Array, url__isnull?: boolean | null, options: RawAxiosRequestConfig = {}, ): Promise => { @@ -11600,18 +11590,6 @@ export const VectorContentFilesSearchApiAxiosParamCreator = function ( localVarQueryParameter["collection_name"] = collection_name } - if (content_feature_type) { - localVarQueryParameter["content_feature_type"] = content_feature_type - } - - if (course_number) { - localVarQueryParameter["course_number"] = course_number - } - - if (edx_module_id) { - localVarQueryParameter["edx_module_id"] = edx_module_id - } - if (file_extension) { localVarQueryParameter["file_extension"] = file_extension } @@ -11664,18 +11642,10 @@ export const VectorContentFilesSearchApiAxiosParamCreator = function ( localVarQueryParameter["sortby"] = sortby } - if (title) { - localVarQueryParameter["title"] = title - } - if (title__isnull !== undefined) { localVarQueryParameter["title__isnull"] = title__isnull } - if (url) { - localVarQueryParameter["url"] = url - } - if (url__isnull !== undefined) { localVarQueryParameter["url__isnull"] = url__isnull } @@ -11711,9 +11681,6 @@ export const VectorContentFilesSearchApiFp = function ( * Vector Search for content * @summary Content File Vector Search * @param {string} [collection_name] Manually specify the name of the Qdrant collection to query - * @param {Array} [content_feature_type] The feature type of the content file. Possible options are at api/v1/course_features/ - * @param {Array} [course_number] Course number of the content file - * @param {Array} [edx_module_id] The edx_module_id of the content file * @param {Array} [file_extension] The extension of the content file. * @param {string} [group_by] The attribute to group results by * @param {number} [group_size] The number of chunks in each group. Only relevant when group_by is used @@ -11727,18 +11694,13 @@ export const VectorContentFilesSearchApiFp = function ( * @param {Array} [resource_readable_id] The readable_id value of the parent learning resource for the content file * @param {Array} [run_readable_id] The readable_id value of the run that the content file belongs to * @param {VectorContentFilesSearchRetrieveSortbyEnum} [sortby] if the parameter starts with \'-\' the sort is in descending order * `id` - id * `-id` - -id * `resource_readable_id` - resource_readable_id * `-resource_readable_id` - -resource_readable_id - * @param {Array} [title] The title of the content file. * @param {boolean | null} [title__isnull] Filter to content files where title is null/not null - * @param {Array} [url] The url of the content file. * @param {boolean | null} [url__isnull] Filter to content files where url is null/not null * @param {*} [options] Override http request option. * @throws {RequiredError} */ async vectorContentFilesSearchRetrieve( collection_name?: string, - content_feature_type?: Array, - course_number?: Array, - edx_module_id?: Array, file_extension?: Array, group_by?: string, group_size?: number, @@ -11752,9 +11714,7 @@ export const VectorContentFilesSearchApiFp = function ( resource_readable_id?: Array, run_readable_id?: Array, sortby?: VectorContentFilesSearchRetrieveSortbyEnum, - title?: Array, title__isnull?: boolean | null, - url?: Array, url__isnull?: boolean | null, options?: RawAxiosRequestConfig, ): Promise< @@ -11766,9 +11726,6 @@ export const VectorContentFilesSearchApiFp = function ( const localVarAxiosArgs = await localVarAxiosParamCreator.vectorContentFilesSearchRetrieve( collection_name, - content_feature_type, - course_number, - edx_module_id, file_extension, group_by, group_size, @@ -11782,9 +11739,7 @@ export const VectorContentFilesSearchApiFp = function ( resource_readable_id, run_readable_id, sortby, - title, title__isnull, - url, url__isnull, options, ) @@ -11829,9 +11784,6 @@ export const VectorContentFilesSearchApiFactory = function ( return localVarFp .vectorContentFilesSearchRetrieve( requestParameters.collection_name, - requestParameters.content_feature_type, - requestParameters.course_number, - requestParameters.edx_module_id, requestParameters.file_extension, requestParameters.group_by, requestParameters.group_size, @@ -11845,9 +11797,7 @@ export const VectorContentFilesSearchApiFactory = function ( requestParameters.resource_readable_id, requestParameters.run_readable_id, requestParameters.sortby, - requestParameters.title, requestParameters.title__isnull, - requestParameters.url, requestParameters.url__isnull, options, ) @@ -11869,27 +11819,6 @@ export interface VectorContentFilesSearchApiVectorContentFilesSearchRetrieveRequ */ readonly collection_name?: string - /** - * The feature type of the content file. Possible options are at api/v1/course_features/ - * @type {Array} - * @memberof VectorContentFilesSearchApiVectorContentFilesSearchRetrieve - */ - readonly content_feature_type?: Array - - /** - * Course number of the content file - * @type {Array} - * @memberof VectorContentFilesSearchApiVectorContentFilesSearchRetrieve - */ - readonly course_number?: Array - - /** - * The edx_module_id of the content file - * @type {Array} - * @memberof VectorContentFilesSearchApiVectorContentFilesSearchRetrieve - */ - readonly edx_module_id?: Array - /** * The extension of the content file. * @type {Array} @@ -11981,13 +11910,6 @@ export interface VectorContentFilesSearchApiVectorContentFilesSearchRetrieveRequ */ readonly sortby?: VectorContentFilesSearchRetrieveSortbyEnum - /** - * The title of the content file. - * @type {Array} - * @memberof VectorContentFilesSearchApiVectorContentFilesSearchRetrieve - */ - readonly title?: Array - /** * Filter to content files where title is null/not null * @type {boolean} @@ -11995,13 +11917,6 @@ export interface VectorContentFilesSearchApiVectorContentFilesSearchRetrieveRequ */ readonly title__isnull?: boolean | null - /** - * The url of the content file. - * @type {Array} - * @memberof VectorContentFilesSearchApiVectorContentFilesSearchRetrieve - */ - readonly url?: Array - /** * Filter to content files where url is null/not null * @type {boolean} @@ -12032,9 +11947,6 @@ export class VectorContentFilesSearchApi extends BaseAPI { return VectorContentFilesSearchApiFp(this.configuration) .vectorContentFilesSearchRetrieve( requestParameters.collection_name, - requestParameters.content_feature_type, - requestParameters.course_number, - requestParameters.edx_module_id, requestParameters.file_extension, requestParameters.group_by, requestParameters.group_size, @@ -12048,9 +11960,7 @@ export class VectorContentFilesSearchApi extends BaseAPI { requestParameters.resource_readable_id, requestParameters.run_readable_id, requestParameters.sortby, - requestParameters.title, requestParameters.title__isnull, - requestParameters.url, requestParameters.url__isnull, options, ) diff --git a/openapi/specs/v0.yaml b/openapi/specs/v0.yaml index 5070e49e65..4bb97c19c2 100644 --- a/openapi/specs/v0.yaml +++ b/openapi/specs/v0.yaml @@ -833,31 +833,6 @@ paths: type: string minLength: 1 description: Manually specify the name of the Qdrant collection to query - - in: query - name: content_feature_type - schema: - type: array - items: - type: string - minLength: 1 - description: The feature type of the content file. Possible options are at - api/v1/course_features/ - - in: query - name: course_number - schema: - type: array - items: - type: string - minLength: 1 - description: Course number of the content file - - in: query - name: edx_module_id - schema: - type: array - items: - type: string - minLength: 1 - description: The edx_module_id of the content file - in: query name: file_extension schema: @@ -959,28 +934,12 @@ paths: * `-id` - -id * `resource_readable_id` - resource_readable_id * `-resource_readable_id` - -resource_readable_id - - in: query - name: title - schema: - type: array - items: - type: string - minLength: 1 - description: 'The title of the content file. ' - in: query name: title__isnull schema: type: boolean nullable: true description: Filter to content files where title is null/not null - - in: query - name: url - schema: - type: array - items: - type: string - minLength: 1 - description: 'The url of the content file. ' - in: query name: url__isnull schema: diff --git a/vector_search/constants.py b/vector_search/constants.py index 7e552febae..0adc4f31a4 100644 --- a/vector_search/constants.py +++ b/vector_search/constants.py @@ -38,7 +38,7 @@ "topic": "topics[].name", "ocw_topic": "ocw_topics", "level": "runs[].level[].code", - "department": "departments.department_id", + "department": "departments[].department_id", "platform": "platform.code", "offered_by": "offered_by.code", "delivery": "delivery[].code", @@ -64,7 +64,7 @@ "topics[].name": models.PayloadSchemaType.KEYWORD, "ocw_topics": models.PayloadSchemaType.KEYWORD, "runs[].level.code": models.PayloadSchemaType.KEYWORD, - "departments.department_id": models.PayloadSchemaType.KEYWORD, + "departments[].department_id": models.PayloadSchemaType.KEYWORD, "platform.code": models.PayloadSchemaType.KEYWORD, "offered_by.code": models.PayloadSchemaType.KEYWORD, "delivery[].code": models.PayloadSchemaType.KEYWORD, @@ -73,25 +73,19 @@ "resource_type_group": models.PayloadSchemaType.KEYWORD, } - +""" +Note: Be intentional about which fields we add as indexes. +Only add fields that we expect to filter or facet on frequently. +""" QDRANT_CONTENT_FILE_INDEXES = { - "chunk_number": models.PayloadSchemaType.INTEGER, "key": models.PayloadSchemaType.KEYWORD, "title": models.PayloadSchemaType.KEYWORD, - "course_number": models.PayloadSchemaType.INTEGER, "platform.code": models.PayloadSchemaType.KEYWORD, "offered_by.code": models.PayloadSchemaType.KEYWORD, - "published": models.PayloadSchemaType.BOOL, - "content_feature_type": models.PayloadSchemaType.KEYWORD, - "file_type": models.PayloadSchemaType.KEYWORD, "file_extension": models.PayloadSchemaType.KEYWORD, "run_readable_id": models.PayloadSchemaType.KEYWORD, "resource_readable_id": models.PayloadSchemaType.KEYWORD, - "run_title": models.PayloadSchemaType.KEYWORD, "edx_module_id": models.PayloadSchemaType.KEYWORD, - "checksum": models.PayloadSchemaType.KEYWORD, - "content_type": models.PayloadSchemaType.KEYWORD, - "edx_block_id": models.PayloadSchemaType.KEYWORD, "url": models.PayloadSchemaType.KEYWORD, } diff --git a/vector_search/serializers.py b/vector_search/serializers.py index 6672d79541..c4c2e7a56a 100644 --- a/vector_search/serializers.py +++ b/vector_search/serializers.py @@ -208,11 +208,6 @@ class ContentFileVectorSearchRequestSerializer(serializers.Serializer): child=serializers.CharField(), help_text="The filename of the content file", ) - course_number = serializers.ListField( - required=False, - child=serializers.CharField(), - help_text="Course number of the content file", - ) offered_by = serializers.ListField( required=False, child=serializers.CharField(), @@ -223,12 +218,7 @@ class ContentFileVectorSearchRequestSerializer(serializers.Serializer): child=serializers.CharField(), help_text="platform(s) of the content file", ) - content_feature_type = serializers.ListField( - required=False, - child=serializers.CharField(), - help_text="The feature type of the content file. " - "Possible options are at api/v1/course_features/", - ) + file_extension = serializers.ListField( required=False, child=serializers.CharField(), @@ -246,11 +236,6 @@ class ContentFileVectorSearchRequestSerializer(serializers.Serializer): "The readable_id value of the parent learning resource for the content file" ), ) - edx_module_id = serializers.ListField( - required=False, - child=serializers.CharField(), - help_text="The edx_module_id of the content file", - ) collection_name = serializers.CharField( required=False, help_text=("Manually specify the name of the Qdrant collection to query"), @@ -265,17 +250,6 @@ class ContentFileVectorSearchRequestSerializer(serializers.Serializer): "The number of chunks in each group. Only relevant when group_by is used" ), ) - url = serializers.ListField( - required=False, - child=serializers.CharField(), - help_text="The url of the content file. ", - ) - title = serializers.ListField( - required=False, - child=serializers.CharField(), - help_text="The title of the content file. ", - ) - url__isnull = serializers.BooleanField( required=False, default=None, diff --git a/vector_search/utils.py b/vector_search/utils.py index 84be216582..d21bfbf988 100644 --- a/vector_search/utils.py +++ b/vector_search/utils.py @@ -852,13 +852,19 @@ def process_batch(docs_batch, summaries_list): def _resource_vector_hits(search_result): hits = [hit.payload["readable_id"] for hit in search_result] """ - Always lookup learning resources by readable_id for portability - in case we load points from external systems - """ - return LearningResourceSerializer( - LearningResource.objects.for_serialization().filter(readable_id__in=hits), - many=True, - ).data + Always lookup learning resources by readable_id for portability + in case we load points from external systems + """ + resources_by_id = { + r.readable_id: r + for r in LearningResource.objects.for_serialization().filter( + readable_id__in=hits + ) + } + # Re-order to match the Qdrant ranking + ordered_resources = [resources_by_id[rid] for rid in hits if rid in resources_by_id] + + return LearningResourceSerializer(ordered_resources, many=True).data def _content_file_vector_hits(search_result): diff --git a/vector_search/utils_test.py b/vector_search/utils_test.py index a1f232cc49..3015bbe01c 100644 --- a/vector_search/utils_test.py +++ b/vector_search/utils_test.py @@ -40,6 +40,7 @@ _generate_content_file_points, _get_text_splitter, _is_markdown_content, + _resource_vector_hits, create_qdrant_collections, embed_learning_resources, embed_topics, @@ -1434,3 +1435,21 @@ def test_vector_search_group_by_offset_behavior(mocker, use_group_by): call_args = mock_qdrant.query_points.call_args.kwargs assert call_args.get("offset") == 15 assert "group_by" not in call_args + + +def test_resource_vector_hits_preserves_qdrant_score_order(): + """Results should be returned in the same order as the search_result (qdrant score order).""" + resources = LearningResourceFactory.create_batch(5) + # Shuffle to create a non-alphabetical, non-pk order (simulating qdrant ranking) + shuffled = random.sample(resources, len(resources)) + + # Build mock ScoredPoints with readable_ids in the shuffled order + search_result = [ + MagicMock(payload={"readable_id": r.readable_id}) for r in shuffled + ] + + result = _resource_vector_hits(search_result) + + expected_readable_ids = [r.readable_id for r in shuffled] + actual_readable_ids = [r["readable_id"] for r in result] + assert actual_readable_ids == expected_readable_ids diff --git a/vector_search/views_test.py b/vector_search/views_test.py index 201e455dd4..4e3207434b 100644 --- a/vector_search/views_test.py +++ b/vector_search/views_test.py @@ -46,7 +46,7 @@ def test_vector_search_filters(mocker, client): ), models.FieldCondition(key="free", match=models.MatchValue(value=True)), models.FieldCondition( - key="departments.department_id", + key="departments[].department_id", match=models.MatchAny(any=["6", "7"]), ), ] @@ -92,7 +92,7 @@ def test_vector_search_filters_empty_query(mocker, client): ), models.FieldCondition(key="free", match=models.MatchValue(value=True)), models.FieldCondition( - key="departments.department_id", + key="departments[].department_id", match=models.MatchAny(any=["6", "7"]), ), ] @@ -157,16 +157,9 @@ def test_content_file_vector_search_filters( models.FieldCondition( key="platform.code", match=models.MatchAny(any=["edx"]) ), - models.FieldCondition( - key="course_number", match=models.MatchAny(any=["test"]) - ), models.FieldCondition( key="run_readable_id", match=models.MatchAny(any=["test_run_id"]) ), - models.FieldCondition( - key="content_feature_type", - match=models.MatchAny(any=["test_feature"]), - ), models.FieldCondition( key="resource_readable_id", match=models.MatchAny( @@ -226,15 +219,9 @@ def test_content_file_vector_search_filters_empty_query( models.FieldCondition( key="platform.code", match=models.MatchAny(any=["edx"]) ), - models.FieldCondition( - key="course_number", match=models.MatchAny(any=["test"]) - ), models.FieldCondition( key="run_readable_id", match=models.MatchAny(any=["test_run_id"]) ), - models.FieldCondition( - key="content_feature_type", match=models.MatchAny(any=["test_feature"]) - ), models.FieldCondition( key="resource_readable_id", match=models.MatchAny(any=["test_resource_id_1", "test_resource_id_2"]),