From 9b31d88781e81fb1b5fb190d4f11e4369c67af30 Mon Sep 17 00:00:00 2001 From: gagik Date: Mon, 27 Oct 2025 13:50:50 +0100 Subject: [PATCH 1/8] chore: simplify --- src/tools/mongodb/create/insertMany.ts | 187 +++++++++++++++++-- tests/accuracy/insertMany.embeddings.test.ts | 46 +++++ 2 files changed, 222 insertions(+), 11 deletions(-) create mode 100644 tests/accuracy/insertMany.embeddings.test.ts diff --git a/src/tools/mongodb/create/insertMany.ts b/src/tools/mongodb/create/insertMany.ts index fa3fc3651..cfc1efee3 100644 --- a/src/tools/mongodb/create/insertMany.ts +++ b/src/tools/mongodb/create/insertMany.ts @@ -3,6 +3,10 @@ import type { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js"; import { type ToolArgs, type OperationType, formatUntrustedData } from "../../tool.js"; import { zEJSON } from "../../args.js"; +import { type Document } from "bson"; +import { zSupportedEmbeddingParameters } from "../../../common/search/embeddingsProvider.js"; +import { ErrorCodes, MongoDBError } from "../../../common/errors.js"; +import type { VectorFieldIndexDefinition } from "../../../common/search/vectorSearchEmbeddingsManager.js"; export class InsertManyTool extends MongoDBToolBase { public name = "insert-many"; @@ -12,7 +16,12 @@ export class InsertManyTool extends MongoDBToolBase { documents: z .array(zEJSON().describe("An individual MongoDB document")) .describe( - "The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany()" + "The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany(). For fields that have vector search indexes, you can provide raw text strings that will be automatically converted to embeddings if embeddingParameters is provided." + ), + embeddingParameters: zSupportedEmbeddingParameters + .optional() + .describe( + "The embedding model and its parameters to use to generate embeddings for fields that have vector search indexes. When a field has a vector search index and contains a plain text string in the document, embeddings will be automatically generated from that string value. Note to LLM: If unsure which embedding model to use, ask the user before providing one." ), }; public operationType: OperationType = "create"; @@ -21,23 +30,34 @@ export class InsertManyTool extends MongoDBToolBase { database, collection, documents, + embeddingParameters, }: ToolArgs): Promise { const provider = await this.ensureConnected(); - const embeddingValidations = new Set( - ...(await Promise.all( - documents.flatMap((document) => - this.session.vectorSearchEmbeddingsManager.findFieldsWithWrongEmbeddings( - { database, collection }, - document - ) - ) - )) + // Get vector search indexes for the collection + const vectorIndexes = await this.session.vectorSearchEmbeddingsManager.embeddingsForNamespace({ + database, + collection, + }); + + // Process documents to replace raw string values with generated embeddings + documents = await this.replaceRawValuesWithEmbeddingsIfNecessary({ + database, + collection, + documents, + vectorIndexes, + embeddingParameters, + }); + + const embeddingValidationPromises = documents.map((document) => + this.session.vectorSearchEmbeddingsManager.findFieldsWithWrongEmbeddings({ database, collection }, document) ); + const embeddingValidationResults = await Promise.all(embeddingValidationPromises); + const embeddingValidations = new Set(embeddingValidationResults.flat()); if (embeddingValidations.size > 0) { // tell the LLM what happened - const embeddingValidationMessages = [...embeddingValidations].map( + const embeddingValidationMessages = Array.from(embeddingValidations).map( (validation) => `- Field ${validation.path} is an embedding with ${validation.expectedNumDimensions} dimensions and ${validation.expectedQuantization}` + ` quantization, and the provided value is not compatible. Actual dimensions: ${validation.actualNumDimensions}, ` + @@ -63,4 +83,149 @@ export class InsertManyTool extends MongoDBToolBase { content, }; } + + private async replaceRawValuesWithEmbeddingsIfNecessary({ + database, + collection, + documents, + vectorIndexes, + embeddingParameters, + }: { + database: string; + collection: string; + documents: Document[]; + vectorIndexes: VectorFieldIndexDefinition[]; + embeddingParameters?: z.infer; + }): Promise { + // If no vector indexes, return documents as-is + if (vectorIndexes.length === 0) { + return documents; + } + + const processedDocuments: Document[] = []; + + for (let i = 0; i < documents.length; i++) { + const document = documents[i]; + if (!document) { + continue; + } + const processedDoc = await this.processDocumentForEmbeddings( + database, + collection, + document, + vectorIndexes, + embeddingParameters + ); + processedDocuments.push(processedDoc); + } + + return processedDocuments; + } + + private async processDocumentForEmbeddings( + database: string, + collection: string, + document: Document, + vectorIndexes: VectorFieldIndexDefinition[], + embeddingParameters?: z.infer + ): Promise { + // Find all fields in the document that match vector search indexed fields and need embeddings + const fieldsNeedingEmbeddings: Array<{ + path: string; + rawValue: string; + indexDef: VectorFieldIndexDefinition; + }> = []; + + for (const indexDef of vectorIndexes) { + // Check if the field exists in the document and is a string (raw text) + const fieldValue = this.getFieldValue(document, indexDef.path); + if (typeof fieldValue === "string") { + fieldsNeedingEmbeddings.push({ + path: indexDef.path, + rawValue: fieldValue, + indexDef, + }); + } + } + + // If no fields need embeddings, return document as-is + if (fieldsNeedingEmbeddings.length === 0) { + return document; + } + + // Check if embeddingParameters is provided + if (!embeddingParameters) { + const fieldPaths = fieldsNeedingEmbeddings.map((f) => f.path).join(", "); + throw new MongoDBError( + ErrorCodes.AtlasVectorSearchInvalidQuery, + `Fields [${fieldPaths}] have vector search indexes and contain raw text strings. The embeddingParameters parameter is required to generate embeddings for these fields.` + ); + } + + // Generate embeddings for all fields + const embeddingsMap = new Map(); + + for (const field of fieldsNeedingEmbeddings) { + const embeddings = await this.session.vectorSearchEmbeddingsManager.generateEmbeddings({ + database, + collection, + path: field.path, + rawValues: [field.rawValue], + embeddingParameters, + inputType: "document", + }); + + if (embeddings.length > 0 && Array.isArray(embeddings[0])) { + embeddingsMap.set(field.path, embeddings[0] as number[]); + } + } + + // Replace raw string values with generated embeddings + const processedDoc = { ...document }; + + for (const field of fieldsNeedingEmbeddings) { + const embedding = embeddingsMap.get(field.path); + if (embedding) { + this.setFieldValue(processedDoc, field.path, embedding); + } + } + + return processedDoc; + } + + private getFieldValue(document: Document, path: string): unknown { + const parts = path.split("."); + let current: unknown = document; + + for (const part of parts) { + if (current && typeof current === "object" && part in current) { + current = (current as Record)[part]; + } else { + return undefined; + } + } + + return current; + } + + private setFieldValue(document: Document, path: string, value: unknown): void { + const parts = path.split("."); + let current: Record = document; + + for (let i = 0; i < parts.length - 1; i++) { + const part = parts[i]; + if (!part) { + continue; + } + if (!(part in current) || typeof current[part] !== "object") { + current[part] = {}; + } + current = current[part] as Record; + } + + const lastPart = parts[parts.length - 1]; + if (lastPart) { + current[lastPart] = value; + } + } } diff --git a/tests/accuracy/insertMany.embeddings.test.ts b/tests/accuracy/insertMany.embeddings.test.ts new file mode 100644 index 000000000..3216664ef --- /dev/null +++ b/tests/accuracy/insertMany.embeddings.test.ts @@ -0,0 +1,46 @@ +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { Matcher } from "./sdk/matcher.js"; + +const embeddingParameters = { + model: "voyage-3", + outputDimension: Matcher.anyOf( + Matcher.undefined, + Matcher.number((n) => n === 1024) + ), + outputDType: Matcher.anyOf(Matcher.undefined, Matcher.value("float")), +}; + +/** + * Accuracy tests for inserting documents with automatic vector embeddings generation. + * Tests scenarios where raw text strings are provided and automatically converted to embeddings. + */ +describeAccuracyTests( + [ + { + prompt: "Insert a document into 'mflix.movies' collection with title 'The Matrix' and a plotSummary field with the text 'A computer hacker learns about the true nature of his reality'. Use the plot summary to generate the 'plotEmbeddings' field using the voyage-3 model.", + expectedToolCalls: [ + { + toolName: "insert-many", + parameters: { + database: "mflix", + collection: "movies", + documents: [ + { + title: "The Matrix", + plotSummary: "A computer hacker learns about the true nature of his reality", + plotEmbeddings: "A computer hacker learns about the true nature of his reality", + }, + ], + embeddingParameters, + }, + }, + ], + }, + ], + { + userConfig: { voyageApiKey: "valid-key" }, + clusterConfig: { + search: true, + }, + } +); From b8a37ef204d39910a9addf0ac4247074cff74cbb Mon Sep 17 00:00:00 2001 From: gagik Date: Tue, 28 Oct 2025 13:50:25 +0100 Subject: [PATCH 2/8] chore: inside-field --- src/tools/mongodb/create/insertMany.ts | 2 +- tests/accuracy/insertMany.embeddings.test.ts | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/tools/mongodb/create/insertMany.ts b/src/tools/mongodb/create/insertMany.ts index cfc1efee3..e56ea8d37 100644 --- a/src/tools/mongodb/create/insertMany.ts +++ b/src/tools/mongodb/create/insertMany.ts @@ -16,7 +16,7 @@ export class InsertManyTool extends MongoDBToolBase { documents: z .array(zEJSON().describe("An individual MongoDB document")) .describe( - "The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany(). For fields that have vector search indexes, you can provide raw text strings that will be automatically converted to embeddings if embeddingParameters is provided." + "The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany(). If you are asked to generate a embedding for a field, you have to explicitly specify the field name with a raw text string value of the field and an embedding will be generated if embeddingParameters is provided." ), embeddingParameters: zSupportedEmbeddingParameters .optional() diff --git a/tests/accuracy/insertMany.embeddings.test.ts b/tests/accuracy/insertMany.embeddings.test.ts index 3216664ef..9ddb614a9 100644 --- a/tests/accuracy/insertMany.embeddings.test.ts +++ b/tests/accuracy/insertMany.embeddings.test.ts @@ -17,7 +17,7 @@ const embeddingParameters = { describeAccuracyTests( [ { - prompt: "Insert a document into 'mflix.movies' collection with title 'The Matrix' and a plotSummary field with the text 'A computer hacker learns about the true nature of his reality'. Use the plot summary to generate the 'plotEmbeddings' field using the voyage-3 model.", + prompt: "Insert a document into 'mflix.movies' collection with title 'The Matrix' and a plotSummary field with the text 'A computer hacker learns about the true nature of his reality' and a 'plotSummaryEmbeddings' field which should be generated using the voyage-3.5 model.", expectedToolCalls: [ { toolName: "insert-many", @@ -28,10 +28,12 @@ describeAccuracyTests( { title: "The Matrix", plotSummary: "A computer hacker learns about the true nature of his reality", - plotEmbeddings: "A computer hacker learns about the true nature of his reality", + plotSummaryEmbeddings: "A computer hacker learns about the true nature of his reality", }, ], - embeddingParameters, + embeddingParameters: { + model: "voyage-3.5", + }, }, }, ], From 00a2b6ba4da73d92359a4f347a4a7d6f09b8af15 Mon Sep 17 00:00:00 2001 From: gagik Date: Wed, 29 Oct 2025 14:22:58 +0100 Subject: [PATCH 3/8] chore: simplify, cleanup, add tests --- .gitignore | 2 + CONTRIBUTING.md | 4 + src/common/errors.ts | 1 + .../search/vectorSearchEmbeddingsManager.ts | 55 ++- src/tools/mongodb/create/insertMany.ts | 210 +++------ src/tools/mongodb/read/aggregate.ts | 19 +- tests/accuracy/insertMany.embeddings.test.ts | 157 ++++++- tests/accuracy/sdk/matcher.ts | 10 + .../tools/mongodb/create/insertMany.test.ts | 432 +++++++++++++++++- 9 files changed, 714 insertions(+), 176 deletions(-) diff --git a/.gitignore b/.gitignore index 49550e277..1029194e2 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,5 @@ tests/tmp coverage # Generated assets by accuracy runs .accuracy + +.DS_Store \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 34fe72f74..77cd3457f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -76,6 +76,10 @@ npm test -- path/to/test/file.test.ts npm test -- path/to/directory ``` +#### Accuracy Tests and colima + +If you use [colima](https://github.com/abiosoft/colima) to run Docker on Mac, you will need to apply [additional configuration](https://node.testcontainers.org/supported-container-runtimes/#colima) to ensure the accuracy tests run correctly. + ## Troubleshooting ### Restart Server diff --git a/src/common/errors.ts b/src/common/errors.ts index 5880eb781..e44a72724 100644 --- a/src/common/errors.ts +++ b/src/common/errors.ts @@ -7,6 +7,7 @@ export enum ErrorCodes { NoEmbeddingsProviderConfigured = 1_000_005, AtlasVectorSearchIndexNotFound = 1_000_006, AtlasVectorSearchInvalidQuery = 1_000_007, + Unexpected = 1_000_008, } export class MongoDBError extends Error { diff --git a/src/common/search/vectorSearchEmbeddingsManager.ts b/src/common/search/vectorSearchEmbeddingsManager.ts index fc8c53beb..f1cf9ce37 100644 --- a/src/common/search/vectorSearchEmbeddingsManager.ts +++ b/src/common/search/vectorSearchEmbeddingsManager.ts @@ -103,7 +103,28 @@ export class VectorSearchEmbeddingsManager { return definition; } - async findFieldsWithWrongEmbeddings( + async assertFieldsHaveCorrectEmbeddings( + { database, collection }: { database: string; collection: string }, + documents: Document[] + ): Promise { + const embeddingValidationResults = await Promise.all( + documents.map((document) => this.findFieldsWithWrongEmbeddings({ database, collection }, document)) + ); + const embeddingValidations = new Set(embeddingValidationResults.flat()); + + if (embeddingValidations.size > 0) { + const embeddingValidationMessages = Array.from(embeddingValidations).map( + (validation) => + `- Field ${validation.path} is an embedding with ${validation.expectedNumDimensions} dimensions and ${validation.expectedQuantization}` + + ` quantization, and the provided value is not compatible. Actual dimensions: ${validation.actualNumDimensions}, ` + + `actual quantization: ${validation.actualQuantization}. Error: ${validation.error}` + ); + + throw new MongoDBError(ErrorCodes.AtlasVectorSearchInvalidQuery, embeddingValidationMessages.join("\n")); + } + } + + private async findFieldsWithWrongEmbeddings( { database, collection, @@ -239,21 +260,34 @@ export class VectorSearchEmbeddingsManager { return undefined; } - public async generateEmbeddings({ + public async assertVectorSearchIndexExists({ database, collection, path, - rawValues, - embeddingParameters, - inputType, }: { database: string; collection: string; path: string; + }): Promise { + const embeddingInfoForCollection = await this.embeddingsForNamespace({ database, collection }); + const embeddingInfoForPath = embeddingInfoForCollection.find((definition) => definition.path === path); + if (!embeddingInfoForPath) { + throw new MongoDBError( + ErrorCodes.AtlasVectorSearchIndexNotFound, + `No Vector Search index found for path "${path}" in namespace "${database}.${collection}"` + ); + } + } + + public async generateEmbeddings({ + rawValues, + embeddingParameters, + inputType, + }: { rawValues: string[]; embeddingParameters: SupportedEmbeddingParameters; inputType: EmbeddingParameters["inputType"]; - }): Promise { + }): Promise { const provider = await this.atlasSearchEnabledProvider(); if (!provider) { throw new MongoDBError( @@ -275,15 +309,6 @@ export class VectorSearchEmbeddingsManager { }); } - const embeddingInfoForCollection = await this.embeddingsForNamespace({ database, collection }); - const embeddingInfoForPath = embeddingInfoForCollection.find((definition) => definition.path === path); - if (!embeddingInfoForPath) { - throw new MongoDBError( - ErrorCodes.AtlasVectorSearchIndexNotFound, - `No Vector Search index found for path "${path}" in namespace "${database}.${collection}"` - ); - } - return await embeddingsProvider.embed(embeddingParameters.model, rawValues, { inputType, ...embeddingParameters, diff --git a/src/tools/mongodb/create/insertMany.ts b/src/tools/mongodb/create/insertMany.ts index e56ea8d37..304f3c992 100644 --- a/src/tools/mongodb/create/insertMany.ts +++ b/src/tools/mongodb/create/insertMany.ts @@ -6,7 +6,14 @@ import { zEJSON } from "../../args.js"; import { type Document } from "bson"; import { zSupportedEmbeddingParameters } from "../../../common/search/embeddingsProvider.js"; import { ErrorCodes, MongoDBError } from "../../../common/errors.js"; -import type { VectorFieldIndexDefinition } from "../../../common/search/vectorSearchEmbeddingsManager.js"; + +const zSupportedEmbeddingParametersWithInput = zSupportedEmbeddingParameters.extend({ + input: z + .array(z.object({}).passthrough()) + .describe( + "Array of objects with vector search index fields as keys (in dot notation) and the raw text values to generate embeddings for as values. The index of each object corresponds to the index of the document in the documents array." + ), +}); export class InsertManyTool extends MongoDBToolBase { public name = "insert-many"; @@ -16,9 +23,9 @@ export class InsertManyTool extends MongoDBToolBase { documents: z .array(zEJSON().describe("An individual MongoDB document")) .describe( - "The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany(). If you are asked to generate a embedding for a field, you have to explicitly specify the field name with a raw text string value of the field and an embedding will be generated if embeddingParameters is provided." + "The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany()." ), - embeddingParameters: zSupportedEmbeddingParameters + embeddingParameters: zSupportedEmbeddingParametersWithInput .optional() .describe( "The embedding model and its parameters to use to generate embeddings for fields that have vector search indexes. When a field has a vector search index and contains a plain text string in the document, embeddings will be automatically generated from that string value. Note to LLM: If unsure which embedding model to use, ask the user before providing one." @@ -34,45 +41,14 @@ export class InsertManyTool extends MongoDBToolBase { }: ToolArgs): Promise { const provider = await this.ensureConnected(); - // Get vector search indexes for the collection - const vectorIndexes = await this.session.vectorSearchEmbeddingsManager.embeddingsForNamespace({ - database, - collection, - }); - // Process documents to replace raw string values with generated embeddings documents = await this.replaceRawValuesWithEmbeddingsIfNecessary({ database, collection, documents, - vectorIndexes, embeddingParameters, }); - const embeddingValidationPromises = documents.map((document) => - this.session.vectorSearchEmbeddingsManager.findFieldsWithWrongEmbeddings({ database, collection }, document) - ); - const embeddingValidationResults = await Promise.all(embeddingValidationPromises); - const embeddingValidations = new Set(embeddingValidationResults.flat()); - - if (embeddingValidations.size > 0) { - // tell the LLM what happened - const embeddingValidationMessages = Array.from(embeddingValidations).map( - (validation) => - `- Field ${validation.path} is an embedding with ${validation.expectedNumDimensions} dimensions and ${validation.expectedQuantization}` + - ` quantization, and the provided value is not compatible. Actual dimensions: ${validation.actualNumDimensions}, ` + - `actual quantization: ${validation.actualQuantization}. Error: ${validation.error}` - ); - - return { - content: formatUntrustedData( - "There were errors when inserting documents. No document was inserted.", - ...embeddingValidationMessages - ), - isError: true, - }; - } - const result = await provider.insertMany(database, collection, documents); const content = formatUntrustedData( "Documents were inserted successfully.", @@ -88,144 +64,84 @@ export class InsertManyTool extends MongoDBToolBase { database, collection, documents, - vectorIndexes, embeddingParameters, }: { database: string; collection: string; documents: Document[]; - vectorIndexes: VectorFieldIndexDefinition[]; - embeddingParameters?: z.infer; + embeddingParameters?: z.infer; }): Promise { - // If no vector indexes, return documents as-is - if (vectorIndexes.length === 0) { + // If no embedding parameters or no input specified, return documents as-is + if (!embeddingParameters?.input || embeddingParameters.input.length === 0) { return documents; } - const processedDocuments: Document[] = []; - - for (let i = 0; i < documents.length; i++) { - const document = documents[i]; - if (!document) { - continue; - } - const processedDoc = await this.processDocumentForEmbeddings( - database, - collection, - document, - vectorIndexes, - embeddingParameters - ); - processedDocuments.push(processedDoc); - } - - return processedDocuments; - } - - private async processDocumentForEmbeddings( - database: string, - collection: string, - document: Document, - vectorIndexes: VectorFieldIndexDefinition[], - embeddingParameters?: z.infer - ): Promise { - // Find all fields in the document that match vector search indexed fields and need embeddings - const fieldsNeedingEmbeddings: Array<{ - path: string; - rawValue: string; - indexDef: VectorFieldIndexDefinition; - }> = []; + // Get vector search indexes for the collection + const vectorIndexes = await this.session.vectorSearchEmbeddingsManager.embeddingsForNamespace({ + database, + collection, + }); - for (const indexDef of vectorIndexes) { - // Check if the field exists in the document and is a string (raw text) - const fieldValue = this.getFieldValue(document, indexDef.path); - if (typeof fieldValue === "string") { - fieldsNeedingEmbeddings.push({ - path: indexDef.path, - rawValue: fieldValue, - indexDef, - }); + // Ensure for inputted fields, the vector search index exists. + for (const input of embeddingParameters.input) { + for (const fieldPath of Object.keys(input)) { + if (!vectorIndexes.some((index) => index.path === fieldPath)) { + throw new MongoDBError( + ErrorCodes.AtlasVectorSearchInvalidQuery, + `Field '${fieldPath}' does not have a vector search index in collection ${database}.${collection}. Only fields with vector search indexes can have embeddings generated.` + ); + } } } - // If no fields need embeddings, return document as-is - if (fieldsNeedingEmbeddings.length === 0) { - return document; - } - - // Check if embeddingParameters is provided - if (!embeddingParameters) { - const fieldPaths = fieldsNeedingEmbeddings.map((f) => f.path).join(", "); - throw new MongoDBError( - ErrorCodes.AtlasVectorSearchInvalidQuery, - `Fields [${fieldPaths}] have vector search indexes and contain raw text strings. The embeddingParameters parameter is required to generate embeddings for these fields.` - ); - } - - // Generate embeddings for all fields - const embeddingsMap = new Map(); - - for (const field of fieldsNeedingEmbeddings) { - const embeddings = await this.session.vectorSearchEmbeddingsManager.generateEmbeddings({ - database, - collection, - path: field.path, - rawValues: [field.rawValue], - embeddingParameters, - inputType: "document", - }); + // We make one call to generate embeddings for all documents at once to avoid making too many API calls. + const flattenedEmbeddingsInput = embeddingParameters.input.flatMap((documentInput, index) => + Object.entries(documentInput).map(([fieldPath, rawTextValue]) => ({ + fieldPath, + rawTextValue, + documentIndex: index, + })) + ); - if (embeddings.length > 0 && Array.isArray(embeddings[0])) { - embeddingsMap.set(field.path, embeddings[0] as number[]); - } - } + const generatedEmbeddings = await this.session.vectorSearchEmbeddingsManager.generateEmbeddings({ + rawValues: flattenedEmbeddingsInput.map(({ rawTextValue }) => rawTextValue) as string[], + embeddingParameters, + inputType: "document", + }); - // Replace raw string values with generated embeddings - const processedDoc = { ...document }; + const processedDocuments: Document[] = [...documents]; - for (const field of fieldsNeedingEmbeddings) { - const embedding = embeddingsMap.get(field.path); - if (embedding) { - this.setFieldValue(processedDoc, field.path, embedding); + for (const [index, { fieldPath, documentIndex }] of flattenedEmbeddingsInput.entries()) { + if (!processedDocuments[documentIndex]) { + throw new MongoDBError(ErrorCodes.Unexpected, `Document at index ${documentIndex} does not exist.`); } + // Ensure no nested fields are present in the field path. + this.deleteFieldPath(processedDocuments[documentIndex], fieldPath); + processedDocuments[documentIndex][fieldPath] = generatedEmbeddings[index]; } - return processedDoc; - } - - private getFieldValue(document: Document, path: string): unknown { - const parts = path.split("."); - let current: unknown = document; - - for (const part of parts) { - if (current && typeof current === "object" && part in current) { - current = (current as Record)[part]; - } else { - return undefined; - } - } + await this.session.vectorSearchEmbeddingsManager.assertFieldsHaveCorrectEmbeddings( + { database, collection }, + processedDocuments + ); - return current; + return processedDocuments; } - private setFieldValue(document: Document, path: string, value: unknown): void { - const parts = path.split("."); + // Delete a specified field path from a document using dot notation. + private deleteFieldPath(document: Record, fieldPath: string): void { + const parts = fieldPath.split("."); let current: Record = document; - - for (let i = 0; i < parts.length - 1; i++) { + for (let i = 0; i < parts.length; i++) { const part = parts[i]; - if (!part) { - continue; - } - if (!(part in current) || typeof current[part] !== "object") { - current[part] = {}; + const key = part as keyof typeof current; + if (!current[key]) { + return; + } else if (i === parts.length - 1) { + delete current[key]; + } else { + current = current[key] as Record; } - current = current[part] as Record; - } - - const lastPart = parts[parts.length - 1]; - if (lastPart) { - current[lastPart] = value; } } } diff --git a/src/tools/mongodb/read/aggregate.ts b/src/tools/mongodb/read/aggregate.ts index 4ad96590d..e85844977 100644 --- a/src/tools/mongodb/read/aggregate.ts +++ b/src/tools/mongodb/read/aggregate.ts @@ -276,22 +276,37 @@ export class AggregateTool extends MongoDBToolBase { const embeddingParameters = vectorSearchStage.embeddingParameters; delete vectorSearchStage.embeddingParameters; - const [embeddings] = await this.session.vectorSearchEmbeddingsManager.generateEmbeddings({ + await this.session.vectorSearchEmbeddingsManager.assertVectorSearchIndexExists({ database, collection, path: vectorSearchStage.path, + }); + + const [embeddings] = await this.session.vectorSearchEmbeddingsManager.generateEmbeddings({ rawValues: [vectorSearchStage.queryVector], embeddingParameters, inputType: "query", }); + if (!embeddings) { + throw new MongoDBError( + ErrorCodes.AtlasVectorSearchInvalidQuery, + "Failed to generate embeddings for the query vector." + ); + } + // $vectorSearch.queryVector can be a BSON.Binary: that it's not either number or an array. // It's not exactly valid from the LLM perspective (they can't provide binaries). // That's why we overwrite the stage in an untyped way, as what we expose and what LLMs can use is different. - vectorSearchStage.queryVector = embeddings as number[]; + vectorSearchStage.queryVector = embeddings; } } + await this.session.vectorSearchEmbeddingsManager.assertFieldsHaveCorrectEmbeddings( + { database, collection }, + pipeline + ); + return pipeline; } diff --git a/tests/accuracy/insertMany.embeddings.test.ts b/tests/accuracy/insertMany.embeddings.test.ts index 9ddb614a9..117a70b62 100644 --- a/tests/accuracy/insertMany.embeddings.test.ts +++ b/tests/accuracy/insertMany.embeddings.test.ts @@ -1,8 +1,9 @@ +import type { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; import { Matcher } from "./sdk/matcher.js"; const embeddingParameters = { - model: "voyage-3", + model: "voyage-3.5", outputDimension: Matcher.anyOf( Matcher.undefined, Matcher.number((n) => n === 1024) @@ -10,14 +11,56 @@ const embeddingParameters = { outputDType: Matcher.anyOf(Matcher.undefined, Matcher.value("float")), }; +const mockInsertMany = (): CallToolResult => { + return { + content: [ + { + type: "text", + text: "Documents were inserted successfully.", + }, + ], + }; +}; + /** * Accuracy tests for inserting documents with automatic vector embeddings generation. - * Tests scenarios where raw text strings are provided and automatically converted to embeddings. */ describeAccuracyTests( [ { - prompt: "Insert a document into 'mflix.movies' collection with title 'The Matrix' and a plotSummary field with the text 'A computer hacker learns about the true nature of his reality' and a 'plotSummaryEmbeddings' field which should be generated using the voyage-3.5 model.", + prompt: "Insert 2 documents in one call into 'mflix.movies' collection - document should have a 'title' field that has generated embeddings using the voyage-3.5 model: 'The Matrix' and 'Blade Runner'. Assume the collection already exists and has vector index on the 'title' field.", + expectedToolCalls: [ + { + toolName: "insert-many", + parameters: { + database: "mflix", + collection: "movies", + documents: [ + { + // The title field might be specified, sometimes as "The Matrix" or "Placeholder". This will be overwritten by the embeddings so this is fine. + title: Matcher.anyOf(Matcher.undefined, Matcher.null, Matcher.string()), + }, + { + title: Matcher.anyOf(Matcher.undefined, Matcher.null, Matcher.string()), + }, + ], + embeddingParameters: { + ...embeddingParameters, + input: [ + { + title: "The Matrix", + }, + { + title: "Blade Runner", + }, + ], + }, + }, + }, + ], + }, + { + prompt: "Insert a document into 'mflix.movies' collection with following fields: title is 'The Matrix', plotSummary is 'A computer hacker learns about the true nature of his reality', generate the necesssary vector embeddings for the 'plotSummaryEmbeddings' field using the voyage-3.5 model. Assume the collection already exists and has vector index on the 'plotSummaryEmbeddings' field.", expectedToolCalls: [ { toolName: "insert-many", @@ -28,15 +71,119 @@ describeAccuracyTests( { title: "The Matrix", plotSummary: "A computer hacker learns about the true nature of his reality", - plotSummaryEmbeddings: "A computer hacker learns about the true nature of his reality", + plotSummaryEmbeddings: Matcher.anyOf(Matcher.undefined, Matcher.null, Matcher.string()), + }, + ], + embeddingParameters: { + ...embeddingParameters, + input: [ + { + plotSummaryEmbeddings: + "A computer hacker learns about the true nature of his reality", + }, + ], + }, + }, + }, + ], + }, + { + prompt: "Insert 2 documents in one call into 'mflix.movies' collection - the movie titles are 1. 'The Matrix' and 2. 'Blade Runner'. They should have an info field which has 2 subfields: 'title' and 'titleEmbeddings'. Generate the embeddings for the 'info.titleEmbeddings' subfield using the voyage-3.5 model. Assume the collection already exists and has vector index on the 'info.titleEmbeddings' field.", + expectedToolCalls: [ + { + toolName: "insert-many", + parameters: { + database: "mflix", + collection: "movies", + documents: [ + { + info: { + titleEmbeddings: Matcher.anyOf(Matcher.undefined, Matcher.null, Matcher.string()), + title: "The Matrix", + }, + }, + { + info: { + titleEmbeddings: Matcher.anyOf(Matcher.undefined, Matcher.null, Matcher.string()), + title: "Blade Runner", + }, + }, + ], + embeddingParameters: { + ...embeddingParameters, + input: [ + { + "info.titleEmbeddings": "The Matrix", + }, + { + "info.titleEmbeddings": "Blade Runner", + }, + ], + }, + }, + }, + ], + mockedTools: { + "insert-many": mockInsertMany, + }, + }, + { + prompt: "Insert a document into 'mflix.movies' collection with title 'The Matrix' and generate the necesssary vector embeddings for the current vector search fields using the voyage-3.5 model.", + expectedToolCalls: [ + { + toolName: "collection-indexes", + parameters: { + database: "mflix", + collection: "movies", + }, + }, + { + toolName: "insert-many", + parameters: { + database: "mflix", + collection: "movies", + documents: [ + { + title: "The Matrix", + title_embeddings: Matcher.anyOf(Matcher.undefined, Matcher.null, Matcher.string()), }, ], embeddingParameters: { - model: "voyage-3.5", + ...embeddingParameters, + input: [ + { + title_embeddings: "The Matrix", + }, + ], }, }, }, ], + mockedTools: { + "insert-many": mockInsertMany, + "collection-indexes": (): CallToolResult => { + return { + content: [ + { + type: "text", + text: JSON.stringify({ + name: "title_embeddings", + type: "vectorSearch", + status: "READY", + queryable: true, + latestDefinition: { + type: "vector", + path: "title_embeddings", + numDimensions: 1024, + quantization: "none", + similarity: "euclidean", + }, + }), + }, + ], + }; + }, + }, }, ], { diff --git a/tests/accuracy/sdk/matcher.ts b/tests/accuracy/sdk/matcher.ts index d25836814..7f403e3ac 100644 --- a/tests/accuracy/sdk/matcher.ts +++ b/tests/accuracy/sdk/matcher.ts @@ -24,6 +24,10 @@ export abstract class Matcher { return new UndefinedMatcher(); } + public static get null(): Matcher { + return new NullMatcher(); + } + public static boolean(expected?: boolean): Matcher { return new BooleanMatcher(expected); } @@ -102,6 +106,12 @@ class UndefinedMatcher extends Matcher { } } +class NullMatcher extends Matcher { + public match(actual: unknown): number { + return actual === null ? 1 : 0; + } +} + class NotMatcher extends Matcher { constructor(private matcher: Matcher) { super(); diff --git a/tests/integration/tools/mongodb/create/insertMany.test.ts b/tests/integration/tools/mongodb/create/insertMany.test.ts index e7bbd0961..f270d6cb8 100644 --- a/tests/integration/tools/mongodb/create/insertMany.test.ts +++ b/tests/integration/tools/mongodb/create/insertMany.test.ts @@ -12,8 +12,9 @@ import { validateThrowsForInvalidArguments, expectDefined, getDataFromUntrustedContent, + defaultTestConfig, } from "../../../helpers.js"; -import { beforeEach, afterEach, expect, it } from "vitest"; +import { beforeEach, afterEach, expect, it, describe } from "vitest"; import { ObjectId } from "bson"; import type { Collection } from "mongodb"; @@ -24,9 +25,16 @@ describeWithMongoDB("insertMany tool when search is disabled", (integration) => name: "documents", type: "array", description: - "The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany()", + "The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany().", required: true, }, + { + name: "embeddingParameters", + type: "object", + description: + "The embedding model and its parameters to use to generate embeddings for fields with vector search indexes. Note to LLM: If unsure which embedding model to use, ask the user before providing one.", + required: false, + }, ]); validateThrowsForInvalidArguments(integration, "insert-many", [ @@ -110,10 +118,12 @@ describeWithMongoDB( "insertMany tool when search is enabled", (integration) => { let collection: Collection; + let database: string; beforeEach(async () => { await integration.connectMcpClient(); - collection = await integration.mongoClient().db(integration.randomDbName()).createCollection("test"); + database = integration.randomDbName(); + collection = await integration.mongoClient().db(database).createCollection("test"); await waitUntilSearchIsReady(integration.mongoClient()); }); @@ -122,7 +132,7 @@ describeWithMongoDB( }); it("inserts a document when the embedding is correct", async () => { - await createVectorSearchIndexAndWait(integration.mongoClient(), integration.randomDbName(), "test", [ + await createVectorSearchIndexAndWait(integration.mongoClient(), database, "test", [ { type: "vector", path: "embedding", @@ -135,7 +145,7 @@ describeWithMongoDB( const response = await integration.mcpClient().callTool({ name: "insert-many", arguments: { - database: integration.randomDbName(), + database, collection: "test", documents: [{ embedding: [1, 2, 3, 4, 5, 6, 7, 8] }], }, @@ -150,7 +160,7 @@ describeWithMongoDB( }); it("returns an error when there is a search index and quantisation is wrong", async () => { - await createVectorSearchIndexAndWait(integration.mongoClient(), integration.randomDbName(), "test", [ + await createVectorSearchIndexAndWait(integration.mongoClient(), database, "test", [ { type: "vector", path: "embedding", @@ -181,8 +191,416 @@ describeWithMongoDB( }); expect(oopsieCount).toBe(0); }); + + describe.skipIf(!process.env.TEST_MDB_MCP_VOYAGE_API_KEY)("embeddings generation with Voyage AI", () => { + beforeEach(async () => { + await integration.connectMcpClient(); + database = integration.randomDbName(); + collection = await integration.mongoClient().db(database).createCollection("test"); + await waitUntilSearchIsReady(integration.mongoClient()); + }); + + afterEach(async () => { + await collection.drop(); + }); + + it("generates embeddings for a single document with one field", async () => { + await createVectorSearchIndexAndWait(integration.mongoClient(), database, "test", [ + { + type: "vector", + path: "titleEmbeddings", + numDimensions: 1024, + similarity: "cosine", + quantization: "scalar", + }, + ]); + + const response = await integration.mcpClient().callTool({ + name: "insert-many", + arguments: { + database, + collection: "test", + documents: [{ title: "The Matrix" }], + embeddingParameters: { + model: "voyage-3.5-lite", + input: [{ titleEmbeddings: "The Matrix" }], + }, + }, + }); + + const content = getResponseContent(response.content); + expect(content).toContain("Documents were inserted successfully."); + const insertedIds = extractInsertedIds(content); + expect(insertedIds).toHaveLength(1); + + const doc = await collection.findOne({ _id: insertedIds[0] }); + expect(doc).toBeDefined(); + expect(doc?.title).toBe("The Matrix"); + expect(doc?.titleEmbeddings).toBeDefined(); + expect(Array.isArray(doc?.titleEmbeddings)).toBe(true); + expect((doc?.titleEmbeddings as number[]).length).toBe(1024); + // Verify all values are numbers + expect((doc?.titleEmbeddings as number[]).every((n) => typeof n === "number")).toBe(true); + }); + + it("generates embeddings for multiple documents with the same field", async () => { + await createVectorSearchIndexAndWait(integration.mongoClient(), integration.randomDbName(), "test", [ + { + type: "vector", + path: "titleEmbeddings", + numDimensions: 1024, + similarity: "cosine", + quantization: "scalar", + }, + ]); + + const response = await integration.mcpClient().callTool({ + name: "insert-many", + arguments: { + database: integration.randomDbName(), + collection: "test", + documents: [ + { + title: "The Matrix", + }, + { + title: "Blade Runner", + }, + ], + embeddingParameters: { + model: "voyage-3.5-lite", + input: [ + { + titleEmbeddings: "The Matrix", + }, + { + titleEmbeddings: "Blade Runner", + }, + ], + }, + }, + }); + + const content = getResponseContent(response.content); + expect(content).toContain("Documents were inserted successfully."); + const insertedIds = extractInsertedIds(content); + expect(insertedIds).toHaveLength(2); + + const doc1 = await collection.findOne({ _id: insertedIds[0] }); + expect(doc1?.title).toBe("The Matrix"); + expect(Array.isArray(doc1?.titleEmbeddings)).toBe(true); + expect((doc1?.titleEmbeddings as number[]).length).toBe(1024); + + const doc2 = await collection.findOne({ _id: insertedIds[1] }); + expect(doc2?.title).toBe("Blade Runner"); + expect(Array.isArray(doc2?.titleEmbeddings)).toBe(true); + expect((doc2?.titleEmbeddings as number[]).length).toBe(1024); + + // Verify embeddings are different + expect(doc1?.titleEmbeddings).not.toEqual(doc2?.titleEmbeddings); + }); + + it("generates embeddings for nested fields", async () => { + await createVectorSearchIndexAndWait(integration.mongoClient(), database, "test", [ + { + type: "vector", + path: "info.titleEmbeddings", + numDimensions: 1024, + similarity: "cosine", + quantization: "scalar", + }, + ]); + + const response = await integration.mcpClient().callTool({ + name: "insert-many", + arguments: { + database, + collection: "test", + documents: [ + { + info: { + title: "The Matrix", + }, + }, + ], + embeddingParameters: { + model: "voyage-3.5-lite", + input: [ + { + "info.titleEmbeddings": "The Matrix", + }, + ], + }, + }, + }); + + const content = getResponseContent(response.content); + expect(content).toContain("Documents were inserted successfully."); + const insertedIds = extractInsertedIds(content); + expect(insertedIds).toHaveLength(1); + + const doc = await collection.findOne({ _id: insertedIds[0] }); + expect(doc?.info).toBeDefined(); + // eslint-disable-next-line @typescript-eslint/no-unsafe-member-access + expect(doc?.info.title).toBe("The Matrix"); + // eslint-disable-next-line @typescript-eslint/no-unsafe-member-access + expect(Array.isArray(doc?.info.titleEmbeddings)).toBe(true); + // eslint-disable-next-line @typescript-eslint/no-unsafe-member-access + expect((doc?.info.titleEmbeddings as number[]).length).toBe(1024); + }); + + it("overwrites existing field value with generated embeddings", async () => { + await createVectorSearchIndexAndWait(integration.mongoClient(), database, "test", [ + { + type: "vector", + path: "titleEmbeddings", + numDimensions: 1024, + similarity: "cosine", + quantization: "scalar", + }, + ]); + + const response = await integration.mcpClient().callTool({ + name: "insert-many", + arguments: { + database, + collection: "test", + documents: [ + { + title: "The Matrix", + titleEmbeddings: [1, 2, 3], // This should be overwritten + }, + ], + embeddingParameters: { + model: "voyage-3.5-lite", + input: [ + { + titleEmbeddings: "The Matrix", + }, + ], + }, + }, + }); + + const content = getResponseContent(response.content); + expect(content).toContain("Documents were inserted successfully."); + const insertedIds = extractInsertedIds(content); + expect(insertedIds).toHaveLength(1); + + const doc = await collection.findOne({ _id: insertedIds[0] }); + expect(doc?.title).toBe("The Matrix"); + expect(doc?.titleEmbeddings).not.toEqual([1, 2, 3]); + expect(Array.isArray(doc?.titleEmbeddings)).toBe(true); + expect((doc?.titleEmbeddings as number[]).length).toBe(1024); + }); + + it("removes redundant nested field from document when embeddings are generated", async () => { + await createVectorSearchIndexAndWait(integration.mongoClient(), integration.randomDbName(), "test", [ + { + type: "vector", + path: "title.embeddings", + numDimensions: 1024, + similarity: "cosine", + quantization: "scalar", + }, + ]); + + const response = await integration.mcpClient().callTool({ + name: "insert-many", + arguments: { + database, + collection: "test", + documents: [{ title: { text: "The Matrix", embeddings: "This should be removed" } }], + embeddingParameters: { + model: "voyage-3.5-lite", + input: [{ "title.embeddings": "The Matrix" }], + }, + }, + }); + const content = getResponseContent(response.content); + expect(content).toContain("Documents were inserted successfully."); + const insertedIds = extractInsertedIds(content); + expect(insertedIds).toHaveLength(1); + + const doc = await collection.findOne({ _id: insertedIds[0] }); + expect((doc?.title as Record)?.text).toBe("The Matrix"); + expect((doc?.title as Record)?.embeddings).not.toBeDefined(); + expect((doc?.["title.embeddings"] as unknown as number[]).length).toBe(1024); + }); + + it("returns an error when input field does not have a vector search index", async () => { + await createVectorSearchIndexAndWait(integration.mongoClient(), integration.randomDbName(), "test", [ + { + type: "vector", + path: "titleEmbeddings", + numDimensions: 1024, + similarity: "cosine", + quantization: "scalar", + }, + ]); + + const response = await integration.mcpClient().callTool({ + name: "insert-many", + arguments: { + database, + collection: "test", + documents: [ + { + title: "The Matrix", + }, + ], + embeddingParameters: { + model: "voyage-3.5-lite", + input: [ + { + nonExistentField: "The Matrix", + }, + ], + }, + }, + }); + + const content = getResponseContent(response.content); + expect(content).toContain("Error running insert-many"); + expect(content).toContain("Field 'nonExistentField' does not have a vector search index in collection"); + expect(content).toContain("Only fields with vector search indexes can have embeddings generated"); + }); + + it("inserts documents without embeddings when input array is empty", async () => { + await createVectorSearchIndexAndWait(integration.mongoClient(), database, "test", [ + { + type: "vector", + path: "titleEmbeddings", + numDimensions: 1024, + similarity: "cosine", + quantization: "scalar", + }, + ]); + + const response = await integration.mcpClient().callTool({ + name: "insert-many", + arguments: { + database, + collection: "test", + documents: [ + { + title: "The Matrix", + }, + ], + embeddingParameters: { + model: "voyage-3.5-lite", + input: [], + }, + }, + }); + + const content = getResponseContent(response.content); + expect(content).toContain("Documents were inserted successfully."); + const insertedIds = extractInsertedIds(content); + expect(insertedIds).toHaveLength(1); + + const doc = await collection.findOne({ _id: insertedIds[0] }); + expect(doc?.title).toBe("The Matrix"); + expect(doc?.titleEmbeddings).toBeUndefined(); + }); + + it("generates embeddings with 256 dimensions", async () => { + await createVectorSearchIndexAndWait(integration.mongoClient(), database, "test", [ + { + type: "vector", + path: "titleEmbeddings", + numDimensions: 256, + similarity: "cosine", + quantization: "scalar", + }, + ]); + + const response = await integration.mcpClient().callTool({ + name: "insert-many", + arguments: { + database, + collection: "test", + documents: [{ title: "The Matrix" }], + embeddingParameters: { + model: "voyage-3.5-lite", + outputDimension: 256, + input: [{ titleEmbeddings: "The Matrix" }], + }, + }, + }); + + const content = getResponseContent(response.content); + expect(content).toContain("Documents were inserted successfully."); + const insertedIds = extractInsertedIds(content); + + const doc = await collection.findOne({ _id: insertedIds[0] }); + expect(Array.isArray(doc?.titleEmbeddings)).toBe(true); + expect((doc?.titleEmbeddings as number[]).length).toBe(256); + }); + + it("generates embeddings for multiple fields in a single document", async () => { + await createVectorSearchIndexAndWait(integration.mongoClient(), database, "test", [ + { + type: "vector", + path: "titleEmbeddings", + numDimensions: 1024, + similarity: "cosine", + quantization: "scalar", + }, + { + type: "vector", + path: "plotEmbeddings", + numDimensions: 1024, + similarity: "cosine", + quantization: "scalar", + }, + ]); + + const response = await integration.mcpClient().callTool({ + name: "insert-many", + arguments: { + database, + collection: "test", + documents: [ + { + title: "The Matrix", + plot: "A computer hacker learns about the true nature of his reality", + }, + ], + embeddingParameters: { + model: "voyage-3.5-lite", + input: [ + { + titleEmbeddings: "The Matrix", + plotEmbeddings: "A computer hacker learns about the true nature of his reality", + }, + ], + }, + }, + }); + + const content = getResponseContent(response.content); + expect(content).toContain("Documents were inserted successfully."); + const insertedIds = extractInsertedIds(content); + + const doc = await collection.findOne({ _id: insertedIds[0] }); + expect(doc?.title).toBe("The Matrix"); + expect(Array.isArray(doc?.titleEmbeddings)).toBe(true); + expect((doc?.titleEmbeddings as number[]).length).toBe(1024); + expect(Array.isArray(doc?.plotEmbeddings)).toBe(true); + expect((doc?.plotEmbeddings as number[]).length).toBe(1024); + // Verify embeddings are different for different text + expect(doc?.titleEmbeddings).not.toEqual(doc?.plotEmbeddings); + }); + }); }, - { downloadOptions: { search: true } } + { + getUserConfig: () => ({ + ...defaultTestConfig, + voyageApiKey: process.env.TEST_MDB_MCP_VOYAGE_API_KEY ?? "", + }), + downloadOptions: { search: true }, + } ); function extractInsertedIds(content: string): ObjectId[] { From c22f2c5edbd2b1ba5b62de9c209defa99867c8d7 Mon Sep 17 00:00:00 2001 From: gagik Date: Wed, 29 Oct 2025 15:16:38 +0100 Subject: [PATCH 4/8] chore: fix tests --- .../search/vectorSearchEmbeddingsManager.ts | 2 +- src/tools/mongodb/create/insertMany.ts | 2 +- .../vectorSearchEmbeddingsManager.test.ts | 59 +++++++++++++++---- 3 files changed, 51 insertions(+), 12 deletions(-) diff --git a/src/common/search/vectorSearchEmbeddingsManager.ts b/src/common/search/vectorSearchEmbeddingsManager.ts index f1cf9ce37..028964adc 100644 --- a/src/common/search/vectorSearchEmbeddingsManager.ts +++ b/src/common/search/vectorSearchEmbeddingsManager.ts @@ -124,7 +124,7 @@ export class VectorSearchEmbeddingsManager { } } - private async findFieldsWithWrongEmbeddings( + public async findFieldsWithWrongEmbeddings( { database, collection, diff --git a/src/tools/mongodb/create/insertMany.ts b/src/tools/mongodb/create/insertMany.ts index 304f3c992..29ef54830 100644 --- a/src/tools/mongodb/create/insertMany.ts +++ b/src/tools/mongodb/create/insertMany.ts @@ -28,7 +28,7 @@ export class InsertManyTool extends MongoDBToolBase { embeddingParameters: zSupportedEmbeddingParametersWithInput .optional() .describe( - "The embedding model and its parameters to use to generate embeddings for fields that have vector search indexes. When a field has a vector search index and contains a plain text string in the document, embeddings will be automatically generated from that string value. Note to LLM: If unsure which embedding model to use, ask the user before providing one." + "The embedding model and its parameters to use to generate embeddings for fields with vector search indexes. Note to LLM: If unsure which embedding model to use, ask the user before providing one." ), }; public operationType: OperationType = "create"; diff --git a/tests/unit/common/search/vectorSearchEmbeddingsManager.test.ts b/tests/unit/common/search/vectorSearchEmbeddingsManager.test.ts index 24b921e72..7581889bd 100644 --- a/tests/unit/common/search/vectorSearchEmbeddingsManager.test.ts +++ b/tests/unit/common/search/vectorSearchEmbeddingsManager.test.ts @@ -1,4 +1,4 @@ -import { describe, it, expect, vi, beforeEach } from "vitest"; +import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; import type { MockedFunction } from "vitest"; import { VectorSearchEmbeddingsManager } from "../../../../src/common/search/vectorSearchEmbeddingsManager.js"; import type { @@ -411,7 +411,7 @@ describe("VectorSearchEmbeddingsManager", () => { ); }); - describe("when atlas search is not available", () => { + describe("assertVectorSearchIndexExists", () => { beforeEach(() => { embeddings = new VectorSearchEmbeddingsManager( embeddingValidationEnabled, @@ -419,12 +419,57 @@ describe("VectorSearchEmbeddingsManager", () => { new Map(), getMockedEmbeddingsProvider ); + }); + + afterEach(() => { + provider.getSearchIndexes.mockReset(); + }); + + it("does not throw an exception when index is available for path", async () => { + provider.getSearchIndexes.mockResolvedValue([ + { + id: "65e8c766d0450e3e7ab9855f", + name: "vector-search-test", + type: "vectorSearch", + status: "READY", + queryable: true, + latestDefinition: { + fields: [ + { + type: "vector", + path: embeddingToGenerate.path, + numDimensions: 1024, + similarity: "euclidean", + }, + ], + }, + }, + ]); + await expect( + embeddings.assertVectorSearchIndexExists({ + database, + collection, + path: embeddingToGenerate.path, + }) + ).resolves.not.toThrowError(); + }); + it("throws an exception when atlas search is not available", async () => { provider.getSearchIndexes.mockRejectedValue(new Error()); + await expect( + embeddings.assertVectorSearchIndexExists({ + database, + collection, + path: embeddingToGenerate.path, + }) + ).rejects.toThrowError(); }); - it("throws an exception", async () => { - await expect(embeddings.generateEmbeddings(embeddingToGenerate)).rejects.toThrowError(); + it("throws an exception when no index is available for path", async () => { + provider.getSearchIndexes.mockResolvedValue([]); + await expect( + embeddings.assertVectorSearchIndexExists({ database, collection, path: embeddingToGenerate.path }) + ).rejects.toThrowError(); }); }); @@ -459,12 +504,6 @@ describe("VectorSearchEmbeddingsManager", () => { ); }); - describe("when no index is available for path", () => { - it("throws an exception", async () => { - await expect(embeddings.generateEmbeddings(embeddingToGenerate)).rejects.toThrowError(); - }); - }); - describe("when index is available on path", () => { beforeEach(() => { provider.getSearchIndexes.mockResolvedValue([ From 6ab92310cf6cca62c8a1b4172821e74468004d53 Mon Sep 17 00:00:00 2001 From: gagik Date: Wed, 29 Oct 2025 15:28:02 +0100 Subject: [PATCH 5/8] chore: fix more tests --- src/common/search/vectorSearchEmbeddingsManager.ts | 8 +++++++- src/tools/mongodb/create/insertMany.ts | 10 +++++----- .../tools/mongodb/create/insertMany.test.ts | 2 +- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/common/search/vectorSearchEmbeddingsManager.ts b/src/common/search/vectorSearchEmbeddingsManager.ts index 028964adc..c118b3c2d 100644 --- a/src/common/search/vectorSearchEmbeddingsManager.ts +++ b/src/common/search/vectorSearchEmbeddingsManager.ts @@ -6,6 +6,7 @@ import z from "zod"; import { ErrorCodes, MongoDBError } from "../errors.js"; import { getEmbeddingsProvider } from "./embeddingsProvider.js"; import type { EmbeddingParameters, SupportedEmbeddingParameters } from "./embeddingsProvider.js"; +import { formatUntrustedData } from "../../tools/tool.js"; export const similarityEnum = z.enum(["cosine", "euclidean", "dotProduct"]); export type Similarity = z.infer; @@ -120,7 +121,12 @@ export class VectorSearchEmbeddingsManager { `actual quantization: ${validation.actualQuantization}. Error: ${validation.error}` ); - throw new MongoDBError(ErrorCodes.AtlasVectorSearchInvalidQuery, embeddingValidationMessages.join("\n")); + throw new MongoDBError( + ErrorCodes.AtlasVectorSearchInvalidQuery, + formatUntrustedData("", ...embeddingValidationMessages) + .map(({ text }) => text) + .join("\n") + ); } } diff --git a/src/tools/mongodb/create/insertMany.ts b/src/tools/mongodb/create/insertMany.ts index 29ef54830..8d4bc4285 100644 --- a/src/tools/mongodb/create/insertMany.ts +++ b/src/tools/mongodb/create/insertMany.ts @@ -49,6 +49,11 @@ export class InsertManyTool extends MongoDBToolBase { embeddingParameters, }); + await this.session.vectorSearchEmbeddingsManager.assertFieldsHaveCorrectEmbeddings( + { database, collection }, + documents + ); + const result = await provider.insertMany(database, collection, documents); const content = formatUntrustedData( "Documents were inserted successfully.", @@ -120,11 +125,6 @@ export class InsertManyTool extends MongoDBToolBase { processedDocuments[documentIndex][fieldPath] = generatedEmbeddings[index]; } - await this.session.vectorSearchEmbeddingsManager.assertFieldsHaveCorrectEmbeddings( - { database, collection }, - processedDocuments - ); - return processedDocuments; } diff --git a/tests/integration/tools/mongodb/create/insertMany.test.ts b/tests/integration/tools/mongodb/create/insertMany.test.ts index f270d6cb8..b37ae548a 100644 --- a/tests/integration/tools/mongodb/create/insertMany.test.ts +++ b/tests/integration/tools/mongodb/create/insertMany.test.ts @@ -180,7 +180,7 @@ describeWithMongoDB( }); const content = getResponseContent(response.content); - expect(content).toContain("There were errors when inserting documents. No document was inserted."); + expect(content).toContain("Error running insert-many"); const untrustedContent = getDataFromUntrustedContent(content); expect(untrustedContent).toContain( "- Field embedding is an embedding with 8 dimensions and scalar quantization, and the provided value is not compatible. Actual dimensions: unknown, actual quantization: unknown. Error: not-a-vector" From aa6dd8ad70d3f8bebca06155748869fb76adadee Mon Sep 17 00:00:00 2001 From: gagik Date: Wed, 29 Oct 2025 15:49:48 +0100 Subject: [PATCH 6/8] chore: conditional usage of vector search param --- src/tools/mongodb/create/insertMany.ts | 20 +++++++--- tests/accuracy/insertMany.embeddings.test.ts | 2 +- .../tools/mongodb/create/insertMany.test.ts | 38 +++++++++++++++---- 3 files changed, 46 insertions(+), 14 deletions(-) diff --git a/src/tools/mongodb/create/insertMany.ts b/src/tools/mongodb/create/insertMany.ts index 8d4bc4285..86aec3203 100644 --- a/src/tools/mongodb/create/insertMany.ts +++ b/src/tools/mongodb/create/insertMany.ts @@ -25,11 +25,15 @@ export class InsertManyTool extends MongoDBToolBase { .describe( "The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany()." ), - embeddingParameters: zSupportedEmbeddingParametersWithInput - .optional() - .describe( - "The embedding model and its parameters to use to generate embeddings for fields with vector search indexes. Note to LLM: If unsure which embedding model to use, ask the user before providing one." - ), + ...(this.isFeatureEnabled("vectorSearch") + ? { + embeddingParameters: zSupportedEmbeddingParametersWithInput + .optional() + .describe( + "The embedding model and its parameters to use to generate embeddings for fields with vector search indexes. Note to LLM: If unsure which embedding model to use, ask the user before providing one." + ), + } + : {}), }; public operationType: OperationType = "create"; @@ -37,10 +41,14 @@ export class InsertManyTool extends MongoDBToolBase { database, collection, documents, - embeddingParameters, + embeddingParameters: providedEmbeddingParameters, }: ToolArgs): Promise { const provider = await this.ensureConnected(); + const embeddingParameters = this.isFeatureEnabled("vectorSearch") + ? (providedEmbeddingParameters as z.infer) + : undefined; + // Process documents to replace raw string values with generated embeddings documents = await this.replaceRawValuesWithEmbeddingsIfNecessary({ database, diff --git a/tests/accuracy/insertMany.embeddings.test.ts b/tests/accuracy/insertMany.embeddings.test.ts index 117a70b62..6445b8458 100644 --- a/tests/accuracy/insertMany.embeddings.test.ts +++ b/tests/accuracy/insertMany.embeddings.test.ts @@ -187,7 +187,7 @@ describeAccuracyTests( }, ], { - userConfig: { voyageApiKey: "valid-key" }, + userConfig: { voyageApiKey: "valid-key", previewFeatures: "vectorSearch" }, clusterConfig: { search: true, }, diff --git a/tests/integration/tools/mongodb/create/insertMany.test.ts b/tests/integration/tools/mongodb/create/insertMany.test.ts index b37ae548a..a5dd31cef 100644 --- a/tests/integration/tools/mongodb/create/insertMany.test.ts +++ b/tests/integration/tools/mongodb/create/insertMany.test.ts @@ -28,13 +28,6 @@ describeWithMongoDB("insertMany tool when search is disabled", (integration) => "The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany().", required: true, }, - { - name: "embeddingParameters", - type: "object", - description: - "The embedding model and its parameters to use to generate embeddings for fields with vector search indexes. Note to LLM: If unsure which embedding model to use, ask the user before providing one.", - required: false, - }, ]); validateThrowsForInvalidArguments(integration, "insert-many", [ @@ -598,11 +591,42 @@ describeWithMongoDB( getUserConfig: () => ({ ...defaultTestConfig, voyageApiKey: process.env.TEST_MDB_MCP_VOYAGE_API_KEY ?? "", + previewFeatures: ["vectorSearch"], }), downloadOptions: { search: true }, } ); +describeWithMongoDB( + "insertMany tool when vector search is enabled", + (integration) => { + validateToolMetadata(integration, "insert-many", "Insert an array of documents into a MongoDB collection", [ + ...databaseCollectionParameters, + { + name: "documents", + type: "array", + description: + "The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany().", + required: true, + }, + { + name: "embeddingParameters", + type: "object", + description: + "The embedding model and its parameters to use to generate embeddings for fields with vector search indexes. Note to LLM: If unsure which embedding model to use, ask the user before providing one.", + required: false, + }, + ]); + }, + { + getUserConfig: () => ({ + ...defaultTestConfig, + voyageApiKey: "valid-key", + previewFeatures: ["vectorSearch"], + }), + } +); + function extractInsertedIds(content: string): ObjectId[] { expect(content).toContain("Documents were inserted successfully."); expect(content).toContain("Inserted IDs:"); From dc2f20764b9252181077c339bc2a28f19f5d7b48 Mon Sep 17 00:00:00 2001 From: gagik Date: Wed, 29 Oct 2025 15:55:03 +0100 Subject: [PATCH 7/8] chore: more usage of database instead of randomDbName --- .../tools/mongodb/create/insertMany.test.ts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/integration/tools/mongodb/create/insertMany.test.ts b/tests/integration/tools/mongodb/create/insertMany.test.ts index a5dd31cef..e9964e26e 100644 --- a/tests/integration/tools/mongodb/create/insertMany.test.ts +++ b/tests/integration/tools/mongodb/create/insertMany.test.ts @@ -166,7 +166,7 @@ describeWithMongoDB( const response = await integration.mcpClient().callTool({ name: "insert-many", arguments: { - database: integration.randomDbName(), + database: database, collection: "test", documents: [{ embedding: "oopsie" }], }, @@ -237,7 +237,7 @@ describeWithMongoDB( }); it("generates embeddings for multiple documents with the same field", async () => { - await createVectorSearchIndexAndWait(integration.mongoClient(), integration.randomDbName(), "test", [ + await createVectorSearchIndexAndWait(integration.mongoClient(), database, "test", [ { type: "vector", path: "titleEmbeddings", @@ -250,7 +250,7 @@ describeWithMongoDB( const response = await integration.mcpClient().callTool({ name: "insert-many", arguments: { - database: integration.randomDbName(), + database: database, collection: "test", documents: [ { @@ -388,7 +388,7 @@ describeWithMongoDB( }); it("removes redundant nested field from document when embeddings are generated", async () => { - await createVectorSearchIndexAndWait(integration.mongoClient(), integration.randomDbName(), "test", [ + await createVectorSearchIndexAndWait(integration.mongoClient(), database, "test", [ { type: "vector", path: "title.embeddings", @@ -422,7 +422,7 @@ describeWithMongoDB( }); it("returns an error when input field does not have a vector search index", async () => { - await createVectorSearchIndexAndWait(integration.mongoClient(), integration.randomDbName(), "test", [ + await createVectorSearchIndexAndWait(integration.mongoClient(), database, "test", [ { type: "vector", path: "titleEmbeddings", From fcfdd32f48d1f111d99f111848eda4f03f6b4089 Mon Sep 17 00:00:00 2001 From: gagik Date: Thu, 30 Oct 2025 11:01:46 +0100 Subject: [PATCH 8/8] chore: update types, add tests for assertVectorSearchIndexExists --- src/common/search/embeddingsProvider.ts | 2 +- .../search/vectorSearchEmbeddingsManager.ts | 17 ++- src/tools/mongodb/read/aggregate.ts | 2 +- .../vectorSearchEmbeddingsManager.test.ts | 143 ++++++++++++++++++ 4 files changed, 154 insertions(+), 10 deletions(-) diff --git a/src/common/search/embeddingsProvider.ts b/src/common/search/embeddingsProvider.ts index 24b6e2c34..b87906ef2 100644 --- a/src/common/search/embeddingsProvider.ts +++ b/src/common/search/embeddingsProvider.ts @@ -7,7 +7,7 @@ import { createFetch } from "@mongodb-js/devtools-proxy-support"; import { z } from "zod"; type EmbeddingsInput = string; -type Embeddings = number[]; +type Embeddings = number[] | unknown[]; export type EmbeddingParameters = { inputType: "query" | "document"; }; diff --git a/src/common/search/vectorSearchEmbeddingsManager.ts b/src/common/search/vectorSearchEmbeddingsManager.ts index c118b3c2d..1af3a8a6c 100644 --- a/src/common/search/vectorSearchEmbeddingsManager.ts +++ b/src/common/search/vectorSearchEmbeddingsManager.ts @@ -108,13 +108,14 @@ export class VectorSearchEmbeddingsManager { { database, collection }: { database: string; collection: string }, documents: Document[] ): Promise { - const embeddingValidationResults = await Promise.all( - documents.map((document) => this.findFieldsWithWrongEmbeddings({ database, collection }, document)) - ); - const embeddingValidations = new Set(embeddingValidationResults.flat()); - - if (embeddingValidations.size > 0) { - const embeddingValidationMessages = Array.from(embeddingValidations).map( + const embeddingValidationResults = ( + await Promise.all( + documents.map((document) => this.findFieldsWithWrongEmbeddings({ database, collection }, document)) + ) + ).flat(); + + if (embeddingValidationResults.length > 0) { + const embeddingValidationMessages = embeddingValidationResults.map( (validation) => `- Field ${validation.path} is an embedding with ${validation.expectedNumDimensions} dimensions and ${validation.expectedQuantization}` + ` quantization, and the provided value is not compatible. Actual dimensions: ${validation.actualNumDimensions}, ` + @@ -293,7 +294,7 @@ export class VectorSearchEmbeddingsManager { rawValues: string[]; embeddingParameters: SupportedEmbeddingParameters; inputType: EmbeddingParameters["inputType"]; - }): Promise { + }): Promise { const provider = await this.atlasSearchEnabledProvider(); if (!provider) { throw new MongoDBError( diff --git a/src/tools/mongodb/read/aggregate.ts b/src/tools/mongodb/read/aggregate.ts index e85844977..001df3c41 100644 --- a/src/tools/mongodb/read/aggregate.ts +++ b/src/tools/mongodb/read/aggregate.ts @@ -298,7 +298,7 @@ export class AggregateTool extends MongoDBToolBase { // $vectorSearch.queryVector can be a BSON.Binary: that it's not either number or an array. // It's not exactly valid from the LLM perspective (they can't provide binaries). // That's why we overwrite the stage in an untyped way, as what we expose and what LLMs can use is different. - vectorSearchStage.queryVector = embeddings; + vectorSearchStage.queryVector = embeddings as string | number[]; } } diff --git a/tests/unit/common/search/vectorSearchEmbeddingsManager.test.ts b/tests/unit/common/search/vectorSearchEmbeddingsManager.test.ts index 7581889bd..9b00e2e38 100644 --- a/tests/unit/common/search/vectorSearchEmbeddingsManager.test.ts +++ b/tests/unit/common/search/vectorSearchEmbeddingsManager.test.ts @@ -390,6 +390,149 @@ describe("VectorSearchEmbeddingsManager", () => { }); }); + describe("assertFieldsHaveCorrectEmbeddings", () => { + it("does not throw for invalid documents when validation is disabled", async () => { + const embeddings = new VectorSearchEmbeddingsManager( + embeddingValidationDisabled, + connectionManager, + embeddingConfig + ); + await expect( + embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, [ + { embedding_field: "some text" }, + { embedding_field: [1, 2, 3] }, + ]) + ).resolves.not.toThrow(); + }); + + describe("when validation is enabled", () => { + let embeddings: VectorSearchEmbeddingsManager; + + beforeEach(() => { + embeddings = new VectorSearchEmbeddingsManager( + embeddingValidationEnabled, + connectionManager, + embeddingConfig + ); + }); + + it("does not throw when all documents are valid", async () => { + await expect( + embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, [ + { embedding_field: [1, 2, 3, 4, 5, 6, 7, 8] }, + { embedding_field: [9, 10, 11, 12, 13, 14, 15, 16] }, + { field: "no embeddings here" }, + ]) + ).resolves.not.toThrow(); + }); + + it("throws error when one document has wrong dimensions", async () => { + await expect( + embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, [ + { embedding_field: [1, 2, 3] }, + ]) + ).rejects.toThrow(/Field embedding_field is an embedding with 8 dimensions/); + }); + + it("throws error when one document has wrong type", async () => { + await expect( + embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, [ + { embedding_field: "some text" }, + ]) + ).rejects.toThrow(/Field embedding_field is an embedding with 8 dimensions/); + }); + + it("throws error when one document has non-numeric values", async () => { + await expect( + embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, [ + { embedding_field: ["1", "2", "3", "4", "5", "6", "7", "8"] }, + ]) + ).rejects.toThrow(/Field embedding_field is an embedding with 8 dimensions/); + }); + + it("throws error with details about dimension mismatch", async () => { + await expect( + embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, [ + { embedding_field: [1, 2, 3] }, + ]) + ).rejects.toThrow(/Actual dimensions: 3/); + }); + + it("throws error with details about quantization", async () => { + await expect( + embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, [ + { embedding_field: [1, 2, 3] }, + ]) + ).rejects.toThrow(/actual quantization: scalar/); + }); + + it("throws error with details about error type", async () => { + await expect( + embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, [ + { embedding_field: [1, 2, 3] }, + ]) + ).rejects.toThrow(/Error: dimension-mismatch/); + }); + + it("throws error when multiple documents have invalid embeddings", async () => { + try { + await embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, [ + { embedding_field: [1, 2, 3] }, + { embedding_field: "some text" }, + ]); + expect.fail("Should have thrown an error"); + } catch (error) { + expect((error as Error).message).toContain("Field embedding_field"); + expect((error as Error).message).toContain("dimension-mismatch"); + } + }); + + it("handles documents with multiple invalid fields", async () => { + try { + await embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, [ + { + embedding_field: [1, 2, 3], + embedding_field_binary: "not binary", + }, + ]); + expect.fail("Should have thrown an error"); + } catch (error) { + expect((error as Error).message).toContain("Field embedding_field"); + expect((error as Error).message).toContain("Field embedding_field_binary"); + } + }); + + it("handles mix of valid and invalid documents", async () => { + try { + await embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, [ + { embedding_field: [1, 2, 3, 4, 5, 6, 7, 8] }, // valid + { embedding_field: [1, 2, 3] }, // invalid + { valid_field: "no embeddings" }, // valid (no embedding field) + ]); + expect.fail("Should have thrown an error"); + } catch (error) { + expect((error as Error).message).toContain("Field embedding_field"); + expect((error as Error).message).toContain("dimension-mismatch"); + expect((error as Error).message).not.toContain("Field valid_field"); + } + }); + + it("handles nested fields with validation errors", async () => { + await expect( + embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, [ + { a: { nasty: { scalar: { field: [1, 2, 3] } } } }, + ]) + ).rejects.toThrow(/Field a\.nasty\.scalar\.field/); + }); + + it("handles empty document array", async () => { + await expect( + embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, []) + ).resolves.not.toThrow(); + }); + }); + }); + describe("generate embeddings", () => { const embeddingToGenerate = { database: "mydb",