diff --git a/.gitignore b/.gitignore index 49550e27..1029194e 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,5 @@ tests/tmp coverage # Generated assets by accuracy runs .accuracy + +.DS_Store \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 34fe72f7..77cd3457 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -76,6 +76,10 @@ npm test -- path/to/test/file.test.ts npm test -- path/to/directory ``` +#### Accuracy Tests and colima + +If you use [colima](https://github.com/abiosoft/colima) to run Docker on Mac, you will need to apply [additional configuration](https://node.testcontainers.org/supported-container-runtimes/#colima) to ensure the accuracy tests run correctly. + ## Troubleshooting ### Restart Server diff --git a/src/common/errors.ts b/src/common/errors.ts index 5880eb78..e44a7272 100644 --- a/src/common/errors.ts +++ b/src/common/errors.ts @@ -7,6 +7,7 @@ export enum ErrorCodes { NoEmbeddingsProviderConfigured = 1_000_005, AtlasVectorSearchIndexNotFound = 1_000_006, AtlasVectorSearchInvalidQuery = 1_000_007, + Unexpected = 1_000_008, } export class MongoDBError extends Error { diff --git a/src/common/search/embeddingsProvider.ts b/src/common/search/embeddingsProvider.ts index 24b6e2c3..b87906ef 100644 --- a/src/common/search/embeddingsProvider.ts +++ b/src/common/search/embeddingsProvider.ts @@ -7,7 +7,7 @@ import { createFetch } from "@mongodb-js/devtools-proxy-support"; import { z } from "zod"; type EmbeddingsInput = string; -type Embeddings = number[]; +type Embeddings = number[] | unknown[]; export type EmbeddingParameters = { inputType: "query" | "document"; }; diff --git a/src/common/search/vectorSearchEmbeddingsManager.ts b/src/common/search/vectorSearchEmbeddingsManager.ts index fc8c53be..1af3a8a6 100644 --- a/src/common/search/vectorSearchEmbeddingsManager.ts +++ b/src/common/search/vectorSearchEmbeddingsManager.ts @@ -6,6 +6,7 @@ import z from "zod"; import { ErrorCodes, MongoDBError } from "../errors.js"; import { getEmbeddingsProvider } from "./embeddingsProvider.js"; import type { EmbeddingParameters, SupportedEmbeddingParameters } from "./embeddingsProvider.js"; +import { formatUntrustedData } from "../../tools/tool.js"; export const similarityEnum = z.enum(["cosine", "euclidean", "dotProduct"]); export type Similarity = z.infer; @@ -103,7 +104,34 @@ export class VectorSearchEmbeddingsManager { return definition; } - async findFieldsWithWrongEmbeddings( + async assertFieldsHaveCorrectEmbeddings( + { database, collection }: { database: string; collection: string }, + documents: Document[] + ): Promise { + const embeddingValidationResults = ( + await Promise.all( + documents.map((document) => this.findFieldsWithWrongEmbeddings({ database, collection }, document)) + ) + ).flat(); + + if (embeddingValidationResults.length > 0) { + const embeddingValidationMessages = embeddingValidationResults.map( + (validation) => + `- Field ${validation.path} is an embedding with ${validation.expectedNumDimensions} dimensions and ${validation.expectedQuantization}` + + ` quantization, and the provided value is not compatible. Actual dimensions: ${validation.actualNumDimensions}, ` + + `actual quantization: ${validation.actualQuantization}. Error: ${validation.error}` + ); + + throw new MongoDBError( + ErrorCodes.AtlasVectorSearchInvalidQuery, + formatUntrustedData("", ...embeddingValidationMessages) + .map(({ text }) => text) + .join("\n") + ); + } + } + + public async findFieldsWithWrongEmbeddings( { database, collection, @@ -239,21 +267,34 @@ export class VectorSearchEmbeddingsManager { return undefined; } - public async generateEmbeddings({ + public async assertVectorSearchIndexExists({ database, collection, path, - rawValues, - embeddingParameters, - inputType, }: { database: string; collection: string; path: string; + }): Promise { + const embeddingInfoForCollection = await this.embeddingsForNamespace({ database, collection }); + const embeddingInfoForPath = embeddingInfoForCollection.find((definition) => definition.path === path); + if (!embeddingInfoForPath) { + throw new MongoDBError( + ErrorCodes.AtlasVectorSearchIndexNotFound, + `No Vector Search index found for path "${path}" in namespace "${database}.${collection}"` + ); + } + } + + public async generateEmbeddings({ + rawValues, + embeddingParameters, + inputType, + }: { rawValues: string[]; embeddingParameters: SupportedEmbeddingParameters; inputType: EmbeddingParameters["inputType"]; - }): Promise { + }): Promise { const provider = await this.atlasSearchEnabledProvider(); if (!provider) { throw new MongoDBError( @@ -275,15 +316,6 @@ export class VectorSearchEmbeddingsManager { }); } - const embeddingInfoForCollection = await this.embeddingsForNamespace({ database, collection }); - const embeddingInfoForPath = embeddingInfoForCollection.find((definition) => definition.path === path); - if (!embeddingInfoForPath) { - throw new MongoDBError( - ErrorCodes.AtlasVectorSearchIndexNotFound, - `No Vector Search index found for path "${path}" in namespace "${database}.${collection}"` - ); - } - return await embeddingsProvider.embed(embeddingParameters.model, rawValues, { inputType, ...embeddingParameters, diff --git a/src/tools/mongodb/create/insertMany.ts b/src/tools/mongodb/create/insertMany.ts index fa3fc365..86aec320 100644 --- a/src/tools/mongodb/create/insertMany.ts +++ b/src/tools/mongodb/create/insertMany.ts @@ -3,6 +3,17 @@ import type { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js"; import { type ToolArgs, type OperationType, formatUntrustedData } from "../../tool.js"; import { zEJSON } from "../../args.js"; +import { type Document } from "bson"; +import { zSupportedEmbeddingParameters } from "../../../common/search/embeddingsProvider.js"; +import { ErrorCodes, MongoDBError } from "../../../common/errors.js"; + +const zSupportedEmbeddingParametersWithInput = zSupportedEmbeddingParameters.extend({ + input: z + .array(z.object({}).passthrough()) + .describe( + "Array of objects with vector search index fields as keys (in dot notation) and the raw text values to generate embeddings for as values. The index of each object corresponds to the index of the document in the documents array." + ), +}); export class InsertManyTool extends MongoDBToolBase { public name = "insert-many"; @@ -12,8 +23,17 @@ export class InsertManyTool extends MongoDBToolBase { documents: z .array(zEJSON().describe("An individual MongoDB document")) .describe( - "The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany()" + "The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany()." ), + ...(this.isFeatureEnabled("vectorSearch") + ? { + embeddingParameters: zSupportedEmbeddingParametersWithInput + .optional() + .describe( + "The embedding model and its parameters to use to generate embeddings for fields with vector search indexes. Note to LLM: If unsure which embedding model to use, ask the user before providing one." + ), + } + : {}), }; public operationType: OperationType = "create"; @@ -21,37 +41,26 @@ export class InsertManyTool extends MongoDBToolBase { database, collection, documents, + embeddingParameters: providedEmbeddingParameters, }: ToolArgs): Promise { const provider = await this.ensureConnected(); - const embeddingValidations = new Set( - ...(await Promise.all( - documents.flatMap((document) => - this.session.vectorSearchEmbeddingsManager.findFieldsWithWrongEmbeddings( - { database, collection }, - document - ) - ) - )) - ); + const embeddingParameters = this.isFeatureEnabled("vectorSearch") + ? (providedEmbeddingParameters as z.infer) + : undefined; - if (embeddingValidations.size > 0) { - // tell the LLM what happened - const embeddingValidationMessages = [...embeddingValidations].map( - (validation) => - `- Field ${validation.path} is an embedding with ${validation.expectedNumDimensions} dimensions and ${validation.expectedQuantization}` + - ` quantization, and the provided value is not compatible. Actual dimensions: ${validation.actualNumDimensions}, ` + - `actual quantization: ${validation.actualQuantization}. Error: ${validation.error}` - ); - - return { - content: formatUntrustedData( - "There were errors when inserting documents. No document was inserted.", - ...embeddingValidationMessages - ), - isError: true, - }; - } + // Process documents to replace raw string values with generated embeddings + documents = await this.replaceRawValuesWithEmbeddingsIfNecessary({ + database, + collection, + documents, + embeddingParameters, + }); + + await this.session.vectorSearchEmbeddingsManager.assertFieldsHaveCorrectEmbeddings( + { database, collection }, + documents + ); const result = await provider.insertMany(database, collection, documents); const content = formatUntrustedData( @@ -63,4 +72,84 @@ export class InsertManyTool extends MongoDBToolBase { content, }; } + + private async replaceRawValuesWithEmbeddingsIfNecessary({ + database, + collection, + documents, + embeddingParameters, + }: { + database: string; + collection: string; + documents: Document[]; + embeddingParameters?: z.infer; + }): Promise { + // If no embedding parameters or no input specified, return documents as-is + if (!embeddingParameters?.input || embeddingParameters.input.length === 0) { + return documents; + } + + // Get vector search indexes for the collection + const vectorIndexes = await this.session.vectorSearchEmbeddingsManager.embeddingsForNamespace({ + database, + collection, + }); + + // Ensure for inputted fields, the vector search index exists. + for (const input of embeddingParameters.input) { + for (const fieldPath of Object.keys(input)) { + if (!vectorIndexes.some((index) => index.path === fieldPath)) { + throw new MongoDBError( + ErrorCodes.AtlasVectorSearchInvalidQuery, + `Field '${fieldPath}' does not have a vector search index in collection ${database}.${collection}. Only fields with vector search indexes can have embeddings generated.` + ); + } + } + } + + // We make one call to generate embeddings for all documents at once to avoid making too many API calls. + const flattenedEmbeddingsInput = embeddingParameters.input.flatMap((documentInput, index) => + Object.entries(documentInput).map(([fieldPath, rawTextValue]) => ({ + fieldPath, + rawTextValue, + documentIndex: index, + })) + ); + + const generatedEmbeddings = await this.session.vectorSearchEmbeddingsManager.generateEmbeddings({ + rawValues: flattenedEmbeddingsInput.map(({ rawTextValue }) => rawTextValue) as string[], + embeddingParameters, + inputType: "document", + }); + + const processedDocuments: Document[] = [...documents]; + + for (const [index, { fieldPath, documentIndex }] of flattenedEmbeddingsInput.entries()) { + if (!processedDocuments[documentIndex]) { + throw new MongoDBError(ErrorCodes.Unexpected, `Document at index ${documentIndex} does not exist.`); + } + // Ensure no nested fields are present in the field path. + this.deleteFieldPath(processedDocuments[documentIndex], fieldPath); + processedDocuments[documentIndex][fieldPath] = generatedEmbeddings[index]; + } + + return processedDocuments; + } + + // Delete a specified field path from a document using dot notation. + private deleteFieldPath(document: Record, fieldPath: string): void { + const parts = fieldPath.split("."); + let current: Record = document; + for (let i = 0; i < parts.length; i++) { + const part = parts[i]; + const key = part as keyof typeof current; + if (!current[key]) { + return; + } else if (i === parts.length - 1) { + delete current[key]; + } else { + current = current[key] as Record; + } + } + } } diff --git a/src/tools/mongodb/read/aggregate.ts b/src/tools/mongodb/read/aggregate.ts index 4ad96590..001df3c4 100644 --- a/src/tools/mongodb/read/aggregate.ts +++ b/src/tools/mongodb/read/aggregate.ts @@ -276,22 +276,37 @@ export class AggregateTool extends MongoDBToolBase { const embeddingParameters = vectorSearchStage.embeddingParameters; delete vectorSearchStage.embeddingParameters; - const [embeddings] = await this.session.vectorSearchEmbeddingsManager.generateEmbeddings({ + await this.session.vectorSearchEmbeddingsManager.assertVectorSearchIndexExists({ database, collection, path: vectorSearchStage.path, + }); + + const [embeddings] = await this.session.vectorSearchEmbeddingsManager.generateEmbeddings({ rawValues: [vectorSearchStage.queryVector], embeddingParameters, inputType: "query", }); + if (!embeddings) { + throw new MongoDBError( + ErrorCodes.AtlasVectorSearchInvalidQuery, + "Failed to generate embeddings for the query vector." + ); + } + // $vectorSearch.queryVector can be a BSON.Binary: that it's not either number or an array. // It's not exactly valid from the LLM perspective (they can't provide binaries). // That's why we overwrite the stage in an untyped way, as what we expose and what LLMs can use is different. - vectorSearchStage.queryVector = embeddings as number[]; + vectorSearchStage.queryVector = embeddings as string | number[]; } } + await this.session.vectorSearchEmbeddingsManager.assertFieldsHaveCorrectEmbeddings( + { database, collection }, + pipeline + ); + return pipeline; } diff --git a/tests/accuracy/insertMany.embeddings.test.ts b/tests/accuracy/insertMany.embeddings.test.ts new file mode 100644 index 00000000..6445b845 --- /dev/null +++ b/tests/accuracy/insertMany.embeddings.test.ts @@ -0,0 +1,195 @@ +import type { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { Matcher } from "./sdk/matcher.js"; + +const embeddingParameters = { + model: "voyage-3.5", + outputDimension: Matcher.anyOf( + Matcher.undefined, + Matcher.number((n) => n === 1024) + ), + outputDType: Matcher.anyOf(Matcher.undefined, Matcher.value("float")), +}; + +const mockInsertMany = (): CallToolResult => { + return { + content: [ + { + type: "text", + text: "Documents were inserted successfully.", + }, + ], + }; +}; + +/** + * Accuracy tests for inserting documents with automatic vector embeddings generation. + */ +describeAccuracyTests( + [ + { + prompt: "Insert 2 documents in one call into 'mflix.movies' collection - document should have a 'title' field that has generated embeddings using the voyage-3.5 model: 'The Matrix' and 'Blade Runner'. Assume the collection already exists and has vector index on the 'title' field.", + expectedToolCalls: [ + { + toolName: "insert-many", + parameters: { + database: "mflix", + collection: "movies", + documents: [ + { + // The title field might be specified, sometimes as "The Matrix" or "Placeholder". This will be overwritten by the embeddings so this is fine. + title: Matcher.anyOf(Matcher.undefined, Matcher.null, Matcher.string()), + }, + { + title: Matcher.anyOf(Matcher.undefined, Matcher.null, Matcher.string()), + }, + ], + embeddingParameters: { + ...embeddingParameters, + input: [ + { + title: "The Matrix", + }, + { + title: "Blade Runner", + }, + ], + }, + }, + }, + ], + }, + { + prompt: "Insert a document into 'mflix.movies' collection with following fields: title is 'The Matrix', plotSummary is 'A computer hacker learns about the true nature of his reality', generate the necesssary vector embeddings for the 'plotSummaryEmbeddings' field using the voyage-3.5 model. Assume the collection already exists and has vector index on the 'plotSummaryEmbeddings' field.", + expectedToolCalls: [ + { + toolName: "insert-many", + parameters: { + database: "mflix", + collection: "movies", + documents: [ + { + title: "The Matrix", + plotSummary: "A computer hacker learns about the true nature of his reality", + plotSummaryEmbeddings: Matcher.anyOf(Matcher.undefined, Matcher.null, Matcher.string()), + }, + ], + embeddingParameters: { + ...embeddingParameters, + input: [ + { + plotSummaryEmbeddings: + "A computer hacker learns about the true nature of his reality", + }, + ], + }, + }, + }, + ], + }, + { + prompt: "Insert 2 documents in one call into 'mflix.movies' collection - the movie titles are 1. 'The Matrix' and 2. 'Blade Runner'. They should have an info field which has 2 subfields: 'title' and 'titleEmbeddings'. Generate the embeddings for the 'info.titleEmbeddings' subfield using the voyage-3.5 model. Assume the collection already exists and has vector index on the 'info.titleEmbeddings' field.", + expectedToolCalls: [ + { + toolName: "insert-many", + parameters: { + database: "mflix", + collection: "movies", + documents: [ + { + info: { + titleEmbeddings: Matcher.anyOf(Matcher.undefined, Matcher.null, Matcher.string()), + title: "The Matrix", + }, + }, + { + info: { + titleEmbeddings: Matcher.anyOf(Matcher.undefined, Matcher.null, Matcher.string()), + title: "Blade Runner", + }, + }, + ], + embeddingParameters: { + ...embeddingParameters, + input: [ + { + "info.titleEmbeddings": "The Matrix", + }, + { + "info.titleEmbeddings": "Blade Runner", + }, + ], + }, + }, + }, + ], + mockedTools: { + "insert-many": mockInsertMany, + }, + }, + { + prompt: "Insert a document into 'mflix.movies' collection with title 'The Matrix' and generate the necesssary vector embeddings for the current vector search fields using the voyage-3.5 model.", + expectedToolCalls: [ + { + toolName: "collection-indexes", + parameters: { + database: "mflix", + collection: "movies", + }, + }, + { + toolName: "insert-many", + parameters: { + database: "mflix", + collection: "movies", + documents: [ + { + title: "The Matrix", + title_embeddings: Matcher.anyOf(Matcher.undefined, Matcher.null, Matcher.string()), + }, + ], + embeddingParameters: { + ...embeddingParameters, + input: [ + { + title_embeddings: "The Matrix", + }, + ], + }, + }, + }, + ], + mockedTools: { + "insert-many": mockInsertMany, + "collection-indexes": (): CallToolResult => { + return { + content: [ + { + type: "text", + text: JSON.stringify({ + name: "title_embeddings", + type: "vectorSearch", + status: "READY", + queryable: true, + latestDefinition: { + type: "vector", + path: "title_embeddings", + numDimensions: 1024, + quantization: "none", + similarity: "euclidean", + }, + }), + }, + ], + }; + }, + }, + }, + ], + { + userConfig: { voyageApiKey: "valid-key", previewFeatures: "vectorSearch" }, + clusterConfig: { + search: true, + }, + } +); diff --git a/tests/accuracy/sdk/matcher.ts b/tests/accuracy/sdk/matcher.ts index d2583681..7f403e3a 100644 --- a/tests/accuracy/sdk/matcher.ts +++ b/tests/accuracy/sdk/matcher.ts @@ -24,6 +24,10 @@ export abstract class Matcher { return new UndefinedMatcher(); } + public static get null(): Matcher { + return new NullMatcher(); + } + public static boolean(expected?: boolean): Matcher { return new BooleanMatcher(expected); } @@ -102,6 +106,12 @@ class UndefinedMatcher extends Matcher { } } +class NullMatcher extends Matcher { + public match(actual: unknown): number { + return actual === null ? 1 : 0; + } +} + class NotMatcher extends Matcher { constructor(private matcher: Matcher) { super(); diff --git a/tests/integration/tools/mongodb/create/insertMany.test.ts b/tests/integration/tools/mongodb/create/insertMany.test.ts index e7bbd096..e9964e26 100644 --- a/tests/integration/tools/mongodb/create/insertMany.test.ts +++ b/tests/integration/tools/mongodb/create/insertMany.test.ts @@ -12,8 +12,9 @@ import { validateThrowsForInvalidArguments, expectDefined, getDataFromUntrustedContent, + defaultTestConfig, } from "../../../helpers.js"; -import { beforeEach, afterEach, expect, it } from "vitest"; +import { beforeEach, afterEach, expect, it, describe } from "vitest"; import { ObjectId } from "bson"; import type { Collection } from "mongodb"; @@ -24,7 +25,7 @@ describeWithMongoDB("insertMany tool when search is disabled", (integration) => name: "documents", type: "array", description: - "The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany()", + "The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany().", required: true, }, ]); @@ -110,10 +111,12 @@ describeWithMongoDB( "insertMany tool when search is enabled", (integration) => { let collection: Collection; + let database: string; beforeEach(async () => { await integration.connectMcpClient(); - collection = await integration.mongoClient().db(integration.randomDbName()).createCollection("test"); + database = integration.randomDbName(); + collection = await integration.mongoClient().db(database).createCollection("test"); await waitUntilSearchIsReady(integration.mongoClient()); }); @@ -122,7 +125,7 @@ describeWithMongoDB( }); it("inserts a document when the embedding is correct", async () => { - await createVectorSearchIndexAndWait(integration.mongoClient(), integration.randomDbName(), "test", [ + await createVectorSearchIndexAndWait(integration.mongoClient(), database, "test", [ { type: "vector", path: "embedding", @@ -135,7 +138,7 @@ describeWithMongoDB( const response = await integration.mcpClient().callTool({ name: "insert-many", arguments: { - database: integration.randomDbName(), + database, collection: "test", documents: [{ embedding: [1, 2, 3, 4, 5, 6, 7, 8] }], }, @@ -150,7 +153,7 @@ describeWithMongoDB( }); it("returns an error when there is a search index and quantisation is wrong", async () => { - await createVectorSearchIndexAndWait(integration.mongoClient(), integration.randomDbName(), "test", [ + await createVectorSearchIndexAndWait(integration.mongoClient(), database, "test", [ { type: "vector", path: "embedding", @@ -163,14 +166,14 @@ describeWithMongoDB( const response = await integration.mcpClient().callTool({ name: "insert-many", arguments: { - database: integration.randomDbName(), + database: database, collection: "test", documents: [{ embedding: "oopsie" }], }, }); const content = getResponseContent(response.content); - expect(content).toContain("There were errors when inserting documents. No document was inserted."); + expect(content).toContain("Error running insert-many"); const untrustedContent = getDataFromUntrustedContent(content); expect(untrustedContent).toContain( "- Field embedding is an embedding with 8 dimensions and scalar quantization, and the provided value is not compatible. Actual dimensions: unknown, actual quantization: unknown. Error: not-a-vector" @@ -181,8 +184,447 @@ describeWithMongoDB( }); expect(oopsieCount).toBe(0); }); + + describe.skipIf(!process.env.TEST_MDB_MCP_VOYAGE_API_KEY)("embeddings generation with Voyage AI", () => { + beforeEach(async () => { + await integration.connectMcpClient(); + database = integration.randomDbName(); + collection = await integration.mongoClient().db(database).createCollection("test"); + await waitUntilSearchIsReady(integration.mongoClient()); + }); + + afterEach(async () => { + await collection.drop(); + }); + + it("generates embeddings for a single document with one field", async () => { + await createVectorSearchIndexAndWait(integration.mongoClient(), database, "test", [ + { + type: "vector", + path: "titleEmbeddings", + numDimensions: 1024, + similarity: "cosine", + quantization: "scalar", + }, + ]); + + const response = await integration.mcpClient().callTool({ + name: "insert-many", + arguments: { + database, + collection: "test", + documents: [{ title: "The Matrix" }], + embeddingParameters: { + model: "voyage-3.5-lite", + input: [{ titleEmbeddings: "The Matrix" }], + }, + }, + }); + + const content = getResponseContent(response.content); + expect(content).toContain("Documents were inserted successfully."); + const insertedIds = extractInsertedIds(content); + expect(insertedIds).toHaveLength(1); + + const doc = await collection.findOne({ _id: insertedIds[0] }); + expect(doc).toBeDefined(); + expect(doc?.title).toBe("The Matrix"); + expect(doc?.titleEmbeddings).toBeDefined(); + expect(Array.isArray(doc?.titleEmbeddings)).toBe(true); + expect((doc?.titleEmbeddings as number[]).length).toBe(1024); + // Verify all values are numbers + expect((doc?.titleEmbeddings as number[]).every((n) => typeof n === "number")).toBe(true); + }); + + it("generates embeddings for multiple documents with the same field", async () => { + await createVectorSearchIndexAndWait(integration.mongoClient(), database, "test", [ + { + type: "vector", + path: "titleEmbeddings", + numDimensions: 1024, + similarity: "cosine", + quantization: "scalar", + }, + ]); + + const response = await integration.mcpClient().callTool({ + name: "insert-many", + arguments: { + database: database, + collection: "test", + documents: [ + { + title: "The Matrix", + }, + { + title: "Blade Runner", + }, + ], + embeddingParameters: { + model: "voyage-3.5-lite", + input: [ + { + titleEmbeddings: "The Matrix", + }, + { + titleEmbeddings: "Blade Runner", + }, + ], + }, + }, + }); + + const content = getResponseContent(response.content); + expect(content).toContain("Documents were inserted successfully."); + const insertedIds = extractInsertedIds(content); + expect(insertedIds).toHaveLength(2); + + const doc1 = await collection.findOne({ _id: insertedIds[0] }); + expect(doc1?.title).toBe("The Matrix"); + expect(Array.isArray(doc1?.titleEmbeddings)).toBe(true); + expect((doc1?.titleEmbeddings as number[]).length).toBe(1024); + + const doc2 = await collection.findOne({ _id: insertedIds[1] }); + expect(doc2?.title).toBe("Blade Runner"); + expect(Array.isArray(doc2?.titleEmbeddings)).toBe(true); + expect((doc2?.titleEmbeddings as number[]).length).toBe(1024); + + // Verify embeddings are different + expect(doc1?.titleEmbeddings).not.toEqual(doc2?.titleEmbeddings); + }); + + it("generates embeddings for nested fields", async () => { + await createVectorSearchIndexAndWait(integration.mongoClient(), database, "test", [ + { + type: "vector", + path: "info.titleEmbeddings", + numDimensions: 1024, + similarity: "cosine", + quantization: "scalar", + }, + ]); + + const response = await integration.mcpClient().callTool({ + name: "insert-many", + arguments: { + database, + collection: "test", + documents: [ + { + info: { + title: "The Matrix", + }, + }, + ], + embeddingParameters: { + model: "voyage-3.5-lite", + input: [ + { + "info.titleEmbeddings": "The Matrix", + }, + ], + }, + }, + }); + + const content = getResponseContent(response.content); + expect(content).toContain("Documents were inserted successfully."); + const insertedIds = extractInsertedIds(content); + expect(insertedIds).toHaveLength(1); + + const doc = await collection.findOne({ _id: insertedIds[0] }); + expect(doc?.info).toBeDefined(); + // eslint-disable-next-line @typescript-eslint/no-unsafe-member-access + expect(doc?.info.title).toBe("The Matrix"); + // eslint-disable-next-line @typescript-eslint/no-unsafe-member-access + expect(Array.isArray(doc?.info.titleEmbeddings)).toBe(true); + // eslint-disable-next-line @typescript-eslint/no-unsafe-member-access + expect((doc?.info.titleEmbeddings as number[]).length).toBe(1024); + }); + + it("overwrites existing field value with generated embeddings", async () => { + await createVectorSearchIndexAndWait(integration.mongoClient(), database, "test", [ + { + type: "vector", + path: "titleEmbeddings", + numDimensions: 1024, + similarity: "cosine", + quantization: "scalar", + }, + ]); + + const response = await integration.mcpClient().callTool({ + name: "insert-many", + arguments: { + database, + collection: "test", + documents: [ + { + title: "The Matrix", + titleEmbeddings: [1, 2, 3], // This should be overwritten + }, + ], + embeddingParameters: { + model: "voyage-3.5-lite", + input: [ + { + titleEmbeddings: "The Matrix", + }, + ], + }, + }, + }); + + const content = getResponseContent(response.content); + expect(content).toContain("Documents were inserted successfully."); + const insertedIds = extractInsertedIds(content); + expect(insertedIds).toHaveLength(1); + + const doc = await collection.findOne({ _id: insertedIds[0] }); + expect(doc?.title).toBe("The Matrix"); + expect(doc?.titleEmbeddings).not.toEqual([1, 2, 3]); + expect(Array.isArray(doc?.titleEmbeddings)).toBe(true); + expect((doc?.titleEmbeddings as number[]).length).toBe(1024); + }); + + it("removes redundant nested field from document when embeddings are generated", async () => { + await createVectorSearchIndexAndWait(integration.mongoClient(), database, "test", [ + { + type: "vector", + path: "title.embeddings", + numDimensions: 1024, + similarity: "cosine", + quantization: "scalar", + }, + ]); + + const response = await integration.mcpClient().callTool({ + name: "insert-many", + arguments: { + database, + collection: "test", + documents: [{ title: { text: "The Matrix", embeddings: "This should be removed" } }], + embeddingParameters: { + model: "voyage-3.5-lite", + input: [{ "title.embeddings": "The Matrix" }], + }, + }, + }); + const content = getResponseContent(response.content); + expect(content).toContain("Documents were inserted successfully."); + const insertedIds = extractInsertedIds(content); + expect(insertedIds).toHaveLength(1); + + const doc = await collection.findOne({ _id: insertedIds[0] }); + expect((doc?.title as Record)?.text).toBe("The Matrix"); + expect((doc?.title as Record)?.embeddings).not.toBeDefined(); + expect((doc?.["title.embeddings"] as unknown as number[]).length).toBe(1024); + }); + + it("returns an error when input field does not have a vector search index", async () => { + await createVectorSearchIndexAndWait(integration.mongoClient(), database, "test", [ + { + type: "vector", + path: "titleEmbeddings", + numDimensions: 1024, + similarity: "cosine", + quantization: "scalar", + }, + ]); + + const response = await integration.mcpClient().callTool({ + name: "insert-many", + arguments: { + database, + collection: "test", + documents: [ + { + title: "The Matrix", + }, + ], + embeddingParameters: { + model: "voyage-3.5-lite", + input: [ + { + nonExistentField: "The Matrix", + }, + ], + }, + }, + }); + + const content = getResponseContent(response.content); + expect(content).toContain("Error running insert-many"); + expect(content).toContain("Field 'nonExistentField' does not have a vector search index in collection"); + expect(content).toContain("Only fields with vector search indexes can have embeddings generated"); + }); + + it("inserts documents without embeddings when input array is empty", async () => { + await createVectorSearchIndexAndWait(integration.mongoClient(), database, "test", [ + { + type: "vector", + path: "titleEmbeddings", + numDimensions: 1024, + similarity: "cosine", + quantization: "scalar", + }, + ]); + + const response = await integration.mcpClient().callTool({ + name: "insert-many", + arguments: { + database, + collection: "test", + documents: [ + { + title: "The Matrix", + }, + ], + embeddingParameters: { + model: "voyage-3.5-lite", + input: [], + }, + }, + }); + + const content = getResponseContent(response.content); + expect(content).toContain("Documents were inserted successfully."); + const insertedIds = extractInsertedIds(content); + expect(insertedIds).toHaveLength(1); + + const doc = await collection.findOne({ _id: insertedIds[0] }); + expect(doc?.title).toBe("The Matrix"); + expect(doc?.titleEmbeddings).toBeUndefined(); + }); + + it("generates embeddings with 256 dimensions", async () => { + await createVectorSearchIndexAndWait(integration.mongoClient(), database, "test", [ + { + type: "vector", + path: "titleEmbeddings", + numDimensions: 256, + similarity: "cosine", + quantization: "scalar", + }, + ]); + + const response = await integration.mcpClient().callTool({ + name: "insert-many", + arguments: { + database, + collection: "test", + documents: [{ title: "The Matrix" }], + embeddingParameters: { + model: "voyage-3.5-lite", + outputDimension: 256, + input: [{ titleEmbeddings: "The Matrix" }], + }, + }, + }); + + const content = getResponseContent(response.content); + expect(content).toContain("Documents were inserted successfully."); + const insertedIds = extractInsertedIds(content); + + const doc = await collection.findOne({ _id: insertedIds[0] }); + expect(Array.isArray(doc?.titleEmbeddings)).toBe(true); + expect((doc?.titleEmbeddings as number[]).length).toBe(256); + }); + + it("generates embeddings for multiple fields in a single document", async () => { + await createVectorSearchIndexAndWait(integration.mongoClient(), database, "test", [ + { + type: "vector", + path: "titleEmbeddings", + numDimensions: 1024, + similarity: "cosine", + quantization: "scalar", + }, + { + type: "vector", + path: "plotEmbeddings", + numDimensions: 1024, + similarity: "cosine", + quantization: "scalar", + }, + ]); + + const response = await integration.mcpClient().callTool({ + name: "insert-many", + arguments: { + database, + collection: "test", + documents: [ + { + title: "The Matrix", + plot: "A computer hacker learns about the true nature of his reality", + }, + ], + embeddingParameters: { + model: "voyage-3.5-lite", + input: [ + { + titleEmbeddings: "The Matrix", + plotEmbeddings: "A computer hacker learns about the true nature of his reality", + }, + ], + }, + }, + }); + + const content = getResponseContent(response.content); + expect(content).toContain("Documents were inserted successfully."); + const insertedIds = extractInsertedIds(content); + + const doc = await collection.findOne({ _id: insertedIds[0] }); + expect(doc?.title).toBe("The Matrix"); + expect(Array.isArray(doc?.titleEmbeddings)).toBe(true); + expect((doc?.titleEmbeddings as number[]).length).toBe(1024); + expect(Array.isArray(doc?.plotEmbeddings)).toBe(true); + expect((doc?.plotEmbeddings as number[]).length).toBe(1024); + // Verify embeddings are different for different text + expect(doc?.titleEmbeddings).not.toEqual(doc?.plotEmbeddings); + }); + }); + }, + { + getUserConfig: () => ({ + ...defaultTestConfig, + voyageApiKey: process.env.TEST_MDB_MCP_VOYAGE_API_KEY ?? "", + previewFeatures: ["vectorSearch"], + }), + downloadOptions: { search: true }, + } +); + +describeWithMongoDB( + "insertMany tool when vector search is enabled", + (integration) => { + validateToolMetadata(integration, "insert-many", "Insert an array of documents into a MongoDB collection", [ + ...databaseCollectionParameters, + { + name: "documents", + type: "array", + description: + "The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany().", + required: true, + }, + { + name: "embeddingParameters", + type: "object", + description: + "The embedding model and its parameters to use to generate embeddings for fields with vector search indexes. Note to LLM: If unsure which embedding model to use, ask the user before providing one.", + required: false, + }, + ]); }, - { downloadOptions: { search: true } } + { + getUserConfig: () => ({ + ...defaultTestConfig, + voyageApiKey: "valid-key", + previewFeatures: ["vectorSearch"], + }), + } ); function extractInsertedIds(content: string): ObjectId[] { diff --git a/tests/unit/common/search/vectorSearchEmbeddingsManager.test.ts b/tests/unit/common/search/vectorSearchEmbeddingsManager.test.ts index 24b921e7..9b00e2e3 100644 --- a/tests/unit/common/search/vectorSearchEmbeddingsManager.test.ts +++ b/tests/unit/common/search/vectorSearchEmbeddingsManager.test.ts @@ -1,4 +1,4 @@ -import { describe, it, expect, vi, beforeEach } from "vitest"; +import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; import type { MockedFunction } from "vitest"; import { VectorSearchEmbeddingsManager } from "../../../../src/common/search/vectorSearchEmbeddingsManager.js"; import type { @@ -390,6 +390,149 @@ describe("VectorSearchEmbeddingsManager", () => { }); }); + describe("assertFieldsHaveCorrectEmbeddings", () => { + it("does not throw for invalid documents when validation is disabled", async () => { + const embeddings = new VectorSearchEmbeddingsManager( + embeddingValidationDisabled, + connectionManager, + embeddingConfig + ); + await expect( + embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, [ + { embedding_field: "some text" }, + { embedding_field: [1, 2, 3] }, + ]) + ).resolves.not.toThrow(); + }); + + describe("when validation is enabled", () => { + let embeddings: VectorSearchEmbeddingsManager; + + beforeEach(() => { + embeddings = new VectorSearchEmbeddingsManager( + embeddingValidationEnabled, + connectionManager, + embeddingConfig + ); + }); + + it("does not throw when all documents are valid", async () => { + await expect( + embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, [ + { embedding_field: [1, 2, 3, 4, 5, 6, 7, 8] }, + { embedding_field: [9, 10, 11, 12, 13, 14, 15, 16] }, + { field: "no embeddings here" }, + ]) + ).resolves.not.toThrow(); + }); + + it("throws error when one document has wrong dimensions", async () => { + await expect( + embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, [ + { embedding_field: [1, 2, 3] }, + ]) + ).rejects.toThrow(/Field embedding_field is an embedding with 8 dimensions/); + }); + + it("throws error when one document has wrong type", async () => { + await expect( + embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, [ + { embedding_field: "some text" }, + ]) + ).rejects.toThrow(/Field embedding_field is an embedding with 8 dimensions/); + }); + + it("throws error when one document has non-numeric values", async () => { + await expect( + embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, [ + { embedding_field: ["1", "2", "3", "4", "5", "6", "7", "8"] }, + ]) + ).rejects.toThrow(/Field embedding_field is an embedding with 8 dimensions/); + }); + + it("throws error with details about dimension mismatch", async () => { + await expect( + embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, [ + { embedding_field: [1, 2, 3] }, + ]) + ).rejects.toThrow(/Actual dimensions: 3/); + }); + + it("throws error with details about quantization", async () => { + await expect( + embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, [ + { embedding_field: [1, 2, 3] }, + ]) + ).rejects.toThrow(/actual quantization: scalar/); + }); + + it("throws error with details about error type", async () => { + await expect( + embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, [ + { embedding_field: [1, 2, 3] }, + ]) + ).rejects.toThrow(/Error: dimension-mismatch/); + }); + + it("throws error when multiple documents have invalid embeddings", async () => { + try { + await embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, [ + { embedding_field: [1, 2, 3] }, + { embedding_field: "some text" }, + ]); + expect.fail("Should have thrown an error"); + } catch (error) { + expect((error as Error).message).toContain("Field embedding_field"); + expect((error as Error).message).toContain("dimension-mismatch"); + } + }); + + it("handles documents with multiple invalid fields", async () => { + try { + await embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, [ + { + embedding_field: [1, 2, 3], + embedding_field_binary: "not binary", + }, + ]); + expect.fail("Should have thrown an error"); + } catch (error) { + expect((error as Error).message).toContain("Field embedding_field"); + expect((error as Error).message).toContain("Field embedding_field_binary"); + } + }); + + it("handles mix of valid and invalid documents", async () => { + try { + await embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, [ + { embedding_field: [1, 2, 3, 4, 5, 6, 7, 8] }, // valid + { embedding_field: [1, 2, 3] }, // invalid + { valid_field: "no embeddings" }, // valid (no embedding field) + ]); + expect.fail("Should have thrown an error"); + } catch (error) { + expect((error as Error).message).toContain("Field embedding_field"); + expect((error as Error).message).toContain("dimension-mismatch"); + expect((error as Error).message).not.toContain("Field valid_field"); + } + }); + + it("handles nested fields with validation errors", async () => { + await expect( + embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, [ + { a: { nasty: { scalar: { field: [1, 2, 3] } } } }, + ]) + ).rejects.toThrow(/Field a\.nasty\.scalar\.field/); + }); + + it("handles empty document array", async () => { + await expect( + embeddings.assertFieldsHaveCorrectEmbeddings({ database, collection }, []) + ).resolves.not.toThrow(); + }); + }); + }); + describe("generate embeddings", () => { const embeddingToGenerate = { database: "mydb", @@ -411,7 +554,7 @@ describe("VectorSearchEmbeddingsManager", () => { ); }); - describe("when atlas search is not available", () => { + describe("assertVectorSearchIndexExists", () => { beforeEach(() => { embeddings = new VectorSearchEmbeddingsManager( embeddingValidationEnabled, @@ -419,12 +562,57 @@ describe("VectorSearchEmbeddingsManager", () => { new Map(), getMockedEmbeddingsProvider ); + }); + + afterEach(() => { + provider.getSearchIndexes.mockReset(); + }); + it("does not throw an exception when index is available for path", async () => { + provider.getSearchIndexes.mockResolvedValue([ + { + id: "65e8c766d0450e3e7ab9855f", + name: "vector-search-test", + type: "vectorSearch", + status: "READY", + queryable: true, + latestDefinition: { + fields: [ + { + type: "vector", + path: embeddingToGenerate.path, + numDimensions: 1024, + similarity: "euclidean", + }, + ], + }, + }, + ]); + await expect( + embeddings.assertVectorSearchIndexExists({ + database, + collection, + path: embeddingToGenerate.path, + }) + ).resolves.not.toThrowError(); + }); + + it("throws an exception when atlas search is not available", async () => { provider.getSearchIndexes.mockRejectedValue(new Error()); + await expect( + embeddings.assertVectorSearchIndexExists({ + database, + collection, + path: embeddingToGenerate.path, + }) + ).rejects.toThrowError(); }); - it("throws an exception", async () => { - await expect(embeddings.generateEmbeddings(embeddingToGenerate)).rejects.toThrowError(); + it("throws an exception when no index is available for path", async () => { + provider.getSearchIndexes.mockResolvedValue([]); + await expect( + embeddings.assertVectorSearchIndexExists({ database, collection, path: embeddingToGenerate.path }) + ).rejects.toThrowError(); }); }); @@ -459,12 +647,6 @@ describe("VectorSearchEmbeddingsManager", () => { ); }); - describe("when no index is available for path", () => { - it("throws an exception", async () => { - await expect(embeddings.generateEmbeddings(embeddingToGenerate)).rejects.toThrowError(); - }); - }); - describe("when index is available on path", () => { beforeEach(() => { provider.getSearchIndexes.mockResolvedValue([