From 2cb086507b254b0a2705902df9053c10c1c3bc55 Mon Sep 17 00:00:00 2001 From: nirinchev Date: Tue, 14 Oct 2025 13:33:18 +0200 Subject: [PATCH 1/4] fix accuracy tests --- scripts/accuracy/generateTestSummary.ts | 3 +- tests/accuracy/createCollection.test.ts | 5 ++ tests/accuracy/dropCollection.test.ts | 12 ++++ tests/accuracy/find.test.ts | 9 ++- tests/accuracy/getPerformanceAdvisor.test.ts | 72 ++++++------------- .../accuracyResultStorage/resultStorage.ts | 4 +- tests/accuracy/sdk/accuracyScorer.ts | 13 ++-- 7 files changed, 59 insertions(+), 59 deletions(-) diff --git a/scripts/accuracy/generateTestSummary.ts b/scripts/accuracy/generateTestSummary.ts index 0d76cc3b..eae58007 100644 --- a/scripts/accuracy/generateTestSummary.ts +++ b/scripts/accuracy/generateTestSummary.ts @@ -73,7 +73,8 @@ function formatToolCallsWithTooltip(toolCalls: ExpectedToolCall[] | LLMToolCall[ return toolCalls .map((call) => { const params = JSON.stringify(call.parameters, null, 2); - return `${call.toolName}`; + const isOptional = "optional" in call && call.optional; + return `${isOptional ? "(" : ""}${call.toolName}${isOptional ? ")" : ""}`; }) .join(", "); } diff --git a/tests/accuracy/createCollection.test.ts b/tests/accuracy/createCollection.test.ts index 75c32e01..6b42250e 100644 --- a/tests/accuracy/createCollection.test.ts +++ b/tests/accuracy/createCollection.test.ts @@ -28,6 +28,11 @@ describeAccuracyTests([ { prompt: "If and only if, the namespace 'mflix.documentaries' does not exist, then create it", expectedToolCalls: [ + { + toolName: "list-databases", + parameters: {}, + optional: true, + }, { toolName: "list-collections", parameters: { diff --git a/tests/accuracy/dropCollection.test.ts b/tests/accuracy/dropCollection.test.ts index 091a5446..565bef90 100644 --- a/tests/accuracy/dropCollection.test.ts +++ b/tests/accuracy/dropCollection.test.ts @@ -4,6 +4,18 @@ describeAccuracyTests([ { prompt: "Remove mflix.movies namespace from my cluster.", expectedToolCalls: [ + { + toolName: "list-databases", + parameters: {}, + optional: true, + }, + { + toolName: "list-collections", + parameters: { + database: "mflix", + }, + optional: true, + }, { toolName: "drop-collection", parameters: { diff --git a/tests/accuracy/find.test.ts b/tests/accuracy/find.test.ts index 6495912d..67caaf1b 100644 --- a/tests/accuracy/find.test.ts +++ b/tests/accuracy/find.test.ts @@ -1,3 +1,4 @@ +import { jsonExportFormat } from "../../src/common/exportsManager.js"; import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; import { Matcher } from "./sdk/matcher.js"; @@ -124,6 +125,7 @@ describeAccuracyTests([ limit: Matcher.anyValue, sort: Matcher.anyValue, }, + optional: true, }, { toolName: "export", @@ -137,7 +139,7 @@ describeAccuracyTests([ arguments: Matcher.anyOf( Matcher.emptyObjectOrUndefined, Matcher.value({ - filter: Matcher.anyValue, + filter: Matcher.emptyObjectOrUndefined, projection: Matcher.anyValue, limit: Matcher.anyValue, sort: Matcher.anyValue, @@ -145,6 +147,11 @@ describeAccuracyTests([ ), }, ], + jsonExportFormat: Matcher.anyOf( + Matcher.undefined, + Matcher.value("relaxed"), + Matcher.value("canonical") + ), }, }, ], diff --git a/tests/accuracy/getPerformanceAdvisor.test.ts b/tests/accuracy/getPerformanceAdvisor.test.ts index 62b570c1..9f3fbe59 100644 --- a/tests/accuracy/getPerformanceAdvisor.test.ts +++ b/tests/accuracy/getPerformanceAdvisor.test.ts @@ -35,21 +35,27 @@ const mockedTools = { }, }; +const listProjectsAndClustersToolCalls = [ + { + toolName: "atlas-list-projects", + parameters: {}, + optional: true, + }, + { + toolName: "atlas-list-clusters", + parameters: { + projectId: "mflix", + }, + optional: true, + }, +]; + describeAccuracyTests([ // Test for Suggested Indexes operation { prompt: "Can you give me index suggestions for the database 'mflix' in the project 'mflix' and cluster 'mflix-cluster'?", expectedToolCalls: [ - { - toolName: "atlas-list-projects", - parameters: {}, - }, - { - toolName: "atlas-list-clusters", - parameters: { - projectId: "mflix", - }, - }, + ...listProjectsAndClustersToolCalls, { toolName: "atlas-get-performance-advisor", parameters: { @@ -65,16 +71,7 @@ describeAccuracyTests([ { prompt: "Show me drop index suggestions for the 'mflix' project and 'mflix-cluster' cluster", expectedToolCalls: [ - { - toolName: "atlas-list-projects", - parameters: {}, - }, - { - toolName: "atlas-list-clusters", - parameters: { - projectId: "mflix", - }, - }, + ...listProjectsAndClustersToolCalls, { toolName: "atlas-get-performance-advisor", parameters: { @@ -88,18 +85,9 @@ describeAccuracyTests([ }, // Test for Slow Query Logs operation { - prompt: "Show me the slow query logs for the 'mflix' project and 'mflix-cluster' cluster for the namespaces 'mflix.movies' and 'mflix.shows' since January 1st, 2025.", + prompt: "Show me the slow query logs for the 'mflix' project and 'mflix-cluster' cluster for the namespaces 'mflix.movies' and 'mflix.shows' since January 1st, 2025 (a date that is certainly in the past!).", expectedToolCalls: [ - { - toolName: "atlas-list-projects", - parameters: {}, - }, - { - toolName: "atlas-list-clusters", - parameters: { - projectId: "mflix", - }, - }, + ...listProjectsAndClustersToolCalls, { toolName: "atlas-get-performance-advisor", parameters: { @@ -117,16 +105,7 @@ describeAccuracyTests([ { prompt: "Give me schema suggestions for the 'mflix' project and 'mflix-cluster' cluster", expectedToolCalls: [ - { - toolName: "atlas-list-projects", - parameters: {}, - }, - { - toolName: "atlas-list-clusters", - parameters: { - projectId: "mflix", - }, - }, + ...listProjectsAndClustersToolCalls, { toolName: "atlas-get-performance-advisor", parameters: { @@ -142,16 +121,7 @@ describeAccuracyTests([ { prompt: "Show me all performance advisor recommendations for the 'mflix' project and 'mflix-cluster' cluster", expectedToolCalls: [ - { - toolName: "atlas-list-projects", - parameters: {}, - }, - { - toolName: "atlas-list-clusters", - parameters: { - projectId: "mflix", - }, - }, + ...listProjectsAndClustersToolCalls, { toolName: "atlas-get-performance-advisor", parameters: { diff --git a/tests/accuracy/sdk/accuracyResultStorage/resultStorage.ts b/tests/accuracy/sdk/accuracyResultStorage/resultStorage.ts index 845af8a0..02f95e79 100644 --- a/tests/accuracy/sdk/accuracyResultStorage/resultStorage.ts +++ b/tests/accuracy/sdk/accuracyResultStorage/resultStorage.ts @@ -4,7 +4,9 @@ export interface LLMToolCall { parameters: Record; } -export type ExpectedToolCall = Omit; +export type ExpectedToolCall = Omit & { + optional?: boolean; +}; export const AccuracyRunStatus = { Done: "done", diff --git a/tests/accuracy/sdk/accuracyScorer.ts b/tests/accuracy/sdk/accuracyScorer.ts index 24a6caf1..8a1a7000 100644 --- a/tests/accuracy/sdk/accuracyScorer.ts +++ b/tests/accuracy/sdk/accuracyScorer.ts @@ -81,12 +81,15 @@ export function calculateToolCallingAccuracy( .sort((a, b) => b.score - a.score || a.index - b.index); const bestMatch = candidates[0]; - if (!bestMatch || bestMatch.score === 0) { - return 0; // No matching tool call found, return 0 + if (bestMatch) { + checkedActualToolCallIndexes.add(bestMatch.index); + currentScore = Math.min(currentScore, bestMatch.score); + } else if (expectedCall.optional) { + // Optional expected tool call not found, but it's okay, continue + continue; + } else { + return 0; // Required expected tool call not found, return 0 } - - checkedActualToolCallIndexes.add(bestMatch.index); - currentScore = Math.min(currentScore, bestMatch.score); } return currentScore; From 8ec9c59ad8e8b755faede7ba3d50fb9e7a9e87a4 Mon Sep 17 00:00:00 2001 From: nirinchev Date: Tue, 14 Oct 2025 13:41:55 +0200 Subject: [PATCH 2/4] remove import --- tests/accuracy/find.test.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/accuracy/find.test.ts b/tests/accuracy/find.test.ts index 67caaf1b..4b2802bb 100644 --- a/tests/accuracy/find.test.ts +++ b/tests/accuracy/find.test.ts @@ -1,4 +1,3 @@ -import { jsonExportFormat } from "../../src/common/exportsManager.js"; import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; import { Matcher } from "./sdk/matcher.js"; From 16121e3b4ca76d4722b19bf66fbf4ecf06e5e77f Mon Sep 17 00:00:00 2001 From: nirinchev Date: Wed, 15 Oct 2025 15:27:53 +0200 Subject: [PATCH 3/4] revert before/afterAll solution for createIndex.test.ts --- tests/accuracy/createIndex.test.ts | 15 ++------------- tests/accuracy/export.test.ts | 20 ++++++++++++++------ 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/tests/accuracy/createIndex.test.ts b/tests/accuracy/createIndex.test.ts index becd5b46..f3c600ea 100644 --- a/tests/accuracy/createIndex.test.ts +++ b/tests/accuracy/createIndex.test.ts @@ -1,19 +1,8 @@ -import { afterAll, beforeAll } from "vitest"; import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; import { Matcher } from "./sdk/matcher.js"; -let originalApiKey: string | undefined; -beforeAll(() => { - originalApiKey = process.env.MDB_VOYAGE_API_KEY; - - // We just need a valid key when registering the tool, the actual value is not important - if (!originalApiKey) { - process.env.MDB_VOYAGE_API_KEY = "valid-key"; - } -}); -afterAll(() => { - process.env.MDB_VOYAGE_API_KEY = originalApiKey; -}); +// TODO: supply this with a proper config API once we refactor describeAccuracyTests to support it +process.env.MDB_VOYAGE_API_KEY = "valid-key"; describeAccuracyTests([ { diff --git a/tests/accuracy/export.test.ts b/tests/accuracy/export.test.ts index 6faddc37..534f2ab6 100644 --- a/tests/accuracy/export.test.ts +++ b/tests/accuracy/export.test.ts @@ -114,12 +114,20 @@ describeAccuracyTests([ arguments: { pipeline: [ { - $group: { - _id: "$release_year", - titles: { - $push: "$title", - }, - }, + $group: Matcher.anyOf( + Matcher.value({ + _id: "$release_year", + titles: { + $push: "$title", + }, + }), + Matcher.value({ + _id: "$release_year", + movies: { + $push: "$title", + }, + }) + ), }, ], }, From 584f02aa8c97971bf50e03c03832c210335be2a9 Mon Sep 17 00:00:00 2001 From: nirinchev Date: Wed, 15 Oct 2025 15:42:04 +0200 Subject: [PATCH 4/4] use a date that's way in the past for slow queries --- tests/accuracy/dropDatabase.test.ts | 5 +++++ tests/accuracy/getPerformanceAdvisor.test.ts | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/accuracy/dropDatabase.test.ts b/tests/accuracy/dropDatabase.test.ts index 3010e83a..f5571486 100644 --- a/tests/accuracy/dropDatabase.test.ts +++ b/tests/accuracy/dropDatabase.test.ts @@ -4,6 +4,11 @@ describeAccuracyTests([ { prompt: "Remove mflix database from my cluster.", expectedToolCalls: [ + { + toolName: "list-databases", + parameters: {}, + optional: true, + }, { toolName: "drop-database", parameters: { diff --git a/tests/accuracy/getPerformanceAdvisor.test.ts b/tests/accuracy/getPerformanceAdvisor.test.ts index 9f3fbe59..02b61b33 100644 --- a/tests/accuracy/getPerformanceAdvisor.test.ts +++ b/tests/accuracy/getPerformanceAdvisor.test.ts @@ -85,7 +85,7 @@ describeAccuracyTests([ }, // Test for Slow Query Logs operation { - prompt: "Show me the slow query logs for the 'mflix' project and 'mflix-cluster' cluster for the namespaces 'mflix.movies' and 'mflix.shows' since January 1st, 2025 (a date that is certainly in the past!).", + prompt: "Show me the slow query logs for the 'mflix' project and 'mflix-cluster' cluster for the namespaces 'mflix.movies' and 'mflix.shows' since January 1st, 2023", expectedToolCalls: [ ...listProjectsAndClustersToolCalls, { @@ -95,7 +95,7 @@ describeAccuracyTests([ clusterName: "mflix-cluster", operations: ["slowQueryLogs"], namespaces: ["mflix.movies", "mflix.shows"], - since: "2025-01-01T00:00:00Z", + since: "2023-01-01T00:00:00Z", }, }, ],