diff --git a/scripts/accuracy/generateTestSummary.ts b/scripts/accuracy/generateTestSummary.ts
index 0d76cc3b4..eae58007b 100644
--- a/scripts/accuracy/generateTestSummary.ts
+++ b/scripts/accuracy/generateTestSummary.ts
@@ -73,7 +73,8 @@ function formatToolCallsWithTooltip(toolCalls: ExpectedToolCall[] | LLMToolCall[
return toolCalls
.map((call) => {
const params = JSON.stringify(call.parameters, null, 2);
- return `${call.toolName}`;
+ const isOptional = "optional" in call && call.optional;
+ return `${isOptional ? "(" : ""}${call.toolName}${isOptional ? ")" : ""}`;
})
.join(", ");
}
diff --git a/tests/accuracy/createCollection.test.ts b/tests/accuracy/createCollection.test.ts
index 75c32e019..6b42250e6 100644
--- a/tests/accuracy/createCollection.test.ts
+++ b/tests/accuracy/createCollection.test.ts
@@ -28,6 +28,11 @@ describeAccuracyTests([
{
prompt: "If and only if, the namespace 'mflix.documentaries' does not exist, then create it",
expectedToolCalls: [
+ {
+ toolName: "list-databases",
+ parameters: {},
+ optional: true,
+ },
{
toolName: "list-collections",
parameters: {
diff --git a/tests/accuracy/createIndex.test.ts b/tests/accuracy/createIndex.test.ts
index becd5b464..f3c600eaf 100644
--- a/tests/accuracy/createIndex.test.ts
+++ b/tests/accuracy/createIndex.test.ts
@@ -1,19 +1,8 @@
-import { afterAll, beforeAll } from "vitest";
import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js";
import { Matcher } from "./sdk/matcher.js";
-let originalApiKey: string | undefined;
-beforeAll(() => {
- originalApiKey = process.env.MDB_VOYAGE_API_KEY;
-
- // We just need a valid key when registering the tool, the actual value is not important
- if (!originalApiKey) {
- process.env.MDB_VOYAGE_API_KEY = "valid-key";
- }
-});
-afterAll(() => {
- process.env.MDB_VOYAGE_API_KEY = originalApiKey;
-});
+// TODO: supply this with a proper config API once we refactor describeAccuracyTests to support it
+process.env.MDB_VOYAGE_API_KEY = "valid-key";
describeAccuracyTests([
{
diff --git a/tests/accuracy/dropCollection.test.ts b/tests/accuracy/dropCollection.test.ts
index 091a54468..565bef903 100644
--- a/tests/accuracy/dropCollection.test.ts
+++ b/tests/accuracy/dropCollection.test.ts
@@ -4,6 +4,18 @@ describeAccuracyTests([
{
prompt: "Remove mflix.movies namespace from my cluster.",
expectedToolCalls: [
+ {
+ toolName: "list-databases",
+ parameters: {},
+ optional: true,
+ },
+ {
+ toolName: "list-collections",
+ parameters: {
+ database: "mflix",
+ },
+ optional: true,
+ },
{
toolName: "drop-collection",
parameters: {
diff --git a/tests/accuracy/dropDatabase.test.ts b/tests/accuracy/dropDatabase.test.ts
index 3010e83ae..f5571486f 100644
--- a/tests/accuracy/dropDatabase.test.ts
+++ b/tests/accuracy/dropDatabase.test.ts
@@ -4,6 +4,11 @@ describeAccuracyTests([
{
prompt: "Remove mflix database from my cluster.",
expectedToolCalls: [
+ {
+ toolName: "list-databases",
+ parameters: {},
+ optional: true,
+ },
{
toolName: "drop-database",
parameters: {
diff --git a/tests/accuracy/export.test.ts b/tests/accuracy/export.test.ts
index 6faddc378..534f2ab6e 100644
--- a/tests/accuracy/export.test.ts
+++ b/tests/accuracy/export.test.ts
@@ -114,12 +114,20 @@ describeAccuracyTests([
arguments: {
pipeline: [
{
- $group: {
- _id: "$release_year",
- titles: {
- $push: "$title",
- },
- },
+ $group: Matcher.anyOf(
+ Matcher.value({
+ _id: "$release_year",
+ titles: {
+ $push: "$title",
+ },
+ }),
+ Matcher.value({
+ _id: "$release_year",
+ movies: {
+ $push: "$title",
+ },
+ })
+ ),
},
],
},
diff --git a/tests/accuracy/find.test.ts b/tests/accuracy/find.test.ts
index 6495912d0..4b2802bbf 100644
--- a/tests/accuracy/find.test.ts
+++ b/tests/accuracy/find.test.ts
@@ -124,6 +124,7 @@ describeAccuracyTests([
limit: Matcher.anyValue,
sort: Matcher.anyValue,
},
+ optional: true,
},
{
toolName: "export",
@@ -137,7 +138,7 @@ describeAccuracyTests([
arguments: Matcher.anyOf(
Matcher.emptyObjectOrUndefined,
Matcher.value({
- filter: Matcher.anyValue,
+ filter: Matcher.emptyObjectOrUndefined,
projection: Matcher.anyValue,
limit: Matcher.anyValue,
sort: Matcher.anyValue,
@@ -145,6 +146,11 @@ describeAccuracyTests([
),
},
],
+ jsonExportFormat: Matcher.anyOf(
+ Matcher.undefined,
+ Matcher.value("relaxed"),
+ Matcher.value("canonical")
+ ),
},
},
],
diff --git a/tests/accuracy/getPerformanceAdvisor.test.ts b/tests/accuracy/getPerformanceAdvisor.test.ts
index 62b570c12..02b61b33f 100644
--- a/tests/accuracy/getPerformanceAdvisor.test.ts
+++ b/tests/accuracy/getPerformanceAdvisor.test.ts
@@ -35,21 +35,27 @@ const mockedTools = {
},
};
+const listProjectsAndClustersToolCalls = [
+ {
+ toolName: "atlas-list-projects",
+ parameters: {},
+ optional: true,
+ },
+ {
+ toolName: "atlas-list-clusters",
+ parameters: {
+ projectId: "mflix",
+ },
+ optional: true,
+ },
+];
+
describeAccuracyTests([
// Test for Suggested Indexes operation
{
prompt: "Can you give me index suggestions for the database 'mflix' in the project 'mflix' and cluster 'mflix-cluster'?",
expectedToolCalls: [
- {
- toolName: "atlas-list-projects",
- parameters: {},
- },
- {
- toolName: "atlas-list-clusters",
- parameters: {
- projectId: "mflix",
- },
- },
+ ...listProjectsAndClustersToolCalls,
{
toolName: "atlas-get-performance-advisor",
parameters: {
@@ -65,16 +71,7 @@ describeAccuracyTests([
{
prompt: "Show me drop index suggestions for the 'mflix' project and 'mflix-cluster' cluster",
expectedToolCalls: [
- {
- toolName: "atlas-list-projects",
- parameters: {},
- },
- {
- toolName: "atlas-list-clusters",
- parameters: {
- projectId: "mflix",
- },
- },
+ ...listProjectsAndClustersToolCalls,
{
toolName: "atlas-get-performance-advisor",
parameters: {
@@ -88,18 +85,9 @@ describeAccuracyTests([
},
// Test for Slow Query Logs operation
{
- prompt: "Show me the slow query logs for the 'mflix' project and 'mflix-cluster' cluster for the namespaces 'mflix.movies' and 'mflix.shows' since January 1st, 2025.",
+ prompt: "Show me the slow query logs for the 'mflix' project and 'mflix-cluster' cluster for the namespaces 'mflix.movies' and 'mflix.shows' since January 1st, 2023",
expectedToolCalls: [
- {
- toolName: "atlas-list-projects",
- parameters: {},
- },
- {
- toolName: "atlas-list-clusters",
- parameters: {
- projectId: "mflix",
- },
- },
+ ...listProjectsAndClustersToolCalls,
{
toolName: "atlas-get-performance-advisor",
parameters: {
@@ -107,7 +95,7 @@ describeAccuracyTests([
clusterName: "mflix-cluster",
operations: ["slowQueryLogs"],
namespaces: ["mflix.movies", "mflix.shows"],
- since: "2025-01-01T00:00:00Z",
+ since: "2023-01-01T00:00:00Z",
},
},
],
@@ -117,16 +105,7 @@ describeAccuracyTests([
{
prompt: "Give me schema suggestions for the 'mflix' project and 'mflix-cluster' cluster",
expectedToolCalls: [
- {
- toolName: "atlas-list-projects",
- parameters: {},
- },
- {
- toolName: "atlas-list-clusters",
- parameters: {
- projectId: "mflix",
- },
- },
+ ...listProjectsAndClustersToolCalls,
{
toolName: "atlas-get-performance-advisor",
parameters: {
@@ -142,16 +121,7 @@ describeAccuracyTests([
{
prompt: "Show me all performance advisor recommendations for the 'mflix' project and 'mflix-cluster' cluster",
expectedToolCalls: [
- {
- toolName: "atlas-list-projects",
- parameters: {},
- },
- {
- toolName: "atlas-list-clusters",
- parameters: {
- projectId: "mflix",
- },
- },
+ ...listProjectsAndClustersToolCalls,
{
toolName: "atlas-get-performance-advisor",
parameters: {
diff --git a/tests/accuracy/sdk/accuracyResultStorage/resultStorage.ts b/tests/accuracy/sdk/accuracyResultStorage/resultStorage.ts
index 845af8a04..02f95e795 100644
--- a/tests/accuracy/sdk/accuracyResultStorage/resultStorage.ts
+++ b/tests/accuracy/sdk/accuracyResultStorage/resultStorage.ts
@@ -4,7 +4,9 @@ export interface LLMToolCall {
parameters: Record;
}
-export type ExpectedToolCall = Omit;
+export type ExpectedToolCall = Omit & {
+ optional?: boolean;
+};
export const AccuracyRunStatus = {
Done: "done",
diff --git a/tests/accuracy/sdk/accuracyScorer.ts b/tests/accuracy/sdk/accuracyScorer.ts
index 24a6caf1a..8a1a7000f 100644
--- a/tests/accuracy/sdk/accuracyScorer.ts
+++ b/tests/accuracy/sdk/accuracyScorer.ts
@@ -81,12 +81,15 @@ export function calculateToolCallingAccuracy(
.sort((a, b) => b.score - a.score || a.index - b.index);
const bestMatch = candidates[0];
- if (!bestMatch || bestMatch.score === 0) {
- return 0; // No matching tool call found, return 0
+ if (bestMatch) {
+ checkedActualToolCallIndexes.add(bestMatch.index);
+ currentScore = Math.min(currentScore, bestMatch.score);
+ } else if (expectedCall.optional) {
+ // Optional expected tool call not found, but it's okay, continue
+ continue;
+ } else {
+ return 0; // Required expected tool call not found, return 0
}
-
- checkedActualToolCallIndexes.add(bestMatch.index);
- currentScore = Math.min(currentScore, bestMatch.score);
}
return currentScore;