diff --git a/.github/workflows/dotnet-build-and-test.yml b/.github/workflows/dotnet-build-and-test.yml
index fcfe26fc0e..b522fc40cf 100644
--- a/.github/workflows/dotnet-build-and-test.yml
+++ b/.github/workflows/dotnet-build-and-test.yml
@@ -273,6 +273,8 @@ jobs:
-c ${{ matrix.configuration }} `
--no-build -v Normal `
--report-xunit-trx `
+ --report-junit `
+ --results-directory ../IntegrationTestResults/ `
--ignore-exit-code 8 `
--filter-not-trait "Category=IntegrationDisabled" `
--filter-not-trait "Category=FoundryHostedAgents" `
@@ -294,6 +296,10 @@ jobs:
AZURE_AI_PROJECT_ENDPOINT: ${{ vars.AZURE_AI_PROJECT_ENDPOINT }}
AZURE_AI_MODEL_DEPLOYMENT_NAME: ${{ vars.AZURE_AI_MODEL_DEPLOYMENT_NAME }}
AZURE_AI_BING_CONNECTION_ID: ${{ vars.AZURE_AI_BING_CONNECTION_ID }}
+ # Anthropic Models
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+ ANTHROPIC_CHAT_MODEL_NAME: ${{ vars.ANTHROPIC_CHAT_MODEL_NAME }}
+ ANTHROPIC_REASONING_MODEL_NAME: ${{ vars.ANTHROPIC_REASONING_MODEL_NAME }}
# Generate test reports and check coverage
- name: Generate test reports
@@ -316,6 +322,14 @@ jobs:
shell: pwsh
run: ./dotnet/eng/scripts/dotnet-check-coverage.ps1 -JsonReportPath "TestResults/Reports/Summary.json" -CoverageThreshold $env:COVERAGE_THRESHOLD
+ - name: Upload integration test results
+ if: always() && github.event_name != 'pull_request' && matrix.integration-tests
+ uses: actions/upload-artifact@v7
+ with:
+ name: dotnet-test-results-${{ matrix.targetFramework }}-${{ matrix.os }}
+ path: IntegrationTestResults/**/*.junit
+ if-no-files-found: ignore
+
# The Foundry hosted-agent IT is costly (it builds a container, pushes to ACR, and provisions
# live agents on a separate Foundry project). Running it in its own job keeps the overall
# workflow time roughly flat: it executes in parallel to dotnet-build and dotnet-test and is
@@ -448,3 +462,64 @@ jobs:
uses: actions/github-script@v8
with:
script: core.setFailed('Integration Tests Cancelled!')
+
+ # Integration test trend report (aggregates JUnit XML results from dotnet test jobs)
+ dotnet-integration-test-report:
+ name: Integration Test Report
+ if: >
+ always() &&
+ github.event_name != 'pull_request' &&
+ (contains(join(needs.*.result, ','), 'success') ||
+ contains(join(needs.*.result, ','), 'failure'))
+ needs: [dotnet-test]
+ runs-on: ubuntu-latest
+ defaults:
+ run:
+ working-directory: python
+ steps:
+ - uses: actions/checkout@v6
+ with:
+ persist-credentials: false
+ sparse-checkout: |
+ .github/actions/python-setup
+ python
+ - name: Set up python and install the project
+ uses: ./.github/actions/python-setup
+ with:
+ python-version: "3.13"
+ os: ${{ runner.os }}
+ - name: Download all test results from current run
+ uses: actions/download-artifact@v4
+ with:
+ pattern: dotnet-test-results-*
+ path: dotnet-test-results/
+ - name: Restore report history cache
+ uses: actions/cache/restore@v4
+ with:
+ path: python/dotnet-integration-report-history.json
+ key: dotnet-integration-report-history-${{ github.run_id }}
+ restore-keys: |
+ dotnet-integration-report-history-
+ - name: Generate trend report
+ run: >
+ uv run python scripts/integration_test_report/aggregate.py
+ ../dotnet-test-results/
+ dotnet-integration-report-history.json
+ dotnet-integration-test-report.md
+ - name: Post to Job Summary
+ if: always()
+ run: cat dotnet-integration-test-report.md >> $GITHUB_STEP_SUMMARY
+ - name: Save report history cache
+ if: always()
+ uses: actions/cache/save@v4
+ with:
+ path: python/dotnet-integration-report-history.json
+ key: dotnet-integration-report-history-${{ github.run_id }}
+ - name: Upload trend report
+ if: always()
+ uses: actions/upload-artifact@v7
+ with:
+ name: dotnet-integration-test-report
+ path: |
+ python/dotnet-integration-test-report.md
+ python/dotnet-integration-report-history.json
diff --git a/dotnet/README.md b/dotnet/README.md
index 328dfdf684..2edb402a94 100644
--- a/dotnet/README.md
+++ b/dotnet/README.md
@@ -33,3 +33,4 @@ Console.WriteLine(await agent.RunAsync("Write a haiku about Microsoft Agent Fram
- [Design Documents](../docs/design)
- [Architectural Decision Records](../docs/decisions)
- [MSFT Learn Docs](https://learn.microsoft.com/agent-framework/overview/agent-framework-overview)
+
diff --git a/dotnet/tests/AnthropicChatCompletion.IntegrationTests/AnthropicChatCompletionFixture.cs b/dotnet/tests/AnthropicChatCompletion.IntegrationTests/AnthropicChatCompletionFixture.cs
index af98629237..7c2a0c3b6c 100644
--- a/dotnet/tests/AnthropicChatCompletion.IntegrationTests/AnthropicChatCompletionFixture.cs
+++ b/dotnet/tests/AnthropicChatCompletion.IntegrationTests/AnthropicChatCompletionFixture.cs
@@ -17,9 +17,6 @@ namespace AnthropicChatCompletion.IntegrationTests;
public class AnthropicChatCompletionFixture : IChatClientAgentFixture
{
- // All tests for Anthropic are intended to be ran locally as the CI pipeline for Anthropic is not setup.
- internal const string SkipReason = "Integrations tests for local execution only";
-
private readonly bool _useReasoningModel;
private readonly bool _useBeta;
@@ -105,7 +102,22 @@ public Task DeleteSessionAsync(AgentSession session) =>
public async ValueTask InitializeAsync()
{
- Assert.SkipWhen(SkipReason is not null, SkipReason ?? string.Empty);
+ // Temporarily disabled: Anthropic SDK has a binary incompatibility with the current
+ // Microsoft.Extensions.AI version (WebSearchToolResultContent.Results method not found).
+ // See: https://github.com/microsoft/agent-framework/pull/5515
+ Assert.Skip("Anthropic integration tests temporarily disabled due to SDK incompatibility with Microsoft.Extensions.AI");
+
+ try
+ {
+ _ = TestConfiguration.GetRequiredValue(TestSettings.AnthropicApiKey);
+ _ = TestConfiguration.GetRequiredValue(TestSettings.AnthropicChatModelName);
+ _ = TestConfiguration.GetRequiredValue(TestSettings.AnthropicReasoningModelName);
+ }
+ catch (InvalidOperationException ex)
+ {
+ Assert.Skip("Anthropic configuration could not be loaded. Error:" + ex.Message);
+ }
+
this._agent = await this.CreateChatClientAgentAsync();
}
diff --git a/dotnet/tests/AnthropicChatCompletion.IntegrationTests/AnthropicSkillsIntegrationTests.cs b/dotnet/tests/AnthropicChatCompletion.IntegrationTests/AnthropicSkillsIntegrationTests.cs
index 452b0c6cf2..82b3511993 100644
--- a/dotnet/tests/AnthropicChatCompletion.IntegrationTests/AnthropicSkillsIntegrationTests.cs
+++ b/dotnet/tests/AnthropicChatCompletion.IntegrationTests/AnthropicSkillsIntegrationTests.cs
@@ -1,5 +1,6 @@
// Copyright (c) Microsoft. All rights reserved.
+using System;
using System.Threading.Tasks;
using AgentConformance.IntegrationTests.Support;
using Anthropic;
@@ -17,19 +18,28 @@ namespace AnthropicChatCompletion.IntegrationTests;
/// Integration tests for Anthropic Skills functionality.
/// These tests are designed to be run locally with a valid Anthropic API key.
///
+///
+/// Temporarily disabled due to Anthropic SDK binary incompatibility with
+/// the current Microsoft.Extensions.AI version (WebSearchToolResultContent.Results).
+///
+[Trait("Category", "IntegrationDisabled")]
public sealed class AnthropicSkillsIntegrationTests
{
- // All tests for Anthropic are intended to be ran locally as the CI pipeline for Anthropic is not setup.
- private const string SkipReason = "Integrations tests for local execution only";
-
[Fact]
public async Task CreateAgentWithPptxSkillAsync()
{
- Assert.SkipWhen(SkipReason is not null, SkipReason ?? string.Empty);
-
- // Arrange
- AnthropicClient anthropicClient = new() { ApiKey = TestConfiguration.GetRequiredValue(TestSettings.AnthropicApiKey) };
- string model = TestConfiguration.GetRequiredValue(TestSettings.AnthropicChatModelName);
+ AnthropicClient? anthropicClient;
+ string? model;
+ try
+ {
+ anthropicClient = new() { ApiKey = TestConfiguration.GetRequiredValue(TestSettings.AnthropicApiKey) };
+ model = TestConfiguration.GetRequiredValue(TestSettings.AnthropicChatModelName);
+ }
+ catch (InvalidOperationException ex)
+ {
+ Assert.Skip("Anthropic configuration could not be loaded. Error:" + ex.Message);
+ return;
+ }
BetaSkillParams pptxSkill = new()
{
@@ -56,10 +66,16 @@ public async Task CreateAgentWithPptxSkillAsync()
[Fact]
public async Task ListAnthropicManagedSkillsAsync()
{
- Assert.SkipWhen(SkipReason is not null, SkipReason ?? string.Empty);
-
- // Arrange
- AnthropicClient anthropicClient = new() { ApiKey = TestConfiguration.GetRequiredValue(TestSettings.AnthropicApiKey) };
+ AnthropicClient? anthropicClient;
+ try
+ {
+ anthropicClient = new() { ApiKey = TestConfiguration.GetRequiredValue(TestSettings.AnthropicApiKey) };
+ }
+ catch (InvalidOperationException ex)
+ {
+ Assert.Skip("Anthropic configuration could not be loaded. Error:" + ex.Message);
+ return;
+ }
// Act
SkillListPage skills = await anthropicClient.Beta.Skills.List(
diff --git a/dotnet/tests/Microsoft.Agents.AI.DurableTask.IntegrationTests/ConsoleAppSamplesValidation.cs b/dotnet/tests/Microsoft.Agents.AI.DurableTask.IntegrationTests/ConsoleAppSamplesValidation.cs
index 7b8fa3a8f9..5e1142f027 100644
--- a/dotnet/tests/Microsoft.Agents.AI.DurableTask.IntegrationTests/ConsoleAppSamplesValidation.cs
+++ b/dotnet/tests/Microsoft.Agents.AI.DurableTask.IntegrationTests/ConsoleAppSamplesValidation.cs
@@ -13,8 +13,6 @@ namespace Microsoft.Agents.AI.DurableTask.IntegrationTests;
[Trait("Category", "SampleValidation")]
public sealed class ConsoleAppSamplesValidation(ITestOutputHelper outputHelper) : SamplesValidationBase(outputHelper)
{
- private const string SkipFlakyTimingTest = "Flaky: timing-dependent LLM test, see https://github.com/microsoft/agent-framework/issues/4971";
-
private static readonly string s_samplesPath = Path.GetFullPath(
Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "..", "..", "samples", "04-hosting", "DurableAgents", "ConsoleApps"));
@@ -69,7 +67,7 @@ await this.RunSampleTestAsync(samplePath, async (process, logs) =>
});
}
- [Fact]
+ [RetryFact(2, 5000)]
public async Task SingleAgentOrchestrationChainingSampleValidationAsync()
{
using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts();
@@ -105,7 +103,7 @@ await this.RunSampleTestAsync(samplePath, async (process, logs) =>
});
}
- [Fact]
+ [RetryFact(2, 5000)]
public async Task MultiAgentConcurrencySampleValidationAsync()
{
using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts();
@@ -160,7 +158,7 @@ await this.RunSampleTestAsync(samplePath, async (process, logs) =>
});
}
- [Fact]
+ [RetryFact(2, 5000)]
public async Task MultiAgentConditionalSampleValidationAsync()
{
using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts();
@@ -237,14 +235,14 @@ private async Task TestSpamDetectionAsync(
Assert.True(foundSuccess, "Orchestration did not complete successfully.");
}
- [Fact(Skip = SkipFlakyTimingTest)]
+ [RetryFact(2, 5000)]
public async Task SingleAgentOrchestrationHITLSampleValidationAsync()
{
string samplePath = Path.Combine(s_samplesPath, "05_AgentOrchestration_HITL");
await this.RunSampleTestAsync(samplePath, async (process, logs) =>
{
- using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts();
+ using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(TimeSpan.FromSeconds(180));
// Start the HITL orchestration following the happy path from README
await this.WriteInputAsync(process, "The Future of Artificial Intelligence", testTimeoutCts.Token);
@@ -260,7 +258,7 @@ await this.RunSampleTestAsync(samplePath, async (process, logs) =>
while ((line = this.ReadLogLine(logs, testTimeoutCts.Token)) != null)
{
// Look for notification that content is ready. The first time we see this, we should send a rejection.
- // The second time we see this, we should send approval.
+ // Subsequent times we see this, we should send approval (LLM may produce extra review cycles).
if (line.Contains("Content is ready for review", StringComparison.OrdinalIgnoreCase))
{
if (!rejectionSent)
@@ -275,20 +273,15 @@ await this.WriteInputAsync(
testTimeoutCts.Token);
rejectionSent = true;
}
- else if (!approvalSent)
+ else
{
- // Prompt: Approve? (y/n):
+ // Approve any subsequent draft (LLM non-determinism may produce extra review cycles)
await this.WriteInputAsync(process, "y", testTimeoutCts.Token);
// Prompt: Feedback (optional):
await this.WriteInputAsync(process, "Looks good!", testTimeoutCts.Token);
approvalSent = true;
}
- else
- {
- // This should never happen
- Assert.Fail("Unexpected message found.");
- }
}
// Look for success message
@@ -311,14 +304,14 @@ await this.WriteInputAsync(
});
}
- [Fact(Skip = SkipFlakyTimingTest)]
+ [RetryFact(2, 5000)]
public async Task LongRunningToolsSampleValidationAsync()
{
string samplePath = Path.Combine(s_samplesPath, "06_LongRunningTools");
await this.RunSampleTestAsync(samplePath, async (process, logs) =>
{
// This test takes a bit longer to run due to the multiple agent interactions and the lengthy content generation.
- using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(TimeSpan.FromSeconds(90));
+ using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(TimeSpan.FromSeconds(180));
// Test starting an agent that schedules a content generation orchestration
await this.WriteInputAsync(
@@ -335,7 +328,7 @@ await this.WriteInputAsync(
while ((line = this.ReadLogLine(logs, testTimeoutCts.Token)) != null)
{
// Look for notification that content is ready. The first time we see this, we should send a rejection.
- // The second time we see this, we should send approval.
+ // Subsequent times we see this, we should send approval (LLM may produce extra review cycles).
if (line.Contains("NOTIFICATION: Please review the following content for approval", StringComparison.OrdinalIgnoreCase))
{
// Wait for the notification to be fully written to the console
@@ -350,20 +343,15 @@ await this.WriteInputAsync(
testTimeoutCts.Token);
rejectionSent = true;
}
- else if (!approvalSent)
+ else
{
- // Approve the content. Note that we need to send a newline character to the console first before sending the input.
+ // Approve any subsequent draft (LLM non-determinism may produce extra review cycles)
await this.WriteInputAsync(
process,
"\nApprove the content",
testTimeoutCts.Token);
approvalSent = true;
}
- else
- {
- // This should never happen
- Assert.Fail("Unexpected message found.");
- }
}
// Look for success message
@@ -396,14 +384,14 @@ await this.WriteInputAsync(
});
}
- [Fact(Skip = SkipFlakyTimingTest)]
+ [RetryFact(2, 5000)]
public async Task ReliableStreamingSampleValidationAsync()
{
string samplePath = Path.Combine(s_samplesPath, "07_ReliableStreaming");
await this.RunSampleTestAsync(samplePath, async (process, logs) =>
{
// This test takes a bit longer to run due to the multiple agent interactions and the lengthy content generation.
- using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(TimeSpan.FromSeconds(90));
+ using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(TimeSpan.FromSeconds(150));
// Test the agent endpoint with a simple prompt
await this.WriteInputAsync(process, "Plan a 5-day trip to Seattle. Include daily activities.", testTimeoutCts.Token);
diff --git a/dotnet/tests/Microsoft.Agents.AI.DurableTask.IntegrationTests/ExternalClientTests.cs b/dotnet/tests/Microsoft.Agents.AI.DurableTask.IntegrationTests/ExternalClientTests.cs
index 134a12e688..aa1edab7da 100644
--- a/dotnet/tests/Microsoft.Agents.AI.DurableTask.IntegrationTests/ExternalClientTests.cs
+++ b/dotnet/tests/Microsoft.Agents.AI.DurableTask.IntegrationTests/ExternalClientTests.cs
@@ -19,11 +19,9 @@ namespace Microsoft.Agents.AI.DurableTask.IntegrationTests;
[Trait("Category", "Integration")]
public sealed class ExternalClientTests(ITestOutputHelper outputHelper) : IDisposable
{
- private const string SkipFlakyTimingTest = "Flaky: timing-dependent LLM test, see https://github.com/microsoft/agent-framework/issues/4971";
-
private static readonly TimeSpan s_defaultTimeout = Debugger.IsAttached
? TimeSpan.FromMinutes(5)
- : TimeSpan.FromSeconds(60);
+ : TimeSpan.FromSeconds(120);
private static readonly IConfiguration s_configuration =
new ConfigurationBuilder()
@@ -38,7 +36,7 @@ public sealed class ExternalClientTests(ITestOutputHelper outputHelper) : IDispo
public void Dispose() => this._cts.Dispose();
- [Fact]
+ [RetryFact(2, 5000)]
public async Task SimplePromptAsync()
{
// Setup
@@ -77,7 +75,7 @@ await simpleAgentProxy.RunAsync(
Assert.Contains(agentLogs, log => log.EventId.Name == "LogAgentResponse");
}
- [Fact(Skip = SkipFlakyTimingTest)]
+ [RetryFact(2, 5000)]
public async Task CallFunctionToolsAsync()
{
int weatherToolInvocationCount = 0;
@@ -129,7 +127,7 @@ string SuggestPackingList(string weather, bool isSunny)
Assert.Equal(1, packingListToolInvocationCount);
}
- [Fact(Skip = SkipFlakyTimingTest)]
+ [RetryFact(2, 5000)]
public async Task CallLongRunningFunctionToolsAsync()
{
[Description("Starts a greeting workflow and returns the workflow instance ID")]
diff --git a/dotnet/tests/Microsoft.Agents.AI.DurableTask.IntegrationTests/SamplesValidationBase.cs b/dotnet/tests/Microsoft.Agents.AI.DurableTask.IntegrationTests/SamplesValidationBase.cs
index f5ecf0354d..3f01b83e54 100644
--- a/dotnet/tests/Microsoft.Agents.AI.DurableTask.IntegrationTests/SamplesValidationBase.cs
+++ b/dotnet/tests/Microsoft.Agents.AI.DurableTask.IntegrationTests/SamplesValidationBase.cs
@@ -217,7 +217,7 @@ protected async Task WriteInputAsync(Process process, string input, Cancellation
///
protected CancellationTokenSource CreateTestTimeoutCts(TimeSpan? timeout = null)
{
- TimeSpan testTimeout = Debugger.IsAttached ? TimeSpan.FromMinutes(5) : timeout ?? TimeSpan.FromSeconds(60);
+ TimeSpan testTimeout = Debugger.IsAttached ? TimeSpan.FromMinutes(5) : timeout ?? TimeSpan.FromSeconds(120);
return new CancellationTokenSource(testTimeout);
}
diff --git a/dotnet/tests/Microsoft.Agents.AI.DurableTask.IntegrationTests/WorkflowConsoleAppSamplesValidation.cs b/dotnet/tests/Microsoft.Agents.AI.DurableTask.IntegrationTests/WorkflowConsoleAppSamplesValidation.cs
index f137e4abd9..390b3586ce 100644
--- a/dotnet/tests/Microsoft.Agents.AI.DurableTask.IntegrationTests/WorkflowConsoleAppSamplesValidation.cs
+++ b/dotnet/tests/Microsoft.Agents.AI.DurableTask.IntegrationTests/WorkflowConsoleAppSamplesValidation.cs
@@ -22,7 +22,7 @@ public sealed class WorkflowConsoleAppSamplesValidation(ITestOutputHelper output
///
protected override string TaskHubPrefix => "workflow";
- [Fact]
+ [RetryFact(2, 5000)]
public async Task SequentialWorkflowSampleValidationAsync()
{
using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(s_testTimeout);
@@ -71,7 +71,7 @@ await this.RunSampleTestAsync(samplePath, async (process, logs) =>
});
}
- [Fact]
+ [RetryFact(2, 5000)]
public async Task ConcurrentWorkflowSampleValidationAsync()
{
using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(s_testTimeout);
@@ -120,7 +120,7 @@ await this.RunSampleTestAsync(samplePath, async (process, logs) =>
});
}
- [Fact]
+ [RetryFact(2, 5000)]
public async Task ConditionalEdgesWorkflowSampleValidationAsync()
{
using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(s_testTimeout);
@@ -182,7 +182,7 @@ private void AssertNoError(string line)
}
}
- [Fact]
+ [RetryFact(2, 5000)]
public async Task WorkflowEventsSampleValidationAsync()
{
using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(s_testTimeout);
@@ -278,7 +278,7 @@ await this.RunSampleTestAsync(samplePath, async (process, logs) =>
});
}
- [Fact]
+ [RetryFact(2, 5000)]
public async Task WorkflowSharedStateSampleValidationAsync()
{
using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(s_testTimeout);
@@ -376,7 +376,7 @@ await this.RunSampleTestAsync(samplePath, async (process, logs) =>
});
}
- [Fact]
+ [RetryFact(2, 5000)]
public async Task SubWorkflowsSampleValidationAsync()
{
using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(s_testTimeout);
@@ -452,7 +452,7 @@ await this.RunSampleTestAsync(samplePath, async (process, logs) =>
});
}
- [Fact]
+ [RetryFact(2, 5000)]
public async Task WorkflowHITLSampleValidationAsync()
{
using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(s_testTimeout);
@@ -505,7 +505,7 @@ await this.RunSampleTestAsync(samplePath, (process, logs) =>
});
}
- [Fact]
+ [RetryFact(2, 5000)]
public async Task WorkflowAndAgentsSampleValidationAsync()
{
using CancellationTokenSource testTimeoutCts = this.CreateTestTimeoutCts(s_testTimeout);
diff --git a/dotnet/tests/Microsoft.Agents.AI.Hosting.AzureFunctions.IntegrationTests/SamplesValidation.cs b/dotnet/tests/Microsoft.Agents.AI.Hosting.AzureFunctions.IntegrationTests/SamplesValidation.cs
index 078b6af790..be9d2b7434 100644
--- a/dotnet/tests/Microsoft.Agents.AI.Hosting.AzureFunctions.IntegrationTests/SamplesValidation.cs
+++ b/dotnet/tests/Microsoft.Agents.AI.Hosting.AzureFunctions.IntegrationTests/SamplesValidation.cs
@@ -15,8 +15,6 @@ namespace Microsoft.Agents.AI.Hosting.AzureFunctions.IntegrationTests;
[Trait("Category", "SampleValidation")]
public sealed class SamplesValidation(ITestOutputHelper outputHelper) : IAsyncLifetime
{
- private const string SkipFlakyTimingTest = "Flaky: timing-dependent LLM test, see https://github.com/microsoft/agent-framework/issues/4971";
-
private const string AzureFunctionsPort = "7071";
private const string AzuritePort = "10000";
private const string DtsPort = "8080";
@@ -37,7 +35,7 @@ public sealed class SamplesValidation(ITestOutputHelper outputHelper) : IAsyncLi
.Build();
private static bool s_infrastructureStarted;
- private static readonly TimeSpan s_orchestrationTimeout = TimeSpan.FromMinutes(2);
+ private static readonly TimeSpan s_orchestrationTimeout = TimeSpan.FromMinutes(3);
// In CI, `dotnet run` builds the Functions project from scratch before the host starts, so 60s is not enough.
private static readonly TimeSpan s_functionsReadyTimeout = TimeSpan.FromSeconds(180);
@@ -62,7 +60,7 @@ async ValueTask IAsyncDisposable.DisposeAsync()
await Task.CompletedTask;
}
- [Fact]
+ [RetryFact(2, 5000)]
public async Task SingleAgentSampleValidationAsync()
{
string samplePath = Path.Combine(s_samplesPath, "01_SingleAgent");
@@ -107,7 +105,7 @@ await this.WaitForConditionAsync(
});
}
- [Fact]
+ [Fact(Skip = "Flaky: LLM non-determinism can produce null orchestration results")]
public async Task SingleAgentOrchestrationChainingSampleValidationAsync()
{
string samplePath = Path.Combine(s_samplesPath, "02_AgentOrchestration_Chaining");
@@ -150,7 +148,7 @@ await this.RunSampleTestAsync(samplePath, async (logs) =>
});
}
- [Fact]
+ [RetryFact(2, 5000)]
public async Task MultiAgentOrchestrationConcurrentSampleValidationAsync()
{
string samplePath = Path.Combine(s_samplesPath, "03_AgentOrchestration_Concurrency");
@@ -200,7 +198,7 @@ await this.RunSampleTestAsync(samplePath, async (logs) =>
});
}
- [Fact]
+ [RetryFact(2, 5000)]
public async Task MultiAgentOrchestrationConditionalsSampleValidationAsync()
{
string samplePath = Path.Combine(s_samplesPath, "04_AgentOrchestration_Conditionals");
@@ -218,7 +216,7 @@ await this.TestSpamDetectionAsync("email-002",
});
}
- [Fact]
+ [RetryFact(2, 5000)]
public async Task SingleAgentOrchestrationHITLSampleValidationAsync()
{
string samplePath = Path.Combine(s_samplesPath, "05_AgentOrchestration_HITL");
@@ -274,7 +272,7 @@ await this.RunSampleTestAsync(samplePath, async (logs) =>
});
}
- [Fact(Skip = SkipFlakyTimingTest)]
+ [RetryFact(2, 5000)]
public async Task LongRunningToolsSampleValidationAsync()
{
string samplePath = Path.Combine(s_samplesPath, "06_LongRunningTools");
@@ -316,7 +314,7 @@ await this.WaitForConditionAsync(
}
},
message: "Orchestration is requesting human feedback",
- timeout: TimeSpan.FromSeconds(60));
+ timeout: TimeSpan.FromSeconds(180));
// Approve the content
Uri approvalUri = new($"{runAgentUri}?thread_id={sessionId}");
@@ -336,7 +334,7 @@ await this.WaitForConditionAsync(
}
},
message: "Content published notification is logged",
- timeout: TimeSpan.FromSeconds(60));
+ timeout: TimeSpan.FromSeconds(180));
// Verify the final orchestration status by asking the agent for the status
Uri statusUri = new($"{runAgentUri}?thread_id={sessionId}");
@@ -360,11 +358,11 @@ await this.WaitForConditionAsync(
return isCompleted && hasContent;
},
message: "Orchestration is completed",
- timeout: TimeSpan.FromSeconds(60));
+ timeout: TimeSpan.FromSeconds(180));
});
}
- [Fact]
+ [RetryFact(2, 5000)]
public async Task AgentAsMcpToolAsync()
{
string samplePath = Path.Combine(s_samplesPath, "07_AgentAsMcpTool");
@@ -404,7 +402,7 @@ await this.WaitForConditionAsync(
});
}
- [Fact(Skip = SkipFlakyTimingTest)]
+ [RetryFact(2, 5000)]
public async Task ReliableStreamingSampleValidationAsync()
{
string samplePath = Path.Combine(s_samplesPath, "08_ReliableStreaming");
diff --git a/python/scripts/integration_test_report/aggregate.py b/python/scripts/integration_test_report/aggregate.py
index e803add730..708f47fcf0 100644
--- a/python/scripts/integration_test_report/aggregate.py
+++ b/python/scripts/integration_test_report/aggregate.py
@@ -2,16 +2,18 @@
"""Aggregate per-provider JUnit XML test results and generate a trend report.
-Parses ``pytest.xml`` (JUnit XML) files produced by each CI job, merges them
-into a single run, combines with historical data, and generates a markdown
-trend table — the same pattern used by ``scripts/sample_validation/aggregate.py``.
+Parses JUnit XML files produced by CI jobs — both ``pytest.xml`` (Python) and
+xunit v3 ``*.junit`` (dotnet) — merges them into a single run, combines
+with historical data, and generates a markdown trend table.
Usage (from CI):
python aggregate.py
-The reports directory is expected to contain subdirectories named
-``test-results-/`` each containing a ``pytest.xml`` file
-(created by ``actions/download-artifact``).
+The reports directory is expected to contain artifact subdirectories. Two
+layouts are supported:
+
+- **Python (pytest):** ``test-results-/pytest.xml``
+- **Dotnet (xunit):** ``dotnet-test-results--/*.junit``
"""
from __future__ import annotations
@@ -46,9 +48,21 @@ def _format_run_label(timestamp: str) -> str:
def _derive_provider(directory_name: str) -> str:
"""Derive a provider label from a report directory name.
- ``test-results-openai`` → ``OpenAI``
- ``test-results-azure-openai`` → ``Azure OpenAI``
+ Handles both Python and dotnet naming conventions:
+ - ``test-results-openai`` → ``OpenAI``
+ - ``test-results-azure-openai`` → ``Azure OpenAI``
+ - ``dotnet-test-results-net10.0-ubuntu-latest`` → ``net10.0 (ubuntu)``
"""
+ # Dotnet convention: dotnet-test-results--
+ if directory_name.startswith("dotnet-test-results-"):
+ raw = directory_name.replace("dotnet-test-results-", "")
+ # e.g. "net10.0-ubuntu-latest" → framework="net10.0", os="ubuntu-latest"
+ parts = raw.split("-", 1)
+ framework = parts[0]
+ os_label = parts[1].split("-")[0] if len(parts) > 1 else ""
+ return f"{framework} ({os_label})" if os_label else framework
+
+ # Python convention: test-results-
raw = directory_name.replace("test-results-", "")
known = {
"openai": "OpenAI",
@@ -102,11 +116,21 @@ def _parse_junit_xml(xml_path: Path) -> list[dict[str, str]]:
# it appends the class name, e.g.:
# "packages.foundry.tests.foundry.test_foundry_embedding_client.TestFoundryEmbeddingIntegration"
# We want the file-level module: "test_foundry_embedding_client"
+ #
+ # xunit (dotnet) writes classname as the full C# type, e.g.:
+ # "OpenAIChatCompletion.IntegrationTests.ChatCompletionTests"
+ # We want the project prefix: "OpenAIChatCompletion"
if classname:
parts = classname.rsplit(".", 2)
# If the last segment starts with uppercase it's a class name — take the one before it
if len(parts) >= 2 and parts[-1][0:1].isupper():
- module = parts[-2]
+ # For dotnet: if the penultimate part is "IntegrationTests" or "UnitTests",
+ # use the part before that (the project name) instead
+ if parts[-2] in ("IntegrationTests", "UnitTests") and len(parts) >= 3:
+ # parts[0] may contain dots — take the last segment of it
+ module = parts[0].rsplit(".", 1)[-1]
+ else:
+ module = parts[-2]
else:
module = parts[-1]
else:
@@ -148,28 +172,61 @@ def _parse_junit_xml(xml_path: Path) -> list[dict[str, str]]:
# ---------------------------------------------------------------------------
+def _discover_xml_files(reports_dir: Path) -> list[tuple[str, Path]]:
+ """Discover JUnit XML test result files in artifact subdirectories.
+
+ Handles two directory layouts:
+ - **Python (pytest):** ``test-results-/pytest.xml``
+ - **Dotnet (xunit):** ``dotnet-test-results--/*.junit``
+
+ Returns:
+ List of ``(directory_name, xml_path)`` tuples.
+ """
+ xml_files: list[tuple[str, Path]] = []
+ if not reports_dir.is_dir():
+ return xml_files
+
+ for subdir in sorted(reports_dir.iterdir()):
+ if not subdir.is_dir():
+ continue
+
+ # Python layout: single pytest.xml per artifact
+ pytest_xml = subdir / "pytest.xml"
+ if pytest_xml.exists():
+ xml_files.append((subdir.name, pytest_xml))
+ continue
+
+ # Dotnet layout: multiple *.junit files per artifact
+ junit_files = sorted(subdir.rglob("*.junit"))
+ for jf in junit_files:
+ xml_files.append((subdir.name, jf))
+
+ # Fallback: any .xml file that looks like JUnit (not .trx, not cobertura)
+ if not junit_files:
+ for xf in sorted(subdir.rglob("*.xml")):
+ if xf.suffix == ".xml" and not xf.name.endswith(".cobertura.xml"):
+ xml_files.append((subdir.name, xf))
+
+ return xml_files
+
+
def load_current_run(reports_dir: Path) -> dict[str, Any]:
"""Load per-provider JUnit XML reports from the current CI run and merge.
+ Supports both pytest (Python) and xunit v3 (dotnet) JUnit XML formats.
+
Args:
- reports_dir: Directory containing ``test-results-/`` subdirs.
+ reports_dir: Directory containing artifact subdirectories with XML reports.
Returns:
Merged run dict with ``timestamp``, ``summary``, ``results``.
"""
combined_results: dict[str, dict[str, str]] = {} # nodeid → {status, provider}
- # actions/download-artifact creates: reports_dir/test-results-openai/pytest.xml
- xml_files: list[tuple[str, Path]] = []
- if reports_dir.is_dir():
- for subdir in sorted(reports_dir.iterdir()):
- if subdir.is_dir():
- xml_file = subdir / "pytest.xml"
- if xml_file.exists():
- xml_files.append((subdir.name, xml_file))
+ xml_files = _discover_xml_files(reports_dir)
if not xml_files:
- print(f"Warning: No pytest.xml files found in {reports_dir}")
+ print(f"Warning: No JUnit XML files found in {reports_dir}")
return {
"timestamp": datetime.now(timezone.utc).isoformat(),
"summary": {
@@ -181,19 +238,42 @@ def load_current_run(reports_dir: Path) -> dict[str, Any]:
"results": {},
}
+ # Dotnet tests always run under multiple frameworks, so we always
+ # qualify their keys with the provider to ensure deterministic,
+ # stable keys across runs regardless of file parse order.
+ is_dotnet = any(d.startswith("dotnet-test-results-") for d, _ in xml_files)
+
for dir_name, xml_file in xml_files:
print(f" Loading: {xml_file}")
provider = _derive_provider(dir_name)
tests = _parse_junit_xml(xml_file)
for test in tests:
- combined_results[test["nodeid"]] = {
+ raw_id = test["nodeid"]
+ key = f"{provider}::{raw_id}" if is_dotnet else raw_id
+
+ combined_results[key] = {
"status": test["status"],
"provider": provider,
"module": test.get("module", ""),
}
- # Build summary counts using mutually exclusive status buckets.
- # Errors are folded into the failed count for display purposes.
+ # Build per-provider summary counts so the report can show one row per
+ # framework (dotnet) or per provider (Python).
+ provider_counts: dict[str, dict[str, int]] = {}
+ for r in combined_results.values():
+ prov = r.get("provider", "Unknown")
+ if prov not in provider_counts:
+ provider_counts[prov] = {"total": 0, "passed": 0, "failed": 0, "skipped": 0}
+ provider_counts[prov]["total"] += 1
+ st = r["status"]
+ if st == "passed":
+ provider_counts[prov]["passed"] += 1
+ elif st in ("failed", "error"):
+ provider_counts[prov]["failed"] += 1
+ elif st == "skipped":
+ provider_counts[prov]["skipped"] += 1
+
+ # Overall summary (sum across all providers).
statuses = [r["status"] for r in combined_results.values()]
summary = {
"total": len(statuses),
@@ -205,6 +285,7 @@ def load_current_run(reports_dir: Path) -> dict[str, Any]:
return {
"timestamp": datetime.now(timezone.utc).isoformat(),
"summary": summary,
+ "provider_summaries": provider_counts,
"results": combined_results,
}
@@ -253,7 +334,29 @@ def generate_trend_report(runs: list[dict[str, Any]]) -> str:
"",
]
- # --- Overall status table (most recent first) ---
+ # Detect whether this is a dotnet report (provider-qualified keys).
+ is_dotnet = False
+ for run in runs:
+ provider_sums = run.get("provider_summaries", {})
+ if any(p.startswith("net") for p in provider_sums):
+ is_dotnet = True
+ break
+
+ if is_dotnet:
+ _generate_dotnet_report(lines, runs)
+ else:
+ _generate_python_report(lines, runs)
+
+ lines.append("")
+ lines.append("**Legend:** ✅ Passed · ❌ Failed · ⏭️ Skipped · ⚠️ Expected Failure (xfail) · N/A Not available")
+ lines.append("")
+
+ return "\n".join(lines)
+
+
+def _generate_python_report(lines: list[str], runs: list[dict[str, Any]]) -> None:
+ """Generate the original single-table Python report format."""
+ # --- Overall status table ---
lines.append("## Overall Status (Last 5 Runs)")
lines.append("")
lines.append("| Run | Total | ✅ Passed | ❌ Failed | ⏭️ Skipped |")
@@ -276,27 +379,91 @@ def generate_trend_report(runs: list[dict[str, Any]]) -> str:
lines.append("")
- # --- Per-test results table ---
- lines.append("## Per-Test Results")
- lines.append("")
+ # --- Single per-test results table ---
+ _generate_per_test_table(lines, runs, "## Per-Test Results")
+
+
+def _generate_dotnet_report(lines: list[str], runs: list[dict[str, Any]]) -> None:
+ """Generate per-framework tables for dotnet (net10.0, net472, etc.)."""
+ # Collect all providers seen across all runs, sorted for stable ordering
+ all_providers: set[str] = set()
+ for run in runs:
+ all_providers.update(run.get("provider_summaries", {}).keys())
+ providers = sorted(all_providers)
+
+ for provider in providers:
+ lines.append(f"## {provider}")
+ lines.append("")
+
+ # --- Per-provider summary table ---
+ lines.append("| Run | Total | ✅ Passed | ❌ Failed | ⏭️ Skipped |")
+ lines.append("|-----|-------|-----------|-----------|------------|")
+
+ for run in reversed(runs):
+ ps = run.get("provider_summaries", {}).get(provider, {})
+ total = ps.get("total", 0)
+ label = _format_run_label(run["timestamp"])
+ if total == 0:
+ lines.append(f"| {label} | N/A | N/A | N/A | N/A |")
+ else:
+ lines.append(
+ f"| {label} "
+ f"| {total} "
+ f"| {ps.get('passed', 0)}/{total} "
+ f"| {ps.get('failed', 0)}/{total} "
+ f"| {ps.get('skipped', 0)}/{total} |"
+ )
- # Collect all test nodeids, providers, and modules across all runs
- all_tests: dict[str, str] = {} # nodeid → provider (from most recent run)
- all_modules: dict[str, str] = {} # nodeid → module (from most recent run)
+ for _ in range(MAX_HISTORY - len(runs)):
+ lines.append("| N/A | N/A | N/A | N/A | N/A |")
+
+ lines.append("")
+
+ # --- Per-test table filtered to this provider ---
+ _generate_per_test_table(
+ lines, runs,
+ heading=None,
+ provider_filter=provider,
+ )
+
+
+def _generate_per_test_table(
+ lines: list[str],
+ runs: list[dict[str, Any]],
+ heading: str | None = None,
+ provider_filter: str | None = None,
+) -> None:
+ """Emit a per-test trend table, optionally filtered to a single provider."""
+ if heading:
+ lines.append(heading)
+ lines.append("")
+
+ # Collect all test nodeids (and metadata) across all runs
+ all_tests: dict[str, str] = {} # nodeid → provider
+ all_modules: dict[str, str] = {} # nodeid → module
for run in runs:
for nodeid, info in run.get("results", {}).items():
- provider = info.get("provider", "Unknown") if isinstance(info, dict) else "Unknown"
- module = info.get("module", "") if isinstance(info, dict) else ""
- all_tests[nodeid] = provider
+ if not isinstance(info, dict):
+ continue
+ prov = info.get("provider", "Unknown")
+ if provider_filter and prov != provider_filter:
+ continue
+ module = info.get("module", "")
+ all_tests[nodeid] = prov
all_modules[nodeid] = module
if not all_tests:
lines.append("*No test results available.*")
- return "\n".join(lines)
+ lines.append("")
+ return
- # Build header (most recent run first)
- header = "| Test | File | Provider |"
- separator = "|------|------|----------|"
+ # Build header
+ if provider_filter:
+ header = "| Test | File |"
+ separator = "|------|------|"
+ else:
+ header = "| Test | File | Provider |"
+ separator = "|------|------|----------|"
for run in reversed(runs):
label = _format_run_label(run["timestamp"])
header += f" {label} |"
@@ -308,12 +475,15 @@ def generate_trend_report(runs: list[dict[str, Any]]) -> str:
lines.append(header)
lines.append(separator)
- # Sort by provider then test name
- for nodeid in sorted(all_tests, key=lambda n: (all_tests[n], n)):
- provider = all_tests[nodeid]
+ # Sort by module then test name
+ for nodeid in sorted(all_tests, key=lambda n: (all_modules.get(n, ""), n)):
module = all_modules.get(nodeid, "")
short = _short_name(nodeid)
- row = f"| `{short}` | `{module}` | {provider} |"
+ if provider_filter:
+ row = f"| `{short}` | `{module}` |"
+ else:
+ provider = all_tests[nodeid]
+ row = f"| `{short}` | `{module}` | {provider} |"
for run in reversed(runs):
result = run.get("results", {}).get(nodeid)
@@ -330,10 +500,6 @@ def generate_trend_report(runs: list[dict[str, Any]]) -> str:
lines.append(row)
lines.append("")
- lines.append("**Legend:** ✅ Passed · ❌ Failed · ⏭️ Skipped · ⚠️ Expected Failure (xfail) · N/A Not available")
- lines.append("")
-
- return "\n".join(lines)
# ---------------------------------------------------------------------------