Skip to content

Commit b7ae840

Browse files
.Net: feat(connectors): Support ImageContent in tool/function results (#13431)
### Motivation and Context Fixes #13430 Currently, when a Semantic Kernel function returns `ImageContent`, it gets serialized to JSON - losing the binary image data and preventing multimodal-capable models from processing the image. This PR enables `ImageContent` preservation in tool/function results, allowing connectors with multimodal capabilities (Gemini 3+, Anthropic) to pass images natively to the model. This is essential for agentic workflows where tools generate or process images that the model needs to analyze. ### Description #### FunctionCallsProcessor (shared infrastructure) - Changed `ProcessFunctionResult()` return type from `string` to `object` - Added early return for `ImageContent` to preserve it for multimodal-capable connectors - Added `ImageContentNotSupportedErrorMessage` constant for consistent error messaging #### Google Gemini Connector (native support) - Extended `FunctionResponsePart` with `Parts` property for nested multimodal content - Added `FunctionResponsePartContent` class with `InlineData` support - Implemented `CreateImageFunctionResponsePart()` to convert `ImageContent` to Gemini's native `inlineData` format #### OpenAI Connector (error handling) - Added `ImageContent` check with clear error message (API does not support images in tool results) #### OpenAI Agents (error handling) - Added `GetFunctionResultAsString()` helper with `ImageContent` error handling #### Amazon Bedrock Agents (error handling) - Added `GetFunctionResultAsString()` helper with `ImageContent` error handling #### New Unit Tests | Test | File | |------|------| | `ItShouldPreserveImageContentWithoutSerialization` | `FunctionCallsProcessorTests.cs` | | `FromChatHistoryImageContentInToolResultCreatesInlineDataPart` | `GeminiRequestTests.cs` | | `FromChatHistoryImageContentWithoutDataThrowsInvalidOperationException` | `GeminiRequestTests.cs` | | `FromChatHistoryImageContentWithoutMimeTypeThrowsInvalidOperationException` | `GeminiRequestTests.cs` | | `VerifyAssistantMessageAdapterGetMessageWithImageContentInFunctionResult` | `AssistantMessageFactoryTests.cs` | #### Notes - **No breaking changes**: All changes are to internal APIs. The public `FunctionResultContent.Result` property is already `object?`. - **Gemini version detection**: The connector does not check the model version. If an older Gemini model does not support `functionResponse.parts`, the API will return an appropriate error. - **URI-based ImageContent**: Only `ImageContent` with binary data is supported. URI-based `ImageContent` will throw `InvalidOperationException`. - **MistralAI**: Out of scope - has its own `ProcessFunctionResult` implementation. - **Related**: Prepares infrastructure for PR #13419 (Anthropic Connector multimodal support) ### Contribution Checklist - [x] The code builds clean without any errors or warnings - [x] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [x] All unit tests pass, and I have added new tests where possible - [x] I didn't break anyone 😄 [sk-pr-MultimodalToolResults.md](https://github.com/user-attachments/files/24299998/sk-pr-MultimodalToolResults.md) --------- Co-authored-by: Roger Barreto <19890735+rogerbarreto@users.noreply.github.com>
1 parent 52d4e5c commit b7ae840

12 files changed

Lines changed: 378 additions & 18 deletions

File tree

dotnet/src/Agents/Bedrock/Extensions/BedrockAgentInvokeExtensions.cs

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,12 +214,28 @@ private static SessionState CreateSessionStateWithFunctionResults(List<FunctionR
214214
Function = functionResult.FunctionName,
215215
ResponseBody = new Dictionary<string, ContentBody>
216216
{
217-
{ "TEXT", new ContentBody() { Body = FunctionCallsProcessor.ProcessFunctionResult(functionResult.Result ?? string.Empty) } }
217+
{ "TEXT", new ContentBody() { Body = GetFunctionResultAsString(functionResult.Result) } }
218218
}
219219
}
220220
};
221221
}
222222
)],
223223
};
224224
}
225+
226+
/// <summary>
227+
/// Processes a function result and returns a string representation.
228+
/// Bedrock does not support multimodal tool results, so ImageContent returns an error message.
229+
/// </summary>
230+
private static string GetFunctionResultAsString(object? result)
231+
{
232+
var processed = FunctionCallsProcessor.ProcessFunctionResult(result ?? string.Empty);
233+
234+
if (processed is ImageContent)
235+
{
236+
return FunctionCallsProcessor.ImageContentNotSupportedErrorMessage;
237+
}
238+
239+
return (string?)processed ?? string.Empty;
240+
}
225241
}

dotnet/src/Agents/OpenAI/Internal/AssistantMessageFactory.cs

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,24 @@ public static IEnumerable<MessageContent> GetMessageContents(ChatMessageContent
7171
else if (content is FunctionResultContent resultContent && resultContent.Result != null && !hasTextContent)
7272
{
7373
// Only convert a function result when text-content is not already present
74-
yield return MessageContent.FromText(FunctionCallsProcessor.ProcessFunctionResult(resultContent.Result));
74+
yield return MessageContent.FromText(GetFunctionResultAsString(resultContent.Result));
7575
}
7676
}
7777
}
78+
79+
/// <summary>
80+
/// Processes a function result and returns a string representation.
81+
/// OpenAI Assistants do not support multimodal tool results, so ImageContent returns an error message.
82+
/// </summary>
83+
private static string GetFunctionResultAsString(object result)
84+
{
85+
var processed = FunctionCallsProcessor.ProcessFunctionResult(result);
86+
87+
if (processed is ImageContent)
88+
{
89+
return FunctionCallsProcessor.ImageContentNotSupportedErrorMessage;
90+
}
91+
92+
return (string?)processed ?? string.Empty;
93+
}
7894
}

dotnet/src/Agents/OpenAI/Internal/ResponseThreadActions.cs

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ await functionProcessor.InvokeFunctionCallsAsync(
9393
agent.GetKernel(options),
9494
isStreaming: false,
9595
cancellationToken).ConfigureAwait(false);
96-
var functionOutputItems = functionResults.Select(fr => ResponseItem.CreateFunctionCallOutputItem(fr.CallId, fr.Result?.ToString() ?? string.Empty)).ToList();
96+
var functionOutputItems = functionResults.Select(fr => ResponseItem.CreateFunctionCallOutputItem(fr.CallId, GetFunctionResultAsString(fr.Result))).ToList();
9797

9898
// If store is enabled we only need to send the function output items
9999
if (agent.StoreEnabled)
@@ -267,7 +267,7 @@ await functionProcessor.InvokeFunctionCallsAsync(
267267
agent.GetKernel(options),
268268
isStreaming: true,
269269
cancellationToken).ConfigureAwait(false);
270-
var functionOutputItems = functionResults.Select(fr => ResponseItem.CreateFunctionCallOutputItem(fr.CallId, fr.Result?.ToString() ?? string.Empty)).ToList();
270+
var functionOutputItems = functionResults.Select(fr => ResponseItem.CreateFunctionCallOutputItem(fr.CallId, GetFunctionResultAsString(fr.Result))).ToList();
271271

272272
// If store is enabled we only need to send the function output items
273273
if (agent.StoreEnabled)
@@ -318,6 +318,22 @@ private static void ThrowIfIncompleteOrFailed(OpenAIResponseAgent agent, Respons
318318
}
319319
}
320320

321+
/// <summary>
322+
/// Processes a function result and returns a string representation.
323+
/// The OpenAI Responses API does not support multimodal tool results, so ImageContent returns an error message.
324+
/// </summary>
325+
internal static string GetFunctionResultAsString(object? result)
326+
{
327+
var processed = FunctionCallsProcessor.ProcessFunctionResult(result ?? string.Empty);
328+
329+
if (processed is ImageContent)
330+
{
331+
return FunctionCallsProcessor.ImageContentNotSupportedErrorMessage;
332+
}
333+
334+
return (string?)processed ?? string.Empty;
335+
}
336+
321337
/// <summary>POCO representing function calling info.</summary>
322338
/// <remarks>Used to concatenation information for a single function call from across multiple streaming updates.</remarks>
323339
private sealed class FunctionCallInfo(FunctionCallResponseItem item)

dotnet/src/Agents/UnitTests/OpenAI/Internal/AssistantMessageFactoryTests.cs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using Microsoft.SemanticKernel;
66
using Microsoft.SemanticKernel.Agents.OpenAI.Internal;
77
using Microsoft.SemanticKernel.ChatCompletion;
8+
89
using OpenAI.Assistants;
910
using Xunit;
1011

@@ -207,4 +208,28 @@ public void VerifyAssistantMessageAdapterGetMessageWithAll()
207208
Assert.NotNull(contents);
208209
Assert.Equal(3, contents.Length);
209210
}
211+
212+
/// <summary>
213+
/// Verify that ImageContent in FunctionResultContent returns error message
214+
/// since OpenAI Assistants do not support multimodal tool results.
215+
/// </summary>
216+
[Fact]
217+
public void VerifyAssistantMessageAdapterGetMessageWithImageContentInFunctionResult()
218+
{
219+
// Arrange: Create a FunctionResultContent containing ImageContent
220+
var imageData = new ReadOnlyMemory<byte>([0x89, 0x50, 0x4E, 0x47]); // PNG magic bytes
221+
var imageContent = new ImageContent(imageData, "image/png");
222+
var functionResultContent = new FunctionResultContent("TestFunction", "TestPlugin", "call-id", imageContent);
223+
ChatMessageContent message = new(AuthorRole.Tool, items: [functionResultContent]);
224+
225+
// Act
226+
MessageContent[] contents = AssistantMessageFactory.GetMessageContents(message).ToArray();
227+
228+
// Assert: Should return error message since OpenAI Assistants don't support multimodal tool results
229+
Assert.NotNull(contents);
230+
Assert.Single(contents);
231+
Assert.NotNull(contents.Single().Text);
232+
// Expected error message from FunctionCallsProcessor.ImageContentNotSupportedErrorMessage
233+
Assert.Equal("Error: This model does not support image content in tool results.", contents.Single().Text);
234+
}
210235
}
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
3+
using System;
4+
using Microsoft.SemanticKernel;
5+
using Microsoft.SemanticKernel.Agents.OpenAI.Internal;
6+
using Xunit;
7+
8+
namespace SemanticKernel.Agents.UnitTests.OpenAI.Internal;
9+
10+
/// <summary>
11+
/// Unit tests for <see cref="ResponseThreadActions"/>.
12+
/// </summary>
13+
public class ResponseThreadActionsTests
14+
{
15+
/// <summary>
16+
/// Verify that <see cref="ResponseThreadActions.GetFunctionResultAsString(object?)"/> returns the
17+
/// shared <c>ImageContentNotSupportedErrorMessage</c> when the function result is an
18+
/// <see cref="ImageContent"/>, since the OpenAI Responses API does not support multimodal tool results.
19+
/// </summary>
20+
[Fact]
21+
public void VerifyResponseThreadActionsGetFunctionResultAsStringReturnsErrorMessageForImageContent()
22+
{
23+
// Arrange: Create an ImageContent with binary data
24+
var imageData = new ReadOnlyMemory<byte>([0x89, 0x50, 0x4E, 0x47]); // PNG magic bytes
25+
var imageContent = new ImageContent(imageData, "image/png");
26+
27+
// Act
28+
string result = ResponseThreadActions.GetFunctionResultAsString(imageContent);
29+
30+
// Assert
31+
Assert.Equal("Error: This model does not support image content in tool results.", result);
32+
}
33+
34+
/// <summary>
35+
/// Verify that <see cref="ResponseThreadActions.GetFunctionResultAsString(object?)"/> returns the
36+
/// original string verbatim when the function result is a string.
37+
/// </summary>
38+
[Fact]
39+
public void VerifyResponseThreadActionsGetFunctionResultAsStringReturnsStringVerbatim()
40+
{
41+
// Arrange
42+
const string Expected = "tool result text";
43+
44+
// Act
45+
string result = ResponseThreadActions.GetFunctionResultAsString(Expected);
46+
47+
// Assert
48+
Assert.Equal(Expected, result);
49+
}
50+
51+
/// <summary>
52+
/// Verify that <see cref="ResponseThreadActions.GetFunctionResultAsString(object?)"/> returns
53+
/// <see cref="string.Empty"/> when the function result is <see langword="null"/>.
54+
/// </summary>
55+
[Fact]
56+
public void VerifyResponseThreadActionsGetFunctionResultAsStringReturnsEmptyForNull()
57+
{
58+
// Act
59+
string result = ResponseThreadActions.GetFunctionResultAsString(null);
60+
61+
// Assert
62+
Assert.Equal(string.Empty, result);
63+
}
64+
}

dotnet/src/Connectors/Connectors.Google.UnitTests/Core/Gemini/GeminiRequestTests.cs

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -781,6 +781,71 @@ public void FromChatHistoryMultiTurnConversationPreservesAllRoles()
781781
Assert.Equal("assistant-message-2", request.Contents[3].Parts![0].Text);
782782
}
783783

784+
[Fact]
785+
public void FromChatHistoryImageContentInToolResultCreatesInlineDataPart()
786+
{
787+
// Arrange
788+
ChatHistory chatHistory = [];
789+
var imageBytes = new byte[] { 0x89, 0x50, 0x4E, 0x47 }; // PNG magic bytes
790+
var imageContent = new ImageContent(imageBytes, "image/png");
791+
var kernelFunction = KernelFunctionFactory.CreateFromMethod(() => imageContent);
792+
var toolCall = new GeminiFunctionToolCall(new GeminiPart.FunctionCallPart { FunctionName = "capture-screenshot" });
793+
GeminiFunctionToolResult toolCallResult = new(toolCall, new FunctionResult(kernelFunction, imageContent));
794+
chatHistory.Add(new GeminiChatMessageContent(AuthorRole.Tool, string.Empty, "modelId", toolCallResult));
795+
var executionSettings = new GeminiPromptExecutionSettings();
796+
797+
// Act
798+
var request = GeminiRequest.FromChatHistoryAndExecutionSettings(chatHistory, executionSettings);
799+
800+
// Assert
801+
Assert.Single(request.Contents);
802+
var part = request.Contents[0].Parts![0];
803+
Assert.NotNull(part.FunctionResponse);
804+
Assert.Equal("capture-screenshot", part.FunctionResponse.FunctionName);
805+
Assert.NotNull(part.FunctionResponse.Parts);
806+
Assert.Single(part.FunctionResponse.Parts);
807+
Assert.NotNull(part.FunctionResponse.Parts[0].InlineData);
808+
Assert.Equal("image/png", part.FunctionResponse.Parts[0].InlineData!.MimeType);
809+
Assert.Equal(Convert.ToBase64String(imageBytes), part.FunctionResponse.Parts[0].InlineData!.InlineData);
810+
}
811+
812+
[Fact]
813+
public void FromChatHistoryImageContentWithoutDataThrowsInvalidOperationException()
814+
{
815+
// Arrange
816+
ChatHistory chatHistory = [];
817+
var imageContent = new ImageContent(new Uri("https://example.com/image.png")) { MimeType = "image/png" };
818+
var kernelFunction = KernelFunctionFactory.CreateFromMethod(() => imageContent);
819+
var toolCall = new GeminiFunctionToolCall(new GeminiPart.FunctionCallPart { FunctionName = "capture-screenshot" });
820+
GeminiFunctionToolResult toolCallResult = new(toolCall, new FunctionResult(kernelFunction, imageContent));
821+
chatHistory.Add(new GeminiChatMessageContent(AuthorRole.Tool, string.Empty, "modelId", toolCallResult));
822+
var executionSettings = new GeminiPromptExecutionSettings();
823+
824+
// Act & Assert
825+
var exception = Assert.Throws<InvalidOperationException>(
826+
() => GeminiRequest.FromChatHistoryAndExecutionSettings(chatHistory, executionSettings));
827+
Assert.Equal("ImageContent in function result must contain binary data.", exception.Message);
828+
}
829+
830+
[Fact]
831+
public void FromChatHistoryImageContentWithoutMimeTypeThrowsInvalidOperationException()
832+
{
833+
// Arrange
834+
ChatHistory chatHistory = [];
835+
ReadOnlyMemory<byte> imageBytes = new byte[] { 0x89, 0x50, 0x4E, 0x47 };
836+
var imageContent = new ImageContent(imageBytes, mimeType: null); // No MimeType
837+
var kernelFunction = KernelFunctionFactory.CreateFromMethod(() => imageContent);
838+
var toolCall = new GeminiFunctionToolCall(new GeminiPart.FunctionCallPart { FunctionName = "capture-screenshot" });
839+
GeminiFunctionToolResult toolCallResult = new(toolCall, new FunctionResult(kernelFunction, imageContent));
840+
chatHistory.Add(new GeminiChatMessageContent(AuthorRole.Tool, string.Empty, "modelId", toolCallResult));
841+
var executionSettings = new GeminiPromptExecutionSettings();
842+
843+
// Act & Assert
844+
var exception = Assert.Throws<InvalidOperationException>(
845+
() => GeminiRequest.FromChatHistoryAndExecutionSettings(chatHistory, executionSettings));
846+
Assert.Equal("Image content MimeType is empty.", exception.Message);
847+
}
848+
784849
[Fact]
785850
public void FromChatHistoryToolCallsWithThoughtSignatureIncludesSignatureInRequest()
786851
{

dotnet/src/Connectors/Connectors.Google/Core/Gemini/Models/GeminiPart.cs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,14 @@ internal sealed class FunctionResponsePart
185185
[JsonRequired]
186186
public FunctionResponseEntity Response { get; set; } = null!;
187187

188+
/// <summary>
189+
/// Optional. Nested parts for multimodal function responses (Gemini 3+ only).
190+
/// Contains inlineData with image/binary data as part of tool results.
191+
/// </summary>
192+
[JsonPropertyName("parts")]
193+
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
194+
public FunctionResponsePartContent[]? Parts { get; set; }
195+
188196
internal sealed class FunctionResponseEntity
189197
{
190198
[JsonConstructor]
@@ -202,5 +210,16 @@ public FunctionResponseEntity(object? response)
202210
[JsonRequired]
203211
public JsonNode Arguments { get; set; } = null!;
204212
}
213+
214+
/// <summary>
215+
/// Represents a part within a Gemini function response (for multimodal content).
216+
/// Used in Gemini 3+ to include images/binary data as part of tool results.
217+
/// </summary>
218+
internal sealed class FunctionResponsePartContent
219+
{
220+
[JsonPropertyName("inlineData")]
221+
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
222+
public InlineDataPart? InlineData { get; set; }
223+
}
205224
}
206225
}

dotnet/src/Connectors/Connectors.Google/Core/Gemini/Models/GeminiRequest.cs

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,13 @@ internal sealed class GeminiRequest
2323
}
2424
};
2525

26+
/// <summary>
27+
/// Synthetic envelope used as the <c>functionResponse.response</c> body when emitting a multimodal
28+
/// (image) tool result. The actual image data is carried in <c>functionResponse.parts[].inlineData</c>;
29+
/// the envelope keeps the required <c>response</c> field present and gives the model a short hint.
30+
/// </summary>
31+
private static readonly object s_imageFunctionResponseEnvelope = new { status = "success", message = "Image data attached" };
32+
2633
[JsonPropertyName("contents")]
2734
public IList<GeminiContent> Contents { get; set; } = null!;
2835

@@ -194,14 +201,24 @@ private static List<GeminiPart> CreateGeminiParts(ChatMessageContent content)
194201
case GeminiChatMessageContent { CalledToolResults: not null } contentWithCalledTools:
195202
// Add all function responses as separate parts in a single message
196203
parts.AddRange(contentWithCalledTools.CalledToolResults.Select(toolResult =>
197-
new GeminiPart
204+
{
205+
var resultValue = toolResult.FunctionResult.GetValue<object>();
206+
207+
// Handle ImageContent for multimodal tool results (Gemini 3+ only)
208+
if (resultValue is ImageContent imageContent)
209+
{
210+
return CreateImageFunctionResponsePart(toolResult.FullyQualifiedName, imageContent);
211+
}
212+
213+
return new GeminiPart
198214
{
199215
FunctionResponse = new GeminiPart.FunctionResponsePart
200216
{
201217
FunctionName = toolResult.FullyQualifiedName,
202-
Response = new(toolResult.FunctionResult.GetValue<object>())
218+
Response = new(resultValue)
203219
}
204-
}));
220+
};
221+
}));
205222
break;
206223
case GeminiChatMessageContent { ToolCalls: not null } contentWithToolCalls:
207224
parts.AddRange(contentWithToolCalls.ToolCalls.Select(toolCall =>
@@ -302,6 +319,37 @@ private static string GetMimeTypeFromImageContent(ImageContent imageContent)
302319
?? throw new InvalidOperationException("Image content MimeType is empty.");
303320
}
304321

322+
/// <summary>
323+
/// Creates a GeminiPart with FunctionResponse containing multimodal image data (Gemini 3+ only).
324+
/// </summary>
325+
private static GeminiPart CreateImageFunctionResponsePart(string functionName, ImageContent imageContent)
326+
{
327+
if (imageContent.Data is not { IsEmpty: false })
328+
{
329+
throw new InvalidOperationException("ImageContent in function result must contain binary data.");
330+
}
331+
332+
return new GeminiPart
333+
{
334+
FunctionResponse = new GeminiPart.FunctionResponsePart
335+
{
336+
FunctionName = functionName,
337+
Response = new(s_imageFunctionResponseEnvelope),
338+
Parts =
339+
[
340+
new GeminiPart.FunctionResponsePart.FunctionResponsePartContent
341+
{
342+
InlineData = new GeminiPart.InlineDataPart
343+
{
344+
MimeType = GetMimeTypeFromImageContent(imageContent),
345+
InlineData = Convert.ToBase64String(imageContent.Data.Value.ToArray())
346+
}
347+
}
348+
]
349+
}
350+
};
351+
}
352+
305353
private static GeminiPart CreateGeminiPartFromAudio(AudioContent audioContent)
306354
{
307355
// Binary data takes precedence over URI.

0 commit comments

Comments
 (0)