.Net: Updated audio abstractions to return multiple values (#5149)

### Motivation and Context  This PR contains changes for audio abstractions to align with abstraction pattern in SK to return multiple values instead of just one to be more flexible in cases when AI connector returns multiple results (like in chat completion service with choices). For easier usage when only one result is provided by AI connector, it's possible to use extension method that is responsible for returning single result. ### Contribution Checklist  - [x] The code builds clean without any errors or warnings - [x] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [x] All unit tests pass, and I have added new tests where possible - [x] I didn't break anyone 😄
microsoft · Feb 26, 2024 · e4d2975 · e4d2975
1 parent 5817c2b
commit e4d2975
Show file tree

Hide file tree

Showing 20 changed files with 142 additions and 55 deletions.
diff --git a/dotnet/samples/KernelSyntaxExamples/Example82_Audio.cs b/dotnet/samples/KernelSyntaxExamples/Example82_Audio.cs
@@ -1,12 +1,12 @@
 // Copyright (c) Microsoft. All rights reserved.
 
 using System;
-using System.IO;
 using System.Threading.Tasks;
 using Microsoft.SemanticKernel;
 using Microsoft.SemanticKernel.AudioToText;
 using Microsoft.SemanticKernel.Connectors.OpenAI;
 using Microsoft.SemanticKernel.TextToAudio;
+using Resources;
 using Xunit;
 using Xunit.Abstractions;
 
@@ -19,7 +19,7 @@ public sealed class Example82_Audio : BaseTest
 {
     private const string TextToAudioModel = "tts-1";
     private const string AudioToTextModel = "whisper-1";
-    private const string AudioFilePath = "audio.wav";
+    private const string AudioFilename = "test_audio.wav";
 
     [Fact(Skip = "Uncomment the line to write the audio file output before running this test.")]
     public async Task TextToAudioAsync()
@@ -66,7 +66,7 @@ public async Task AudioToTextAsync()
         var audioToTextService = kernel.GetRequiredService<IAudioToTextService>();
 
         // Set execution settings (optional)
-        OpenAIAudioToTextExecutionSettings executionSettings = new(AudioFilePath)
+        OpenAIAudioToTextExecutionSettings executionSettings = new(AudioFilename)
         {
             Language = "en", // The language of the audio data as two-letter ISO-639-1 language code (e.g. 'en' or 'es').
             Prompt = "sample prompt", // An optional text to guide the model's style or continue a previous audio segment.
@@ -78,8 +78,9 @@ public async Task AudioToTextAsync()
         };
 
         // Read audio content from a file
-        ReadOnlyMemory<byte> audioData = await File.ReadAllBytesAsync(AudioFilePath);
-        AudioContent audioContent = new(new BinaryData(audioData));
+        await using var audioFileStream = EmbeddedResource.ReadStream(AudioFilename);
+        var audioFileBinaryData = await BinaryData.FromStreamAsync(audioFileStream!);
+        AudioContent audioContent = new(audioFileBinaryData);
 
         // Convert audio to text
         var textContent = await audioToTextService.GetTextContentAsync(audioContent, executionSettings);

diff --git a/dotnet/samples/KernelSyntaxExamples/Resources/test_audio.wav b/dotnet/samples/KernelSyntaxExamples/Resources/test_audio.wav
diff --git a/dotnet/src/Connectors/Connectors.OpenAI/AudioToText/AzureOpenAIAudioToTextService.cs b/dotnet/src/Connectors/Connectors.OpenAI/AudioToText/AzureOpenAIAudioToTextService.cs
@@ -85,7 +85,7 @@ public sealed class AzureOpenAIAudioToTextService : IAudioToTextService
     }
 
     /// <inheritdoc/>
-    public Task<TextContent> GetTextContentAsync(
+    public Task<IReadOnlyList<TextContent>> GetTextContentsAsync(
         AudioContent content,
         PromptExecutionSettings? executionSettings = null,
         Kernel? kernel = null,

diff --git a/dotnet/src/Connectors/Connectors.OpenAI/AudioToText/OpenAIAudioToTextService.cs b/dotnet/src/Connectors/Connectors.OpenAI/AudioToText/OpenAIAudioToTextService.cs
@@ -62,7 +62,7 @@ public sealed class OpenAIAudioToTextService : IAudioToTextService
     }
 
     /// <inheritdoc/>
-    public Task<TextContent> GetTextContentAsync(
+    public Task<IReadOnlyList<TextContent>> GetTextContentsAsync(
         AudioContent content,
         PromptExecutionSettings? executionSettings = null,
         Kernel? kernel = null,

diff --git a/dotnet/src/Connectors/Connectors.OpenAI/AzureSdk/AzureOpenAITextToAudioClient.cs b/dotnet/src/Connectors/Connectors.OpenAI/AzureSdk/AzureOpenAITextToAudioClient.cs
@@ -62,7 +62,7 @@ internal sealed class AzureOpenAITextToAudioClient
         this._logger = logger ?? NullLogger.Instance;
     }
 
-    internal async Task<AudioContent> GetAudioContentAsync(
+    internal async Task<IReadOnlyList<AudioContent>> GetAudioContentsAsync(
         string text,
         PromptExecutionSettings? executionSettings,
         CancellationToken cancellationToken)
@@ -79,7 +79,7 @@ internal sealed class AzureOpenAITextToAudioClient
 
         var binaryData = await BinaryData.FromStreamAsync(stream, cancellationToken).ConfigureAwait(false);
 
-        return new AudioContent(binaryData, modelId);
+        return new List<AudioContent> { new(binaryData, modelId) };
     }
 
     internal void AddAttribute(string key, string? value)

diff --git a/dotnet/src/Connectors/Connectors.OpenAI/AzureSdk/ClientCore.cs b/dotnet/src/Connectors/Connectors.OpenAI/AzureSdk/ClientCore.cs
@@ -241,7 +241,7 @@ await foreach (Completions completions in response)
         return result;
     }
 
-    internal async Task<TextContent> GetTextContentFromAudioAsync(
+    internal async Task<IReadOnlyList<TextContent>> GetTextContentFromAudioAsync(
         AudioContent content,
         PromptExecutionSettings? executionSettings,
         CancellationToken cancellationToken)
@@ -265,7 +265,7 @@ await foreach (Completions completions in response)
 
         AudioTranscription responseData = (await RunRequestAsync(() => this.Client.GetAudioTranscriptionAsync(audioOptions, cancellationToken)).ConfigureAwait(false)).Value;
 
-        return new TextContent(responseData.Text, this.DeploymentOrModelName, metadata: GetResponseMetadata(responseData));
+        return new List<TextContent> { new(responseData.Text, this.DeploymentOrModelName, metadata: GetResponseMetadata(responseData)) };
     }
 
     /// <summary>

diff --git a/dotnet/src/Connectors/Connectors.OpenAI/AzureSdk/OpenAITextToAudioClient.cs b/dotnet/src/Connectors/Connectors.OpenAI/AzureSdk/OpenAITextToAudioClient.cs
@@ -56,7 +56,7 @@ internal sealed class OpenAITextToAudioClient
         this._logger = logger ?? NullLogger.Instance;
     }
 
-    internal async Task<AudioContent> GetAudioContentAsync(
+    internal async Task<IReadOnlyList<AudioContent>> GetAudioContentsAsync(
         string text,
         PromptExecutionSettings? executionSettings,
         CancellationToken cancellationToken)
@@ -71,7 +71,7 @@ internal sealed class OpenAITextToAudioClient
 
         var binaryData = await BinaryData.FromStreamAsync(stream, cancellationToken).ConfigureAwait(false);
 
-        return new AudioContent(binaryData, this._modelId);
+        return new List<AudioContent> { new(binaryData, this._modelId) };
     }
 
     internal void AddAttribute(string key, string? value)

diff --git a/dotnet/src/Connectors/Connectors.OpenAI/TextToAudio/AzureOpenAITextToAudioService.cs b/dotnet/src/Connectors/Connectors.OpenAI/TextToAudio/AzureOpenAITextToAudioService.cs
@@ -54,10 +54,10 @@ public sealed class AzureOpenAITextToAudioService : ITextToAudioService
     }
 
     /// <inheritdoc/>
-    public Task<AudioContent> GetAudioContentAsync(
+    public Task<IReadOnlyList<AudioContent>> GetAudioContentsAsync(
         string text,
         PromptExecutionSettings? executionSettings = null,
         Kernel? kernel = null,
         CancellationToken cancellationToken = default)
-        => this._client.GetAudioContentAsync(text, executionSettings, cancellationToken);
+        => this._client.GetAudioContentsAsync(text, executionSettings, cancellationToken);
 }
diff --git a/dotnet/src/Connectors/Connectors.OpenAI/TextToAudio/OpenAITextToAudioService.cs b/dotnet/src/Connectors/Connectors.OpenAI/TextToAudio/OpenAITextToAudioService.cs
@@ -52,10 +52,10 @@ public sealed class OpenAITextToAudioService : ITextToAudioService
     }
 
     /// <inheritdoc/>
-    public Task<AudioContent> GetAudioContentAsync(
+    public Task<IReadOnlyList<AudioContent>> GetAudioContentsAsync(
         string text,
         PromptExecutionSettings? executionSettings = null,
         Kernel? kernel = null,
         CancellationToken cancellationToken = default)
-        => this._client.GetAudioContentAsync(text, executionSettings, cancellationToken);
+        => this._client.GetAudioContentsAsync(text, executionSettings, cancellationToken);
 }
diff --git a/.../Connectors/Connectors.UnitTests/OpenAI/AudioToText/AzureOpenAIAudioToTextServiceTests.cs b/.../Connectors/Connectors.UnitTests/OpenAI/AudioToText/AzureOpenAIAudioToTextServiceTests.cs
@@ -88,7 +88,7 @@ public async Task GetTextContentWithInvalidSettingsThrowsExceptionAsync(OpenAIAu
         };
 
         // Act
-        var exception = await Record.ExceptionAsync(() => service.GetTextContentAsync(new AudioContent(new BinaryData("data")), settings));
+        var exception = await Record.ExceptionAsync(() => service.GetTextContentsAsync(new AudioContent(new BinaryData("data")), settings));
 
         // Assert
         Assert.NotNull(exception);
@@ -106,11 +106,11 @@ public async Task GetTextContentByDefaultWorksCorrectlyAsync()
         };
 
         // Act
-        var result = await service.GetTextContentAsync(new AudioContent(new BinaryData("data")), new OpenAIAudioToTextExecutionSettings("file.mp3"));
+        var result = await service.GetTextContentsAsync(new AudioContent(new BinaryData("data")), new OpenAIAudioToTextExecutionSettings("file.mp3"));
 
         // Assert
         Assert.NotNull(result);
-        Assert.Equal("Test audio-to-text response", result.Text);
+        Assert.Equal("Test audio-to-text response", result[0].Text);
     }
 
     public void Dispose()

diff --git a/...t/src/Connectors/Connectors.UnitTests/OpenAI/AudioToText/OpenAIAudioToTextServiceTests.cs b/...t/src/Connectors/Connectors.UnitTests/OpenAI/AudioToText/OpenAIAudioToTextServiceTests.cs
@@ -70,11 +70,11 @@ public async Task GetTextContentByDefaultWorksCorrectlyAsync()
         };
 
         // Act
-        var result = await service.GetTextContentAsync(new AudioContent(new BinaryData("data")), new OpenAIAudioToTextExecutionSettings("file.mp3"));
+        var result = await service.GetTextContentsAsync(new AudioContent(new BinaryData("data")), new OpenAIAudioToTextExecutionSettings("file.mp3"));
 
         // Assert
         Assert.NotNull(result);
-        Assert.Equal("Test audio-to-text response", result.Text);
+        Assert.Equal("Test audio-to-text response", result[0].Text);
     }
 
     public void Dispose()

diff --git a/.../Connectors/Connectors.UnitTests/OpenAI/TextToAudio/AzureOpenAITextToAudioServiceTests.cs b/.../Connectors/Connectors.UnitTests/OpenAI/TextToAudio/AzureOpenAITextToAudioServiceTests.cs
@@ -59,7 +59,7 @@ public async Task GetAudioContentWithInvalidSettingsThrowsExceptionAsync(OpenAIT
         };
 
         // Act
-        var exception = await Record.ExceptionAsync(() => service.GetAudioContentAsync("Some text", settings));
+        var exception = await Record.ExceptionAsync(() => service.GetAudioContentsAsync("Some text", settings));
 
         // Assert
         Assert.NotNull(exception);
@@ -81,11 +81,11 @@ public async Task GetAudioContentByDefaultWorksCorrectlyAsync()
         };
 
         // Act
-        var result = await service.GetAudioContentAsync("Some text", new OpenAITextToAudioExecutionSettings("voice"));
+        var result = await service.GetAudioContentsAsync("Some text", new OpenAITextToAudioExecutionSettings("voice"));
 
         // Assert
-        Assert.NotNull(result?.Data);
-        Assert.True(result.Data.ToArray().SequenceEqual(expectedByteArray));
+        Assert.NotNull(result[0].Data);
+        Assert.True(result[0].Data!.ToArray().SequenceEqual(expectedByteArray));
     }
 
     [Theory]
@@ -110,7 +110,7 @@ public async Task GetAudioContentUsesValidBaseUrlAsync(bool useHttpClientBaseAdd
         };
 
         // Act
-        var result = await service.GetAudioContentAsync("Some text", new OpenAITextToAudioExecutionSettings("voice"));
+        var result = await service.GetAudioContentsAsync("Some text", new OpenAITextToAudioExecutionSettings("voice"));
 
         // Assert
         Assert.StartsWith(expectedBaseAddress, this._messageHandlerStub.RequestUri!.AbsoluteUri, StringComparison.InvariantCulture);

diff --git a/...t/src/Connectors/Connectors.UnitTests/OpenAI/TextToAudio/OpenAITextToAudioServiceTests.cs b/...t/src/Connectors/Connectors.UnitTests/OpenAI/TextToAudio/OpenAITextToAudioServiceTests.cs
@@ -58,7 +58,7 @@ public async Task GetAudioContentWithInvalidSettingsThrowsExceptionAsync(OpenAIT
         };
 
         // Act
-        var exception = await Record.ExceptionAsync(() => service.GetAudioContentAsync("Some text", settings));
+        var exception = await Record.ExceptionAsync(() => service.GetAudioContentsAsync("Some text", settings));
 
         // Assert
         Assert.NotNull(exception);
@@ -80,11 +80,11 @@ public async Task GetAudioContentByDefaultWorksCorrectlyAsync()
         };
 
         // Act
-        var result = await service.GetAudioContentAsync("Some text", new OpenAITextToAudioExecutionSettings("voice"));
+        var result = await service.GetAudioContentsAsync("Some text", new OpenAITextToAudioExecutionSettings("voice"));
 
         // Assert
-        Assert.NotNull(result?.Data);
-        Assert.True(result.Data.ToArray().SequenceEqual(expectedByteArray));
+        Assert.NotNull(result[0].Data);
+        Assert.True(result[0].Data!.ToArray().SequenceEqual(expectedByteArray));
     }
 
     [Theory]
@@ -109,7 +109,7 @@ public async Task GetAudioContentUsesValidBaseUrlAsync(bool useHttpClientBaseAdd
         };
 
         // Act
-        var result = await service.GetAudioContentAsync("Some text", new OpenAITextToAudioExecutionSettings("voice"));
+        var result = await service.GetAudioContentsAsync("Some text", new OpenAITextToAudioExecutionSettings("voice"));
 
         // Assert
         Assert.StartsWith(expectedBaseAddress, this._messageHandlerStub.RequestUri!.AbsoluteUri, StringComparison.InvariantCulture);

diff --git a/dotnet/src/IntegrationTests/Connectors/OpenAI/OpenAIAudioToTextTests.cs b/dotnet/src/IntegrationTests/Connectors/OpenAI/OpenAIAudioToTextTests.cs
@@ -5,6 +5,7 @@
 using System.Threading.Tasks;
 using Microsoft.Extensions.Configuration;
 using Microsoft.SemanticKernel;
+using Microsoft.SemanticKernel.AudioToText;
 using Microsoft.SemanticKernel.Connectors.OpenAI;
 using SemanticKernel.IntegrationTests.TestSettings;
 using Xunit;
@@ -40,7 +41,11 @@ public async Task OpenAIAudioToTextTestAsync()
         OpenAIConfiguration? openAIConfiguration = this._configuration.GetSection("OpenAIAudioToText").Get<OpenAIConfiguration>();
         Assert.NotNull(openAIConfiguration);
 
-        var service = new OpenAIAudioToTextService(openAIConfiguration.ModelId, openAIConfiguration.ApiKey);
+        var kernel = Kernel.CreateBuilder()
+            .AddOpenAIAudioToText(openAIConfiguration.ModelId, openAIConfiguration.ApiKey)
+            .Build();
+
+        var service = kernel.GetRequiredService<IAudioToTextService>();
 
         await using Stream audio = File.OpenRead($"./TestData/{Filename}");
         var audioData = await BinaryData.FromStreamAsync(audio);
@@ -61,10 +66,14 @@ public async Task AzureOpenAIAudioToTextTestAsync()
         AzureOpenAIConfiguration? azureOpenAIConfiguration = this._configuration.GetSection("AzureOpenAIAudioToText").Get<AzureOpenAIConfiguration>();
         Assert.NotNull(azureOpenAIConfiguration);
 
-        var service = new AzureOpenAIAudioToTextService(
-            azureOpenAIConfiguration.DeploymentName,
-            azureOpenAIConfiguration.Endpoint,
-            azureOpenAIConfiguration.ApiKey);
+        var kernel = Kernel.CreateBuilder()
+            .AddAzureOpenAIAudioToText(
+                azureOpenAIConfiguration.DeploymentName,
+                azureOpenAIConfiguration.Endpoint,
+                azureOpenAIConfiguration.ApiKey)
+            .Build();
+
+        var service = kernel.GetRequiredService<IAudioToTextService>();
 
         await using Stream audio = File.OpenRead($"./TestData/{Filename}");
         var audioData = await BinaryData.FromStreamAsync(audio);

diff --git a/dotnet/src/IntegrationTests/Connectors/OpenAI/OpenAITextToAudioTests.cs b/dotnet/src/IntegrationTests/Connectors/OpenAI/OpenAITextToAudioTests.cs
@@ -3,7 +3,8 @@
 using System;
 using System.Threading.Tasks;
 using Microsoft.Extensions.Configuration;
-using Microsoft.SemanticKernel.Connectors.OpenAI;
+using Microsoft.SemanticKernel;
+using Microsoft.SemanticKernel.TextToAudio;
 using SemanticKernel.IntegrationTests.TestSettings;
 using Xunit;
 using Xunit.Abstractions;
@@ -36,14 +37,18 @@ public async Task OpenAITextToAudioTestAsync()
         OpenAIConfiguration? openAIConfiguration = this._configuration.GetSection("OpenAITextToAudio").Get<OpenAIConfiguration>();
         Assert.NotNull(openAIConfiguration);
 
-        var service = new OpenAITextToAudioService(openAIConfiguration.ModelId, openAIConfiguration.ApiKey);
+        var kernel = Kernel.CreateBuilder()
+            .AddOpenAITextToAudio(openAIConfiguration.ModelId, openAIConfiguration.ApiKey)
+            .Build();
+
+        var service = kernel.GetRequiredService<ITextToAudioService>();
 
         // Act
         var result = await service.GetAudioContentAsync("The sun rises in the east and sets in the west.");
 
         // Assert
-        Assert.NotNull(result?.Data);
-        Assert.False(result.Data.IsEmpty);
+        Assert.NotNull(result.Data);
+        Assert.False(result.Data!.IsEmpty);
     }
 
     [Fact]
@@ -53,17 +58,21 @@ public async Task AzureOpenAITextToAudioTestAsync()
         AzureOpenAIConfiguration? azureOpenAIConfiguration = this._configuration.GetSection("AzureOpenAITextToAudio").Get<AzureOpenAIConfiguration>();
         Assert.NotNull(azureOpenAIConfiguration);
 
-        var service = new AzureOpenAITextToAudioService(
-            azureOpenAIConfiguration.DeploymentName,
-            azureOpenAIConfiguration.Endpoint,
-            azureOpenAIConfiguration.ApiKey);
+        var kernel = Kernel.CreateBuilder()
+            .AddAzureOpenAITextToAudio(
+                azureOpenAIConfiguration.DeploymentName,
+                azureOpenAIConfiguration.Endpoint,
+                azureOpenAIConfiguration.ApiKey)
+            .Build();
+
+        var service = kernel.GetRequiredService<ITextToAudioService>();
 
         // Act
         var result = await service.GetAudioContentAsync("The sun rises in the east and sets in the west.");
 
         // Assert
-        Assert.NotNull(result?.Data);
-        Assert.False(result.Data.IsEmpty);
+        Assert.NotNull(result.Data);
+        Assert.False(result.Data!.IsEmpty);
     }
 
     public void Dispose()

diff --git a/dotnet/src/SemanticKernel.Abstractions/AI/AudioToText/AudioToTextServiceExtensions.cs b/dotnet/src/SemanticKernel.Abstractions/AI/AudioToText/AudioToTextServiceExtensions.cs
@@ -0,0 +1,33 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System.Diagnostics.CodeAnalysis;
+using System.Linq;
+using System.Threading;
+using System.Threading.Tasks;
+
+namespace Microsoft.SemanticKernel.AudioToText;
+
+/// <summary>
+/// Class with extension methods for <see cref="IAudioToTextService"/> interface.
+/// </summary>
+[Experimental("SKEXP0005")]
+public static class AudioToTextServiceExtensions
+{
+    /// <summary>
+    /// Get text content from audio content.
+    /// </summary>
+    /// <param name="audioToTextService">Target <see cref="IAudioToTextService"/> instance.</param>
+    /// <param name="content">Audio content.</param>
+    /// <param name="executionSettings">The AI execution settings (optional).</param>
+    /// <param name="kernel">The <see cref="Kernel"/> containing services, plugins, and other state for use throughout the operation.</param>
+    /// <param name="cancellationToken">The <see cref="CancellationToken"/> to monitor for cancellation requests. The default is <see cref="CancellationToken.None"/>.</param>
+    /// <returns>Text content from audio content.</returns>
+    public static async Task<TextContent> GetTextContentAsync(
+        this IAudioToTextService audioToTextService,
+        AudioContent content,
+        PromptExecutionSettings? executionSettings = null,
+        Kernel? kernel = null,
+        CancellationToken cancellationToken = default)
+        => (await audioToTextService.GetTextContentsAsync(content, executionSettings, kernel, cancellationToken).ConfigureAwait(false))
+        .Single();
+}