Skip to content

Commit

Permalink
.Net: Updated audio abstractions to return multiple values (#5149)
Browse files Browse the repository at this point in the history
### Motivation and Context

<!-- Thank you for your contribution to the semantic-kernel repo!
Please help reviewers and future users, providing the following
information:
  1. Why is this change required?
  2. What problem does it solve?
  3. What scenario does it contribute to?
  4. If it fixes an open issue, please link to the issue here.
-->

This PR contains changes for audio abstractions to align with
abstraction pattern in SK to return multiple values instead of just one
to be more flexible in cases when AI connector returns multiple results
(like in chat completion service with choices).

For easier usage when only one result is provided by AI connector, it's
possible to use extension method that is responsible for returning
single result.

### Contribution Checklist

<!-- Before submitting this PR, please make sure: -->

- [x] The code builds clean without any errors or warnings
- [x] The PR follows the [SK Contribution
Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md)
and the [pre-submission formatting
script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts)
raises no violations
- [x] All unit tests pass, and I have added new tests where possible
- [x] I didn't break anyone 😄
  • Loading branch information
dmytrostruk committed Feb 26, 2024
1 parent 5817c2b commit e4d2975
Show file tree
Hide file tree
Showing 20 changed files with 142 additions and 55 deletions.
11 changes: 6 additions & 5 deletions dotnet/samples/KernelSyntaxExamples/Example82_Audio.cs
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
// Copyright (c) Microsoft. All rights reserved.

using System;
using System.IO;
using System.Threading.Tasks;
using Microsoft.SemanticKernel;
using Microsoft.SemanticKernel.AudioToText;
using Microsoft.SemanticKernel.Connectors.OpenAI;
using Microsoft.SemanticKernel.TextToAudio;
using Resources;
using Xunit;
using Xunit.Abstractions;

Expand All @@ -19,7 +19,7 @@ public sealed class Example82_Audio : BaseTest
{
private const string TextToAudioModel = "tts-1";
private const string AudioToTextModel = "whisper-1";
private const string AudioFilePath = "audio.wav";
private const string AudioFilename = "test_audio.wav";

[Fact(Skip = "Uncomment the line to write the audio file output before running this test.")]
public async Task TextToAudioAsync()
Expand Down Expand Up @@ -66,7 +66,7 @@ public async Task AudioToTextAsync()
var audioToTextService = kernel.GetRequiredService<IAudioToTextService>();

// Set execution settings (optional)
OpenAIAudioToTextExecutionSettings executionSettings = new(AudioFilePath)
OpenAIAudioToTextExecutionSettings executionSettings = new(AudioFilename)
{
Language = "en", // The language of the audio data as two-letter ISO-639-1 language code (e.g. 'en' or 'es').
Prompt = "sample prompt", // An optional text to guide the model's style or continue a previous audio segment.
Expand All @@ -78,8 +78,9 @@ public async Task AudioToTextAsync()
};

// Read audio content from a file
ReadOnlyMemory<byte> audioData = await File.ReadAllBytesAsync(AudioFilePath);
AudioContent audioContent = new(new BinaryData(audioData));
await using var audioFileStream = EmbeddedResource.ReadStream(AudioFilename);
var audioFileBinaryData = await BinaryData.FromStreamAsync(audioFileStream!);
AudioContent audioContent = new(audioFileBinaryData);

// Convert audio to text
var textContent = await audioToTextService.GetTextContentAsync(audioContent, executionSettings);
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ public sealed class AzureOpenAIAudioToTextService : IAudioToTextService
}

/// <inheritdoc/>
public Task<TextContent> GetTextContentAsync(
public Task<IReadOnlyList<TextContent>> GetTextContentsAsync(
AudioContent content,
PromptExecutionSettings? executionSettings = null,
Kernel? kernel = null,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ public sealed class OpenAIAudioToTextService : IAudioToTextService
}

/// <inheritdoc/>
public Task<TextContent> GetTextContentAsync(
public Task<IReadOnlyList<TextContent>> GetTextContentsAsync(
AudioContent content,
PromptExecutionSettings? executionSettings = null,
Kernel? kernel = null,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ internal sealed class AzureOpenAITextToAudioClient
this._logger = logger ?? NullLogger.Instance;
}

internal async Task<AudioContent> GetAudioContentAsync(
internal async Task<IReadOnlyList<AudioContent>> GetAudioContentsAsync(
string text,
PromptExecutionSettings? executionSettings,
CancellationToken cancellationToken)
Expand All @@ -79,7 +79,7 @@ internal sealed class AzureOpenAITextToAudioClient

var binaryData = await BinaryData.FromStreamAsync(stream, cancellationToken).ConfigureAwait(false);

return new AudioContent(binaryData, modelId);
return new List<AudioContent> { new(binaryData, modelId) };
}

internal void AddAttribute(string key, string? value)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ await foreach (Completions completions in response)
return result;
}

internal async Task<TextContent> GetTextContentFromAudioAsync(
internal async Task<IReadOnlyList<TextContent>> GetTextContentFromAudioAsync(
AudioContent content,
PromptExecutionSettings? executionSettings,
CancellationToken cancellationToken)
Expand All @@ -265,7 +265,7 @@ await foreach (Completions completions in response)

AudioTranscription responseData = (await RunRequestAsync(() => this.Client.GetAudioTranscriptionAsync(audioOptions, cancellationToken)).ConfigureAwait(false)).Value;

return new TextContent(responseData.Text, this.DeploymentOrModelName, metadata: GetResponseMetadata(responseData));
return new List<TextContent> { new(responseData.Text, this.DeploymentOrModelName, metadata: GetResponseMetadata(responseData)) };
}

/// <summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ internal sealed class OpenAITextToAudioClient
this._logger = logger ?? NullLogger.Instance;
}

internal async Task<AudioContent> GetAudioContentAsync(
internal async Task<IReadOnlyList<AudioContent>> GetAudioContentsAsync(
string text,
PromptExecutionSettings? executionSettings,
CancellationToken cancellationToken)
Expand All @@ -71,7 +71,7 @@ internal sealed class OpenAITextToAudioClient

var binaryData = await BinaryData.FromStreamAsync(stream, cancellationToken).ConfigureAwait(false);

return new AudioContent(binaryData, this._modelId);
return new List<AudioContent> { new(binaryData, this._modelId) };
}

internal void AddAttribute(string key, string? value)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,10 @@ public sealed class AzureOpenAITextToAudioService : ITextToAudioService
}

/// <inheritdoc/>
public Task<AudioContent> GetAudioContentAsync(
public Task<IReadOnlyList<AudioContent>> GetAudioContentsAsync(
string text,
PromptExecutionSettings? executionSettings = null,
Kernel? kernel = null,
CancellationToken cancellationToken = default)
=> this._client.GetAudioContentAsync(text, executionSettings, cancellationToken);
=> this._client.GetAudioContentsAsync(text, executionSettings, cancellationToken);
}
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,10 @@ public sealed class OpenAITextToAudioService : ITextToAudioService
}

/// <inheritdoc/>
public Task<AudioContent> GetAudioContentAsync(
public Task<IReadOnlyList<AudioContent>> GetAudioContentsAsync(
string text,
PromptExecutionSettings? executionSettings = null,
Kernel? kernel = null,
CancellationToken cancellationToken = default)
=> this._client.GetAudioContentAsync(text, executionSettings, cancellationToken);
=> this._client.GetAudioContentsAsync(text, executionSettings, cancellationToken);
}
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ public async Task GetTextContentWithInvalidSettingsThrowsExceptionAsync(OpenAIAu
};

// Act
var exception = await Record.ExceptionAsync(() => service.GetTextContentAsync(new AudioContent(new BinaryData("data")), settings));
var exception = await Record.ExceptionAsync(() => service.GetTextContentsAsync(new AudioContent(new BinaryData("data")), settings));

// Assert
Assert.NotNull(exception);
Expand All @@ -106,11 +106,11 @@ public async Task GetTextContentByDefaultWorksCorrectlyAsync()
};

// Act
var result = await service.GetTextContentAsync(new AudioContent(new BinaryData("data")), new OpenAIAudioToTextExecutionSettings("file.mp3"));
var result = await service.GetTextContentsAsync(new AudioContent(new BinaryData("data")), new OpenAIAudioToTextExecutionSettings("file.mp3"));

// Assert
Assert.NotNull(result);
Assert.Equal("Test audio-to-text response", result.Text);
Assert.Equal("Test audio-to-text response", result[0].Text);
}

public void Dispose()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,11 @@ public async Task GetTextContentByDefaultWorksCorrectlyAsync()
};

// Act
var result = await service.GetTextContentAsync(new AudioContent(new BinaryData("data")), new OpenAIAudioToTextExecutionSettings("file.mp3"));
var result = await service.GetTextContentsAsync(new AudioContent(new BinaryData("data")), new OpenAIAudioToTextExecutionSettings("file.mp3"));

// Assert
Assert.NotNull(result);
Assert.Equal("Test audio-to-text response", result.Text);
Assert.Equal("Test audio-to-text response", result[0].Text);
}

public void Dispose()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ public async Task GetAudioContentWithInvalidSettingsThrowsExceptionAsync(OpenAIT
};

// Act
var exception = await Record.ExceptionAsync(() => service.GetAudioContentAsync("Some text", settings));
var exception = await Record.ExceptionAsync(() => service.GetAudioContentsAsync("Some text", settings));

// Assert
Assert.NotNull(exception);
Expand All @@ -81,11 +81,11 @@ public async Task GetAudioContentByDefaultWorksCorrectlyAsync()
};

// Act
var result = await service.GetAudioContentAsync("Some text", new OpenAITextToAudioExecutionSettings("voice"));
var result = await service.GetAudioContentsAsync("Some text", new OpenAITextToAudioExecutionSettings("voice"));

// Assert
Assert.NotNull(result?.Data);
Assert.True(result.Data.ToArray().SequenceEqual(expectedByteArray));
Assert.NotNull(result[0].Data);
Assert.True(result[0].Data!.ToArray().SequenceEqual(expectedByteArray));
}

[Theory]
Expand All @@ -110,7 +110,7 @@ public async Task GetAudioContentUsesValidBaseUrlAsync(bool useHttpClientBaseAdd
};

// Act
var result = await service.GetAudioContentAsync("Some text", new OpenAITextToAudioExecutionSettings("voice"));
var result = await service.GetAudioContentsAsync("Some text", new OpenAITextToAudioExecutionSettings("voice"));

// Assert
Assert.StartsWith(expectedBaseAddress, this._messageHandlerStub.RequestUri!.AbsoluteUri, StringComparison.InvariantCulture);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public async Task GetAudioContentWithInvalidSettingsThrowsExceptionAsync(OpenAIT
};

// Act
var exception = await Record.ExceptionAsync(() => service.GetAudioContentAsync("Some text", settings));
var exception = await Record.ExceptionAsync(() => service.GetAudioContentsAsync("Some text", settings));

// Assert
Assert.NotNull(exception);
Expand All @@ -80,11 +80,11 @@ public async Task GetAudioContentByDefaultWorksCorrectlyAsync()
};

// Act
var result = await service.GetAudioContentAsync("Some text", new OpenAITextToAudioExecutionSettings("voice"));
var result = await service.GetAudioContentsAsync("Some text", new OpenAITextToAudioExecutionSettings("voice"));

// Assert
Assert.NotNull(result?.Data);
Assert.True(result.Data.ToArray().SequenceEqual(expectedByteArray));
Assert.NotNull(result[0].Data);
Assert.True(result[0].Data!.ToArray().SequenceEqual(expectedByteArray));
}

[Theory]
Expand All @@ -109,7 +109,7 @@ public async Task GetAudioContentUsesValidBaseUrlAsync(bool useHttpClientBaseAdd
};

// Act
var result = await service.GetAudioContentAsync("Some text", new OpenAITextToAudioExecutionSettings("voice"));
var result = await service.GetAudioContentsAsync("Some text", new OpenAITextToAudioExecutionSettings("voice"));

// Assert
Assert.StartsWith(expectedBaseAddress, this._messageHandlerStub.RequestUri!.AbsoluteUri, StringComparison.InvariantCulture);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using System.Threading.Tasks;
using Microsoft.Extensions.Configuration;
using Microsoft.SemanticKernel;
using Microsoft.SemanticKernel.AudioToText;
using Microsoft.SemanticKernel.Connectors.OpenAI;
using SemanticKernel.IntegrationTests.TestSettings;
using Xunit;
Expand Down Expand Up @@ -40,7 +41,11 @@ public async Task OpenAIAudioToTextTestAsync()
OpenAIConfiguration? openAIConfiguration = this._configuration.GetSection("OpenAIAudioToText").Get<OpenAIConfiguration>();
Assert.NotNull(openAIConfiguration);

var service = new OpenAIAudioToTextService(openAIConfiguration.ModelId, openAIConfiguration.ApiKey);
var kernel = Kernel.CreateBuilder()
.AddOpenAIAudioToText(openAIConfiguration.ModelId, openAIConfiguration.ApiKey)
.Build();

var service = kernel.GetRequiredService<IAudioToTextService>();

await using Stream audio = File.OpenRead($"./TestData/{Filename}");
var audioData = await BinaryData.FromStreamAsync(audio);
Expand All @@ -61,10 +66,14 @@ public async Task AzureOpenAIAudioToTextTestAsync()
AzureOpenAIConfiguration? azureOpenAIConfiguration = this._configuration.GetSection("AzureOpenAIAudioToText").Get<AzureOpenAIConfiguration>();
Assert.NotNull(azureOpenAIConfiguration);

var service = new AzureOpenAIAudioToTextService(
azureOpenAIConfiguration.DeploymentName,
azureOpenAIConfiguration.Endpoint,
azureOpenAIConfiguration.ApiKey);
var kernel = Kernel.CreateBuilder()
.AddAzureOpenAIAudioToText(
azureOpenAIConfiguration.DeploymentName,
azureOpenAIConfiguration.Endpoint,
azureOpenAIConfiguration.ApiKey)
.Build();

var service = kernel.GetRequiredService<IAudioToTextService>();

await using Stream audio = File.OpenRead($"./TestData/{Filename}");
var audioData = await BinaryData.FromStreamAsync(audio);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
using System;
using System.Threading.Tasks;
using Microsoft.Extensions.Configuration;
using Microsoft.SemanticKernel.Connectors.OpenAI;
using Microsoft.SemanticKernel;
using Microsoft.SemanticKernel.TextToAudio;
using SemanticKernel.IntegrationTests.TestSettings;
using Xunit;
using Xunit.Abstractions;
Expand Down Expand Up @@ -36,14 +37,18 @@ public async Task OpenAITextToAudioTestAsync()
OpenAIConfiguration? openAIConfiguration = this._configuration.GetSection("OpenAITextToAudio").Get<OpenAIConfiguration>();
Assert.NotNull(openAIConfiguration);

var service = new OpenAITextToAudioService(openAIConfiguration.ModelId, openAIConfiguration.ApiKey);
var kernel = Kernel.CreateBuilder()
.AddOpenAITextToAudio(openAIConfiguration.ModelId, openAIConfiguration.ApiKey)
.Build();

var service = kernel.GetRequiredService<ITextToAudioService>();

// Act
var result = await service.GetAudioContentAsync("The sun rises in the east and sets in the west.");

// Assert
Assert.NotNull(result?.Data);
Assert.False(result.Data.IsEmpty);
Assert.NotNull(result.Data);
Assert.False(result.Data!.IsEmpty);
}

[Fact]
Expand All @@ -53,17 +58,21 @@ public async Task AzureOpenAITextToAudioTestAsync()
AzureOpenAIConfiguration? azureOpenAIConfiguration = this._configuration.GetSection("AzureOpenAITextToAudio").Get<AzureOpenAIConfiguration>();
Assert.NotNull(azureOpenAIConfiguration);

var service = new AzureOpenAITextToAudioService(
azureOpenAIConfiguration.DeploymentName,
azureOpenAIConfiguration.Endpoint,
azureOpenAIConfiguration.ApiKey);
var kernel = Kernel.CreateBuilder()
.AddAzureOpenAITextToAudio(
azureOpenAIConfiguration.DeploymentName,
azureOpenAIConfiguration.Endpoint,
azureOpenAIConfiguration.ApiKey)
.Build();

var service = kernel.GetRequiredService<ITextToAudioService>();

// Act
var result = await service.GetAudioContentAsync("The sun rises in the east and sets in the west.");

// Assert
Assert.NotNull(result?.Data);
Assert.False(result.Data.IsEmpty);
Assert.NotNull(result.Data);
Assert.False(result.Data!.IsEmpty);
}

public void Dispose()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Copyright (c) Microsoft. All rights reserved.

using System.Diagnostics.CodeAnalysis;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;

namespace Microsoft.SemanticKernel.AudioToText;

/// <summary>
/// Class with extension methods for <see cref="IAudioToTextService"/> interface.
/// </summary>
[Experimental("SKEXP0005")]
public static class AudioToTextServiceExtensions
{
/// <summary>
/// Get text content from audio content.
/// </summary>
/// <param name="audioToTextService">Target <see cref="IAudioToTextService"/> instance.</param>
/// <param name="content">Audio content.</param>
/// <param name="executionSettings">The AI execution settings (optional).</param>
/// <param name="kernel">The <see cref="Kernel"/> containing services, plugins, and other state for use throughout the operation.</param>
/// <param name="cancellationToken">The <see cref="CancellationToken"/> to monitor for cancellation requests. The default is <see cref="CancellationToken.None"/>.</param>
/// <returns>Text content from audio content.</returns>
public static async Task<TextContent> GetTextContentAsync(
this IAudioToTextService audioToTextService,
AudioContent content,
PromptExecutionSettings? executionSettings = null,
Kernel? kernel = null,
CancellationToken cancellationToken = default)
=> (await audioToTextService.GetTextContentsAsync(content, executionSettings, kernel, cancellationToken).ConfigureAwait(false))
.Single();
}

0 comments on commit e4d2975

Please sign in to comment.