-
Notifications
You must be signed in to change notification settings - Fork 2.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
### Motivation and Context <!-- Thank you for your contribution to the semantic-kernel repo! Please help reviewers and future users, providing the following information: 1. Why is this change required? 2. What problem does it solve? 3. What scenario does it contribute to? 4. If it fixes an open issue, please link to the issue here. --> Issue #5016 ### Description <!-- Describe your changes, the overall approach, the underlying design. These notes will help understanding how your code works. Thanks! --> Added samples which demonstrate using ITextToAudioService and IAudioToTextService. ### Contribution Checklist <!-- Before submitting this PR, please make sure: --> - [x] The code builds clean without any errors or warnings - [x] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [x] All unit tests pass, and I have added new tests where possible - [x] I didn't break anyone 😄 --------- Co-authored-by: Dmytro Struk <13853051+dmytrostruk@users.noreply.github.com>
- Loading branch information
1 parent
6fb8a46
commit 6feffd3
Showing
2 changed files
with
94 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using System; | ||
using System.IO; | ||
using System.Threading.Tasks; | ||
using Microsoft.SemanticKernel; | ||
using Microsoft.SemanticKernel.AudioToText; | ||
using Microsoft.SemanticKernel.Connectors.OpenAI; | ||
using Microsoft.SemanticKernel.Contents; | ||
using Microsoft.SemanticKernel.TextToAudio; | ||
using Xunit; | ||
using Xunit.Abstractions; | ||
|
||
namespace Examples; | ||
|
||
/// <summary> | ||
/// Represents a class that demonstrates audio processing functionality. | ||
/// </summary> | ||
public sealed class Example28_Audio : BaseTest | ||
{ | ||
private const string TextToAudioModel = "tts-1"; | ||
private const string AudioToTextModel = "whisper-1"; | ||
private const string AudioFilePath = "input.wav"; | ||
|
||
[Fact(Skip = "Needs setup.")] | ||
public async Task TextToAudioAsync() | ||
{ | ||
// Create a kernel with OpenAI text to audio service | ||
var kernel = Kernel.CreateBuilder() | ||
.AddOpenAITextToAudio( | ||
modelId: TextToAudioModel, | ||
apiKey: TestConfiguration.OpenAI.ApiKey) | ||
.Build(); | ||
|
||
var textToAudioService = kernel.GetRequiredService<ITextToAudioService>(); | ||
|
||
string sampleText = "Hello, my name is John. I am a software engineer. I am working on a project to convert text to audio."; | ||
|
||
// Set execution settings (optional) | ||
OpenAITextToAudioExecutionSettings executionSettings = new("alloy") | ||
{ | ||
Voice = "alloy", // The voice to use when generating the audio. | ||
// Supported voices are alloy, echo, fable, onyx, nova, and shimmer. | ||
ResponseFormat = "mp3", // The format to audio in. | ||
// Supported formats are mp3, opus, aac, and flac. | ||
Speed = 1.0f // The speed of the generated audio. | ||
// Select a value from 0.25 to 4.0. 1.0 is the default. | ||
}; | ||
|
||
// Convert text to audio | ||
AudioContent audioContent = await textToAudioService.GetAudioContentAsync(sampleText, executionSettings); | ||
|
||
// Save audio content to a file | ||
// await File.WriteAllBytesAsync("output.wav", audioContent.Data.ToArray()); | ||
} | ||
|
||
[Fact(Skip = "Setup audio file input before running this test.")] | ||
public async Task AudioToTextAsync() | ||
{ | ||
// Create a kernel with OpenAI audio to text service | ||
var kernel = Kernel.CreateBuilder() | ||
.AddOpenAIAudioToText( | ||
modelId: AudioToTextModel, | ||
apiKey: TestConfiguration.OpenAI.ApiKey) | ||
.Build(); | ||
|
||
var audioToTextService = kernel.GetRequiredService<IAudioToTextService>(); | ||
|
||
// Set execution settings (optional) | ||
OpenAIAudioToTextExecutionSettings executionSettings = new("input.wav") | ||
{ | ||
Language = "en", // The language of the audio data as two-letter ISO-639-1 language code (e.g. 'en' or 'es'). | ||
Prompt = "sample prompt", // An optional text to guide the model's style or continue a previous audio segment. | ||
// The prompt should match the audio language. | ||
ResponseFormat = "json", // The format to return the transcribed text in. | ||
// Supported formats are json, text, srt, verbose_json, or vtt. Default is 'json'. | ||
Temperature = 0.3f, // The randomness of the generated text. | ||
// Select a value from 0.0 to 1.0. 0 is the default. | ||
}; | ||
|
||
// Read audio content from a file | ||
ReadOnlyMemory<byte> audioData = await File.ReadAllBytesAsync(AudioFilePath); | ||
AudioContent audioContent = new(new BinaryData(audioData)); | ||
|
||
// Convert audio to text | ||
var textContent = await audioToTextService.GetTextContentAsync(audioContent, executionSettings); | ||
|
||
// Output the transcribed text | ||
this.WriteLine(textContent.Text); | ||
} | ||
|
||
public Example28_Audio(ITestOutputHelper output) : base(output) { } | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters