Skip to content

Commit

Permalink
.Net ImageToText Abstraction + HuggingFace Connector ImageToText supp…
Browse files Browse the repository at this point in the history
…ort. (#5150)

### Motivation and Context

Add support for a new modality Image -> Text including a Hugging Face
service for it.

Resolves #4947

---------

Co-authored-by: Krzysztof Kasprowicz <60486987+Krzysztof318@users.noreply.github.com>
Co-authored-by: Dmytro Struk <13853051+dmytrostruk@users.noreply.github.com>
  • Loading branch information
3 people committed Feb 27, 2024
1 parent 0f97fdc commit 0e10dd5
Show file tree
Hide file tree
Showing 16 changed files with 551 additions and 15 deletions.
1 change: 1 addition & 0 deletions dotnet/SK-dotnet.sln
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
Directory.Build.targets = Directory.Build.targets
Directory.Packages.props = Directory.Packages.props
..\.github\workflows\dotnet-format.yml = ..\.github\workflows\dotnet-format.yml
docs\EXPERIMENTS.md = docs\EXPERIMENTS.md
..\nuget.config = ..\nuget.config
..\README.md = ..\README.md
EndProjectSection
Expand Down
52 changes: 52 additions & 0 deletions dotnet/samples/KernelSyntaxExamples/Example85_ImageToText.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// Copyright (c) Microsoft. All rights reserved.

using System;
using System.Threading.Tasks;
using Microsoft.SemanticKernel;
using Microsoft.SemanticKernel.Connectors.HuggingFace;
using Microsoft.SemanticKernel.ImageToText;
using Resources;
using Xunit;
using Xunit.Abstractions;

namespace Examples;

/// <summary>
/// Represents a class that demonstrates image-to-text functionality.
/// </summary>
public sealed class Example85_ImageToText : BaseTest
{
private const string ImageToTextModel = "Salesforce/blip-image-captioning-base";
private const string ImageFilePath = "test_image.jpg";

[Fact]
public async Task ImageToTextAsync()
{
// Create a kernel with HuggingFace image-to-text service
var kernel = Kernel.CreateBuilder()
.AddHuggingFaceImageToText(
model: ImageToTextModel,
apiKey: TestConfiguration.HuggingFace.ApiKey)
.Build();

var imageToText = kernel.GetRequiredService<IImageToTextService>();

// Set execution settings (optional)
HuggingFacePromptExecutionSettings executionSettings = new()
{
MaxTokens = 500
};

// Read image content from a file
ReadOnlyMemory<byte> imageData = await EmbeddedResource.ReadAllAsync(ImageFilePath);
ImageContent imageContent = new(new BinaryData(imageData, "image/jpeg"));

// Convert image to text
var textContent = await imageToText.GetTextContentAsync(imageContent, executionSettings);

// Output image description
this.WriteLine(textContent.Text);
}

public Example85_ImageToText(ITestOutputHelper output) : base(output) { }
}
15 changes: 15 additions & 0 deletions dotnet/samples/KernelSyntaxExamples/Resources/EmbeddedResource.cs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
// Copyright (c) Microsoft. All rights reserved.

using System;
using System.IO;
using System.Reflection;
using System.Threading.Tasks;
using RepoUtils;

namespace Resources;
Expand Down Expand Up @@ -49,4 +51,17 @@ internal static string Read(string fileName)
var resourceName = $"{s_namespace}." + fileName;
return assembly.GetManifestResourceStream(resourceName);
}

internal async static Task<ReadOnlyMemory<byte>> ReadAllAsync(string fileName)
{
await using Stream? resourceStream = ReadStream(fileName);
using var memoryStream = new MemoryStream();

// Copy the resource stream to the memory stream
await resourceStream!.CopyToAsync(memoryStream);

// Convert the memory stream's buffer to ReadOnlyMemory<byte>
// Note: ToArray() creates a copy of the buffer, which is fine for converting to ReadOnlyMemory<byte>
return new ReadOnlyMemory<byte>(memoryStream.ToArray());
}
}
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// Copyright (c) Microsoft. All rights reserved.

using System;
using System.IO;
using System.Net.Http;
using System.Threading;
Expand All @@ -23,6 +24,11 @@ internal static string GetTestResponse(string fileName)
return File.ReadAllText($"./TestData/{fileName}");
}

internal static ReadOnlyMemory<byte> GetTestResponseBytes(string fileName)
{
return File.ReadAllBytes($"./TestData/{fileName}");
}

/// <summary>
/// Returns mocked instance of <see cref="HttpClientHandler"/>.
/// </summary>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
// Copyright (c) Microsoft. All rights reserved.

using System;
using System.Linq;
using System.Net.Http;
using System.Text;
using System.Threading.Tasks;
using Microsoft.SemanticKernel;
using Microsoft.SemanticKernel.Connectors.HuggingFace;
using Microsoft.SemanticKernel.ImageToText;
using Xunit;

namespace SemanticKernel.Connectors.HuggingFace.UnitTests;

/// <summary>
/// Unit tests for <see cref="HuggingFaceImageToTextService"/> class.
/// </summary>
public sealed class HuggingFaceImageToTextTests : IDisposable
{
private readonly HttpMessageHandlerStub _messageHandlerStub;
private readonly HttpClient _httpClient;
private readonly ImageContent _imageContentInput;

public HuggingFaceImageToTextTests()
{
this._messageHandlerStub = new HttpMessageHandlerStub();
this._messageHandlerStub.ResponseToReturn.Content = new StringContent(HuggingFaceTestHelper.GetTestResponse("imagetotext_test_response.json"));

this._httpClient = new HttpClient(this._messageHandlerStub, false);

var expectedPayload = HuggingFaceTestHelper.GetTestResponseBytes("imagetotext_test_request.jpg");
this._imageContentInput = new ImageContent(new BinaryData(expectedPayload, "image/jpeg"), "model");
}

[Fact]
public async Task SpecifiedModelShouldBeUsedAsync()
{
//Arrange
var sut = new HuggingFaceImageToTextService("fake-model", httpClient: this._httpClient);

//Act
await sut.GetTextContentsAsync(this._imageContentInput);

//Assert
Assert.EndsWith("/fake-model", this._messageHandlerStub.RequestUri?.AbsoluteUri, StringComparison.OrdinalIgnoreCase);
}

[Fact]
public async Task NoAuthorizationHeaderShouldBeAddedIfApiKeyIsNotProvidedAsync()
{
//Arrange
var sut = new HuggingFaceImageToTextService("fake-model", apiKey: null, httpClient: this._httpClient);

//Act
await sut.GetTextContentsAsync(this._imageContentInput);

//Assert
Assert.False(this._messageHandlerStub.RequestHeaders?.Contains("Authorization"));
}

[Fact]
public async Task AuthorizationHeaderShouldBeAddedIfApiKeyIsProvidedAsync()
{
//Arrange
var sut = new HuggingFaceImageToTextService("fake-model", apiKey: "fake-api-key", httpClient: this._httpClient);

//Act
await sut.GetTextContentsAsync(this._imageContentInput);

//Assert
Assert.True(this._messageHandlerStub.RequestHeaders?.Contains("Authorization"));

var values = this._messageHandlerStub.RequestHeaders!.GetValues("Authorization");

var value = values.SingleOrDefault();
Assert.Equal("Bearer fake-api-key", value);
}

[Fact]
public async Task UserAgentHeaderShouldBeUsedAsync()
{
//Arrange
var sut = new HuggingFaceImageToTextService("fake-model", httpClient: this._httpClient);

//Act
await sut.GetTextContentsAsync(this._imageContentInput);

//Assert
Assert.True(this._messageHandlerStub.RequestHeaders?.Contains("User-Agent"));

var values = this._messageHandlerStub.RequestHeaders!.GetValues("User-Agent");

var value = values.SingleOrDefault();
Assert.Equal("Semantic-Kernel", value);
}

[Fact]
public async Task ProvidedEndpointShouldBeUsedAsync()
{
//Arrange
var sut = new HuggingFaceImageToTextService("fake-model", endpoint: new Uri("https://fake-random-test-host/fake-path"), httpClient: this._httpClient);

//Act
await sut.GetTextContentsAsync(this._imageContentInput);

//Assert
Assert.StartsWith("https://fake-random-test-host/fake-path", this._messageHandlerStub.RequestUri?.AbsoluteUri, StringComparison.OrdinalIgnoreCase);
}

[Fact]
public async Task HttpClientBaseAddressShouldBeUsedAsync()
{
//Arrange
this._httpClient.BaseAddress = new Uri("https://fake-random-test-host/fake-path");

var sut = new HuggingFaceImageToTextService("fake-model", httpClient: this._httpClient);

//Act
await sut.GetTextContentsAsync(this._imageContentInput);

//Assert
Assert.StartsWith("https://fake-random-test-host/fake-path", this._messageHandlerStub.RequestUri?.AbsoluteUri, StringComparison.OrdinalIgnoreCase);
}

[Fact]
public async Task DefaultAddressShouldBeUsedAsync()
{
//Arrange
var sut = new HuggingFaceImageToTextService("fake-model", httpClient: this._httpClient);

//Act
await sut.GetTextContentsAsync(this._imageContentInput);

//Assert
Assert.StartsWith("https://api-inference.huggingface.co/models", this._messageHandlerStub.RequestUri?.AbsoluteUri, StringComparison.OrdinalIgnoreCase);
}

[Fact]
public async Task ModelUrlShouldBeBuiltSuccessfullyAsync()
{
//Arrange
var sut = new HuggingFaceImageToTextService("fake-model", endpoint: new Uri("https://fake-random-test-host/fake-path"), httpClient: this._httpClient);

//Act
await sut.GetTextContentsAsync(this._imageContentInput);

//Assert
Assert.Equal("https://fake-random-test-host/fake-path/models/fake-model", this._messageHandlerStub.RequestUri?.AbsoluteUri);
}

[Fact]
public async Task ShouldSendPromptToServiceAsync()
{
//Arrange
var sut = new HuggingFaceImageToTextService("fake-model", httpClient: this._httpClient);

//Act
await sut.GetTextContentsAsync(this._imageContentInput);

//Assert
var requestPayload = this._messageHandlerStub.RequestContent;

Assert.NotNull(requestPayload);
Assert.Equal(this._imageContentInput.Data!.ToArray(), requestPayload);
}

[Fact]
public async Task ShouldHandleServiceResponseAsync()
{
//Arrange
var sut = new HuggingFaceImageToTextService("fake-model", endpoint: new Uri("https://fake-random-test-host/fake-path"), httpClient: this._httpClient);
var expectedPayload = HuggingFaceTestHelper.GetTestResponseBytes("imagetotext_test_request.jpg");

//Act
var contents = await sut.GetTextContentsAsync(this._imageContentInput);

//Assert
Assert.NotNull(contents);

var content = contents.SingleOrDefault();
Assert.NotNull(content);
Assert.Equal("This is test completion response", content.Text);
}

[Fact]
public async Task GetTextContentsShouldHaveModelIdDefinedAsync()
{
//Arrange
var sut = new HuggingFaceImageToTextService("fake-model", endpoint: new Uri("https://fake-random-test-host/fake-path"), httpClient: this._httpClient);

//Act
var contents = await sut.GetTextContentsAsync(this._imageContentInput);
this._messageHandlerStub.ResponseToReturn = new HttpResponseMessage(System.Net.HttpStatusCode.OK)
{
Content = new StringContent(@"
[
{
""generated_text"": ""Why the sky is blue? | Dept. of Science & Mathematics Education | University of Notre Dame\nWhen I was in high school I had a pretty simple conception of reality. I believed that if something made sense to me, then it must also be true. I believed that some problems were so fundamental that I couldn’t understand""
}
]",
Encoding.UTF8,
"application/json")
};

// Act
var textContent = await sut.GetTextContentAsync(this._imageContentInput);

// Assert
Assert.NotNull(textContent.ModelId);
Assert.Equal("fake-model", textContent.ModelId);
}

public void Dispose()
{
this._httpClient.Dispose();
this._messageHandlerStub.Dispose();
}
}
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[
{
"generated_text": "This is test completion response"
}
]
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<Project Sdk="Microsoft.NET.Sdk">
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<!-- THIS PROPERTY GROUP MUST COME FIRST -->
Expand Down

0 comments on commit 0e10dd5

Please sign in to comment.