diff --git a/.github/_typos.toml b/.github/_typos.toml index 841b71e15743..a56c70770c47 100644 --- a/.github/_typos.toml +++ b/.github/_typos.toml @@ -27,6 +27,7 @@ EOF = "EOF" # End of File ans = "ans" # Short for answers arange = "arange" # Method in Python numpy package prompty = "prompty" # prompty is a format name. +ist = "ist" # German language [default.extend-identifiers] ags = "ags" # Azure Graph Service diff --git a/dotnet/SK-dotnet.sln b/dotnet/SK-dotnet.sln index b661c90a9405..8b58bb93f4aa 100644 --- a/dotnet/SK-dotnet.sln +++ b/dotnet/SK-dotnet.sln @@ -307,6 +307,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Connectors.Memory.SqlServer EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "CodeInterpreterPlugin", "samples\Demos\CodeInterpreterPlugin\CodeInterpreterPlugin.csproj", "{3ED53702-0E53-473A-A0F4-645DB33541C2}" EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "QualityCheckWithFilters", "samples\Demos\QualityCheck\QualityCheckWithFilters\QualityCheckWithFilters.csproj", "{1D3EEB5B-0E06-4700-80D5-164956E43D0A}" +EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "TimePlugin", "samples\Demos\TimePlugin\TimePlugin.csproj", "{F312FCE1-12D7-4DEF-BC29-2FF6618509F3}" EndProject Global @@ -748,6 +750,12 @@ Global {3ED53702-0E53-473A-A0F4-645DB33541C2}.Publish|Any CPU.Build.0 = Debug|Any CPU {3ED53702-0E53-473A-A0F4-645DB33541C2}.Release|Any CPU.ActiveCfg = Release|Any CPU {3ED53702-0E53-473A-A0F4-645DB33541C2}.Release|Any CPU.Build.0 = Release|Any CPU + {1D3EEB5B-0E06-4700-80D5-164956E43D0A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {1D3EEB5B-0E06-4700-80D5-164956E43D0A}.Debug|Any CPU.Build.0 = Debug|Any CPU + {1D3EEB5B-0E06-4700-80D5-164956E43D0A}.Publish|Any CPU.ActiveCfg = Debug|Any CPU + {1D3EEB5B-0E06-4700-80D5-164956E43D0A}.Publish|Any CPU.Build.0 = Debug|Any CPU + {1D3EEB5B-0E06-4700-80D5-164956E43D0A}.Release|Any CPU.ActiveCfg = Release|Any CPU + {1D3EEB5B-0E06-4700-80D5-164956E43D0A}.Release|Any CPU.Build.0 = Release|Any CPU {F312FCE1-12D7-4DEF-BC29-2FF6618509F3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {F312FCE1-12D7-4DEF-BC29-2FF6618509F3}.Debug|Any CPU.Build.0 = Debug|Any CPU {F312FCE1-12D7-4DEF-BC29-2FF6618509F3}.Publish|Any CPU.ActiveCfg = Debug|Any CPU @@ -857,6 +865,7 @@ Global {6B56D8EE-9991-43E3-90B2-B8F5C5CE77C2} = {5D4C0700-BBB5-418F-A7B2-F392B9A18263} {24B8041B-92C6-4BB3-A699-C593AF5A870F} = {24503383-A8C4-4255-9998-28D70FE8E99A} {3ED53702-0E53-473A-A0F4-645DB33541C2} = {5D4C0700-BBB5-418F-A7B2-F392B9A18263} + {1D3EEB5B-0E06-4700-80D5-164956E43D0A} = {5D4C0700-BBB5-418F-A7B2-F392B9A18263} {F312FCE1-12D7-4DEF-BC29-2FF6618509F3} = {5D4C0700-BBB5-418F-A7B2-F392B9A18263} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution diff --git a/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Filters/BertSummarizationEvaluationFilter.cs b/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Filters/BertSummarizationEvaluationFilter.cs new file mode 100644 index 000000000000..22f990b52e6e --- /dev/null +++ b/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Filters/BertSummarizationEvaluationFilter.cs @@ -0,0 +1,41 @@ +// Copyright (c) Microsoft. All rights reserved. + +using Microsoft.Extensions.Logging; +using Microsoft.SemanticKernel; +using QualityCheckWithFilters.Models; +using QualityCheckWithFilters.Services; + +namespace QualityCheckWithFilters.Filters; + +/// +/// Filter which performs text summarization evaluation using BERTScore metric: https://huggingface.co/spaces/evaluate-metric/bertscore. +/// Evaluation result contains three values: precision, recall and F1 score. +/// The higher F1 score - the better the quality of the summary. +/// +internal sealed class BertSummarizationEvaluationFilter( + EvaluationService evaluationService, + ILogger logger, + double threshold) : IFunctionInvocationFilter +{ + public async Task OnFunctionInvocationAsync(FunctionInvocationContext context, Func next) + { + await next(context); + + var sourceText = context.Result.RenderedPrompt!; + var summary = context.Result.ToString(); + + var request = new SummarizationEvaluationRequest { Sources = [sourceText], Summaries = [summary] }; + var response = await evaluationService.EvaluateAsync(request); + + var precision = Math.Round(response.Precision[0], 4); + var recall = Math.Round(response.Recall[0], 4); + var f1 = Math.Round(response.F1[0], 4); + + logger.LogInformation("[BERT] Precision: {Precision}, Recall: {Recall}, F1: {F1}", precision, recall, f1); + + if (f1 < threshold) + { + throw new KernelException($"BERT summary evaluation score ({f1}) is lower than threshold ({threshold})"); + } + } +} diff --git a/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Filters/BleuSummarizationEvaluationFilter.cs b/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Filters/BleuSummarizationEvaluationFilter.cs new file mode 100644 index 000000000000..0ac339f353d4 --- /dev/null +++ b/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Filters/BleuSummarizationEvaluationFilter.cs @@ -0,0 +1,46 @@ +// Copyright (c) Microsoft. All rights reserved. + +using Microsoft.Extensions.Logging; +using Microsoft.SemanticKernel; +using QualityCheckWithFilters.Models; +using QualityCheckWithFilters.Services; + +namespace QualityCheckWithFilters.Filters; + +/// +/// Filter which performs text summarization evaluation using BLEU metric: https://huggingface.co/spaces/evaluate-metric/bleu. +/// Evaluation result contains values like score, precisions, brevity penalty and length ratio. +/// The closer the score and precision values are to 1 - the better the quality of the summary. +/// +internal sealed class BleuSummarizationEvaluationFilter( + EvaluationService evaluationService, + ILogger logger, + double threshold) : IFunctionInvocationFilter +{ + public async Task OnFunctionInvocationAsync(FunctionInvocationContext context, Func next) + { + await next(context); + + var sourceText = context.Result.RenderedPrompt!; + var summary = context.Result.ToString(); + + var request = new SummarizationEvaluationRequest { Sources = [sourceText], Summaries = [summary] }; + var response = await evaluationService.EvaluateAsync(request); + + var score = Math.Round(response.Score, 4); + var precisions = response.Precisions.Select(l => Math.Round(l, 4)).ToList(); + var brevityPenalty = Math.Round(response.BrevityPenalty, 4); + var lengthRatio = Math.Round(response.LengthRatio, 4); + + logger.LogInformation("[BLEU] Score: {Score}, Precisions: {Precisions}, Brevity penalty: {BrevityPenalty}, Length Ratio: {LengthRatio}", + score, + string.Join(", ", precisions), + brevityPenalty, + lengthRatio); + + if (precisions[0] < threshold) + { + throw new KernelException($"BLEU summary evaluation score ({precisions[0]}) is lower than threshold ({threshold})"); + } + } +} diff --git a/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Filters/CometTranslationEvaluationFilter.cs b/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Filters/CometTranslationEvaluationFilter.cs new file mode 100644 index 000000000000..a1319336cdca --- /dev/null +++ b/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Filters/CometTranslationEvaluationFilter.cs @@ -0,0 +1,40 @@ +// Copyright (c) Microsoft. All rights reserved. + +using Microsoft.Extensions.Logging; +using Microsoft.SemanticKernel; +using QualityCheckWithFilters.Models; +using QualityCheckWithFilters.Services; + +namespace QualityCheckWithFilters.Filters; + +/// +/// Filter which performs text translation evaluation using COMET metric: https://huggingface.co/Unbabel/wmt22-cometkiwi-da. +/// COMET score ranges from 0 to 1, where higher values indicate better translation. +/// +internal sealed class CometTranslationEvaluationFilter( + EvaluationService evaluationService, + ILogger logger, + double threshold) : IFunctionInvocationFilter +{ + public async Task OnFunctionInvocationAsync(FunctionInvocationContext context, Func next) + { + await next(context); + + var sourceText = context.Result.RenderedPrompt!; + var translation = context.Result.ToString(); + + logger.LogInformation("Translation: {Translation}", translation); + + var request = new TranslationEvaluationRequest { Sources = [sourceText], Translations = [translation] }; + var response = await evaluationService.EvaluateAsync(request); + + var score = Math.Round(response.Scores[0], 4); + + logger.LogInformation("[COMET] Score: {Score}", score); + + if (score < threshold) + { + throw new KernelException($"COMET translation evaluation score ({score}) is lower than threshold ({threshold})"); + } + } +} diff --git a/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Filters/FilterFactory.cs b/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Filters/FilterFactory.cs new file mode 100644 index 000000000000..866420d6096d --- /dev/null +++ b/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Filters/FilterFactory.cs @@ -0,0 +1,25 @@ +// Copyright (c) Microsoft. All rights reserved. + +using Microsoft.Extensions.Logging; +using Microsoft.SemanticKernel; +using QualityCheckWithFilters.Models; +using QualityCheckWithFilters.Services; + +namespace QualityCheckWithFilters.Filters; + +/// +/// Factory class for function invocation filters based on evaluation score type. +/// +internal sealed class FilterFactory +{ + private static readonly Dictionary> s_filters = new() + { + [EvaluationScoreType.BERT] = (service, logger, threshold) => new BertSummarizationEvaluationFilter(service, logger, threshold), + [EvaluationScoreType.BLEU] = (service, logger, threshold) => new BleuSummarizationEvaluationFilter(service, logger, threshold), + [EvaluationScoreType.METEOR] = (service, logger, threshold) => new MeteorSummarizationEvaluationFilter(service, logger, threshold), + [EvaluationScoreType.COMET] = (service, logger, threshold) => new CometTranslationEvaluationFilter(service, logger, threshold), + }; + + public static IFunctionInvocationFilter Create(EvaluationScoreType type, EvaluationService evaluationService, ILogger logger, double threshold) + => s_filters[type].Invoke(evaluationService, logger, threshold); +} diff --git a/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Filters/MeteorSummarizationEvaluationFilter.cs b/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Filters/MeteorSummarizationEvaluationFilter.cs new file mode 100644 index 000000000000..4909c81caf0b --- /dev/null +++ b/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Filters/MeteorSummarizationEvaluationFilter.cs @@ -0,0 +1,38 @@ +// Copyright (c) Microsoft. All rights reserved. + +using Microsoft.Extensions.Logging; +using Microsoft.SemanticKernel; +using QualityCheckWithFilters.Models; +using QualityCheckWithFilters.Services; + +namespace QualityCheckWithFilters.Filters; + +/// +/// Filter which performs text summarization evaluation using METEOR metric: https://huggingface.co/spaces/evaluate-metric/meteor. +/// METEOR score ranges from 0 to 1, where higher values indicate better similarity between original text and generated summary. +/// +internal sealed class MeteorSummarizationEvaluationFilter( + EvaluationService evaluationService, + ILogger logger, + double threshold) : IFunctionInvocationFilter +{ + public async Task OnFunctionInvocationAsync(FunctionInvocationContext context, Func next) + { + await next(context); + + var sourceText = context.Result.RenderedPrompt!; + var summary = context.Result.ToString(); + + var request = new SummarizationEvaluationRequest { Sources = [sourceText], Summaries = [summary] }; + var response = await evaluationService.EvaluateAsync(request); + + var score = Math.Round(response.Score, 4); + + logger.LogInformation("[METEOR] Score: {Score}", score); + + if (score < threshold) + { + throw new KernelException($"METEOR summary evaluation score ({score}) is lower than threshold ({threshold})"); + } + } +} diff --git a/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Models/EvaluationRequest.cs b/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Models/EvaluationRequest.cs new file mode 100644 index 000000000000..96650762fec4 --- /dev/null +++ b/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Models/EvaluationRequest.cs @@ -0,0 +1,26 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Text.Json.Serialization; + +namespace QualityCheckWithFilters.Models; + +/// Base request model with source texts. +internal class EvaluationRequest +{ + [JsonPropertyName("sources")] + public List Sources { get; set; } +} + +/// Request model with generated summaries. +internal sealed class SummarizationEvaluationRequest : EvaluationRequest +{ + [JsonPropertyName("summaries")] + public List Summaries { get; set; } +} + +/// Request model with generated translations. +internal sealed class TranslationEvaluationRequest : EvaluationRequest +{ + [JsonPropertyName("translations")] + public List Translations { get; set; } +} diff --git a/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Models/EvaluationResponse.cs b/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Models/EvaluationResponse.cs new file mode 100644 index 000000000000..1552c0ec1aaa --- /dev/null +++ b/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Models/EvaluationResponse.cs @@ -0,0 +1,51 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Text.Json.Serialization; + +namespace QualityCheckWithFilters.Models; + +/// Response model for BERTScore metric: https://huggingface.co/spaces/evaluate-metric/bertscore. +internal sealed class BertSummarizationEvaluationResponse +{ + [JsonPropertyName("precision")] + public List Precision { get; set; } + + [JsonPropertyName("recall")] + public List Recall { get; set; } + + [JsonPropertyName("f1")] + public List F1 { get; set; } +} + +/// Response model for BLEU metric: https://huggingface.co/spaces/evaluate-metric/bleu. +internal sealed class BleuSummarizationEvaluationResponse +{ + [JsonPropertyName("bleu")] + public double Score { get; set; } + + [JsonPropertyName("precisions")] + public List Precisions { get; set; } + + [JsonPropertyName("brevity_penalty")] + public double BrevityPenalty { get; set; } + + [JsonPropertyName("length_ratio")] + public double LengthRatio { get; set; } +} + +/// Response model for METEOR metric: https://huggingface.co/spaces/evaluate-metric/meteor. +internal sealed class MeteorSummarizationEvaluationResponse +{ + [JsonPropertyName("meteor")] + public double Score { get; set; } +} + +/// Response model for COMET metric: https://huggingface.co/Unbabel/wmt22-cometkiwi-da. +internal sealed class CometTranslationEvaluationResponse +{ + [JsonPropertyName("scores")] + public List Scores { get; set; } + + [JsonPropertyName("system_score")] + public double SystemScore { get; set; } +} diff --git a/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Models/EvaluationScoreType.cs b/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Models/EvaluationScoreType.cs new file mode 100644 index 000000000000..354ce46f0a05 --- /dev/null +++ b/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Models/EvaluationScoreType.cs @@ -0,0 +1,33 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Diagnostics.CodeAnalysis; + +namespace QualityCheckWithFilters.Models; + +/// +/// Internal representation of evaluation score type to configure and run examples. +/// +internal readonly struct EvaluationScoreType(string endpoint) : IEquatable +{ + public string Endpoint { get; } = endpoint; + + public static EvaluationScoreType BERT = new("bert-score"); + public static EvaluationScoreType BLEU = new("bleu-score"); + public static EvaluationScoreType METEOR = new("meteor-score"); + public static EvaluationScoreType COMET = new("comet-score"); + + public static bool operator ==(EvaluationScoreType left, EvaluationScoreType right) => left.Equals(right); + public static bool operator !=(EvaluationScoreType left, EvaluationScoreType right) => !(left == right); + + /// + public override bool Equals([NotNullWhen(true)] object? obj) => obj is EvaluationScoreType other && this == other; + + /// + public bool Equals(EvaluationScoreType other) => string.Equals(this.Endpoint, other.Endpoint, StringComparison.OrdinalIgnoreCase); + + /// + public override int GetHashCode() => StringComparer.OrdinalIgnoreCase.GetHashCode(this.Endpoint ?? string.Empty); + + /// + public override string ToString() => this.Endpoint ?? string.Empty; +} diff --git a/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Program.cs b/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Program.cs new file mode 100644 index 000000000000..dae1a5f6ec20 --- /dev/null +++ b/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Program.cs @@ -0,0 +1,213 @@ +// Copyright (c) Microsoft. All rights reserved. + +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using Microsoft.SemanticKernel; +using Microsoft.SemanticKernel.ChatCompletion; +using QualityCheckWithFilters.Filters; +using QualityCheckWithFilters.Models; +using QualityCheckWithFilters.Services; + +namespace QualityCheckWithFilters; + +public class Program +{ + /// + /// This example demonstrates how to evaluate LLM results on tasks such as text summarization and translation + /// using following metrics: + /// - BERTScore: https://github.com/Tiiiger/bert_score + /// - BLEU (BiLingual Evaluation Understudy): https://en.wikipedia.org/wiki/BLEU + /// - METEOR (Metric for Evaluation of Translation with Explicit ORdering): https://en.wikipedia.org/wiki/METEOR + /// - COMET (Crosslingual Optimized Metric for Evaluation of Translation): https://unbabel.github.io/COMET + /// Semantic Kernel Filters are used to perform following tasks during function invocation: + /// 1. Get original text to summarize/translate. + /// 2. Get LLM result. + /// 3. Call evaluation server to get specific metric score. + /// 4. Compare metric score to configured threshold and throw an exception if score is lower. + /// + public static async Task Main() + { + await SummarizationEvaluationAsync(EvaluationScoreType.BERT, threshold: 0.85); + + // Output: + // Extractive summary: [BERT] Precision: 0.9756, Recall: 0.9114, F1: 0.9424 + // Abstractive summary: [BERT] Precision: 0.8953, Recall: 0.8656, F1: 0.8802 + // Random summary: [BERT] Precision: 0.8433, Recall: 0.787, F1: 0.8142 + // Exception occurred during function invocation: BERT summary evaluation score (0.8142) is lower than threshold (0.85) + + await SummarizationEvaluationAsync(EvaluationScoreType.BLEU, threshold: 0.5); + + // Output: + // Extractive summary: [BLEU] Score: 0.3281, Precisions: 1, 1, 0.9726, 0.9444, Brevity penalty: 0.3351, Length Ratio: 0.4777 + // Abstractive summary: [BLEU] Score: 0, Precisions: 0.678, 0.1552, 0.0175, 0, Brevity penalty: 0.1899, Length Ratio: 0.3758 + // Random summary: [BLEU] Score: 0, Precisions: 0.2, 0, 0, 0, Brevity penalty: 0, Length Ratio: 0.0318 + // Exception occurred during function invocation: BLEU summary evaluation score (0.2) is lower than threshold (0.5) + + await SummarizationEvaluationAsync(EvaluationScoreType.METEOR, threshold: 0.1); + + // Output: + // Extractive summary: [METEOR] Score: 0.438 + // Abstractive summary: [METEOR] Score: 0.1661 + // Random summary: [METEOR] Score: 0.0035 + // Exception occurred during function invocation: METEOR summary evaluation score (0.0035) is lower than threshold (0.1) + + await TranslationEvaluationAsync(threshold: 0.4); + + // Output: + // Text to translate: Berlin ist die Hauptstadt der Deutschland. + // Translation: Berlin is the capital of Germany - [COMET] Score: 0.8695 + // Translation: Berlin capital Germany is of The - [COMET] Score: 0.4724 + // Translation: This is random translation - [COMET] Score: 0.3525 + // Exception occurred during function invocation: COMET translation evaluation score (0.3525) is lower than threshold (0.4) + } + + #region Scenarios + + /// + /// This method performs summarization evaluation and compare following types of summaries: + /// - Extractive summary: involves selecting and extracting key sentences, phrases, or segments directly from the original text to create a summary. + /// - Abstractive summary: involves generating new sentences that convey the key information from the original text. + /// - Random summary: unrelated text to original source for comparison purposes. + /// + private static async Task SummarizationEvaluationAsync(EvaluationScoreType scoreType, double threshold) + { + // Define text to summarize and possible LLM summaries. + const string TextToSummarize = + """ + The sun rose over the horizon, casting a warm glow across the landscape. + Birds began to chirp, greeting the new day with their melodious songs. + The flowers in the garden slowly opened their petals, revealing vibrant colors and delicate fragrances. + A gentle breeze rustled through the trees, creating a soothing sound that complemented the morning stillness. + People started to emerge from their homes, ready to embark on their daily routines. + Some went for a morning jog, enjoying the fresh air and the peaceful surroundings. + Others sipped their coffee while reading the newspaper on their porches. + The streets gradually filled with the hum of cars and the chatter of pedestrians. + In the park, children played joyfully, their laughter echoing through the air. + As the day progressed, the town buzzed with activity, each moment bringing new opportunities and experiences. + """; + + const string ExtractiveSummary = + """ + The sun rose over the horizon, casting a warm glow across the landscape. + Birds began to chirp, greeting the new day with their melodious songs. + People started to emerge from their homes, ready to embark on their daily routines. + The streets gradually filled with the hum of cars and the chatter of pedestrians. + In the park, children played joyfully, their laughter echoing through the air. + """; + + const string AbstractiveSummary = + """ + As the sun rises, nature awakens with birds singing and flowers blooming. + People begin their day with various routines, from jogging to enjoying coffee. + The town gradually becomes lively with the sounds of traffic and children's laughter in the park, + marking the start of a bustling day filled with new activities and opportunities. + """; + + const string RandomSummary = + """ + This is random text. + """; + + // Get kernel builder with initial configuration. + var builder = GetKernelBuilder(scoreType, threshold); + + // It doesn't matter which LLM to use for text summarization, since the main goal is to demonstrate how to evaluate the result and compare metrics. + // For demonstration purposes, fake chat completion service is used to simulate LLM response with predefined summary. + builder.Services.AddSingleton(new FakeChatCompletionService("extractive-summary-model", ExtractiveSummary)); + builder.Services.AddSingleton(new FakeChatCompletionService("abstractive-summary-model", AbstractiveSummary)); + builder.Services.AddSingleton(new FakeChatCompletionService("random-summary-model", RandomSummary)); + + // Build kernel + var kernel = builder.Build(); + + // Invoke function to perform text summarization with predefined result, trigger function invocation filter and evaluate the result. + await InvokeAsync(kernel, TextToSummarize, "extractive-summary-model"); + await InvokeAsync(kernel, TextToSummarize, "abstractive-summary-model"); + await InvokeAsync(kernel, TextToSummarize, "random-summary-model"); + } + + /// + /// This method performs translation evaluation and compare the results. + /// + private static async Task TranslationEvaluationAsync(double threshold) + { + EvaluationScoreType scoreType = EvaluationScoreType.COMET; + + // Define text to translate and possible LLM translations. + const string TextToTranslate = "Berlin ist die Hauptstadt der Deutschland."; + const string Translation1 = "Berlin is the capital of Germany."; + const string Translation2 = "Berlin capital Germany is of The."; + const string Translation3 = "This is random translation."; + + // Get kernel builder with initial configuration. + var builder = GetKernelBuilder(scoreType, threshold); + + // It doesn't matter which LLM to use for text translation, since the main goal is to demonstrate how to evaluate the result and compare metrics. + // For demonstration purposes, fake chat completion service is used to simulate LLM response with predefined translation. + builder.Services.AddSingleton(new FakeChatCompletionService("translation-1-model", Translation1)); + builder.Services.AddSingleton(new FakeChatCompletionService("translation-2-model", Translation2)); + builder.Services.AddSingleton(new FakeChatCompletionService("translation-3-model", Translation3)); + + // Build kernel + var kernel = builder.Build(); + + // Invoke function to perform text translation with predefined result, trigger function invocation filter and evaluate the result. + await InvokeAsync(kernel, TextToTranslate, "translation-1-model"); + await InvokeAsync(kernel, TextToTranslate, "translation-2-model"); + await InvokeAsync(kernel, TextToTranslate, "translation-3-model"); + } + + #endregion + + #region Helpers + + /// + /// Gets kernel builder with initial configuration. + /// + private static IKernelBuilder GetKernelBuilder(EvaluationScoreType scoreType, double threshold) + { + // Create kernel builder + var builder = Kernel.CreateBuilder(); + + // Add logging + builder.Services.AddLogging(loggingBuilder => loggingBuilder.AddConsole().SetMinimumLevel(LogLevel.Information)); + + // Add default HTTP client with base address to local evaluation server + builder.Services.AddHttpClient("default", client => { client.BaseAddress = new Uri("http://localhost:8080"); }); + + // Add service which performs HTTP requests to evaluation server + builder.Services.AddSingleton( + sp => new EvaluationService( + sp.GetRequiredService().CreateClient("default"), + scoreType.Endpoint)); + + // Add function invocation filter to perform evaluation and compare metric score with configured threshold + builder.Services.AddSingleton( + sp => FilterFactory.Create( + scoreType, + sp.GetRequiredService(), + sp.GetRequiredService>(), + threshold)); + + return builder; + } + + /// + /// Invokes kernel function with provided input and model ID. + /// + private static async Task InvokeAsync(Kernel kernel, string input, string modelId) + { + var logger = kernel.Services.GetRequiredService>(); + + try + { + await kernel.InvokePromptAsync(input, new(new PromptExecutionSettings { ModelId = modelId })); + } + catch (KernelException exception) + { + logger.LogError(exception, "Exception occurred during function invocation: {Message}", exception.Message); + } + } + + #endregion +} diff --git a/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/QualityCheckWithFilters.csproj b/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/QualityCheckWithFilters.csproj new file mode 100644 index 000000000000..f5221179c54f --- /dev/null +++ b/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/QualityCheckWithFilters.csproj @@ -0,0 +1,18 @@ + + + + Exe + net8.0 + enable + enable + $(NoWarn);VSTHRD111,CA2007,CS8618,CS1591,CA1052,SKEXP0001 + + + + + + + + + + diff --git a/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Services/EvaluationService.cs b/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Services/EvaluationService.cs new file mode 100644 index 000000000000..b550ca8848ab --- /dev/null +++ b/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Services/EvaluationService.cs @@ -0,0 +1,28 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Text; +using System.Text.Json; +using QualityCheckWithFilters.Models; + +namespace QualityCheckWithFilters.Services; + +/// +/// Service which performs HTTP requests to evaluation server. +/// +internal sealed class EvaluationService(HttpClient httpClient, string endpoint) +{ + public async Task EvaluateAsync(TRequest request) + where TRequest : EvaluationRequest + { + var requestContent = new StringContent(JsonSerializer.Serialize(request), Encoding.UTF8, "application/json"); + + var response = await httpClient.PostAsync(new Uri(endpoint, UriKind.Relative), requestContent); + + response.EnsureSuccessStatusCode(); + + var responseContent = await response.Content.ReadAsStringAsync(); + + return JsonSerializer.Deserialize(responseContent) ?? + throw new Exception("Response is not available."); + } +} diff --git a/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Services/FakeChatCompletionService.cs b/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Services/FakeChatCompletionService.cs new file mode 100644 index 000000000000..246888b9423f --- /dev/null +++ b/dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Services/FakeChatCompletionService.cs @@ -0,0 +1,28 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Runtime.CompilerServices; +using Microsoft.SemanticKernel; +using Microsoft.SemanticKernel.ChatCompletion; +using Microsoft.SemanticKernel.Services; + +namespace QualityCheckWithFilters.Services; + +#pragma warning disable CS1998 + +/// +/// Fake chat completion service to simulate a call to LLM and return predefined result for demonstration purposes. +/// +internal sealed class FakeChatCompletionService(string modelId, string result) : IChatCompletionService +{ + public IReadOnlyDictionary Attributes => new Dictionary { [AIServiceExtensions.ModelIdKey] = modelId }; + + public Task> GetChatMessageContentsAsync(ChatHistory chatHistory, PromptExecutionSettings? executionSettings = null, Kernel? kernel = null, CancellationToken cancellationToken = default) + { + return Task.FromResult>([new(AuthorRole.Assistant, result)]); + } + + public async IAsyncEnumerable GetStreamingChatMessageContentsAsync(ChatHistory chatHistory, PromptExecutionSettings? executionSettings = null, Kernel? kernel = null, [EnumeratorCancellation] CancellationToken cancellationToken = default) + { + yield return new StreamingChatMessageContent(AuthorRole.Assistant, result); + } +} diff --git a/dotnet/samples/Demos/QualityCheck/README.md b/dotnet/samples/Demos/QualityCheck/README.md new file mode 100644 index 000000000000..ae05bd35f42e --- /dev/null +++ b/dotnet/samples/Demos/QualityCheck/README.md @@ -0,0 +1,76 @@ +# Quality Check with Filters + +This sample provides a practical demonstration how to perform quality check on LLM results for such tasks as text summarization and translation with Semantic Kernel Filters. + +Metrics used in this example: +- [BERTScore](https://github.com/Tiiiger/bert_score) - leverages the pre-trained contextual embeddings from BERT and matches words in candidate and reference sentences by cosine similarity. +- [BLEU](https://en.wikipedia.org/wiki/BLEU) (BiLingual Evaluation Understudy) - evaluates the quality of text which has been machine-translated from one natural language to another. +- [METEOR](https://en.wikipedia.org/wiki/METEOR) (Metric for Evaluation of Translation with Explicit ORdering) - evaluates the similarity between the generated summary and the reference summary, taking into account grammar and semantics. +- [COMET](https://unbabel.github.io/COMET) (Crosslingual Optimized Metric for Evaluation of Translation) - is an open-source framework used to train Machine Translation metrics that achieve high levels of correlation with different types of human judgments. + +In this example, SK Filters call dedicated [server](./python-server/) which is responsible for task evaluation using metrics described above. If evaluation score of specific metric doesn't meet configured threshold, an exception is thrown with evaluation details. + +[Hugging Face Evaluate Metric](https://github.com/huggingface/evaluate) library is used to evaluate summarization and translation results. + +## Prerequisites + +1. [Python 3.12](https://www.python.org/downloads/) +2. Get [Hugging Face API token](https://huggingface.co/docs/api-inference/en/quicktour#get-your-api-token). +3. Accept conditions to access [Unbabel/wmt22-cometkiwi-da](https://huggingface.co/Unbabel/wmt22-cometkiwi-da) model on Hugging Face portal. + +## Setup + +It's possible to run Python server for task evaluation directly or with Docker. + +### Run server + +1. Open Python server directory: +```bash +cd python-server +``` + +2. Create and active virtual environment: +```bash +python -m venv venv +source venv/Scripts/activate # activate on Windows +source venv/bin/activate # activate on Unix/MacOS +``` + +3. Setup Hugging Face API key: +```bash +pip install "huggingface_hub[cli]" +huggingface-cli login --token +``` + +4. Install dependencies: +```bash +pip install -r requirements.txt +``` + +5. Run server: +```bash +cd app +uvicorn main:app --port 8080 --reload +``` + +6. Open `http://localhost:8080/docs` and check available endpoints. + +### Run server with Docker + +1. Open Python server directory: +```bash +cd python-server +``` + +2. Create `.env/hf_token.txt` file and put Hugging Face API token in it. + +3. Build image and run container: +```bash +docker-compose up --build +``` + +4. Open `http://localhost:8080/docs` and check available endpoints. + +## Testing + +Open and run `QualityCheckWithFilters/Program.cs` to experiment with different evaluation metrics, thresholds and input parameters. diff --git a/dotnet/samples/Demos/QualityCheck/python-server/Dockerfile b/dotnet/samples/Demos/QualityCheck/python-server/Dockerfile new file mode 100644 index 000000000000..e270b2e08ab0 --- /dev/null +++ b/dotnet/samples/Demos/QualityCheck/python-server/Dockerfile @@ -0,0 +1,17 @@ +# syntax=docker/dockerfile:1.2 +FROM python:3.12 + +WORKDIR /code + +COPY ./requirements.txt /code/requirements.txt + +RUN pip install "huggingface_hub[cli]" +RUN --mount=type=secret,id=hf_token \ + huggingface-cli login --token $(cat /run/secrets/hf_token) + +RUN pip install cmake +RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt + +COPY ./app /code/app + +CMD ["fastapi", "run", "app/main.py", "--port", "80"] diff --git a/dotnet/samples/Demos/QualityCheck/python-server/app/__init__.py b/dotnet/samples/Demos/QualityCheck/python-server/app/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/dotnet/samples/Demos/QualityCheck/python-server/app/main.py b/dotnet/samples/Demos/QualityCheck/python-server/app/main.py new file mode 100644 index 000000000000..7a17f552da54 --- /dev/null +++ b/dotnet/samples/Demos/QualityCheck/python-server/app/main.py @@ -0,0 +1,40 @@ +# Copyright (c) Microsoft. All rights reserved. + +from typing import List +from pydantic import BaseModel + +from fastapi import FastAPI +from evaluate import load +from comet import download_model, load_from_checkpoint + +app = FastAPI() + +class SummarizationEvaluationRequest(BaseModel): + sources: List[str] + summaries: List[str] + +class TranslationEvaluationRequest(BaseModel): + sources: List[str] + translations: List[str] + +@app.post("/bert-score/") +def bert_score(request: SummarizationEvaluationRequest): + bertscore = load("bertscore") + return bertscore.compute(predictions=request.summaries, references=request.sources, lang="en") + +@app.post("/meteor-score/") +def meteor_score(request: SummarizationEvaluationRequest): + meteor = load("meteor") + return meteor.compute(predictions=request.summaries, references=request.sources) + +@app.post("/bleu-score/") +def bleu_score(request: SummarizationEvaluationRequest): + bleu = load("bleu") + return bleu.compute(predictions=request.summaries, references=request.sources) + +@app.post("/comet-score/") +def comet_score(request: TranslationEvaluationRequest): + model_path = download_model("Unbabel/wmt22-cometkiwi-da") + model = load_from_checkpoint(model_path) + data = [{"src": src, "mt": mt} for src, mt in zip(request.sources, request.translations)] + return model.predict(data, accelerator="cpu") diff --git a/dotnet/samples/Demos/QualityCheck/python-server/docker-compose.yml b/dotnet/samples/Demos/QualityCheck/python-server/docker-compose.yml new file mode 100644 index 000000000000..6701b53fadd8 --- /dev/null +++ b/dotnet/samples/Demos/QualityCheck/python-server/docker-compose.yml @@ -0,0 +1,16 @@ +version: '3.8' + +services: + quality-check: + build: + context: . + dockerfile: Dockerfile + secrets: + - hf_token + ports: + - "8080:80" + secrets: + - hf_token +secrets: + hf_token: + file: .env/hf_token.txt diff --git a/dotnet/samples/Demos/QualityCheck/python-server/requirements.txt b/dotnet/samples/Demos/QualityCheck/python-server/requirements.txt new file mode 100644 index 000000000000..24b95da19607 --- /dev/null +++ b/dotnet/samples/Demos/QualityCheck/python-server/requirements.txt @@ -0,0 +1,8 @@ +fastapi +uvicorn +pydantic +bert_score +nltk +evaluate +cmake +unbabel-comet