-
Notifications
You must be signed in to change notification settings - Fork 2.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
.Net: Summarization and translation evaluation examples with Filters (#…
…6262) ### Motivation and Context <!-- Thank you for your contribution to the semantic-kernel repo! Please help reviewers and future users, providing the following information: 1. Why is this change required? 2. What problem does it solve? 3. What scenario does it contribute to? 4. If it fixes an open issue, please link to the issue here. --> This example demonstrates how to perform quality check on LLM results for such tasks as text summarization and translation with Semantic Kernel Filters. Metrics used in this example: - [BERTScore](https://github.com/Tiiiger/bert_score) - leverages the pre-trained contextual embeddings from BERT and matches words in candidate and reference sentences by cosine similarity. - [BLEU](https://en.wikipedia.org/wiki/BLEU) (BiLingual Evaluation Understudy) - evaluates the quality of text which has been machine-translated from one natural language to another. - [METEOR](https://en.wikipedia.org/wiki/METEOR) (Metric for Evaluation of Translation with Explicit ORdering) - evaluates the similarity between the generated summary and the reference summary, taking into account grammar and semantics. - [COMET](https://unbabel.github.io/COMET) (Crosslingual Optimized Metric for Evaluation of Translation) - is an open-source framework used to train Machine Translation metrics that achieve high levels of correlation with different types of human judgments. In this example, SK Filters call dedicated server which is responsible for task evaluation using metrics described above. If evaluation score of specific metric doesn't meet configured threshold, an exception is thrown with evaluation details. [Hugging Face Evaluate Metric](https://github.com/huggingface/evaluate) library is used to evaluate summarization and translation results. ### Contribution Checklist <!-- Before submitting this PR, please make sure: --> - [x] The code builds clean without any errors or warnings - [x] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [x] All unit tests pass, and I have added new tests where possible - [x] I didn't break anyone 😄
- Loading branch information
1 parent
dbe6aa2
commit 51af5ee
Showing
20 changed files
with
754 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
41 changes: 41 additions & 0 deletions
41
...s/Demos/QualityCheck/QualityCheckWithFilters/Filters/BertSummarizationEvaluationFilter.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using Microsoft.Extensions.Logging; | ||
using Microsoft.SemanticKernel; | ||
using QualityCheckWithFilters.Models; | ||
using QualityCheckWithFilters.Services; | ||
|
||
namespace QualityCheckWithFilters.Filters; | ||
|
||
/// <summary> | ||
/// Filter which performs text summarization evaluation using BERTScore metric: https://huggingface.co/spaces/evaluate-metric/bertscore. | ||
/// Evaluation result contains three values: precision, recall and F1 score. | ||
/// The higher F1 score - the better the quality of the summary. | ||
/// </summary> | ||
internal sealed class BertSummarizationEvaluationFilter( | ||
EvaluationService evaluationService, | ||
ILogger logger, | ||
double threshold) : IFunctionInvocationFilter | ||
{ | ||
public async Task OnFunctionInvocationAsync(FunctionInvocationContext context, Func<FunctionInvocationContext, Task> next) | ||
{ | ||
await next(context); | ||
|
||
var sourceText = context.Result.RenderedPrompt!; | ||
var summary = context.Result.ToString(); | ||
|
||
var request = new SummarizationEvaluationRequest { Sources = [sourceText], Summaries = [summary] }; | ||
var response = await evaluationService.EvaluateAsync<SummarizationEvaluationRequest, BertSummarizationEvaluationResponse>(request); | ||
|
||
var precision = Math.Round(response.Precision[0], 4); | ||
var recall = Math.Round(response.Recall[0], 4); | ||
var f1 = Math.Round(response.F1[0], 4); | ||
|
||
logger.LogInformation("[BERT] Precision: {Precision}, Recall: {Recall}, F1: {F1}", precision, recall, f1); | ||
|
||
if (f1 < threshold) | ||
{ | ||
throw new KernelException($"BERT summary evaluation score ({f1}) is lower than threshold ({threshold})"); | ||
} | ||
} | ||
} |
46 changes: 46 additions & 0 deletions
46
...s/Demos/QualityCheck/QualityCheckWithFilters/Filters/BleuSummarizationEvaluationFilter.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using Microsoft.Extensions.Logging; | ||
using Microsoft.SemanticKernel; | ||
using QualityCheckWithFilters.Models; | ||
using QualityCheckWithFilters.Services; | ||
|
||
namespace QualityCheckWithFilters.Filters; | ||
|
||
/// <summary> | ||
/// Filter which performs text summarization evaluation using BLEU metric: https://huggingface.co/spaces/evaluate-metric/bleu. | ||
/// Evaluation result contains values like score, precisions, brevity penalty and length ratio. | ||
/// The closer the score and precision values are to 1 - the better the quality of the summary. | ||
/// </summary> | ||
internal sealed class BleuSummarizationEvaluationFilter( | ||
EvaluationService evaluationService, | ||
ILogger logger, | ||
double threshold) : IFunctionInvocationFilter | ||
{ | ||
public async Task OnFunctionInvocationAsync(FunctionInvocationContext context, Func<FunctionInvocationContext, Task> next) | ||
{ | ||
await next(context); | ||
|
||
var sourceText = context.Result.RenderedPrompt!; | ||
var summary = context.Result.ToString(); | ||
|
||
var request = new SummarizationEvaluationRequest { Sources = [sourceText], Summaries = [summary] }; | ||
var response = await evaluationService.EvaluateAsync<SummarizationEvaluationRequest, BleuSummarizationEvaluationResponse>(request); | ||
|
||
var score = Math.Round(response.Score, 4); | ||
var precisions = response.Precisions.Select(l => Math.Round(l, 4)).ToList(); | ||
var brevityPenalty = Math.Round(response.BrevityPenalty, 4); | ||
var lengthRatio = Math.Round(response.LengthRatio, 4); | ||
|
||
logger.LogInformation("[BLEU] Score: {Score}, Precisions: {Precisions}, Brevity penalty: {BrevityPenalty}, Length Ratio: {LengthRatio}", | ||
score, | ||
string.Join(", ", precisions), | ||
brevityPenalty, | ||
lengthRatio); | ||
|
||
if (precisions[0] < threshold) | ||
{ | ||
throw new KernelException($"BLEU summary evaluation score ({precisions[0]}) is lower than threshold ({threshold})"); | ||
} | ||
} | ||
} |
40 changes: 40 additions & 0 deletions
40
...es/Demos/QualityCheck/QualityCheckWithFilters/Filters/CometTranslationEvaluationFilter.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using Microsoft.Extensions.Logging; | ||
using Microsoft.SemanticKernel; | ||
using QualityCheckWithFilters.Models; | ||
using QualityCheckWithFilters.Services; | ||
|
||
namespace QualityCheckWithFilters.Filters; | ||
|
||
/// <summary> | ||
/// Filter which performs text translation evaluation using COMET metric: https://huggingface.co/Unbabel/wmt22-cometkiwi-da. | ||
/// COMET score ranges from 0 to 1, where higher values indicate better translation. | ||
/// </summary> | ||
internal sealed class CometTranslationEvaluationFilter( | ||
EvaluationService evaluationService, | ||
ILogger logger, | ||
double threshold) : IFunctionInvocationFilter | ||
{ | ||
public async Task OnFunctionInvocationAsync(FunctionInvocationContext context, Func<FunctionInvocationContext, Task> next) | ||
{ | ||
await next(context); | ||
|
||
var sourceText = context.Result.RenderedPrompt!; | ||
var translation = context.Result.ToString(); | ||
|
||
logger.LogInformation("Translation: {Translation}", translation); | ||
|
||
var request = new TranslationEvaluationRequest { Sources = [sourceText], Translations = [translation] }; | ||
var response = await evaluationService.EvaluateAsync<TranslationEvaluationRequest, CometTranslationEvaluationResponse>(request); | ||
|
||
var score = Math.Round(response.Scores[0], 4); | ||
|
||
logger.LogInformation("[COMET] Score: {Score}", score); | ||
|
||
if (score < threshold) | ||
{ | ||
throw new KernelException($"COMET translation evaluation score ({score}) is lower than threshold ({threshold})"); | ||
} | ||
} | ||
} |
25 changes: 25 additions & 0 deletions
25
dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Filters/FilterFactory.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using Microsoft.Extensions.Logging; | ||
using Microsoft.SemanticKernel; | ||
using QualityCheckWithFilters.Models; | ||
using QualityCheckWithFilters.Services; | ||
|
||
namespace QualityCheckWithFilters.Filters; | ||
|
||
/// <summary> | ||
/// Factory class for function invocation filters based on evaluation score type. | ||
/// </summary> | ||
internal sealed class FilterFactory | ||
{ | ||
private static readonly Dictionary<EvaluationScoreType, Func<EvaluationService, ILogger, double, IFunctionInvocationFilter>> s_filters = new() | ||
{ | ||
[EvaluationScoreType.BERT] = (service, logger, threshold) => new BertSummarizationEvaluationFilter(service, logger, threshold), | ||
[EvaluationScoreType.BLEU] = (service, logger, threshold) => new BleuSummarizationEvaluationFilter(service, logger, threshold), | ||
[EvaluationScoreType.METEOR] = (service, logger, threshold) => new MeteorSummarizationEvaluationFilter(service, logger, threshold), | ||
[EvaluationScoreType.COMET] = (service, logger, threshold) => new CometTranslationEvaluationFilter(service, logger, threshold), | ||
}; | ||
|
||
public static IFunctionInvocationFilter Create(EvaluationScoreType type, EvaluationService evaluationService, ILogger logger, double threshold) | ||
=> s_filters[type].Invoke(evaluationService, logger, threshold); | ||
} |
38 changes: 38 additions & 0 deletions
38
...Demos/QualityCheck/QualityCheckWithFilters/Filters/MeteorSummarizationEvaluationFilter.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using Microsoft.Extensions.Logging; | ||
using Microsoft.SemanticKernel; | ||
using QualityCheckWithFilters.Models; | ||
using QualityCheckWithFilters.Services; | ||
|
||
namespace QualityCheckWithFilters.Filters; | ||
|
||
/// <summary> | ||
/// Filter which performs text summarization evaluation using METEOR metric: https://huggingface.co/spaces/evaluate-metric/meteor. | ||
/// METEOR score ranges from 0 to 1, where higher values indicate better similarity between original text and generated summary. | ||
/// </summary> | ||
internal sealed class MeteorSummarizationEvaluationFilter( | ||
EvaluationService evaluationService, | ||
ILogger logger, | ||
double threshold) : IFunctionInvocationFilter | ||
{ | ||
public async Task OnFunctionInvocationAsync(FunctionInvocationContext context, Func<FunctionInvocationContext, Task> next) | ||
{ | ||
await next(context); | ||
|
||
var sourceText = context.Result.RenderedPrompt!; | ||
var summary = context.Result.ToString(); | ||
|
||
var request = new SummarizationEvaluationRequest { Sources = [sourceText], Summaries = [summary] }; | ||
var response = await evaluationService.EvaluateAsync<SummarizationEvaluationRequest, MeteorSummarizationEvaluationResponse>(request); | ||
|
||
var score = Math.Round(response.Score, 4); | ||
|
||
logger.LogInformation("[METEOR] Score: {Score}", score); | ||
|
||
if (score < threshold) | ||
{ | ||
throw new KernelException($"METEOR summary evaluation score ({score}) is lower than threshold ({threshold})"); | ||
} | ||
} | ||
} |
26 changes: 26 additions & 0 deletions
26
dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Models/EvaluationRequest.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using System.Text.Json.Serialization; | ||
|
||
namespace QualityCheckWithFilters.Models; | ||
|
||
/// <summary>Base request model with source texts.</summary> | ||
internal class EvaluationRequest | ||
{ | ||
[JsonPropertyName("sources")] | ||
public List<string> Sources { get; set; } | ||
} | ||
|
||
/// <summary>Request model with generated summaries.</summary> | ||
internal sealed class SummarizationEvaluationRequest : EvaluationRequest | ||
{ | ||
[JsonPropertyName("summaries")] | ||
public List<string> Summaries { get; set; } | ||
} | ||
|
||
/// <summary>Request model with generated translations.</summary> | ||
internal sealed class TranslationEvaluationRequest : EvaluationRequest | ||
{ | ||
[JsonPropertyName("translations")] | ||
public List<string> Translations { get; set; } | ||
} |
51 changes: 51 additions & 0 deletions
51
dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Models/EvaluationResponse.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using System.Text.Json.Serialization; | ||
|
||
namespace QualityCheckWithFilters.Models; | ||
|
||
/// <summary>Response model for BERTScore metric: https://huggingface.co/spaces/evaluate-metric/bertscore.</summary> | ||
internal sealed class BertSummarizationEvaluationResponse | ||
{ | ||
[JsonPropertyName("precision")] | ||
public List<double> Precision { get; set; } | ||
|
||
[JsonPropertyName("recall")] | ||
public List<double> Recall { get; set; } | ||
|
||
[JsonPropertyName("f1")] | ||
public List<double> F1 { get; set; } | ||
} | ||
|
||
/// <summary>Response model for BLEU metric: https://huggingface.co/spaces/evaluate-metric/bleu.</summary> | ||
internal sealed class BleuSummarizationEvaluationResponse | ||
{ | ||
[JsonPropertyName("bleu")] | ||
public double Score { get; set; } | ||
|
||
[JsonPropertyName("precisions")] | ||
public List<double> Precisions { get; set; } | ||
|
||
[JsonPropertyName("brevity_penalty")] | ||
public double BrevityPenalty { get; set; } | ||
|
||
[JsonPropertyName("length_ratio")] | ||
public double LengthRatio { get; set; } | ||
} | ||
|
||
/// <summary>Response model for METEOR metric: https://huggingface.co/spaces/evaluate-metric/meteor.</summary> | ||
internal sealed class MeteorSummarizationEvaluationResponse | ||
{ | ||
[JsonPropertyName("meteor")] | ||
public double Score { get; set; } | ||
} | ||
|
||
/// <summary>Response model for COMET metric: https://huggingface.co/Unbabel/wmt22-cometkiwi-da.</summary> | ||
internal sealed class CometTranslationEvaluationResponse | ||
{ | ||
[JsonPropertyName("scores")] | ||
public List<double> Scores { get; set; } | ||
|
||
[JsonPropertyName("system_score")] | ||
public double SystemScore { get; set; } | ||
} |
33 changes: 33 additions & 0 deletions
33
dotnet/samples/Demos/QualityCheck/QualityCheckWithFilters/Models/EvaluationScoreType.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using System.Diagnostics.CodeAnalysis; | ||
|
||
namespace QualityCheckWithFilters.Models; | ||
|
||
/// <summary> | ||
/// Internal representation of evaluation score type to configure and run examples. | ||
/// </summary> | ||
internal readonly struct EvaluationScoreType(string endpoint) : IEquatable<EvaluationScoreType> | ||
{ | ||
public string Endpoint { get; } = endpoint; | ||
|
||
public static EvaluationScoreType BERT = new("bert-score"); | ||
public static EvaluationScoreType BLEU = new("bleu-score"); | ||
public static EvaluationScoreType METEOR = new("meteor-score"); | ||
public static EvaluationScoreType COMET = new("comet-score"); | ||
|
||
public static bool operator ==(EvaluationScoreType left, EvaluationScoreType right) => left.Equals(right); | ||
public static bool operator !=(EvaluationScoreType left, EvaluationScoreType right) => !(left == right); | ||
|
||
/// <inheritdoc/> | ||
public override bool Equals([NotNullWhen(true)] object? obj) => obj is EvaluationScoreType other && this == other; | ||
|
||
/// <inheritdoc/> | ||
public bool Equals(EvaluationScoreType other) => string.Equals(this.Endpoint, other.Endpoint, StringComparison.OrdinalIgnoreCase); | ||
|
||
/// <inheritdoc/> | ||
public override int GetHashCode() => StringComparer.OrdinalIgnoreCase.GetHashCode(this.Endpoint ?? string.Empty); | ||
|
||
/// <inheritdoc/> | ||
public override string ToString() => this.Endpoint ?? string.Empty; | ||
} |
Oops, something went wrong.