Skip to content

Commit

Permalink
.Net: Loosen TextChunker's lines input type (#5502)
Browse files Browse the repository at this point in the history
It currently requires a `List<string>`. This both annoying and
unnecessary.

I also removed duplicative experimental attributes.

(Note this is a binary breaking change, not source breaking, but the
type is marked experimental.)
  • Loading branch information
stephentoub committed Mar 19, 2024
1 parent 2a06539 commit 0a44a66
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 14 deletions.
18 changes: 18 additions & 0 deletions dotnet/src/SemanticKernel.Core/CompatibilitySuppressions.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- https://learn.microsoft.com/en-us/dotnet/fundamentals/package-validation/diagnostic-ids -->
<Suppressions xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">
<Suppression>
<DiagnosticId>CP0002</DiagnosticId>
<Target>M:Microsoft.SemanticKernel.Text.TextChunker.SplitMarkdownParagraphs(System.Collections.Generic.List{System.String},System.Int32,System.Int32,System.String,Microsoft.SemanticKernel.Text.TextChunker.TokenCounter)</Target>
<Left>lib/netstandard2.0/Microsoft.SemanticKernel.Core.dll</Left>
<Right>lib/netstandard2.0/Microsoft.SemanticKernel.Core.dll</Right>
<IsBaselineSuppression>true</IsBaselineSuppression>
</Suppression>
<Suppression>
<DiagnosticId>CP0002</DiagnosticId>
<Target>M:Microsoft.SemanticKernel.Text.TextChunker.SplitPlainTextParagraphs(System.Collections.Generic.List{System.String},System.Int32,System.Int32,System.String,Microsoft.SemanticKernel.Text.TextChunker.TokenCounter)</Target>
<Left>lib/netstandard2.0/Microsoft.SemanticKernel.Core.dll</Left>
<Right>lib/netstandard2.0/Microsoft.SemanticKernel.Core.dll</Right>
<IsBaselineSuppression>true</IsBaselineSuppression>
</Suppression>
</Suppressions>
19 changes: 5 additions & 14 deletions dotnet/src/SemanticKernel.Core/Text/TextChunker.cs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ public static class TextChunker
/// <param name="maxTokensPerLine">Maximum number of tokens per line.</param>
/// <param name="tokenCounter">Function to count tokens in a string. If not supplied, the default counter will be used.</param>
/// <returns>List of lines.</returns>
[Experimental("SKEXP0050")]
public static List<string> SplitPlainTextLines(string text, int maxTokensPerLine, TokenCounter? tokenCounter = null) =>
InternalSplitLines(text, maxTokensPerLine, trim: true, s_plaintextSplitOptions, tokenCounter);

Expand All @@ -46,7 +45,6 @@ public static class TextChunker
/// <param name="maxTokensPerLine">Maximum number of tokens per line.</param>
/// <param name="tokenCounter">Function to count tokens in a string. If not supplied, the default counter will be used.</param>
/// <returns>List of lines.</returns>
[Experimental("SKEXP0050")]
public static List<string> SplitMarkDownLines(string text, int maxTokensPerLine, TokenCounter? tokenCounter = null) =>
InternalSplitLines(text, maxTokensPerLine, trim: true, s_markdownSplitOptions, tokenCounter);

Expand All @@ -59,8 +57,7 @@ public static class TextChunker
/// <param name="chunkHeader">Text to be prepended to each individual chunk.</param>
/// <param name="tokenCounter">Function to count tokens in a string. If not supplied, the default counter will be used.</param>
/// <returns>List of paragraphs.</returns>
[Experimental("SKEXP0050")]
public static List<string> SplitPlainTextParagraphs(List<string> lines, int maxTokensPerParagraph, int overlapTokens = 0, string? chunkHeader = null, TokenCounter? tokenCounter = null) =>
public static List<string> SplitPlainTextParagraphs(IEnumerable<string> lines, int maxTokensPerParagraph, int overlapTokens = 0, string? chunkHeader = null, TokenCounter? tokenCounter = null) =>
InternalSplitTextParagraphs(lines, maxTokensPerParagraph, overlapTokens, chunkHeader, static (text, maxTokens, tokenCounter) => InternalSplitLines(text, maxTokens, trim: false, s_plaintextSplitOptions, tokenCounter), tokenCounter);

/// <summary>
Expand All @@ -72,12 +69,10 @@ public static class TextChunker
/// <param name="chunkHeader">Text to be prepended to each individual chunk.</param>
/// <param name="tokenCounter">Function to count tokens in a string. If not supplied, the default counter will be used.</param>
/// <returns>List of paragraphs.</returns>
[Experimental("SKEXP0050")]
public static List<string> SplitMarkdownParagraphs(List<string> lines, int maxTokensPerParagraph, int overlapTokens = 0, string? chunkHeader = null, TokenCounter? tokenCounter = null) =>
public static List<string> SplitMarkdownParagraphs(IEnumerable<string> lines, int maxTokensPerParagraph, int overlapTokens = 0, string? chunkHeader = null, TokenCounter? tokenCounter = null) =>
InternalSplitTextParagraphs(lines, maxTokensPerParagraph, overlapTokens, chunkHeader, static (text, maxTokens, tokenCounter) => InternalSplitLines(text, maxTokens, trim: false, s_markdownSplitOptions, tokenCounter), tokenCounter);

[Experimental("SKEXP0050")]
private static List<string> InternalSplitTextParagraphs(List<string> lines, int maxTokensPerParagraph, int overlapTokens, string? chunkHeader, Func<string, int, TokenCounter?, List<string>> longLinesSplitter, TokenCounter? tokenCounter)
private static List<string> InternalSplitTextParagraphs(IEnumerable<string> lines, int maxTokensPerParagraph, int overlapTokens, string? chunkHeader, Func<string, int, TokenCounter?, List<string>> longLinesSplitter, TokenCounter? tokenCounter)
{
if (maxTokensPerParagraph <= 0)
{
Expand All @@ -89,7 +84,8 @@ private static List<string> InternalSplitTextParagraphs(List<string> lines, int
throw new ArgumentException("overlapTokens cannot be larger than maxTokensPerParagraph", nameof(maxTokensPerParagraph));
}

if (lines.Count == 0)
// Optimize empty inputs if we can efficiently determine the're empty
if (lines is ICollection<string> c && c.Count == 0)
{
return new List<string>();
}
Expand All @@ -106,7 +102,6 @@ private static List<string> InternalSplitTextParagraphs(List<string> lines, int
return processedParagraphs;
}

[Experimental("SKEXP0050")]
private static List<string> BuildParagraph(IEnumerable<string> truncatedLines, int maxTokensPerParagraph, TokenCounter? tokenCounter)
{
StringBuilder paragraphBuilder = new();
Expand Down Expand Up @@ -147,7 +142,6 @@ private static List<string> BuildParagraph(IEnumerable<string> truncatedLines, i
return paragraphs;
}

[Experimental("SKEXP0050")]
private static List<string> ProcessParagraphs(List<string> paragraphs, int adjustedMaxTokensPerParagraph, int overlapTokens, string? chunkHeader, Func<string, int, TokenCounter?, List<string>> longLinesSplitter, TokenCounter? tokenCounter)
{
// distribute text more evenly in the last paragraphs when the last paragraph is too short.
Expand Down Expand Up @@ -212,7 +206,6 @@ private static List<string> ProcessParagraphs(List<string> paragraphs, int adjus
return processedParagraphs;
}

[Experimental("SKEXP0050")]
private static List<string> InternalSplitLines(string text, int maxTokensPerLine, bool trim, string?[] splitOptions, TokenCounter? tokenCounter)
{
var result = new List<string>();
Expand All @@ -233,7 +226,6 @@ private static List<string> InternalSplitLines(string text, int maxTokensPerLine
return result;
}

[Experimental("SKEXP0050")]
private static (List<string>, bool) Split(List<string> input, int maxTokens, ReadOnlySpan<char> separators, bool trim, TokenCounter? tokenCounter)
{
bool inputWasSplit = false;
Expand All @@ -248,7 +240,6 @@ private static (List<string>, bool) Split(List<string> input, int maxTokens, Rea
return (result, inputWasSplit);
}

[Experimental("SKEXP0050")]
private static (List<string>, bool) Split(ReadOnlySpan<char> input, string? inputString, int maxTokens, ReadOnlySpan<char> separators, bool trim, TokenCounter? tokenCounter)
{
Debug.Assert(inputString is null || input.SequenceEqual(inputString.AsSpan()));
Expand Down

0 comments on commit 0a44a66

Please sign in to comment.