Skip to content

Commit

Permalink
Refactor Encoding to split fast-path and fallback logic (dotnet/corec…
Browse files Browse the repository at this point in the history
…lr#23098)

This refactoring is limited to ASCIIEncoding at the moment, but it can easily be applied to UTF-8 / UTF-16 / UTF-32.

High-level changes:
- Fallback logic has been split from the fast-path, improving performance of GetBytes and similar routines.
- All of the plumbing of when to invoke the fallback logic and how to manage leftover data has been moved into the base class.
- Almost all of the logic except for the fast-path is now written in terms of verifiable code (Span and ReadOnlySpan).
- Minor bug fixes in EncoderNLS.Convert (see https://github.com/dotnet/coreclr/issues/23020).

Commit migrated from dotnet/coreclr@43a5159
  • Loading branch information
GrabYourPitchforks committed Mar 11, 2019
1 parent 8d27a06 commit 2883820
Show file tree
Hide file tree
Showing 13 changed files with 2,584 additions and 658 deletions.
22 changes: 22 additions & 0 deletions src/coreclr/tests/CoreFX/CoreFX.issues.json
Original file line number Diff line number Diff line change
Expand Up @@ -861,6 +861,28 @@
]
}
},
{
"name": "System.Text.Encoding.Tests",
"enabled": true,
"exclusions": {
"namespaces": null,
"classes": null,
"methods": [
{
"name": "System.Text.Tests.EncoderConvert2.EncoderASCIIConvertMixedASCIIUnicodeCharArrayPartial",
"reason": "https://github.com/dotnet/coreclr/issues/23020"
},
{
"name": "System.Text.Tests.EncoderConvert2.EncoderUTF8ConvertMixedASCIIUnicodeCharArrayPartial",
"reason": "https://github.com/dotnet/coreclr/issues/23020"
},
{
"name": "System.Text.Tests.EncoderConvert2.EncoderUTF8ConvertUnicodeCharArrayPartial",
"reason": "https://github.com/dotnet/coreclr/issues/23020"
}
]
}
},
{
"name": "System.Text.RegularExpressions.Tests",
"enabled": true,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -761,6 +761,7 @@
<Compile Include="$(MSBuildThisFileDirectory)System\StringSplitOptions.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\SystemException.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\ASCIIEncoding.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\ASCIIUtility.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\StringBuilderCache.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\CodePageDataItem.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Decoder.cs" />
Expand All @@ -776,6 +777,7 @@
<Compile Include="$(MSBuildThisFileDirectory)System\Text\EncoderFallback.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\EncoderReplacementFallback.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Encoding.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Encoding.Internal.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\EncodingData.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\EncodingInfo.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\EncodingNLS.cs" />
Expand Down
4 changes: 2 additions & 2 deletions src/libraries/System.Private.CoreLib/src/System/String.cs
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,7 @@ public static bool IsNullOrWhiteSpace(string value)
Debug.Assert(byteLength >= 0);

// Get our string length
int stringLength = encoding.GetCharCount(bytes, byteLength, null);
int stringLength = encoding.GetCharCount(bytes, byteLength);
Debug.Assert(stringLength >= 0, "stringLength >= 0");

// They gave us an empty string if they needed one
Expand All @@ -491,7 +491,7 @@ public static bool IsNullOrWhiteSpace(string value)
string s = FastAllocateString(stringLength);
fixed (char* pTempChars = &s._firstChar)
{
int doubleCheck = encoding.GetChars(bytes, byteLength, pTempChars, stringLength, null);
int doubleCheck = encoding.GetChars(bytes, byteLength, pTempChars, stringLength);
Debug.Assert(stringLength == doubleCheck,
"Expected encoding.GetChars to return same length as encoding.GetCharCount");
}
Expand Down
1,128 changes: 520 additions & 608 deletions src/libraries/System.Private.CoreLib/src/System/Text/ASCIIEncoding.cs

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Runtime.CompilerServices;

namespace System.Text
{
/*
* Contains naive unoptimized (non-SIMD) implementations of ASCII transcoding
* operations. Vectorized methods can be substituted here as a drop-in replacement.
*/

internal unsafe static class ASCIIUtility
{
[MethodImpl(MethodImplOptions.NoInlining)] // the actual implementation won't be inlined, so this shouldn't be either, lest it throw off benchmarks
public static uint GetIndexOfFirstNonAsciiByte(byte* pBytes, uint byteCount)
{
uint idx = 0;
for (; idx < byteCount; idx++)
{
if ((sbyte)pBytes[idx] < 0)
{
break;
}
}
return idx;
}

[MethodImpl(MethodImplOptions.NoInlining)] // the actual implementation won't be inlined, so this shouldn't be either, lest it throw off benchmarks
public static uint GetIndexOfFirstNonAsciiChar(char* pChars, uint charCount)
{
uint idx = 0;
for (; idx < charCount; idx++)
{
if (pChars[idx] > 0x7Fu)
{
break;
}
}
return idx;
}

[MethodImpl(MethodImplOptions.NoInlining)] // the actual implementation won't be inlined, so this shouldn't be either, lest it throw off benchmarks
public static uint NarrowUtf16ToAscii(char* pChars, byte* pBytes, uint elementCount)
{
uint idx = 0;
for (; idx < elementCount; idx++)
{
uint ch = pChars[idx];
if (ch > 0x7Fu)
{
break;
}
pBytes[idx] = (byte)ch;
}
return idx;
}

[MethodImpl(MethodImplOptions.NoInlining)] // the actual implementation won't be inlined, so this shouldn't be either, lest it throw off benchmarks
public static uint WidenAsciiToUtf16(byte* pBytes, char* pChars, uint elementCount)
{
uint idx = 0;
for (; idx < elementCount; idx++)
{
byte b = pBytes[idx];
if (b > 0x7F)
{
break;
}
pChars[idx] = (char)b;
}
return idx;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ public virtual void Reset()
internal unsafe byte* byteStart;
internal unsafe char* charEnd;

internal Encoding _encoding;
internal DecoderNLS _decoder;
private int _originalByteCount;

// Internal Reset
internal unsafe void InternalReset()
{
Expand All @@ -82,6 +86,22 @@ internal unsafe void InternalInitialize(byte* byteStart, char* charEnd)
this.charEnd = charEnd;
}

internal static DecoderFallbackBuffer CreateAndInitialize(Encoding encoding, DecoderNLS decoder, int originalByteCount)
{
// The original byte count is only used for keeping track of what 'index' value needs
// to be passed to the abstract Fallback method. The index value is calculated by subtracting
// 'bytes.Length' (where bytes is expected to be the entire remaining input buffer)
// from the 'originalByteCount' value specified here.

DecoderFallbackBuffer fallbackBuffer = (decoder is null) ? encoding.DecoderFallback.CreateFallbackBuffer() : decoder.FallbackBuffer;

fallbackBuffer._encoding = encoding;
fallbackBuffer._decoder = decoder;
fallbackBuffer._originalByteCount = originalByteCount;

return fallbackBuffer;
}

// Fallback the current byte by sticking it into the remaining char buffer.
// This can only be called by our encodings (other have to use the public fallback methods), so
// we can use our DecoderNLS here too (except we don't).
Expand Down Expand Up @@ -191,6 +211,90 @@ internal unsafe virtual int InternalFallback(byte[] bytes, byte* pBytes)
return 0;
}

internal int InternalFallbackGetCharCount(ReadOnlySpan<byte> remainingBytes, int fallbackLength)
{
return (Fallback(remainingBytes.Slice(0, fallbackLength).ToArray(), index: _originalByteCount - remainingBytes.Length))
? DrainRemainingDataForGetCharCount()
: 0;
}

internal bool TryInternalFallbackGetChars(ReadOnlySpan<byte> remainingBytes, int fallbackLength, Span<char> chars, out int charsWritten)
{
if (Fallback(remainingBytes.Slice(0, fallbackLength).ToArray(), index: _originalByteCount - remainingBytes.Length))
{
return TryDrainRemainingDataForGetChars(chars, out charsWritten);
}
else
{
// Return true because we weren't asked to write anything, so this is a "success" in the sense that
// the output buffer was large enough to hold the desired 0 chars of output.

charsWritten = 0;
return true;
}
}

private Rune GetNextRune()
{
// Call GetNextChar() and try treating it as a non-surrogate character.
// If that fails, call GetNextChar() again and attempt to treat the two chars
// as a surrogate pair. If that still fails, throw an exception since the fallback
// mechanism is giving us a bad replacement character.

Rune rune;
char ch = GetNextChar();
if (!Rune.TryCreate(ch, out rune) && !Rune.TryCreate(ch, GetNextChar(), out rune))
{
throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex);
}

return rune;
}

internal int DrainRemainingDataForGetCharCount()
{
int totalCharCount = 0;

Rune thisRune;
while ((thisRune = GetNextRune()).Value != 0)
{
// We need to check for overflow while tallying the fallback char count.

totalCharCount += thisRune.Utf16SequenceLength;
if (totalCharCount < 0)
{
InternalReset();
Encoding.ThrowConversionOverflow();
}
}

return totalCharCount;
}

internal bool TryDrainRemainingDataForGetChars(Span<char> chars, out int charsWritten)
{
int originalCharCount = chars.Length;

Rune thisRune;
while ((thisRune = GetNextRune()).Value != 0)
{
if (thisRune.TryEncode(chars, out int charsWrittenJustNow))
{
chars = chars.Slice(charsWrittenJustNow);
continue;
}
else
{
InternalReset();
charsWritten = default;
return false;
}
}

charsWritten = originalCharCount - chars.Length;
return true;
}

// private helper methods
internal void ThrowLastBytesRecursive(byte[] bytesUnknown)
{
Expand Down
Loading

0 comments on commit 2883820

Please sign in to comment.