Vectorized HttpUserAgentParser.TryExtractVersion #79

gfoidl · 2025-09-06T20:04:23Z

Fixes #73

I tried quite a couple of variants (more than in the below benchmark code), but it's really hard to beat the scalar implementation, as the code is quite trivial and straightforward for the cpu to prefetch, etc.
So for vectorization something along the comment in

HttpUserAgentParser/src/HttpUserAgentParser/HttpUserAgentParser.cs

Lines 212 to 219 in 57bfa71

    
           // Vectorization is used in a optimistic way and specialized to common (trimmed down) user agents. 
        
           // When the first two char-vectors don't yield any success, we fall back to the scalar path. 
        
           // This penalized not found versions, but has an advantage for found versions. 
        
           // Vector512 is left out, because there are no common inputs with length 128 or more. 
        
           // 
        
           // Two short (same size as char) vectors are read, then packed to byte vectors on which the 
        
           // operation is done. For short / chart the higher byte is not of interest and zero or outside 
        
           // the target characters, thus with bytes we can process twice as much elements at once.

is used.

benchmark code

//#define DEV

using System.Buffers;
using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
using BenchmarkDotNet.Attributes;

#if !DEBUG
using BenchmarkDotNet.Running;
#endif

Console.WriteLine(RuntimeInformation.FrameworkDescription);
Console.WriteLine(RuntimeInformation.RuntimeIdentifier);
Console.WriteLine();

Bench bench = new();
Print(bench.Current());
Print(bench.IndexOf());
Print(bench.VectorSpecialized());

#if !DEBUG
BenchmarkRunner.Run<Bench>();
#endif

static void Print(string version, [CallerArgumentExpression(nameof(version))] string? argument = null)
{
    argument = argument!.Replace("bench.", "").Replace("(", "").Replace(")", "");

    Console.WriteLine($"{argument,-25}<{version}>");
}

[MemoryDiagnoser]
#if DEV
[ShortRunJob]
[DisassemblyDiagnoser]
#endif
public class Bench
{
    [Params(
        "/90.0.4430.212 Safari/537.36 OPR/76.0.4017.107",
        "11.0) like Gecko",
        "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
    )]
    public string TrimmedUserAgent { get; set; } = "/90.0.4430.212 Safari/537.36 OPR/76.0.4017.107";

#if !DEV
    [Benchmark(Baseline = true)]
#endif
    public string Current()
    {
        if (TryExtractVersion(this.TrimmedUserAgent, out Range range))
        {
            return this.TrimmedUserAgent[range];
        }

        return "fail";
    }

#if !DEV
    [Benchmark]
#endif
    public string IndexOf()
    {
        if (TryExtractVersion_IndexOf(this.TrimmedUserAgent, out Range range))
        {
            return this.TrimmedUserAgent[range];
        }

        return "fail";
    }

    [Benchmark]
    public string VectorSpecialized()
    {
        if (TryExtractVersion_VectorSpecialized(this.TrimmedUserAgent, out Range range))
        {
            return this.TrimmedUserAgent[range];
        }

        return "fail";
    }

    private static bool TryExtractVersion(ReadOnlySpan<char> haystack, out Range range)
    {
        range = default;

        // Limit search window to avoid scanning entire UA string unnecessarily
        const int Window = 128;
        if (haystack.Length > Window)
        {
            haystack = haystack.Slice(0, Window);
        }

        // Find first digit
        int start = -1;
        for (int i = 0; i < haystack.Length; i++)
        {
            char c = haystack[i];
            if (c >= '0' && c <= '9')
            {
                start = i;
                break;
            }
        }

        if (start < 0)
        {
            // No digit found => no version
            return false;
        }

        // Consume digits and dots after first digit
        int end = start + 1;
        while (end < haystack.Length)
        {
            char c = haystack[end];
            if (!((c >= '0' && c <= '9') || c == '.'))
            {
                break;
            }
            end++;
        }

        // Create exclusive end range
        range = new Range(start, end);
        return true;
    }

    private static readonly SearchValues<char> s_versionEnd = SearchValues.Create("0123456789.");
    private static bool TryExtractVersion_IndexOf(ReadOnlySpan<char> haystack, out Range range)
    {
        range = default;

        // Limit search window to avoid scanning entire UA string unnecessarily
        const int Window = 128;
        if (haystack.Length > Window)
        {
            haystack = haystack.Slice(0, Window);
        }

        int i = haystack.IndexOfAnyInRange('0', '9');
        if (i < 0)
        {
            return false;
        }

        int start = i;
        haystack = haystack.Slice(i + 1);

        i = haystack.IndexOfAnyExcept(s_versionEnd);
        if (i < 0)
        {
            return false;
        }
        i += start + 1;     // shift back the previous domain

        if (i == start)
        {
            return false;
        }

        range = new Range(start, i);
        return true;
    }

    private static bool TryExtractVersion_VectorSpecialized(ReadOnlySpan<char> haystack, out Range range)
    {
        range = default;

        if (Vector256.IsHardwareAccelerated && haystack.Length >= 2 * Vector256<short>.Count)
        {
            ref char ptr = ref MemoryMarshal.GetReference(haystack);

            Vector256<byte> vec = ptr.ReadVector256AsBytes(0);
            Vector256<byte> between0and9 = Vector256.LessThan(vec - Vector256.Create((byte)'0'), Vector256.Create((byte)('9' - '0' + 1)));

            if (between0and9 == Vector256<byte>.Zero)
            {
                goto Scalar;
            }

            uint bitMask = between0and9.ExtractMostSignificantBits();
            int idx = (int)uint.TrailingZeroCount(bitMask);
            Debug.Assert(idx is >= 0 and <= 32);
            int start = idx;

            Vector256<byte> byteMask = between0and9 | Vector256.Equals(vec, Vector256.Create((byte)'.'));
            byteMask = ~byteMask;

            if (byteMask == Vector256<byte>.Zero)
            {
                goto Scalar;
            }

            bitMask = byteMask.ExtractMostSignificantBits();
            bitMask >>= start;

            idx = start + (int)uint.TrailingZeroCount(bitMask);
            Debug.Assert(idx is >= 0 and <= 32);
            int end = idx;

            range = new Range(start, end);
            return true;
        }
        else if (Vector128.IsHardwareAccelerated && haystack.Length >= 2 * Vector128<short>.Count)
        {
            ref char ptr = ref MemoryMarshal.GetReference(haystack);

            Vector128<byte> vec = ptr.ReadVector128AsBytes(0);
            Vector128<byte> between0and9 = Vector128.LessThan(vec - Vector128.Create((byte)'0'), Vector128.Create((byte)('9' - '0' + 1)));

            if (between0and9 == Vector128<byte>.Zero)
            {
                goto Scalar;
            }

            uint bitMask = between0and9.ExtractMostSignificantBits();
            int idx = (int)uint.TrailingZeroCount(bitMask);
            Debug.Assert(idx is >= 0 and <= 16);
            int start = idx;

            Vector128<byte> byteMask = between0and9 | Vector128.Equals(vec, Vector128.Create((byte)'.'));
            byteMask = ~byteMask;

            if (byteMask == Vector128<byte>.Zero)
            {
                goto Scalar;
            }

            bitMask = byteMask.ExtractMostSignificantBits();
            bitMask >>= start;

            idx = start + (int)uint.TrailingZeroCount(bitMask);
            Debug.Assert(idx is >= 0 and <= 16);
            int end = idx;

            range = new Range(start, end);
            return true;
        }

    Scalar:
        {
            // Limit search window to avoid scanning entire UA string unnecessarily
            const int Windows = 128;
            if (haystack.Length > Windows)
            {
                haystack = haystack.Slice(0, Windows);
            }

            int start = -1;
            int i = 0;

            for (; i < haystack.Length; ++i)
            {
                char c = haystack[i];
                if (char.IsBetween(c, '0', '9'))
                {
                    start = i;
                    break;
                }
            }

            if (start < 0)
            {
                return false;
            }

            haystack = haystack.Slice(i + 1);
            for (i = 0; i < haystack.Length; ++i)
            {
                char c = haystack[i];
                if (!(char.IsBetween(c, '0', '9') || c == '.'))
                {
                    break;
                }
            }

            i += start + 1;     // shift back the previous domain

            if (i == start)
            {
                return false;
            }

            range = new Range(start, i);
            return true;
        }
    }
}

file static class VectorExtensions
{
    extension(ref char c)
    {
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public Vector128<byte> ReadVector128AsBytes(int offset)
        {
            ref short ptr = ref Unsafe.As<char, short>(ref c);

#if NET10_0_OR_GREATER
            return Vector128.NarrowWithSaturation(
                Vector128.LoadUnsafe(ref ptr, (uint)offset),
                Vector128.LoadUnsafe(ref ptr, (uint)(offset + Vector128<short>.Count))
            ).AsByte();
#else
            if (Sse2.IsSupported)
            {
                return Sse2.PackUnsignedSaturate(
                    Vector128.LoadUnsafe(ref ptr, (uint)offset),
                    Vector128.LoadUnsafe(ref ptr, (uint)(offset + Vector128<short>.Count)));
            }
            else if (AdvSimd.Arm64.IsSupported)
            {
                return AdvSimd.Arm64.UnzipEven(
                    Vector128.LoadUnsafe(ref ptr, (uint)offset).AsByte(),
                    Vector128.LoadUnsafe(ref ptr, (uint)(offset + Vector128<short>.Count)).AsByte());
            }
            else
            {
                return Vector128.Narrow(
                    Vector128.LoadUnsafe(ref ptr, (uint)offset),
                    Vector128.LoadUnsafe(ref ptr, (uint)(offset + Vector128<short>.Count))
                ).AsByte();
            }
#endif
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public Vector256<byte> ReadVector256AsBytes(int offset)
        {
            ref short ptr = ref Unsafe.As<char, short>(ref c);

#if NET10_0_OR_GREATER
            return Vector256.NarrowWithSaturation(
                Vector256.LoadUnsafe(ref ptr, (uint)offset),
                Vector256.LoadUnsafe(ref ptr, (uint)offset + (uint)Vector256<short>.Count)
            ).AsByte();
#else
            if (Avx2.IsSupported)
            {
                Vector256<byte> tmp = Avx2.PackUnsignedSaturate(
                    Vector256.LoadUnsafe(ref ptr, (uint)offset),
                    Vector256.LoadUnsafe(ref ptr, (uint)offset + (uint)Vector256<short>.Count));

                Vector256<long> tmp1 = Avx2.Permute4x64(tmp.AsInt64(), 0b_11_01_10_00);

                return tmp1.AsByte();
            }
            else
            {
                return Vector256.Narrow(
                    Vector256.LoadUnsafe(ref ptr, (uint)offset),
                    Vector256.LoadUnsafe(ref ptr, (uint)offset + (uint)Vector256<short>.Count)
                ).AsByte();
            }
#endif
        }
    }
}

Method	TrimmedUserAgent	Mean	Error	StdDev	Ratio	RatioSD	Gen0	Allocated	Alloc Ratio
Current	/90.0(...)7.107 [46]	23.343 ns	0.2926 ns	0.2737 ns	1.00	0.02	0.0153	48 B	1.00
IndexOf	/90.0(...)7.107 [46]	19.492 ns	0.4090 ns	0.6488 ns	0.84	0.03	0.0153	48 B	1.00
VectorSpecialized	/90.0(...)7.107 [46]	15.186 ns	0.0803 ns	0.0751 ns	0.65	0.01	0.0153	48 B	1.00

Current	11.0) like Gecko	12.587 ns	0.2726 ns	0.2550 ns	1.00	0.03	0.0102	32 B	1.00
IndexOf	11.0) like Gecko	21.743 ns	0.4945 ns	0.6769 ns	1.73	0.06	0.0102	32 B	1.00
VectorSpecialized	11.0) like Gecko	12.813 ns	0.2961 ns	0.2908 ns	1.02	0.03	0.0102	32 B	1.00

Current	aaaaa(...)aaaaa [54]	21.786 ns	0.2205 ns	0.1841 ns	1.00	0.01	-	-	NA
IndexOf	aaaaa(...)aaaaa [54]	3.507 ns	0.1247 ns	0.1166 ns	0.16	0.01	-	-	NA
VectorSpecialized	aaaaa(...)aaaaa [54]	24.644 ns	0.1723 ns	0.1612 ns	1.13	0.01	-	-	NA

The cases where a version is found profit from vectorization (the second benchmark is is juse in the ns-range different), but when no version is found the vectorized approach is slower -- here the IndexOf-variant shines.

Overall I'm not really shure if the added code-complexity is worth it to add this to the codebase.
I see more potential in finding browser, bots, etc. by a better approach. ATM we loop over the known sets, but I think there's some better approach that isn't $O(n)$ and this will give more perf (maybe by some kind of tree structure). I'll think about this later, leaving the decision to take this PR open to @BenjaminAbt 😉.

gfoidl · 2025-09-06T20:06:01Z

@BenjaminAbt we added tests for the faulty user agent that caused troubles the last time? I don't remember it...

gfoidl · 2025-09-06T20:19:33Z

src/HttpUserAgentParser/HttpUserAgentParser.cs

-        // Find first digit
-        int start = -1;
-        for (int i = 0; i < haystack.Length; i++)
+        if (Vector256.IsHardwareAccelerated && haystack.Length >= 2 * Vector256<short>.Count)


I'm pretty sure the implementation is correct, but should we add some kind of toggle (env variable) with which vectorization can be disabled in case there's a bug in the code (i.e. from some strange user agent that we don't know at the moment), so a user could still use the lib w/o the need to wait for hotfix release?

I don't think the environment variable is visible to users. It's kind of hidden black magic and users will remove the library without checking. For me it feels the IndexOf way is the most stable and a very fast solution for now, no?

IndexOf is quite good, but for the 11.0) like Gecko benchmark it's +75% slower and such short versions are quite common.

The specialized vectorized code is nice, but it's a lot of hard to maintain code so actually I don't like this approach very much.

I'd like to keep this PR open for a moment, so I'll be able to explore other approaches for speed-up too.
The easiest one is to shorten the "window" in

HttpUserAgentParser/src/HttpUserAgentParser/HttpUserAgentParser.cs

Lines 210 to 214 in 4a82130

const int Window = 128;

if (haystack.Length > Window)

{

haystack = haystack.Slice(0, Window);

}

, so that a) vectorization of IndexOf can still be used and b) it's no longer than the longest assumed version.

Further the biggest speed-up may come from a not linear scan through all possible patterns.

Further the biggest speed-up may come from a not linear scan through all possible patterns.

In my head I'm ready with such an approach, but I need to finish a work-project, then I'll prototype it and see how it goes.

Feel free, have complete confidence in your everything looks impressively good!

The first try with that idea is nice, as the user agent only needs to be scaned once ($O(n)$) and not for every possibility.
But for some browsers, where we depend on the order in the arrays, the wrong result is yielded. So I have to research a bit more.

Maybe there will move also some things around (w/o API breaking changes), so I'd like to leave this PR open so that all together can be done or only parts of it.

I hope to continue on this next weeks.

gfoidl added 2 commits September 6, 2025 21:45

Vectorized HttpUserAgentParser.TryExtractVersion

232c6cf

Added a comment

57bfa71

gfoidl requested a review from BenjaminAbt September 6, 2025 20:04

gfoidl commented Sep 6, 2025

View reviewed changes

BenjaminAbt approved these changes Sep 14, 2025

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Vectorized HttpUserAgentParser.TryExtractVersion #79

Vectorized HttpUserAgentParser.TryExtractVersion #79

Uh oh!

gfoidl commented Sep 6, 2025

Uh oh!

gfoidl commented Sep 6, 2025

Uh oh!

gfoidl Sep 6, 2025

Uh oh!

BenjaminAbt Sep 7, 2025

Uh oh!

gfoidl Sep 7, 2025

Uh oh!

gfoidl Sep 9, 2025

Uh oh!

BenjaminAbt Sep 14, 2025

Uh oh!

gfoidl Sep 14, 2025

Uh oh!

Uh oh!

	// Vectorization is used in a optimistic way and specialized to common (trimmed down) user agents.
	// When the first two char-vectors don't yield any success, we fall back to the scalar path.
	// This penalized not found versions, but has an advantage for found versions.
	// Vector512 is left out, because there are no common inputs with length 128 or more.
	//
	// Two short (same size as char) vectors are read, then packed to byte vectors on which the
	// operation is done. For short / chart the higher byte is not of interest and zero or outside
	// the target characters, thus with bytes we can process twice as much elements at once.

	const int Window = 128;
	if (haystack.Length > Window)
	{
	haystack = haystack.Slice(0, Window);
	}

Vectorized HttpUserAgentParser.TryExtractVersion #79

Are you sure you want to change the base?

Vectorized HttpUserAgentParser.TryExtractVersion #79

Uh oh!

Conversation

gfoidl commented Sep 6, 2025

Uh oh!

gfoidl commented Sep 6, 2025

Uh oh!

gfoidl Sep 6, 2025

Choose a reason for hiding this comment

Uh oh!

BenjaminAbt Sep 7, 2025

Choose a reason for hiding this comment

Uh oh!

gfoidl Sep 7, 2025

Choose a reason for hiding this comment

Uh oh!

gfoidl Sep 9, 2025

Choose a reason for hiding this comment

Uh oh!

BenjaminAbt Sep 14, 2025

Choose a reason for hiding this comment

Uh oh!

gfoidl Sep 14, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!