From d40152414d3849c3bf943ed05298edbd4ffa43f0 Mon Sep 17 00:00:00 2001 From: LongYinan Date: Sun, 12 Oct 2025 23:21:09 +0800 Subject: [PATCH 01/14] refactor: runtime detect simd features --- Cargo.lock | 1 - Cargo.toml | 1 - src/lib.rs | 429 +++++++++--- src/simd/README.md | 11 + src/simd/avx2.rs | 100 +++ src/simd/avx512.rs | 92 +++ src/simd/bits.rs | 105 +++ src/simd/mod.rs | 16 + src/simd/neon.rs | 102 +++ src/simd/sse2.rs | 97 +++ src/simd/traits.rs | 71 ++ src/simd/v128.rs | 113 ++++ test.rs | 1614 ++++++++++++++++++++++++++++++++++++++++++++ 13 files changed, 2663 insertions(+), 89 deletions(-) create mode 100644 src/simd/README.md create mode 100644 src/simd/avx2.rs create mode 100644 src/simd/avx512.rs create mode 100644 src/simd/bits.rs create mode 100644 src/simd/mod.rs create mode 100644 src/simd/neon.rs create mode 100644 src/simd/sse2.rs create mode 100644 src/simd/traits.rs create mode 100644 src/simd/v128.rs create mode 100644 test.rs diff --git a/Cargo.lock b/Cargo.lock index fb0b6d0..6c22aed 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -322,7 +322,6 @@ dependencies = [ "serde", "serde_json", "sonic-rs", - "sonic-simd", "v_jsonescape", ] diff --git a/Cargo.toml b/Cargo.toml index cda204f..db691f7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,7 +24,6 @@ name = "escape" harness = false [dependencies] -sonic-simd = "0.1" [dev-dependencies] criterion2 = "3" diff --git a/src/lib.rs b/src/lib.rs index e0114ec..543a488 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,13 +2,13 @@ //! //! Only takes the string escaping part to avoid the abstraction overhead. +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +use std::arch::is_x86_feature_detected; use std::slice::from_raw_parts; -#[cfg(not(all(target_feature = "neon", target_arch = "aarch64")))] -use sonic_simd::u8x32; -use sonic_simd::{BitMask, Mask, Simd}; -#[cfg(all(target_feature = "neon", target_arch = "aarch64"))] -use sonic_simd::{bits::NeonBits, u8x16}; +use simd::{BitMask, Mask, Simd}; + +mod simd; #[inline(always)] unsafe fn load(ptr: *const u8) -> V { @@ -292,6 +292,10 @@ const NEED_ESCAPED: [u8; 256] = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +static COMPUTE_LANES: std::sync::Once = std::sync::Once::new(); +static mut LANES: usize = 16; + // only check the src length. #[inline(always)] unsafe fn escape_unchecked(src: &mut *const u8, nb: &mut usize, dst: &mut *mut u8) { @@ -323,37 +327,137 @@ fn check_cross_page(ptr: *const u8, step: usize) -> bool { ((ptr as usize & (page_size - 1)) + step) > page_size } +#[inline] +fn escaped_mask_generic(v: simd::v128::Simd128u) -> u16 { + use simd::v128::Simd128u as u8x16; + + let x1f = u8x16::splat(0x1f); // 0x00 ~ 0x20 + let blash = u8x16::splat(b'\\'); + let quote = u8x16::splat(b'"'); + let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); + v.bitmask() +} + +#[cfg(target_arch = "aarch64")] +#[inline] +fn escaped_mask_neon(v: simd::neon::Simd128u) -> simd::bits::NeonBits { + use simd::neon::Simd128u as u8x16; + + let x1f = u8x16::splat(0x1f); // 0x00 ~ 0x20 + let blash = u8x16::splat(b'\\'); + let quote = u8x16::splat(b'"'); + let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); + v.bitmask() +} + +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[inline] +fn escaped_mask_sse2(v: simd::sse2::Simd128u) -> u16 { + use simd::sse2::Simd128u as u8x16; + + let x1f = u8x16::splat(0x1f); // 0x00 ~ 0x20 + let blash = u8x16::splat(b'\\'); + let quote = u8x16::splat(b'"'); + let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); + v.bitmask() +} + +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[inline] +fn escaped_mask_avx2(v: simd::avx2::Simd256u) -> u32 { + use simd::avx2::Simd256u as u8x32; + + let x1f = u8x32::splat(0x1f); // 0x00 ~ 0x20 + let blash = u8x32::splat(b'\\'); + let quote = u8x32::splat(b'"'); + let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); + v.bitmask() +} + +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[inline] +fn escaped_mask_avx512(v: simd::avx512::Simd512u) -> u64 { + use simd::avx512::Simd512u as u8x64; + + let x1f = u8x64::splat(0x1f); // 0x00 ~ 0x20 + let blash = u8x64::splat(b'\\'); + let quote = u8x64::splat(b'"'); + let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); + v.bitmask() +} + +macro_rules! escape { + ($mask:expr, $nb:expr, $dptr:expr, $sptr:expr) => { + if $mask.all_zero() { + $nb -= LANES; + $dptr = $dptr.add(LANES); + $sptr = $sptr.add(LANES); + } else { + let cn = $mask.first_offset(); + $nb -= cn; + $dptr = $dptr.add(cn); + $sptr = $sptr.add(cn); + escape_unchecked(&mut $sptr, &mut $nb, &mut $dptr); + } + }; +} + +macro_rules! load_v { + ($placeholder:expr, $sptr:expr, $nb:expr) => {{ + #[cfg(not(any(target_os = "linux", target_os = "macos")))] + { + std::ptr::copy_nonoverlapping($sptr, $placeholder[..].as_mut_ptr(), $nb); + load($placeholder[..].as_ptr()) + } + #[cfg(any(target_os = "linux", target_os = "macos"))] + { + if check_cross_page($sptr, LANES) { + std::ptr::copy_nonoverlapping($sptr, $placeholder[..].as_mut_ptr(), $nb); + load($placeholder[..].as_ptr()) + } else { + #[cfg(not(debug_assertions))] + { + // disable memory sanitizer here + load($sptr) + } + #[cfg(debug_assertions)] + { + std::ptr::copy_nonoverlapping($sptr, $placeholder[..].as_mut_ptr(), $nb); + load($placeholder[..].as_ptr()) + } + } + } + }}; +} + #[inline(always)] fn format_string(value: &str, dst: &mut [u8]) -> usize { - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - let mut v: u8x16; - #[cfg(not(all(target_arch = "aarch64", target_feature = "neon")))] - let mut v: u8x32; - - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - const LANES: usize = 16; - #[cfg(not(all(target_arch = "aarch64", target_feature = "neon")))] - const LANES: usize = 32; - - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - #[inline] - fn escaped_mask(v: u8x16) -> NeonBits { - let x1f = u8x16::splat(0x1f); // 0x00 ~ 0x20 - let blash = u8x16::splat(b'\\'); - let quote = u8x16::splat(b'"'); - let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); - v.bitmask() - } - - #[cfg(not(all(target_arch = "aarch64", target_feature = "neon")))] - #[inline] - fn escaped_mask(v: u8x32) -> u32 { - let x1f = u8x32::splat(0x1f); // 0x00 ~ 0x20 - let blash = u8x32::splat(b'\\'); - let quote = u8x32::splat(b'"'); - let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); - v.bitmask() - } + #[cfg(target_arch = "aarch64")] + let mut v_neon: simd::neon::Simd128u; + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + let mut v_sse2: simd::sse2::Simd128u; + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + let mut v_avx2: simd::avx2::Simd256u; + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + let mut v_avx512: simd::avx512::Simd512u; + + let mut v_generic: simd::v128::Simd128u; + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + COMPUTE_LANES.call_once(|| { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if is_x86_feature_detected!("avx512f") { + unsafe { + LANES = simd::avx512::Simd512u::LANES; + } + } else if is_x86_feature_detected!("avx2") { + unsafe { + LANES = simd::avx2::Simd256u::LANES; + } + } + } + }); unsafe { let slice = value.as_bytes(); @@ -365,66 +469,214 @@ fn format_string(value: &str, dst: &mut [u8]) -> usize { *dptr = b'"'; dptr = dptr.add(1); while nb >= LANES { - v = load(sptr); - v.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut(dptr, LANES)); - let mask = escaped_mask(v); - if mask.all_zero() { - nb -= LANES; - dptr = dptr.add(LANES); - sptr = sptr.add(LANES); - } else { - let cn = mask.first_offset(); - nb -= cn; - dptr = dptr.add(cn); - sptr = sptr.add(cn); - escape_unchecked(&mut sptr, &mut nb, &mut dptr); + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + { + if cfg!(target_os = "macos") || std::arch::is_aarch64_feature_detected!("neon") { + v_neon = load(sptr); + v_neon.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( + dptr, LANES, + )); + let mask = escaped_mask_neon(v_neon); + escape!(mask, nb, dptr, sptr); + } else { + v_generic = load(sptr); + v_generic.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( + dptr, LANES, + )); + let mask = escaped_mask_generic(v_generic); + escape!(mask, nb, dptr, sptr); + } + } + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + { + if is_x86_feature_detected!("avx512f") { + v_avx512 = load(sptr); + v_avx512.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( + dptr, LANES, + )); + let mask = escaped_mask_avx512(v_avx512); + escape!(mask, nb, dptr, sptr); + } else if is_x86_feature_detected!("avx2") { + v_avx2 = load(sptr); + v_avx2.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( + dptr, LANES, + )); + let mask = escaped_mask_avx2(v_avx2); + escape!(mask, nb, dptr, sptr); + } else if is_x86_feature_detected!("sse2") { + v_sse2 = load(sptr); + v_sse2.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( + dptr, LANES, + )); + let mask = escaped_mask_sse2(v_sse2); + escape!(mask, nb, dptr, sptr); + } else { + v_generic = load(sptr); + v_generic.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( + dptr, LANES, + )); + let mask = escaped_mask_generic(v_generic); + escape!(mask, nb, dptr, sptr); + } } } - // Scratch buffer reused for mask materialisation; stay uninitialised. - #[cfg(not(miri))] - #[allow(invalid_value, clippy::uninit_assumed_init)] - let mut placeholder: [u8; LANES] = core::mem::MaybeUninit::uninit().assume_init(); - #[cfg(miri)] - let mut placeholder: [u8; LANES] = [0; LANES]; - while nb > 0 { - v = { - #[cfg(not(any(target_os = "linux", target_os = "macos")))] - { - std::ptr::copy_nonoverlapping(sptr, placeholder[..].as_mut_ptr(), nb); - load(placeholder[..].as_ptr()) + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + { + if cfg!(target_os = "macos") || std::arch::is_aarch64_feature_detected!("neon") { + const LANES: usize = simd::neon::Simd128u::LANES; + // Scratch buffer reused for mask materialisation; stay uninitialised. + #[cfg(not(miri))] + #[allow(invalid_value, clippy::uninit_assumed_init)] + let mut placeholder: [u8; LANES] = core::mem::MaybeUninit::uninit().assume_init(); + #[cfg(miri)] + let mut placeholder: [u8; LANES] = [0; LANES]; + while nb > 0 { + v_neon = load_v!(placeholder, sptr, nb); + v_neon.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( + dptr, LANES, + )); + let mask = escaped_mask_neon(v_neon).clear_high_bits(LANES - nb); + if mask.all_zero() { + dptr = dptr.add(nb); + break; + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } } - #[cfg(any(target_os = "linux", target_os = "macos"))] - { - if check_cross_page(sptr, LANES) { - std::ptr::copy_nonoverlapping(sptr, placeholder[..].as_mut_ptr(), nb); - load(placeholder[..].as_ptr()) + } else { + const LANES: usize = simd::v128::Simd128u::LANES; + // Scratch buffer reused for mask materialisation; stay uninitialised. + #[cfg(not(miri))] + #[allow(invalid_value, clippy::uninit_assumed_init)] + let mut placeholder: [u8; LANES] = core::mem::MaybeUninit::uninit().assume_init(); + #[cfg(miri)] + let mut placeholder: [u8; LANES] = [0; LANES]; + while nb > 0 { + v_generic = load_v!(placeholder, sptr, nb); + v_generic.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( + dptr, LANES, + )); + let mask = escaped_mask_generic(v_generic).clear_high_bits(LANES - nb); + if mask.all_zero() { + dptr = dptr.add(nb); + break; } else { - #[cfg(not(debug_assertions))] - { - // disable memory sanitizer here - load(sptr) - } - #[cfg(debug_assertions)] - { - std::ptr::copy_nonoverlapping(sptr, placeholder[..].as_mut_ptr(), nb); - load(placeholder[..].as_ptr()) - } + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } + } + } + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + { + if is_x86_feature_detected!("avx512f") { + const LANES: usize = simd::avx512::Simd512u::LANES; + // Scratch buffer reused for mask materialisation; stay uninitialised. + #[cfg(not(miri))] + #[allow(invalid_value, clippy::uninit_assumed_init)] + let mut placeholder: [u8; LANES] = core::mem::MaybeUninit::uninit().assume_init(); + #[cfg(miri)] + let mut placeholder: [u8; LANES] = [0; LANES]; + while nb > 0 { + v_avx512 = load_v!(placeholder, sptr, nb); + v_avx512.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( + dptr, LANES, + )); + let mask = escaped_mask_avx512(v_avx512).clear_high_bits(LANES - nb); + if mask.all_zero() { + dptr = dptr.add(nb); + break; + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } + } else if is_x86_feature_detected!("avx2") { + const LANES: usize = simd::avx2::Simd256u::LANES; + // Scratch buffer reused for mask materialisation; stay uninitialised. + #[cfg(not(miri))] + #[allow(invalid_value, clippy::uninit_assumed_init)] + let mut placeholder: [u8; LANES] = core::mem::MaybeUninit::uninit().assume_init(); + #[cfg(miri)] + let mut placeholder: [u8; LANES] = [0; LANES]; + while nb > 0 { + v_avx2 = load_v!(placeholder, sptr, nb); + v_avx2.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( + dptr, LANES, + )); + let mask = escaped_mask_avx2(v_avx2).clear_high_bits(LANES - nb); + if mask.all_zero() { + dptr = dptr.add(nb); + break; + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } + } else if is_x86_feature_detected!("sse2") { + const LANES: usize = simd::sse2::Simd128u::LANES; + // Scratch buffer reused for mask materialisation; stay uninitialised. + #[cfg(not(miri))] + #[allow(invalid_value, clippy::uninit_assumed_init)] + let mut placeholder: [u8; LANES] = core::mem::MaybeUninit::uninit().assume_init(); + #[cfg(miri)] + let mut placeholder: [u8; LANES] = [0; LANES]; + while nb > 0 { + v_sse2 = load_v!(placeholder, sptr, nb); + v_sse2.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( + dptr, LANES, + )); + let mask = escaped_mask_sse2(v_sse2).clear_high_bits(LANES - nb); + if mask.all_zero() { + dptr = dptr.add(nb); + break; + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); } } - }; - v.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut(dptr, LANES)); - - let mask = escaped_mask(v).clear_high_bits(LANES - nb); - if mask.all_zero() { - dptr = dptr.add(nb); - break; } else { - let cn = mask.first_offset(); - nb -= cn; - dptr = dptr.add(cn); - sptr = sptr.add(cn); - escape_unchecked(&mut sptr, &mut nb, &mut dptr); + const LANES: usize = simd::v128::Simd128u::LANES; + // Scratch buffer reused for mask materialisation; stay uninitialised. + #[cfg(not(miri))] + #[allow(invalid_value, clippy::uninit_assumed_init)] + let mut placeholder: [u8; LANES] = core::mem::MaybeUninit::uninit().assume_init(); + #[cfg(miri)] + let mut placeholder: [u8; LANES] = [0; LANES]; + while nb > 0 { + v_generic = load_v!(placeholder, sptr, nb); + v_generic.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( + dptr, LANES, + )); + let mask = escaped_mask_generic(v_generic).clear_high_bits(LANES - nb); + if mask.all_zero() { + dptr = dptr.add(nb); + break; + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } } } *dptr = b'"'; @@ -436,7 +688,10 @@ fn format_string(value: &str, dst: &mut [u8]) -> usize { pub fn escape(value: &str) -> String { let capacity = value.len() * 6 + 32 + 3; let mut buf = Vec::with_capacity(capacity); - unsafe { buf.set_len(capacity) }; + #[allow(clippy::uninit_vec)] + unsafe { + buf.set_len(capacity) + }; let cnt = format_string(value, &mut buf); unsafe { buf.set_len(cnt) }; unsafe { String::from_utf8_unchecked(buf) } diff --git a/src/simd/README.md b/src/simd/README.md new file mode 100644 index 0000000..aa1b464 --- /dev/null +++ b/src/simd/README.md @@ -0,0 +1,11 @@ +# sonic_simd + +Borrowed from https://github.com/cloudwego/sonic-rs. +With the runtime SIMD features detection rather than compile-time detection. + +A portable SIMD library that provides low-level APIs for x86, ARM. Other platforms will use the fallback scalar implementation. + +TODO: + +1. support RISC-V. +2. support wasm. \ No newline at end of file diff --git a/src/simd/avx2.rs b/src/simd/avx2.rs new file mode 100644 index 0000000..60cde12 --- /dev/null +++ b/src/simd/avx2.rs @@ -0,0 +1,100 @@ +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +use std::ops::{BitAnd, BitOr, BitOrAssign}; + +use super::{Mask, Simd}; + +#[derive(Debug)] +#[repr(transparent)] +pub struct Simd256u(__m256i); + +#[derive(Debug)] +#[repr(transparent)] +pub struct Mask256(__m256i); + +impl Mask for Mask256 { + type BitMask = u32; + type Element = u8; + + #[inline(always)] + fn bitmask(self) -> Self::BitMask { + unsafe { _mm256_movemask_epi8(self.0) as u32 } + } + + #[inline(always)] + fn splat(b: bool) -> Self { + let v: i8 = if b { -1 } else { 0 }; + unsafe { Mask256(_mm256_set1_epi8(v)) } + } +} + +impl BitAnd for Mask256 { + type Output = Self; + + #[inline(always)] + fn bitand(self, rhs: Mask256) -> Self::Output { + unsafe { Mask256(_mm256_and_si256(self.0, rhs.0)) } + } +} + +impl BitOr for Mask256 { + type Output = Self; + + #[inline(always)] + fn bitor(self, rhs: Mask256) -> Self::Output { + unsafe { Mask256(_mm256_or_si256(self.0, rhs.0)) } + } +} + +impl BitOrAssign for Mask256 { + #[inline(always)] + fn bitor_assign(&mut self, rhs: Mask256) { + unsafe { self.0 = _mm256_or_si256(self.0, rhs.0) } + } +} + +impl Simd for Simd256u { + const LANES: usize = 32; + type Mask = Mask256; + type Element = u8; + + #[inline(always)] + unsafe fn loadu(ptr: *const u8) -> Self { + unsafe { Simd256u(_mm256_loadu_si256(ptr as *const __m256i)) } + } + + #[inline(always)] + unsafe fn storeu(&self, ptr: *mut u8) { + unsafe { _mm256_storeu_si256(ptr as *mut __m256i, self.0) } + } + + #[inline(always)] + fn eq(&self, rhs: &Self) -> Self::Mask { + unsafe { + let eq = _mm256_cmpeq_epi8(self.0, rhs.0); + Mask256(eq) + } + } + + #[inline(always)] + fn splat(ch: u8) -> Self { + unsafe { Simd256u(_mm256_set1_epi8(ch as i8)) } + } + + #[inline(always)] + fn le(&self, rhs: &Self) -> Self::Mask { + unsafe { + let max = _mm256_max_epu8(self.0, rhs.0); + let eq = _mm256_cmpeq_epi8(max, rhs.0); + Mask256(eq) + } + } + + #[inline(always)] + fn gt(&self, _rhs: &Self) -> Self::Mask { + todo!() + } +} diff --git a/src/simd/avx512.rs b/src/simd/avx512.rs new file mode 100644 index 0000000..e798044 --- /dev/null +++ b/src/simd/avx512.rs @@ -0,0 +1,92 @@ +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +use std::ops::{BitAnd, BitOr, BitOrAssign}; + +use super::{Mask, Simd}; + +#[derive(Debug)] +#[repr(transparent)] +pub struct Simd512u(__m512i); + +#[derive(Debug, Clone, Copy)] +#[repr(transparent)] +pub struct Mask512(__mmask64); + +impl Mask for Mask512 { + type BitMask = u64; + type Element = u8; + + #[inline(always)] + fn bitmask(self) -> Self::BitMask { + self.0 + } + + #[inline(always)] + fn splat(b: bool) -> Self { + if b { Mask512(u64::MAX) } else { Mask512(0) } + } +} + +impl BitOr for Mask512 { + type Output = Self; + + #[inline(always)] + fn bitor(self, rhs: Self) -> Self::Output { + Mask512(self.0 | rhs.0) + } +} + +impl BitOrAssign for Mask512 { + #[inline(always)] + fn bitor_assign(&mut self, rhs: Self) { + self.0 |= rhs.0; + } +} + +impl BitAnd for Mask512 { + type Output = Self; + + #[inline(always)] + fn bitand(self, rhs: Mask512) -> Self::Output { + Mask512(self.0 & rhs.0) + } +} + +impl Simd for Simd512u { + const LANES: usize = 64; + type Element = u8; + type Mask = Mask512; + + #[inline(always)] + unsafe fn loadu(ptr: *const u8) -> Self { + unsafe { Simd512u(_mm512_loadu_si512(ptr as *const __m512i)) } + } + + #[inline(always)] + unsafe fn storeu(&self, ptr: *mut u8) { + unsafe { _mm512_storeu_si512(ptr as *mut __m512i, self.0) } + } + + #[inline(always)] + fn eq(&self, rhs: &Self) -> Self::Mask { + unsafe { Mask512(_mm512_cmpeq_epi8_mask(self.0, rhs.0)) } + } + + #[inline(always)] + fn splat(ch: u8) -> Self { + unsafe { Simd512u(_mm512_set1_epi8(ch as i8)) } + } + + #[inline(always)] + fn le(&self, rhs: &Self) -> Self::Mask { + unsafe { Mask512(_mm512_cmple_epu8_mask(self.0, rhs.0)) } + } + + #[inline(always)] + fn gt(&self, rhs: &Self) -> Self::Mask { + unsafe { Mask512(_mm512_cmpgt_epu8_mask(self.0, rhs.0)) } + } +} diff --git a/src/simd/bits.rs b/src/simd/bits.rs new file mode 100644 index 0000000..3bdb694 --- /dev/null +++ b/src/simd/bits.rs @@ -0,0 +1,105 @@ +use super::traits::BitMask; + +macro_rules! impl_bits { + () => {}; + ($($ty:ty)*) => { + $( + impl BitMask for $ty { + const LEN: usize = std::mem::size_of::<$ty>() * 8; + + #[inline] + fn before(&self, rhs: &Self) -> bool { + (self.as_little_endian() & rhs.as_little_endian().wrapping_sub(1)) != 0 + } + + #[inline] + fn first_offset(&self) -> usize { + self.as_little_endian().trailing_zeros() as usize + } + + #[inline] + fn as_little_endian(&self) -> Self { + #[cfg(target_endian = "little")] + { + self.clone() + } + #[cfg(target_endian = "big")] + { + self.swap_bytes() + } + } + + #[inline] + fn all_zero(&self) -> bool { + *self == 0 + } + + #[inline] + fn clear_high_bits(&self, n: usize) -> Self { + debug_assert!(n <= Self::LEN); + *self & ((u64::MAX as $ty) >> n) + } + } + )* + }; +} + +impl_bits!(u16 u32 u64); + +#[cfg(target_arch = "aarch64")] +/// Use u64 representation the bitmask of Neon vector. +/// (low) +/// Vector: 00-ff-ff-ff-ff-00-00-00 +/// Mask : 0000-1111-1111-1111-1111-0000-0000-0000 +/// +/// first_offset() = 1 +/// clear_high_bits(4) = Mask(0000-1111-1111-1111-[0000]-0000-0000-0000) +/// +/// reference: https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon +pub struct NeonBits(u64); + +#[cfg(target_arch = "aarch64")] +impl NeonBits { + #[inline] + pub fn new(u: u64) -> Self { + Self(u) + } +} + +#[cfg(target_arch = "aarch64")] +impl BitMask for NeonBits { + const LEN: usize = 16; + + #[inline] + fn first_offset(&self) -> usize { + (self.as_little_endian().0.trailing_zeros() as usize) >> 2 + } + + #[inline] + fn before(&self, rhs: &Self) -> bool { + (self.as_little_endian().0 & rhs.as_little_endian().0.wrapping_sub(1)) != 0 + } + + #[inline] + fn as_little_endian(&self) -> Self { + #[cfg(target_endian = "little")] + { + Self::new(self.0) + } + #[cfg(target_endian = "big")] + { + Self::new(self.0.swap_bytes()) + } + } + + #[inline] + fn all_zero(&self) -> bool { + self.0 == 0 + } + + #[inline] + fn clear_high_bits(&self, n: usize) -> Self { + debug_assert!(n <= Self::LEN); + Self(self.0 & u64::MAX >> (n * 4)) + } +} diff --git a/src/simd/mod.rs b/src/simd/mod.rs new file mode 100644 index 0000000..a4c80ff --- /dev/null +++ b/src/simd/mod.rs @@ -0,0 +1,16 @@ +#![allow(non_camel_case_types)] + +pub mod bits; +mod traits; + +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +pub(crate) mod avx2; +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +pub(crate) mod avx512; +#[cfg(target_arch = "aarch64")] +pub(crate) mod neon; +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +pub(crate) mod sse2; +pub(crate) mod v128; + +pub use self::traits::{BitMask, Mask, Simd}; diff --git a/src/simd/neon.rs b/src/simd/neon.rs new file mode 100644 index 0000000..cc6bb99 --- /dev/null +++ b/src/simd/neon.rs @@ -0,0 +1,102 @@ +use std::arch::aarch64::*; + +use super::{Mask, Simd, bits::NeonBits}; + +#[derive(Debug)] +#[repr(transparent)] +pub struct Simd128u(uint8x16_t); + +impl Simd for Simd128u { + const LANES: usize = 16; + type Mask = Mask128; + type Element = u8; + + #[inline(always)] + unsafe fn loadu(ptr: *const u8) -> Self { + unsafe { Self(vld1q_u8(ptr)) } + } + + #[inline(always)] + unsafe fn storeu(&self, ptr: *mut u8) { + unsafe { vst1q_u8(ptr, self.0) }; + } + + #[inline(always)] + fn eq(&self, lhs: &Self) -> Self::Mask { + unsafe { Mask128(vceqq_u8(self.0, lhs.0)) } + } + + #[inline(always)] + fn splat(ch: u8) -> Self { + unsafe { Self(vdupq_n_u8(ch)) } + } + + // less or equal + #[inline(always)] + fn le(&self, lhs: &Self) -> Self::Mask { + unsafe { Mask128(vcleq_u8(self.0, lhs.0)) } + } + + // greater than + #[inline(always)] + fn gt(&self, lhs: &Self) -> Self::Mask { + unsafe { Mask128(vcgtq_u8(self.0, lhs.0)) } + } +} + +#[derive(Debug)] +#[repr(transparent)] +pub struct Mask128(pub(crate) uint8x16_t); + +impl Mask for Mask128 { + type BitMask = NeonBits; + type Element = u8; + + /// Convert Mask Vector 0x00-ff-ff to Bits 0b0000-1111-1111 + /// Reference: https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon + #[inline(always)] + fn bitmask(self) -> Self::BitMask { + unsafe { + let v16 = vreinterpretq_u16_u8(self.0); + let sr4 = vshrn_n_u16(v16, 4); + let v64 = vreinterpret_u64_u8(sr4); + NeonBits::new(vget_lane_u64(v64, 0)) + } + } + + #[inline(always)] + fn splat(b: bool) -> Self { + let v: i8 = if b { -1 } else { 0 }; + unsafe { Self(vdupq_n_u8(v as u8)) } + } +} + +// Bitwise AND for Mask128 +impl std::ops::BitAnd for Mask128 { + type Output = Self; + + #[inline(always)] + fn bitand(self, rhs: Mask128) -> Self::Output { + unsafe { Self(vandq_u8(self.0, rhs.0)) } + } +} + +// Bitwise OR for Mask128 +impl std::ops::BitOr for Mask128 { + type Output = Self; + + #[inline(always)] + fn bitor(self, rhs: Mask128) -> Self::Output { + unsafe { Self(vorrq_u8(self.0, rhs.0)) } + } +} + +// Bitwise OR assignment for Mask128 +impl std::ops::BitOrAssign for Mask128 { + #[inline(always)] + fn bitor_assign(&mut self, rhs: Mask128) { + unsafe { + self.0 = vorrq_u8(self.0, rhs.0); + } + } +} diff --git a/src/simd/sse2.rs b/src/simd/sse2.rs new file mode 100644 index 0000000..63a95b8 --- /dev/null +++ b/src/simd/sse2.rs @@ -0,0 +1,97 @@ +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +use std::ops::{BitAnd, BitOr, BitOrAssign}; + +use super::{Mask, Simd}; + +#[derive(Debug)] +#[repr(transparent)] +pub struct Simd128u(__m128i); + +#[derive(Debug)] +#[repr(transparent)] +pub struct Mask128(__m128i); + +impl Mask for Mask128 { + type BitMask = u16; + type Element = u8; + + #[inline(always)] + fn bitmask(self) -> Self::BitMask { + unsafe { _mm_movemask_epi8(self.0) as u16 } + } + + #[inline(always)] + fn splat(b: bool) -> Self { + let v: i8 = if b { -1 } else { 0 }; + unsafe { Mask128(_mm_set1_epi8(v)) } + } +} + +impl BitAnd for Mask128 { + type Output = Self; + + #[inline(always)] + fn bitand(self, rhs: Mask128) -> Self::Output { + unsafe { Mask128(_mm_and_si128(self.0, rhs.0)) } + } +} + +impl BitOr for Mask128 { + type Output = Self; + + #[inline(always)] + fn bitor(self, rhs: Mask128) -> Self::Output { + unsafe { Mask128(_mm_or_si128(self.0, rhs.0)) } + } +} + +impl BitOrAssign for Mask128 { + #[inline(always)] + fn bitor_assign(&mut self, rhs: Mask128) { + self.0 = unsafe { _mm_or_si128(self.0, rhs.0) }; + } +} + +impl Simd for Simd128u { + const LANES: usize = 16; + type Mask = Mask128; + type Element = u8; + + #[inline(always)] + unsafe fn loadu(ptr: *const u8) -> Self { + Simd128u(unsafe { _mm_loadu_si128(ptr as *const __m128i) }) + } + + #[inline(always)] + unsafe fn storeu(&self, ptr: *mut u8) { + unsafe { _mm_storeu_si128(ptr as *mut __m128i, self.0) } + } + + #[inline(always)] + fn eq(&self, rhs: &Self) -> Self::Mask { + Mask128(unsafe { _mm_cmpeq_epi8(self.0, rhs.0) }) + } + + #[inline(always)] + fn splat(ch: u8) -> Self { + Simd128u(unsafe { _mm_set1_epi8(ch as i8) }) + } + + #[inline(always)] + fn le(&self, rhs: &Self) -> Self::Mask { + unsafe { + let max = _mm_max_epu8(self.0, rhs.0); + let eq = _mm_cmpeq_epi8(max, rhs.0); + Mask128(eq) + } + } + + #[inline(always)] + fn gt(&self, _rhs: &Self) -> Self::Mask { + todo!() + } +} diff --git a/src/simd/traits.rs b/src/simd/traits.rs new file mode 100644 index 0000000..b3f77f6 --- /dev/null +++ b/src/simd/traits.rs @@ -0,0 +1,71 @@ +use std::ops::{BitAnd, BitOr, BitOrAssign}; + +/// Portbal SIMD traits +pub trait Simd: Sized { + const LANES: usize; + + type Element; + type Mask: Mask; + + /// # Safety + unsafe fn from_slice_unaligned_unchecked(slice: &[u8]) -> Self { + debug_assert!(slice.len() >= Self::LANES); + unsafe { Self::loadu(slice.as_ptr()) } + } + + /// # Safety + unsafe fn write_to_slice_unaligned_unchecked(&self, slice: &mut [u8]) { + debug_assert!(slice.len() >= Self::LANES); + unsafe { self.storeu(slice.as_mut_ptr()) } + } + + /// # Safety + unsafe fn loadu(ptr: *const u8) -> Self; + + /// # Safety + unsafe fn storeu(&self, ptr: *mut u8); + + fn eq(&self, rhs: &Self) -> Self::Mask; + + fn splat(elem: Self::Element) -> Self; + + #[allow(unused)] + /// greater than + fn gt(&self, rhs: &Self) -> Self::Mask; + + /// less or equal + fn le(&self, rhs: &Self) -> Self::Mask; +} + +/// Portbal SIMD mask traits +pub trait Mask: Sized + BitOr + BitOrAssign + BitAnd { + type Element; + type BitMask: BitMask; + + fn bitmask(self) -> Self::BitMask; + + #[allow(unused)] + fn splat(b: bool) -> Self; +} + +/// Trait for the bitmask of a vector Mask. +pub trait BitMask { + /// Total bits in the bitmask. + const LEN: usize; + + /// get the offset of the first `1` bit. + fn first_offset(&self) -> usize; + + #[allow(unused)] + /// check if this bitmask is before the other bitmask. + fn before(&self, rhs: &Self) -> bool; + + /// convert bitmask as little endian + fn as_little_endian(&self) -> Self; + + /// whether all bits are zero. + fn all_zero(&self) -> bool; + + /// clear high n bits. + fn clear_high_bits(&self, n: usize) -> Self; +} diff --git a/src/simd/v128.rs b/src/simd/v128.rs new file mode 100644 index 0000000..448dece --- /dev/null +++ b/src/simd/v128.rs @@ -0,0 +1,113 @@ +use std::ops::{BitAnd, BitOr, BitOrAssign}; + +use super::{Mask, Simd}; + +#[derive(Debug)] +pub struct Simd128u([u8; 16]); + +#[derive(Debug)] +pub struct Mask128(pub(crate) [u8; 16]); + +impl Simd for Simd128u { + type Element = u8; + const LANES: usize = 16; + type Mask = Mask128; + + unsafe fn loadu(ptr: *const u8) -> Self { + let v = unsafe { std::slice::from_raw_parts(ptr, Self::LANES) }; + let mut res = [0u8; 16]; + res.copy_from_slice(v); + Self(res) + } + + unsafe fn storeu(&self, ptr: *mut u8) { + let data = &self.0; + unsafe { std::ptr::copy_nonoverlapping(data.as_ptr(), ptr, Self::LANES) }; + } + + fn eq(&self, rhs: &Self) -> Self::Mask { + let mut mask = [0u8; 16]; + for (i, item) in mask.iter_mut().enumerate().take(Self::LANES) { + *item = if self.0[i] == rhs.0[i] { 1 } else { 0 }; + } + Mask128(mask) + } + + fn splat(value: u8) -> Self { + Self([value; Self::LANES]) + } + + fn le(&self, rhs: &Self) -> Self::Mask { + let mut mask = [0u8; 16]; + for i in 0..Self::LANES { + mask[i] = if self.0[i] <= rhs.0[i] { 1 } else { 0 }; + } + Mask128(mask) + } + + fn gt(&self, rhs: &Self) -> Self::Mask { + let mut mask = [0u8; 16]; + for i in 0..Self::LANES { + mask[i] = if self.0[i] > rhs.0[i] { 1 } else { 0 }; + } + Mask128(mask) + } +} + +impl Mask for Mask128 { + type BitMask = u16; + type Element = u8; + + fn bitmask(self) -> Self::BitMask { + #[cfg(target_endian = "little")] + { + self.0 + .iter() + .enumerate() + .fold(0, |acc, (i, &b)| acc | ((b as u16) << i)) + } + #[cfg(target_endian = "big")] + { + self.0 + .iter() + .enumerate() + .fold(0, |acc, (i, &b)| acc | ((b as u16) << (15 - i))) + } + } + + fn splat(b: bool) -> Self { + Mask128([b as u8; 16]) + } +} + +impl BitAnd for Mask128 { + type Output = Self; + + fn bitand(self, rhs: Self) -> Self::Output { + let mut result = [0u8; 16]; + for i in 0..16 { + result[i] = self.0[i] & rhs.0[i]; + } + Mask128(result) + } +} + +impl BitOr for Mask128 { + type Output = Self; + + fn bitor(self, rhs: Self) -> Self::Output { + let mut result = [0u8; 16]; + for i in 0..16 { + result[i] = self.0[i] | rhs.0[i]; + } + Mask128(result) + } +} + +impl BitOrAssign for Mask128 { + fn bitor_assign(&mut self, rhs: Self) { + for i in 0..16 { + self.0[i] |= rhs.0[i]; + } + } +} diff --git a/test.rs b/test.rs new file mode 100644 index 0000000..b159465 --- /dev/null +++ b/test.rs @@ -0,0 +1,1614 @@ +#![feature(prelude_import)] +//! Borrowed from +//! +//! Only takes the string escaping part to avoid the abstraction overhead. +#[macro_use] +extern crate std; +#[prelude_import] +use std::prelude::rust_2024::*; +use std::slice::from_raw_parts; +use simd::{BitMask, Mask, Simd}; +mod simd { + #![allow(non_camel_case_types)] + pub mod bits { + use super::traits::BitMask; + impl BitMask for u16 { + const LEN: usize = std::mem::size_of::() * 8; + #[inline] + fn before(&self, rhs: &Self) -> bool { + (self.as_little_endian() & rhs.as_little_endian().wrapping_sub(1)) != 0 + } + #[inline] + fn first_offset(&self) -> usize { + self.as_little_endian().trailing_zeros() as usize + } + #[inline] + fn as_little_endian(&self) -> Self { + { self.clone() } + } + #[inline] + fn all_zero(&self) -> bool { + *self == 0 + } + #[inline] + fn clear_high_bits(&self, n: usize) -> Self { + if true { + if !(n <= Self::LEN) { + ::core::panicking::panic("assertion failed: n <= Self::LEN") + } + } + *self & ((u64::MAX as u16) >> n) + } + } + impl BitMask for u32 { + const LEN: usize = std::mem::size_of::() * 8; + #[inline] + fn before(&self, rhs: &Self) -> bool { + (self.as_little_endian() & rhs.as_little_endian().wrapping_sub(1)) != 0 + } + #[inline] + fn first_offset(&self) -> usize { + self.as_little_endian().trailing_zeros() as usize + } + #[inline] + fn as_little_endian(&self) -> Self { + { self.clone() } + } + #[inline] + fn all_zero(&self) -> bool { + *self == 0 + } + #[inline] + fn clear_high_bits(&self, n: usize) -> Self { + if true { + if !(n <= Self::LEN) { + ::core::panicking::panic("assertion failed: n <= Self::LEN") + } + } + *self & ((u64::MAX as u32) >> n) + } + } + impl BitMask for u64 { + const LEN: usize = std::mem::size_of::() * 8; + #[inline] + fn before(&self, rhs: &Self) -> bool { + (self.as_little_endian() & rhs.as_little_endian().wrapping_sub(1)) != 0 + } + #[inline] + fn first_offset(&self) -> usize { + self.as_little_endian().trailing_zeros() as usize + } + #[inline] + fn as_little_endian(&self) -> Self { + { self.clone() } + } + #[inline] + fn all_zero(&self) -> bool { + *self == 0 + } + #[inline] + fn clear_high_bits(&self, n: usize) -> Self { + if true { + if !(n <= Self::LEN) { + ::core::panicking::panic("assertion failed: n <= Self::LEN") + } + } + *self & ((u64::MAX as u64) >> n) + } + } + /// Use u64 representation the bitmask of Neon vector. + /// (low) + /// Vector: 00-ff-ff-ff-ff-00-00-00 + /// Mask : 0000-1111-1111-1111-1111-0000-0000-0000 + /// + /// first_offset() = 1 + /// clear_high_bits(4) = Mask(0000-1111-1111-1111-[0000]-0000-0000-0000) + /// + /// reference: https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon + pub struct NeonBits(u64); + impl NeonBits { + #[inline] + pub fn new(u: u64) -> Self { + Self(u) + } + } + impl BitMask for NeonBits { + const LEN: usize = 16; + #[inline] + fn first_offset(&self) -> usize { + (self.as_little_endian().0.trailing_zeros() as usize) >> 2 + } + #[inline] + fn before(&self, rhs: &Self) -> bool { + (self.as_little_endian().0 & rhs.as_little_endian().0.wrapping_sub(1)) + != 0 + } + #[inline] + fn as_little_endian(&self) -> Self { + { Self::new(self.0) } + } + #[inline] + fn all_zero(&self) -> bool { + self.0 == 0 + } + #[inline] + fn clear_high_bits(&self, n: usize) -> Self { + if true { + if !(n <= Self::LEN) { + ::core::panicking::panic("assertion failed: n <= Self::LEN") + } + } + Self(self.0 & u64::MAX >> (n * 4)) + } + } + } + mod traits { + use std::ops::{BitAnd, BitOr, BitOrAssign}; + /// Portbal SIMD traits + pub trait Simd: Sized { + const LANES: usize; + type Element; + type Mask: Mask; + /// # Safety + unsafe fn from_slice_unaligned_unchecked(slice: &[u8]) -> Self { + if true { + if !(slice.len() >= Self::LANES) { + ::core::panicking::panic( + "assertion failed: slice.len() >= Self::LANES", + ) + } + } + unsafe { Self::loadu(slice.as_ptr()) } + } + /// # Safety + unsafe fn write_to_slice_unaligned_unchecked(&self, slice: &mut [u8]) { + if true { + if !(slice.len() >= Self::LANES) { + ::core::panicking::panic( + "assertion failed: slice.len() >= Self::LANES", + ) + } + } + unsafe { self.storeu(slice.as_mut_ptr()) } + } + /// # Safety + unsafe fn loadu(ptr: *const u8) -> Self; + /// # Safety + unsafe fn storeu(&self, ptr: *mut u8); + fn eq(&self, rhs: &Self) -> Self::Mask; + fn splat(elem: Self::Element) -> Self; + #[allow(unused)] + /// greater than + fn gt(&self, rhs: &Self) -> Self::Mask; + /// less or equal + fn le(&self, rhs: &Self) -> Self::Mask; + } + /// Portbal SIMD mask traits + pub trait Mask: Sized + BitOr + BitOrAssign + BitAnd { + type Element; + type BitMask: BitMask; + fn bitmask(self) -> Self::BitMask; + fn splat(b: bool) -> Self; + } + /// Trait for the bitmask of a vector Mask. + pub trait BitMask { + /// Total bits in the bitmask. + const LEN: usize; + /// get the offset of the first `1` bit. + fn first_offset(&self) -> usize; + /// check if this bitmask is before the other bitmask. + fn before(&self, rhs: &Self) -> bool; + /// convert bitmask as little endian + fn as_little_endian(&self) -> Self; + /// whether all bits are zero. + fn all_zero(&self) -> bool; + /// clear high n bits. + fn clear_high_bits(&self, n: usize) -> Self; + } + } + pub(crate) mod neon { + use std::arch::aarch64::*; + use super::{Mask, Simd, bits::NeonBits}; + #[repr(transparent)] + pub struct Simd128u(uint8x16_t); + #[automatically_derived] + impl ::core::fmt::Debug for Simd128u { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + ::core::fmt::Formatter::debug_tuple_field1_finish( + f, + "Simd128u", + &&self.0, + ) + } + } + #[repr(transparent)] + pub struct Simd128i(int8x16_t); + #[automatically_derived] + impl ::core::fmt::Debug for Simd128i { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + ::core::fmt::Formatter::debug_tuple_field1_finish( + f, + "Simd128i", + &&self.0, + ) + } + } + impl Simd for Simd128u { + const LANES: usize = 16; + type Mask = Mask128; + type Element = u8; + #[inline(always)] + unsafe fn loadu(ptr: *const u8) -> Self { + unsafe { Self(vld1q_u8(ptr)) } + } + #[inline(always)] + unsafe fn storeu(&self, ptr: *mut u8) { + unsafe { vst1q_u8(ptr, self.0) }; + } + #[inline(always)] + fn eq(&self, lhs: &Self) -> Self::Mask { + unsafe { Mask128(vceqq_u8(self.0, lhs.0)) } + } + #[inline(always)] + fn splat(ch: u8) -> Self { + unsafe { Self(vdupq_n_u8(ch)) } + } + #[inline(always)] + fn le(&self, lhs: &Self) -> Self::Mask { + unsafe { Mask128(vcleq_u8(self.0, lhs.0)) } + } + #[inline(always)] + fn gt(&self, lhs: &Self) -> Self::Mask { + unsafe { Mask128(vcgtq_u8(self.0, lhs.0)) } + } + } + impl Simd for Simd128i { + const LANES: usize = 16; + type Mask = Mask128; + type Element = i8; + #[inline(always)] + unsafe fn loadu(ptr: *const u8) -> Self { + Self(unsafe { vld1q_s8(ptr as *const i8) }) + } + #[inline(always)] + unsafe fn storeu(&self, ptr: *mut u8) { + unsafe { vst1q_s8(ptr as *mut i8, self.0) }; + } + #[inline(always)] + fn eq(&self, lhs: &Self) -> Self::Mask { + unsafe { Mask128(vceqq_s8(self.0, lhs.0)) } + } + #[inline(always)] + fn splat(elem: i8) -> Self { + unsafe { Self(vdupq_n_s8(elem)) } + } + #[inline(always)] + fn le(&self, lhs: &Self) -> Self::Mask { + unsafe { Mask128(vcleq_s8(self.0, lhs.0)) } + } + #[inline(always)] + fn gt(&self, lhs: &Self) -> Self::Mask { + unsafe { Mask128(vcgtq_s8(self.0, lhs.0)) } + } + } + pub(crate) const BIT_MASK_TAB: [u8; 16] = [ + 0x01u8, + 0x02, + 0x4, + 0x8, + 0x10, + 0x20, + 0x40, + 0x80, + 0x01, + 0x02, + 0x4, + 0x8, + 0x10, + 0x20, + 0x40, + 0x80, + ]; + #[repr(transparent)] + pub struct Mask128(pub(crate) uint8x16_t); + #[automatically_derived] + impl ::core::fmt::Debug for Mask128 { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + ::core::fmt::Formatter::debug_tuple_field1_finish(f, "Mask128", &&self.0) + } + } + impl Mask for Mask128 { + type BitMask = NeonBits; + type Element = u8; + /// Convert Mask Vector 0x00-ff-ff to Bits 0b0000-1111-1111 + /// Reference: https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon + #[inline(always)] + fn bitmask(self) -> Self::BitMask { + unsafe { + let v16 = vreinterpretq_u16_u8(self.0); + let sr4 = vshrn_n_u16(v16, 4); + let v64 = vreinterpret_u64_u8(sr4); + NeonBits::new(vget_lane_u64(v64, 0)) + } + } + #[inline(always)] + fn splat(b: bool) -> Self { + let v: i8 = if b { -1 } else { 0 }; + unsafe { Self(vdupq_n_u8(v as u8)) } + } + } + impl std::ops::BitAnd for Mask128 { + type Output = Self; + #[inline(always)] + fn bitand(self, rhs: Mask128) -> Self::Output { + unsafe { Self(vandq_u8(self.0, rhs.0)) } + } + } + impl std::ops::BitOr for Mask128 { + type Output = Self; + #[inline(always)] + fn bitor(self, rhs: Mask128) -> Self::Output { + unsafe { Self(vorrq_u8(self.0, rhs.0)) } + } + } + impl std::ops::BitOrAssign for Mask128 { + #[inline(always)] + fn bitor_assign(&mut self, rhs: Mask128) { + unsafe { + self.0 = vorrq_u8(self.0, rhs.0); + } + } + } + #[inline(always)] + pub unsafe fn to_bitmask64( + v0: uint8x16_t, + v1: uint8x16_t, + v2: uint8x16_t, + v3: uint8x16_t, + ) -> u64 { + let bit_mask = unsafe { + std::mem::transmute::<[u8; 16], uint8x16_t>(BIT_MASK_TAB) + }; + let t0 = unsafe { vandq_u8(v0, bit_mask) }; + let t1 = unsafe { vandq_u8(v1, bit_mask) }; + let t2 = unsafe { vandq_u8(v2, bit_mask) }; + let t3 = unsafe { vandq_u8(v3, bit_mask) }; + let pair0 = unsafe { vpaddq_u8(t0, t1) }; + let pair1 = unsafe { vpaddq_u8(t2, t3) }; + let quad = unsafe { vpaddq_u8(pair0, pair1) }; + let octa = unsafe { vpaddq_u8(quad, quad) }; + unsafe { vgetq_lane_u64(vreinterpretq_u64_u8(octa), 0) } + } + #[inline(always)] + pub(crate) unsafe fn to_bitmask32(v0: uint8x16_t, v1: uint8x16_t) -> u32 { + let bit_mask = unsafe { + std::mem::transmute::<[u8; 16], uint8x16_t>(BIT_MASK_TAB) + }; + let t0 = vandq_u8(v0, bit_mask); + let t1 = vandq_u8(v1, bit_mask); + let pair = vpaddq_u8(t0, t1); + let quad = vpaddq_u8(pair, pair); + let octa = vpaddq_u8(quad, quad); + vgetq_lane_u32(vreinterpretq_u32_u8(octa), 0) + } + } + pub(crate) mod v128 { + use std::ops::{BitAnd, BitOr, BitOrAssign}; + use super::{Mask, Simd}; + pub struct Simd128i([i8; 16]); + #[automatically_derived] + impl ::core::fmt::Debug for Simd128i { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + ::core::fmt::Formatter::debug_tuple_field1_finish( + f, + "Simd128i", + &&self.0, + ) + } + } + pub struct Simd128u([u8; 16]); + #[automatically_derived] + impl ::core::fmt::Debug for Simd128u { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + ::core::fmt::Formatter::debug_tuple_field1_finish( + f, + "Simd128u", + &&self.0, + ) + } + } + pub struct Mask128(pub(crate) [u8; 16]); + #[automatically_derived] + impl ::core::fmt::Debug for Mask128 { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + ::core::fmt::Formatter::debug_tuple_field1_finish(f, "Mask128", &&self.0) + } + } + impl Simd for Simd128i { + type Element = i8; + const LANES: usize = 16; + type Mask = Mask128; + unsafe fn loadu(ptr: *const u8) -> Self { + let v = unsafe { std::slice::from_raw_parts(ptr, Self::LANES) }; + let mut res = [0i8; 16]; + res.copy_from_slice(unsafe { std::mem::transmute::<&[u8], &[i8]>(v) }); + Self(res) + } + unsafe fn storeu(&self, ptr: *mut u8) { + let data = unsafe { std::mem::transmute::<&[i8], &[u8]>(&self.0) }; + unsafe { + std::ptr::copy_nonoverlapping(data.as_ptr(), ptr, Self::LANES) + }; + } + fn eq(&self, rhs: &Self) -> Self::Mask { + let mut mask = [0u8; 16]; + for i in 0..Self::LANES { + mask[i] = if self.0[i] == rhs.0[i] { 1 } else { 0 }; + } + Mask128(mask) + } + fn splat(value: i8) -> Self { + Self([value as i8; Self::LANES]) + } + fn le(&self, rhs: &Self) -> Self::Mask { + let mut mask = [0u8; 16]; + for i in 0..Self::LANES { + mask[i] = if self.0[i] <= rhs.0[i] { 1 } else { 0 }; + } + Mask128(mask) + } + fn gt(&self, rhs: &Self) -> Self::Mask { + let mut mask = [0u8; 16]; + for i in 0..Self::LANES { + mask[i] = if self.0[i] > rhs.0[i] { 1 } else { 0 }; + } + Mask128(mask) + } + } + impl Simd for Simd128u { + type Element = u8; + const LANES: usize = 16; + type Mask = Mask128; + unsafe fn loadu(ptr: *const u8) -> Self { + let v = unsafe { std::slice::from_raw_parts(ptr, Self::LANES) }; + let mut res = [0u8; 16]; + res.copy_from_slice(v); + Self(res) + } + unsafe fn storeu(&self, ptr: *mut u8) { + let data = &self.0; + unsafe { + std::ptr::copy_nonoverlapping(data.as_ptr(), ptr, Self::LANES) + }; + } + fn eq(&self, rhs: &Self) -> Self::Mask { + let mut mask = [0u8; 16]; + for i in 0..Self::LANES { + mask[i] = if self.0[i] == rhs.0[i] { 1 } else { 0 }; + } + Mask128(mask) + } + fn splat(value: u8) -> Self { + Self([value; Self::LANES]) + } + fn le(&self, rhs: &Self) -> Self::Mask { + let mut mask = [0u8; 16]; + for i in 0..Self::LANES { + mask[i] = if self.0[i] <= rhs.0[i] { 1 } else { 0 }; + } + Mask128(mask) + } + fn gt(&self, rhs: &Self) -> Self::Mask { + let mut mask = [0u8; 16]; + for i in 0..Self::LANES { + mask[i] = if self.0[i] > rhs.0[i] { 1 } else { 0 }; + } + Mask128(mask) + } + } + impl Mask for Mask128 { + type BitMask = u16; + type Element = u8; + fn bitmask(self) -> Self::BitMask { + { + self.0 + .iter() + .enumerate() + .fold(0, |acc, (i, &b)| acc | ((b as u16) << i)) + } + } + fn splat(b: bool) -> Self { + Mask128([b as u8; 16]) + } + } + impl BitAnd for Mask128 { + type Output = Self; + fn bitand(self, rhs: Self) -> Self::Output { + let mut result = [0u8; 16]; + for i in 0..16 { + result[i] = self.0[i] & rhs.0[i]; + } + Mask128(result) + } + } + impl BitOr for Mask128 { + type Output = Self; + fn bitor(self, rhs: Self) -> Self::Output { + let mut result = [0u8; 16]; + for i in 0..16 { + result[i] = self.0[i] | rhs.0[i]; + } + Mask128(result) + } + } + impl BitOrAssign for Mask128 { + fn bitor_assign(&mut self, rhs: Self) { + for i in 0..16 { + self.0[i] |= rhs.0[i]; + } + } + } + } + pub(crate) mod v256 { + use std::ops::{BitAnd, BitOr, BitOrAssign}; + use super::{Mask, Simd, v128::{Mask128, Simd128i, Simd128u}}; + #[repr(transparent)] + pub struct Simd256u((Simd128u, Simd128u)); + #[automatically_derived] + impl ::core::fmt::Debug for Simd256u { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + ::core::fmt::Formatter::debug_tuple_field1_finish( + f, + "Simd256u", + &&self.0, + ) + } + } + #[repr(transparent)] + pub struct Simd256i((Simd128i, Simd128i)); + #[automatically_derived] + impl ::core::fmt::Debug for Simd256i { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + ::core::fmt::Formatter::debug_tuple_field1_finish( + f, + "Simd256i", + &&self.0, + ) + } + } + #[repr(transparent)] + pub struct Mask256(pub(crate) (Mask128, Mask128)); + #[automatically_derived] + impl ::core::fmt::Debug for Mask256 { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + ::core::fmt::Formatter::debug_tuple_field1_finish(f, "Mask256", &&self.0) + } + } + impl Mask for Mask256 { + type BitMask = u32; + type Element = u8; + #[inline(always)] + fn bitmask(self) -> Self::BitMask { + fn combine_u16(lo: u16, hi: u16) -> u32 { + { (lo as u32) | ((hi as u32) << 16) } + } + combine_u16(self.0.0.bitmask(), self.0.1.bitmask()) + } + #[inline(always)] + fn splat(b: bool) -> Self { + Mask256((Mask128::splat(b), Mask128::splat(b))) + } + } + impl BitOr for Mask256 { + type Output = Self; + #[inline(always)] + fn bitor(self, rhs: Self) -> Self::Output { + let lo = self.0.0 | rhs.0.0; + let hi = self.0.1 | rhs.0.1; + Mask256((lo, hi)) + } + } + impl BitOrAssign for Mask256 { + #[inline(always)] + fn bitor_assign(&mut self, rhs: Self) { + self.0.0 |= rhs.0.0; + self.0.1 |= rhs.0.1; + } + } + impl BitAnd for Mask256 { + type Output = Self; + #[inline(always)] + fn bitand(self, rhs: Mask256) -> Self::Output { + let lo = self.0.0 & rhs.0.0; + let hi = self.0.1 & rhs.0.1; + Mask256((lo, hi)) + } + } + impl Simd for Simd256u { + const LANES: usize = 32; + type Mask = Mask256; + type Element = u8; + #[inline(always)] + unsafe fn loadu(ptr: *const u8) -> Self { + let lo = unsafe { Simd128u::loadu(ptr) }; + let hi = unsafe { Simd128u::loadu(ptr.add(Simd128u::LANES)) }; + Simd256u((lo, hi)) + } + #[inline(always)] + unsafe fn storeu(&self, ptr: *mut u8) { + unsafe { Simd128u::storeu(&self.0.0, ptr) }; + unsafe { Simd128u::storeu(&self.0.1, ptr.add(Simd128u::LANES)) }; + } + #[inline(always)] + fn eq(&self, rhs: &Self) -> Self::Mask { + let lo = self.0.0.eq(&rhs.0.0); + let hi = self.0.1.eq(&rhs.0.1); + Mask256((lo, hi)) + } + #[inline(always)] + fn splat(elem: u8) -> Self { + Simd256u((Simd128u::splat(elem), Simd128u::splat(elem))) + } + #[inline(always)] + fn le(&self, rhs: &Self) -> Self::Mask { + let lo = self.0.0.le(&rhs.0.0); + let hi = self.0.1.le(&rhs.0.1); + Mask256((lo, hi)) + } + #[inline(always)] + fn gt(&self, rhs: &Self) -> Self::Mask { + let lo = self.0.0.gt(&rhs.0.0); + let hi = self.0.1.gt(&rhs.0.1); + Mask256((lo, hi)) + } + } + impl Simd for Simd256i { + const LANES: usize = 32; + type Mask = Mask256; + type Element = i8; + #[inline(always)] + unsafe fn loadu(ptr: *const u8) -> Self { + let lo = unsafe { Simd128i::loadu(ptr) }; + let hi = unsafe { Simd128i::loadu(ptr.add(Simd128i::LANES)) }; + Simd256i((lo, hi)) + } + #[inline(always)] + unsafe fn storeu(&self, ptr: *mut u8) { + unsafe { Simd128i::storeu(&self.0.0, ptr) }; + unsafe { Simd128i::storeu(&self.0.1, ptr.add(Simd128i::LANES)) }; + } + #[inline(always)] + fn eq(&self, rhs: &Self) -> Self::Mask { + let lo = self.0.0.eq(&rhs.0.0); + let hi = self.0.1.eq(&rhs.0.1); + Mask256((lo, hi)) + } + #[inline(always)] + fn splat(elem: i8) -> Self { + Simd256i((Simd128i::splat(elem), Simd128i::splat(elem))) + } + #[inline(always)] + fn le(&self, rhs: &Self) -> Self::Mask { + let lo = self.0.0.le(&rhs.0.0); + let hi = self.0.1.le(&rhs.0.1); + Mask256((lo, hi)) + } + #[inline(always)] + fn gt(&self, rhs: &Self) -> Self::Mask { + let lo = self.0.0.gt(&rhs.0.0); + let hi = self.0.1.gt(&rhs.0.1); + Mask256((lo, hi)) + } + } + } + pub(crate) mod v512 { + use std::ops::{BitAnd, BitOr, BitOrAssign}; + use super::{Mask, Simd, v256::{Mask256, Simd256i, Simd256u}}; + #[repr(transparent)] + pub struct Simd512u((Simd256u, Simd256u)); + #[automatically_derived] + impl ::core::fmt::Debug for Simd512u { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + ::core::fmt::Formatter::debug_tuple_field1_finish( + f, + "Simd512u", + &&self.0, + ) + } + } + #[repr(transparent)] + pub struct Simd512i((Simd256i, Simd256i)); + #[automatically_derived] + impl ::core::fmt::Debug for Simd512i { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + ::core::fmt::Formatter::debug_tuple_field1_finish( + f, + "Simd512i", + &&self.0, + ) + } + } + #[repr(transparent)] + pub struct Mask512((Mask256, Mask256)); + #[automatically_derived] + impl ::core::fmt::Debug for Mask512 { + #[inline] + fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { + ::core::fmt::Formatter::debug_tuple_field1_finish(f, "Mask512", &&self.0) + } + } + impl Mask for Mask512 { + type BitMask = u64; + type Element = u8; + #[inline(always)] + fn bitmask(self) -> Self::BitMask { + fn combine_u32(lo: u32, hi: u32) -> u64 { + { (lo as u64) | ((hi as u64) << 32) } + } + combine_u32(self.0.0.bitmask(), self.0.1.bitmask()) + } + #[inline(always)] + fn splat(b: bool) -> Self { + Mask512((Mask256::splat(b), Mask256::splat(b))) + } + } + impl BitOr for Mask512 { + type Output = Self; + #[inline(always)] + fn bitor(self, rhs: Self) -> Self::Output { + let lo = self.0.0 | rhs.0.0; + let hi = self.0.1 | rhs.0.1; + Mask512((lo, hi)) + } + } + impl BitOrAssign for Mask512 { + #[inline(always)] + fn bitor_assign(&mut self, rhs: Self) { + self.0.0 |= rhs.0.0; + self.0.1 |= rhs.0.1; + } + } + impl BitAnd for Mask512 { + type Output = Self; + #[inline(always)] + fn bitand(self, rhs: Mask512) -> Self::Output { + let lo = self.0.0 & rhs.0.0; + let hi = self.0.1 & rhs.0.1; + Mask512((lo, hi)) + } + } + impl Simd for Simd512u { + const LANES: usize = 64; + type Element = u8; + type Mask = Mask512; + #[inline(always)] + unsafe fn loadu(ptr: *const u8) -> Self { + let lo = unsafe { Simd256u::loadu(ptr) }; + let hi = unsafe { Simd256u::loadu(ptr.add(Simd256u::LANES)) }; + Simd512u((lo, hi)) + } + #[inline(always)] + unsafe fn storeu(&self, ptr: *mut u8) { + unsafe { Simd256u::storeu(&self.0.0, ptr) }; + unsafe { Simd256u::storeu(&self.0.1, ptr.add(Simd256u::LANES)) }; + } + #[inline(always)] + fn eq(&self, rhs: &Self) -> Self::Mask { + let lo = self.0.0.eq(&rhs.0.0); + let hi = self.0.1.eq(&rhs.0.1); + Mask512((lo, hi)) + } + #[inline(always)] + fn splat(ch: u8) -> Self { + Simd512u((Simd256u::splat(ch), Simd256u::splat(ch))) + } + #[inline(always)] + fn le(&self, rhs: &Self) -> Self::Mask { + let lo = self.0.0.le(&rhs.0.0); + let hi = self.0.1.le(&rhs.0.1); + Mask512((lo, hi)) + } + #[inline(always)] + fn gt(&self, rhs: &Self) -> Self::Mask { + let lo = self.0.0.gt(&rhs.0.0); + let hi = self.0.1.gt(&rhs.0.1); + Mask512((lo, hi)) + } + } + impl Simd for Simd512i { + const LANES: usize = 64; + type Element = i8; + type Mask = Mask512; + #[inline(always)] + unsafe fn loadu(ptr: *const u8) -> Self { + let lo = unsafe { Simd256i::loadu(ptr) }; + let hi = unsafe { Simd256i::loadu(ptr.add(Simd256i::LANES)) }; + Simd512i((lo, hi)) + } + #[inline(always)] + unsafe fn storeu(&self, ptr: *mut u8) { + unsafe { Simd256i::storeu(&self.0.0, ptr) }; + unsafe { Simd256i::storeu(&self.0.1, ptr.add(Simd256i::LANES)) }; + } + #[inline(always)] + fn eq(&self, rhs: &Self) -> Self::Mask { + let lo = self.0.0.eq(&rhs.0.0); + let hi = self.0.1.eq(&rhs.0.1); + Mask512((lo, hi)) + } + #[inline(always)] + fn splat(elem: i8) -> Self { + Simd512i((Simd256i::splat(elem), Simd256i::splat(elem))) + } + #[inline(always)] + fn le(&self, rhs: &Self) -> Self::Mask { + let lo = self.0.0.le(&rhs.0.0); + let hi = self.0.1.le(&rhs.0.1); + Mask512((lo, hi)) + } + #[inline(always)] + fn gt(&self, rhs: &Self) -> Self::Mask { + let lo = self.0.0.gt(&rhs.0.0); + let hi = self.0.1.gt(&rhs.0.1); + Mask512((lo, hi)) + } + } + } + pub use self::traits::{BitMask, Mask, Simd}; +} +#[inline(always)] +unsafe fn load(ptr: *const u8) -> V { + let chunk = unsafe { from_raw_parts(ptr, V::LANES) }; + unsafe { V::from_slice_unaligned_unchecked(chunk) } +} +const QUOTE_TAB: [(u8, [u8; 8]); 256] = [ + (6, *b"\\u0000\0\0"), + (6, *b"\\u0001\0\0"), + (6, *b"\\u0002\0\0"), + (6, *b"\\u0003\0\0"), + (6, *b"\\u0004\0\0"), + (6, *b"\\u0005\0\0"), + (6, *b"\\u0006\0\0"), + (6, *b"\\u0007\0\0"), + (2, *b"\\b\0\0\0\0\0\0"), + (2, *b"\\t\0\0\0\0\0\0"), + (2, *b"\\n\0\0\0\0\0\0"), + (6, *b"\\u000b\0\0"), + (2, *b"\\f\0\0\0\0\0\0"), + (2, *b"\\r\0\0\0\0\0\0"), + (6, *b"\\u000e\0\0"), + (6, *b"\\u000f\0\0"), + (6, *b"\\u0010\0\0"), + (6, *b"\\u0011\0\0"), + (6, *b"\\u0012\0\0"), + (6, *b"\\u0013\0\0"), + (6, *b"\\u0014\0\0"), + (6, *b"\\u0015\0\0"), + (6, *b"\\u0016\0\0"), + (6, *b"\\u0017\0\0"), + (6, *b"\\u0018\0\0"), + (6, *b"\\u0019\0\0"), + (6, *b"\\u001a\0\0"), + (6, *b"\\u001b\0\0"), + (6, *b"\\u001c\0\0"), + (6, *b"\\u001d\0\0"), + (6, *b"\\u001e\0\0"), + (6, *b"\\u001f\0\0"), + (0, [0; 8]), + (0, [0; 8]), + (2, *b"\\\"\0\0\0\0\0\0"), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (2, *b"\\\\\0\0\0\0\0\0"), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), +]; +const NEED_ESCAPED: [u8; 256] = [ + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 0, + 0, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, +]; +#[inline(always)] +unsafe fn escape_unchecked(src: &mut *const u8, nb: &mut usize, dst: &mut *mut u8) { + if true { + if !(*nb >= 1) { + ::core::panicking::panic("assertion failed: *nb >= 1") + } + } + loop { + let ch = unsafe { *(*src) }; + let cnt = QUOTE_TAB[ch as usize].0 as usize; + if true { + if !(cnt != 0) { + { + ::core::panicking::panic_fmt( + format_args!( + "char is {0}, cnt is {1}, NEED_ESCAPED is {2}", + ch as char, + cnt, + NEED_ESCAPED[ch as usize], + ), + ); + } + } + } + unsafe { + std::ptr::copy_nonoverlapping(QUOTE_TAB[ch as usize].1.as_ptr(), *dst, 8) + }; + unsafe { (*dst) = (*dst).add(cnt) }; + unsafe { (*src) = (*src).add(1) }; + (*nb) -= 1; + if (*nb) == 0 || unsafe { NEED_ESCAPED[*(*src) as usize] == 0 } { + return; + } + } +} +#[inline(always)] +fn check_cross_page(ptr: *const u8, step: usize) -> bool { + let page_size = 4096; + ((ptr as usize & (page_size - 1)) + step) > page_size +} +const LANES: usize = 16; +#[inline] +fn escaped_mask_generic(v: simd::v128::Simd128u) -> u16 { + use simd::v128::Simd128u as u8x16; + let x1f = u8x16::splat(0x1f); + let blash = u8x16::splat(b'\\'); + let quote = u8x16::splat(b'"'); + let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); + v.bitmask() +} +#[inline] +fn escaped_mask_neon(v: simd::neon::Simd128u) -> simd::bits::NeonBits { + use simd::neon::Simd128u as u8x16; + let x1f = u8x16::splat(0x1f); + let blash = u8x16::splat(b'\\'); + let quote = u8x16::splat(b'"'); + let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); + v.bitmask() +} +#[inline(always)] +fn format_string(value: &str, dst: &mut [u8]) -> usize { + let mut v_neon: simd::neon::Simd128u; + let mut v_generic: simd::v128::Simd128u; + unsafe { + let slice = value.as_bytes(); + let mut sptr = slice.as_ptr(); + let mut dptr = dst.as_mut_ptr(); + let dstart = dptr; + let mut nb: usize = slice.len(); + *dptr = b'"'; + dptr = dptr.add(1); + while nb >= LANES { + { + if true || (true || ::std_detect::detect::__is_feature_detected::asimd()) + { + v_neon = load(sptr); + v_neon + .write_to_slice_unaligned_unchecked( + std::slice::from_raw_parts_mut(dptr, LANES), + ); + let mask = escaped_mask_neon(v_neon); + if mask.all_zero() { + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + }; + } else { + v_generic = load(sptr); + v_generic + .write_to_slice_unaligned_unchecked( + std::slice::from_raw_parts_mut(dptr, LANES), + ); + let mask = escaped_mask_generic(v_generic); + if mask.all_zero() { + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + }; + } + } + } + #[allow(invalid_value, clippy::uninit_assumed_init)] + let mut placeholder: [u8; LANES] = core::mem::MaybeUninit::uninit() + .assume_init(); + while nb > 0 { + { + if true || (true || ::std_detect::detect::__is_feature_detected::asimd()) + { + v_neon = { + { + if check_cross_page(sptr, LANES) { + std::ptr::copy_nonoverlapping( + sptr, + placeholder[..].as_mut_ptr(), + nb, + ); + load(placeholder[..].as_ptr()) + } else { + { + std::ptr::copy_nonoverlapping( + sptr, + placeholder[..].as_mut_ptr(), + nb, + ); + load(placeholder[..].as_ptr()) + } + } + } + }; + v_neon + .write_to_slice_unaligned_unchecked( + std::slice::from_raw_parts_mut(dptr, LANES), + ); + let mask = escaped_mask_neon(v_neon); + if mask.all_zero() { + dptr = dptr.add(nb); + break; + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } else { + v_generic = { + { + if check_cross_page(sptr, LANES) { + std::ptr::copy_nonoverlapping( + sptr, + placeholder[..].as_mut_ptr(), + nb, + ); + load(placeholder[..].as_ptr()) + } else { + { + std::ptr::copy_nonoverlapping( + sptr, + placeholder[..].as_mut_ptr(), + nb, + ); + load(placeholder[..].as_ptr()) + } + } + } + }; + v_generic + .write_to_slice_unaligned_unchecked( + std::slice::from_raw_parts_mut(dptr, LANES), + ); + let mask = escaped_mask_generic(v_generic); + if mask.all_zero() { + dptr = dptr.add(nb); + break; + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } + } + } + *dptr = b'"'; + dptr = dptr.add(1); + dptr as usize - dstart as usize + } +} +pub fn escape(value: &str) -> String { + let capacity = value.len() * 6 + 32 + 3; + let mut buf = Vec::with_capacity(capacity); + unsafe { buf.set_len(capacity) }; + let cnt = format_string(value, &mut buf); + unsafe { buf.set_len(cnt) }; + unsafe { String::from_utf8_unchecked(buf) } +} +pub fn escape_into>(value: S, dst: &mut Vec) -> usize { + let value = value.as_ref(); + let needed_capacity = value.len() * 6 + 32 + 3; + dst.reserve(needed_capacity); + let old_len = dst.len(); + unsafe { + let spare = std::slice::from_raw_parts_mut( + dst.as_mut_ptr().add(old_len), + dst.capacity() - old_len, + ); + let cnt = format_string(value, spare); + dst.set_len(old_len + cnt); + cnt + } +} From 9988f060f60251b3e18ae3505bbc32d650e4388a Mon Sep 17 00:00:00 2001 From: LongYinan Date: Mon, 13 Oct 2025 10:10:32 +0800 Subject: [PATCH 02/14] warmup --- benches/escape.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/benches/escape.rs b/benches/escape.rs index 02616b1..a60916a 100644 --- a/benches/escape.rs +++ b/benches/escape.rs @@ -41,6 +41,10 @@ fn get_affine_sources() -> Vec { } fn run_benchmarks(c: &mut Criterion, sources: &[String], prefix: &str) { + let first = &sources[0]; + assert_eq!(escape(first), sonic_rs::to_string(first).unwrap()); + assert_eq!(escape(first), serde_json::to_string(first).unwrap()); + c.bench_function(&format!("{} escape simd", prefix), |b| { b.iter(|| { for source in sources { From f8e56b5760aa67cf180a8b74f50380b8d8dfbdbc Mon Sep 17 00:00:00 2001 From: LongYinan Date: Mon, 13 Oct 2025 10:29:52 +0800 Subject: [PATCH 03/14] update --- src/lib.rs | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 543a488..bb5d629 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -292,9 +292,21 @@ const NEED_ESCAPED: [u8; 256] = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + not(feature = "codspeed") +))] static COMPUTE_LANES: std::sync::Once = std::sync::Once::new(); -static mut LANES: usize = 16; +#[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + not(feature = "codspeed") +))] +static mut LANES: usize = 32; +#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "codspeed"))] +const LANES: usize = 32; + +#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] +const LANES: usize = 16; // only check the src length. #[inline(always)] @@ -443,7 +455,10 @@ fn format_string(value: &str, dst: &mut [u8]) -> usize { let mut v_generic: simd::v128::Simd128u; - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + not(feature = "codspeed") + ))] COMPUTE_LANES.call_once(|| { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { @@ -451,9 +466,9 @@ fn format_string(value: &str, dst: &mut [u8]) -> usize { unsafe { LANES = simd::avx512::Simd512u::LANES; } - } else if is_x86_feature_detected!("avx2") { + } else if is_x86_feature_detected!("sse2") { unsafe { - LANES = simd::avx2::Simd256u::LANES; + LANES = simd::sse2::Simd128u::LANES; } } } From c3f01b91f149f7f0f5080c3826c1e9351aa27f65 Mon Sep 17 00:00:00 2001 From: LongYinan Date: Mon, 13 Oct 2025 10:38:52 +0800 Subject: [PATCH 04/14] update --- src/lib.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index bb5d629..b807066 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -301,7 +301,7 @@ static COMPUTE_LANES: std::sync::Once = std::sync::Once::new(); any(target_arch = "x86", target_arch = "x86_64"), not(feature = "codspeed") ))] -static mut LANES: usize = 32; +static mut LANES: usize = simd::avx2::Simd256u::LANES; #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "codspeed"))] const LANES: usize = 32; @@ -351,7 +351,7 @@ fn escaped_mask_generic(v: simd::v128::Simd128u) -> u16 { } #[cfg(target_arch = "aarch64")] -#[inline] +#[inline(always)] fn escaped_mask_neon(v: simd::neon::Simd128u) -> simd::bits::NeonBits { use simd::neon::Simd128u as u8x16; @@ -363,7 +363,7 @@ fn escaped_mask_neon(v: simd::neon::Simd128u) -> simd::bits::NeonBits { } #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -#[inline] +#[inline(always)] fn escaped_mask_sse2(v: simd::sse2::Simd128u) -> u16 { use simd::sse2::Simd128u as u8x16; @@ -375,7 +375,7 @@ fn escaped_mask_sse2(v: simd::sse2::Simd128u) -> u16 { } #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -#[inline] +#[inline(always)] fn escaped_mask_avx2(v: simd::avx2::Simd256u) -> u32 { use simd::avx2::Simd256u as u8x32; @@ -387,7 +387,7 @@ fn escaped_mask_avx2(v: simd::avx2::Simd256u) -> u32 { } #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -#[inline] +#[inline(always)] fn escaped_mask_avx512(v: simd::avx512::Simd512u) -> u64 { use simd::avx512::Simd512u as u8x64; From ab1978576b7ad5981b01ceacf5729742a3f5d7fb Mon Sep 17 00:00:00 2001 From: LongYinan Date: Mon, 13 Oct 2025 10:41:40 +0800 Subject: [PATCH 05/14] cond --- src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index b807066..0d04882 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -303,7 +303,7 @@ static COMPUTE_LANES: std::sync::Once = std::sync::Once::new(); ))] static mut LANES: usize = simd::avx2::Simd256u::LANES; #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "codspeed"))] -const LANES: usize = 32; +const LANES: usize = simd::avx2::Simd256u::LANES; #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] const LANES: usize = 16; @@ -466,7 +466,7 @@ fn format_string(value: &str, dst: &mut [u8]) -> usize { unsafe { LANES = simd::avx512::Simd512u::LANES; } - } else if is_x86_feature_detected!("sse2") { + } else if !is_x86_feature_detected!("avx2") { unsafe { LANES = simd::sse2::Simd128u::LANES; } From aa9bf7f7c1e27e446e61bd66d65634ae2b1044b5 Mon Sep 17 00:00:00 2001 From: LongYinan Date: Mon, 13 Oct 2025 10:51:35 +0800 Subject: [PATCH 06/14] cache feature detect --- src/lib.rs | 19 +- test.rs | 1614 ---------------------------------------------------- 2 files changed, 13 insertions(+), 1620 deletions(-) delete mode 100644 test.rs diff --git a/src/lib.rs b/src/lib.rs index 0d04882..f18ff7d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -453,6 +453,13 @@ fn format_string(value: &str, dst: &mut [u8]) -> usize { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] let mut v_avx512: simd::avx512::Simd512u; + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + let has_avx512 = is_x86_feature_detected!("avx512f"); + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + let has_avx2 = is_x86_feature_detected!("avx2"); + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + let has_sse2 = is_x86_feature_detected!("sse2"); + let mut v_generic: simd::v128::Simd128u; #[cfg(all( @@ -504,21 +511,21 @@ fn format_string(value: &str, dst: &mut [u8]) -> usize { } #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] { - if is_x86_feature_detected!("avx512f") { + if has_avx512 { v_avx512 = load(sptr); v_avx512.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( dptr, LANES, )); let mask = escaped_mask_avx512(v_avx512); escape!(mask, nb, dptr, sptr); - } else if is_x86_feature_detected!("avx2") { + } else if has_avx2 { v_avx2 = load(sptr); v_avx2.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( dptr, LANES, )); let mask = escaped_mask_avx2(v_avx2); escape!(mask, nb, dptr, sptr); - } else if is_x86_feature_detected!("sse2") { + } else if has_sse2 { v_sse2 = load(sptr); v_sse2.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( dptr, LANES, @@ -592,7 +599,7 @@ fn format_string(value: &str, dst: &mut [u8]) -> usize { } #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] { - if is_x86_feature_detected!("avx512f") { + if has_avx512 { const LANES: usize = simd::avx512::Simd512u::LANES; // Scratch buffer reused for mask materialisation; stay uninitialised. #[cfg(not(miri))] @@ -617,7 +624,7 @@ fn format_string(value: &str, dst: &mut [u8]) -> usize { escape_unchecked(&mut sptr, &mut nb, &mut dptr); } } - } else if is_x86_feature_detected!("avx2") { + } else if has_avx2 { const LANES: usize = simd::avx2::Simd256u::LANES; // Scratch buffer reused for mask materialisation; stay uninitialised. #[cfg(not(miri))] @@ -642,7 +649,7 @@ fn format_string(value: &str, dst: &mut [u8]) -> usize { escape_unchecked(&mut sptr, &mut nb, &mut dptr); } } - } else if is_x86_feature_detected!("sse2") { + } else if has_sse2 { const LANES: usize = simd::sse2::Simd128u::LANES; // Scratch buffer reused for mask materialisation; stay uninitialised. #[cfg(not(miri))] diff --git a/test.rs b/test.rs deleted file mode 100644 index b159465..0000000 --- a/test.rs +++ /dev/null @@ -1,1614 +0,0 @@ -#![feature(prelude_import)] -//! Borrowed from -//! -//! Only takes the string escaping part to avoid the abstraction overhead. -#[macro_use] -extern crate std; -#[prelude_import] -use std::prelude::rust_2024::*; -use std::slice::from_raw_parts; -use simd::{BitMask, Mask, Simd}; -mod simd { - #![allow(non_camel_case_types)] - pub mod bits { - use super::traits::BitMask; - impl BitMask for u16 { - const LEN: usize = std::mem::size_of::() * 8; - #[inline] - fn before(&self, rhs: &Self) -> bool { - (self.as_little_endian() & rhs.as_little_endian().wrapping_sub(1)) != 0 - } - #[inline] - fn first_offset(&self) -> usize { - self.as_little_endian().trailing_zeros() as usize - } - #[inline] - fn as_little_endian(&self) -> Self { - { self.clone() } - } - #[inline] - fn all_zero(&self) -> bool { - *self == 0 - } - #[inline] - fn clear_high_bits(&self, n: usize) -> Self { - if true { - if !(n <= Self::LEN) { - ::core::panicking::panic("assertion failed: n <= Self::LEN") - } - } - *self & ((u64::MAX as u16) >> n) - } - } - impl BitMask for u32 { - const LEN: usize = std::mem::size_of::() * 8; - #[inline] - fn before(&self, rhs: &Self) -> bool { - (self.as_little_endian() & rhs.as_little_endian().wrapping_sub(1)) != 0 - } - #[inline] - fn first_offset(&self) -> usize { - self.as_little_endian().trailing_zeros() as usize - } - #[inline] - fn as_little_endian(&self) -> Self { - { self.clone() } - } - #[inline] - fn all_zero(&self) -> bool { - *self == 0 - } - #[inline] - fn clear_high_bits(&self, n: usize) -> Self { - if true { - if !(n <= Self::LEN) { - ::core::panicking::panic("assertion failed: n <= Self::LEN") - } - } - *self & ((u64::MAX as u32) >> n) - } - } - impl BitMask for u64 { - const LEN: usize = std::mem::size_of::() * 8; - #[inline] - fn before(&self, rhs: &Self) -> bool { - (self.as_little_endian() & rhs.as_little_endian().wrapping_sub(1)) != 0 - } - #[inline] - fn first_offset(&self) -> usize { - self.as_little_endian().trailing_zeros() as usize - } - #[inline] - fn as_little_endian(&self) -> Self { - { self.clone() } - } - #[inline] - fn all_zero(&self) -> bool { - *self == 0 - } - #[inline] - fn clear_high_bits(&self, n: usize) -> Self { - if true { - if !(n <= Self::LEN) { - ::core::panicking::panic("assertion failed: n <= Self::LEN") - } - } - *self & ((u64::MAX as u64) >> n) - } - } - /// Use u64 representation the bitmask of Neon vector. - /// (low) - /// Vector: 00-ff-ff-ff-ff-00-00-00 - /// Mask : 0000-1111-1111-1111-1111-0000-0000-0000 - /// - /// first_offset() = 1 - /// clear_high_bits(4) = Mask(0000-1111-1111-1111-[0000]-0000-0000-0000) - /// - /// reference: https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon - pub struct NeonBits(u64); - impl NeonBits { - #[inline] - pub fn new(u: u64) -> Self { - Self(u) - } - } - impl BitMask for NeonBits { - const LEN: usize = 16; - #[inline] - fn first_offset(&self) -> usize { - (self.as_little_endian().0.trailing_zeros() as usize) >> 2 - } - #[inline] - fn before(&self, rhs: &Self) -> bool { - (self.as_little_endian().0 & rhs.as_little_endian().0.wrapping_sub(1)) - != 0 - } - #[inline] - fn as_little_endian(&self) -> Self { - { Self::new(self.0) } - } - #[inline] - fn all_zero(&self) -> bool { - self.0 == 0 - } - #[inline] - fn clear_high_bits(&self, n: usize) -> Self { - if true { - if !(n <= Self::LEN) { - ::core::panicking::panic("assertion failed: n <= Self::LEN") - } - } - Self(self.0 & u64::MAX >> (n * 4)) - } - } - } - mod traits { - use std::ops::{BitAnd, BitOr, BitOrAssign}; - /// Portbal SIMD traits - pub trait Simd: Sized { - const LANES: usize; - type Element; - type Mask: Mask; - /// # Safety - unsafe fn from_slice_unaligned_unchecked(slice: &[u8]) -> Self { - if true { - if !(slice.len() >= Self::LANES) { - ::core::panicking::panic( - "assertion failed: slice.len() >= Self::LANES", - ) - } - } - unsafe { Self::loadu(slice.as_ptr()) } - } - /// # Safety - unsafe fn write_to_slice_unaligned_unchecked(&self, slice: &mut [u8]) { - if true { - if !(slice.len() >= Self::LANES) { - ::core::panicking::panic( - "assertion failed: slice.len() >= Self::LANES", - ) - } - } - unsafe { self.storeu(slice.as_mut_ptr()) } - } - /// # Safety - unsafe fn loadu(ptr: *const u8) -> Self; - /// # Safety - unsafe fn storeu(&self, ptr: *mut u8); - fn eq(&self, rhs: &Self) -> Self::Mask; - fn splat(elem: Self::Element) -> Self; - #[allow(unused)] - /// greater than - fn gt(&self, rhs: &Self) -> Self::Mask; - /// less or equal - fn le(&self, rhs: &Self) -> Self::Mask; - } - /// Portbal SIMD mask traits - pub trait Mask: Sized + BitOr + BitOrAssign + BitAnd { - type Element; - type BitMask: BitMask; - fn bitmask(self) -> Self::BitMask; - fn splat(b: bool) -> Self; - } - /// Trait for the bitmask of a vector Mask. - pub trait BitMask { - /// Total bits in the bitmask. - const LEN: usize; - /// get the offset of the first `1` bit. - fn first_offset(&self) -> usize; - /// check if this bitmask is before the other bitmask. - fn before(&self, rhs: &Self) -> bool; - /// convert bitmask as little endian - fn as_little_endian(&self) -> Self; - /// whether all bits are zero. - fn all_zero(&self) -> bool; - /// clear high n bits. - fn clear_high_bits(&self, n: usize) -> Self; - } - } - pub(crate) mod neon { - use std::arch::aarch64::*; - use super::{Mask, Simd, bits::NeonBits}; - #[repr(transparent)] - pub struct Simd128u(uint8x16_t); - #[automatically_derived] - impl ::core::fmt::Debug for Simd128u { - #[inline] - fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { - ::core::fmt::Formatter::debug_tuple_field1_finish( - f, - "Simd128u", - &&self.0, - ) - } - } - #[repr(transparent)] - pub struct Simd128i(int8x16_t); - #[automatically_derived] - impl ::core::fmt::Debug for Simd128i { - #[inline] - fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { - ::core::fmt::Formatter::debug_tuple_field1_finish( - f, - "Simd128i", - &&self.0, - ) - } - } - impl Simd for Simd128u { - const LANES: usize = 16; - type Mask = Mask128; - type Element = u8; - #[inline(always)] - unsafe fn loadu(ptr: *const u8) -> Self { - unsafe { Self(vld1q_u8(ptr)) } - } - #[inline(always)] - unsafe fn storeu(&self, ptr: *mut u8) { - unsafe { vst1q_u8(ptr, self.0) }; - } - #[inline(always)] - fn eq(&self, lhs: &Self) -> Self::Mask { - unsafe { Mask128(vceqq_u8(self.0, lhs.0)) } - } - #[inline(always)] - fn splat(ch: u8) -> Self { - unsafe { Self(vdupq_n_u8(ch)) } - } - #[inline(always)] - fn le(&self, lhs: &Self) -> Self::Mask { - unsafe { Mask128(vcleq_u8(self.0, lhs.0)) } - } - #[inline(always)] - fn gt(&self, lhs: &Self) -> Self::Mask { - unsafe { Mask128(vcgtq_u8(self.0, lhs.0)) } - } - } - impl Simd for Simd128i { - const LANES: usize = 16; - type Mask = Mask128; - type Element = i8; - #[inline(always)] - unsafe fn loadu(ptr: *const u8) -> Self { - Self(unsafe { vld1q_s8(ptr as *const i8) }) - } - #[inline(always)] - unsafe fn storeu(&self, ptr: *mut u8) { - unsafe { vst1q_s8(ptr as *mut i8, self.0) }; - } - #[inline(always)] - fn eq(&self, lhs: &Self) -> Self::Mask { - unsafe { Mask128(vceqq_s8(self.0, lhs.0)) } - } - #[inline(always)] - fn splat(elem: i8) -> Self { - unsafe { Self(vdupq_n_s8(elem)) } - } - #[inline(always)] - fn le(&self, lhs: &Self) -> Self::Mask { - unsafe { Mask128(vcleq_s8(self.0, lhs.0)) } - } - #[inline(always)] - fn gt(&self, lhs: &Self) -> Self::Mask { - unsafe { Mask128(vcgtq_s8(self.0, lhs.0)) } - } - } - pub(crate) const BIT_MASK_TAB: [u8; 16] = [ - 0x01u8, - 0x02, - 0x4, - 0x8, - 0x10, - 0x20, - 0x40, - 0x80, - 0x01, - 0x02, - 0x4, - 0x8, - 0x10, - 0x20, - 0x40, - 0x80, - ]; - #[repr(transparent)] - pub struct Mask128(pub(crate) uint8x16_t); - #[automatically_derived] - impl ::core::fmt::Debug for Mask128 { - #[inline] - fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { - ::core::fmt::Formatter::debug_tuple_field1_finish(f, "Mask128", &&self.0) - } - } - impl Mask for Mask128 { - type BitMask = NeonBits; - type Element = u8; - /// Convert Mask Vector 0x00-ff-ff to Bits 0b0000-1111-1111 - /// Reference: https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon - #[inline(always)] - fn bitmask(self) -> Self::BitMask { - unsafe { - let v16 = vreinterpretq_u16_u8(self.0); - let sr4 = vshrn_n_u16(v16, 4); - let v64 = vreinterpret_u64_u8(sr4); - NeonBits::new(vget_lane_u64(v64, 0)) - } - } - #[inline(always)] - fn splat(b: bool) -> Self { - let v: i8 = if b { -1 } else { 0 }; - unsafe { Self(vdupq_n_u8(v as u8)) } - } - } - impl std::ops::BitAnd for Mask128 { - type Output = Self; - #[inline(always)] - fn bitand(self, rhs: Mask128) -> Self::Output { - unsafe { Self(vandq_u8(self.0, rhs.0)) } - } - } - impl std::ops::BitOr for Mask128 { - type Output = Self; - #[inline(always)] - fn bitor(self, rhs: Mask128) -> Self::Output { - unsafe { Self(vorrq_u8(self.0, rhs.0)) } - } - } - impl std::ops::BitOrAssign for Mask128 { - #[inline(always)] - fn bitor_assign(&mut self, rhs: Mask128) { - unsafe { - self.0 = vorrq_u8(self.0, rhs.0); - } - } - } - #[inline(always)] - pub unsafe fn to_bitmask64( - v0: uint8x16_t, - v1: uint8x16_t, - v2: uint8x16_t, - v3: uint8x16_t, - ) -> u64 { - let bit_mask = unsafe { - std::mem::transmute::<[u8; 16], uint8x16_t>(BIT_MASK_TAB) - }; - let t0 = unsafe { vandq_u8(v0, bit_mask) }; - let t1 = unsafe { vandq_u8(v1, bit_mask) }; - let t2 = unsafe { vandq_u8(v2, bit_mask) }; - let t3 = unsafe { vandq_u8(v3, bit_mask) }; - let pair0 = unsafe { vpaddq_u8(t0, t1) }; - let pair1 = unsafe { vpaddq_u8(t2, t3) }; - let quad = unsafe { vpaddq_u8(pair0, pair1) }; - let octa = unsafe { vpaddq_u8(quad, quad) }; - unsafe { vgetq_lane_u64(vreinterpretq_u64_u8(octa), 0) } - } - #[inline(always)] - pub(crate) unsafe fn to_bitmask32(v0: uint8x16_t, v1: uint8x16_t) -> u32 { - let bit_mask = unsafe { - std::mem::transmute::<[u8; 16], uint8x16_t>(BIT_MASK_TAB) - }; - let t0 = vandq_u8(v0, bit_mask); - let t1 = vandq_u8(v1, bit_mask); - let pair = vpaddq_u8(t0, t1); - let quad = vpaddq_u8(pair, pair); - let octa = vpaddq_u8(quad, quad); - vgetq_lane_u32(vreinterpretq_u32_u8(octa), 0) - } - } - pub(crate) mod v128 { - use std::ops::{BitAnd, BitOr, BitOrAssign}; - use super::{Mask, Simd}; - pub struct Simd128i([i8; 16]); - #[automatically_derived] - impl ::core::fmt::Debug for Simd128i { - #[inline] - fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { - ::core::fmt::Formatter::debug_tuple_field1_finish( - f, - "Simd128i", - &&self.0, - ) - } - } - pub struct Simd128u([u8; 16]); - #[automatically_derived] - impl ::core::fmt::Debug for Simd128u { - #[inline] - fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { - ::core::fmt::Formatter::debug_tuple_field1_finish( - f, - "Simd128u", - &&self.0, - ) - } - } - pub struct Mask128(pub(crate) [u8; 16]); - #[automatically_derived] - impl ::core::fmt::Debug for Mask128 { - #[inline] - fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { - ::core::fmt::Formatter::debug_tuple_field1_finish(f, "Mask128", &&self.0) - } - } - impl Simd for Simd128i { - type Element = i8; - const LANES: usize = 16; - type Mask = Mask128; - unsafe fn loadu(ptr: *const u8) -> Self { - let v = unsafe { std::slice::from_raw_parts(ptr, Self::LANES) }; - let mut res = [0i8; 16]; - res.copy_from_slice(unsafe { std::mem::transmute::<&[u8], &[i8]>(v) }); - Self(res) - } - unsafe fn storeu(&self, ptr: *mut u8) { - let data = unsafe { std::mem::transmute::<&[i8], &[u8]>(&self.0) }; - unsafe { - std::ptr::copy_nonoverlapping(data.as_ptr(), ptr, Self::LANES) - }; - } - fn eq(&self, rhs: &Self) -> Self::Mask { - let mut mask = [0u8; 16]; - for i in 0..Self::LANES { - mask[i] = if self.0[i] == rhs.0[i] { 1 } else { 0 }; - } - Mask128(mask) - } - fn splat(value: i8) -> Self { - Self([value as i8; Self::LANES]) - } - fn le(&self, rhs: &Self) -> Self::Mask { - let mut mask = [0u8; 16]; - for i in 0..Self::LANES { - mask[i] = if self.0[i] <= rhs.0[i] { 1 } else { 0 }; - } - Mask128(mask) - } - fn gt(&self, rhs: &Self) -> Self::Mask { - let mut mask = [0u8; 16]; - for i in 0..Self::LANES { - mask[i] = if self.0[i] > rhs.0[i] { 1 } else { 0 }; - } - Mask128(mask) - } - } - impl Simd for Simd128u { - type Element = u8; - const LANES: usize = 16; - type Mask = Mask128; - unsafe fn loadu(ptr: *const u8) -> Self { - let v = unsafe { std::slice::from_raw_parts(ptr, Self::LANES) }; - let mut res = [0u8; 16]; - res.copy_from_slice(v); - Self(res) - } - unsafe fn storeu(&self, ptr: *mut u8) { - let data = &self.0; - unsafe { - std::ptr::copy_nonoverlapping(data.as_ptr(), ptr, Self::LANES) - }; - } - fn eq(&self, rhs: &Self) -> Self::Mask { - let mut mask = [0u8; 16]; - for i in 0..Self::LANES { - mask[i] = if self.0[i] == rhs.0[i] { 1 } else { 0 }; - } - Mask128(mask) - } - fn splat(value: u8) -> Self { - Self([value; Self::LANES]) - } - fn le(&self, rhs: &Self) -> Self::Mask { - let mut mask = [0u8; 16]; - for i in 0..Self::LANES { - mask[i] = if self.0[i] <= rhs.0[i] { 1 } else { 0 }; - } - Mask128(mask) - } - fn gt(&self, rhs: &Self) -> Self::Mask { - let mut mask = [0u8; 16]; - for i in 0..Self::LANES { - mask[i] = if self.0[i] > rhs.0[i] { 1 } else { 0 }; - } - Mask128(mask) - } - } - impl Mask for Mask128 { - type BitMask = u16; - type Element = u8; - fn bitmask(self) -> Self::BitMask { - { - self.0 - .iter() - .enumerate() - .fold(0, |acc, (i, &b)| acc | ((b as u16) << i)) - } - } - fn splat(b: bool) -> Self { - Mask128([b as u8; 16]) - } - } - impl BitAnd for Mask128 { - type Output = Self; - fn bitand(self, rhs: Self) -> Self::Output { - let mut result = [0u8; 16]; - for i in 0..16 { - result[i] = self.0[i] & rhs.0[i]; - } - Mask128(result) - } - } - impl BitOr for Mask128 { - type Output = Self; - fn bitor(self, rhs: Self) -> Self::Output { - let mut result = [0u8; 16]; - for i in 0..16 { - result[i] = self.0[i] | rhs.0[i]; - } - Mask128(result) - } - } - impl BitOrAssign for Mask128 { - fn bitor_assign(&mut self, rhs: Self) { - for i in 0..16 { - self.0[i] |= rhs.0[i]; - } - } - } - } - pub(crate) mod v256 { - use std::ops::{BitAnd, BitOr, BitOrAssign}; - use super::{Mask, Simd, v128::{Mask128, Simd128i, Simd128u}}; - #[repr(transparent)] - pub struct Simd256u((Simd128u, Simd128u)); - #[automatically_derived] - impl ::core::fmt::Debug for Simd256u { - #[inline] - fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { - ::core::fmt::Formatter::debug_tuple_field1_finish( - f, - "Simd256u", - &&self.0, - ) - } - } - #[repr(transparent)] - pub struct Simd256i((Simd128i, Simd128i)); - #[automatically_derived] - impl ::core::fmt::Debug for Simd256i { - #[inline] - fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { - ::core::fmt::Formatter::debug_tuple_field1_finish( - f, - "Simd256i", - &&self.0, - ) - } - } - #[repr(transparent)] - pub struct Mask256(pub(crate) (Mask128, Mask128)); - #[automatically_derived] - impl ::core::fmt::Debug for Mask256 { - #[inline] - fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { - ::core::fmt::Formatter::debug_tuple_field1_finish(f, "Mask256", &&self.0) - } - } - impl Mask for Mask256 { - type BitMask = u32; - type Element = u8; - #[inline(always)] - fn bitmask(self) -> Self::BitMask { - fn combine_u16(lo: u16, hi: u16) -> u32 { - { (lo as u32) | ((hi as u32) << 16) } - } - combine_u16(self.0.0.bitmask(), self.0.1.bitmask()) - } - #[inline(always)] - fn splat(b: bool) -> Self { - Mask256((Mask128::splat(b), Mask128::splat(b))) - } - } - impl BitOr for Mask256 { - type Output = Self; - #[inline(always)] - fn bitor(self, rhs: Self) -> Self::Output { - let lo = self.0.0 | rhs.0.0; - let hi = self.0.1 | rhs.0.1; - Mask256((lo, hi)) - } - } - impl BitOrAssign for Mask256 { - #[inline(always)] - fn bitor_assign(&mut self, rhs: Self) { - self.0.0 |= rhs.0.0; - self.0.1 |= rhs.0.1; - } - } - impl BitAnd for Mask256 { - type Output = Self; - #[inline(always)] - fn bitand(self, rhs: Mask256) -> Self::Output { - let lo = self.0.0 & rhs.0.0; - let hi = self.0.1 & rhs.0.1; - Mask256((lo, hi)) - } - } - impl Simd for Simd256u { - const LANES: usize = 32; - type Mask = Mask256; - type Element = u8; - #[inline(always)] - unsafe fn loadu(ptr: *const u8) -> Self { - let lo = unsafe { Simd128u::loadu(ptr) }; - let hi = unsafe { Simd128u::loadu(ptr.add(Simd128u::LANES)) }; - Simd256u((lo, hi)) - } - #[inline(always)] - unsafe fn storeu(&self, ptr: *mut u8) { - unsafe { Simd128u::storeu(&self.0.0, ptr) }; - unsafe { Simd128u::storeu(&self.0.1, ptr.add(Simd128u::LANES)) }; - } - #[inline(always)] - fn eq(&self, rhs: &Self) -> Self::Mask { - let lo = self.0.0.eq(&rhs.0.0); - let hi = self.0.1.eq(&rhs.0.1); - Mask256((lo, hi)) - } - #[inline(always)] - fn splat(elem: u8) -> Self { - Simd256u((Simd128u::splat(elem), Simd128u::splat(elem))) - } - #[inline(always)] - fn le(&self, rhs: &Self) -> Self::Mask { - let lo = self.0.0.le(&rhs.0.0); - let hi = self.0.1.le(&rhs.0.1); - Mask256((lo, hi)) - } - #[inline(always)] - fn gt(&self, rhs: &Self) -> Self::Mask { - let lo = self.0.0.gt(&rhs.0.0); - let hi = self.0.1.gt(&rhs.0.1); - Mask256((lo, hi)) - } - } - impl Simd for Simd256i { - const LANES: usize = 32; - type Mask = Mask256; - type Element = i8; - #[inline(always)] - unsafe fn loadu(ptr: *const u8) -> Self { - let lo = unsafe { Simd128i::loadu(ptr) }; - let hi = unsafe { Simd128i::loadu(ptr.add(Simd128i::LANES)) }; - Simd256i((lo, hi)) - } - #[inline(always)] - unsafe fn storeu(&self, ptr: *mut u8) { - unsafe { Simd128i::storeu(&self.0.0, ptr) }; - unsafe { Simd128i::storeu(&self.0.1, ptr.add(Simd128i::LANES)) }; - } - #[inline(always)] - fn eq(&self, rhs: &Self) -> Self::Mask { - let lo = self.0.0.eq(&rhs.0.0); - let hi = self.0.1.eq(&rhs.0.1); - Mask256((lo, hi)) - } - #[inline(always)] - fn splat(elem: i8) -> Self { - Simd256i((Simd128i::splat(elem), Simd128i::splat(elem))) - } - #[inline(always)] - fn le(&self, rhs: &Self) -> Self::Mask { - let lo = self.0.0.le(&rhs.0.0); - let hi = self.0.1.le(&rhs.0.1); - Mask256((lo, hi)) - } - #[inline(always)] - fn gt(&self, rhs: &Self) -> Self::Mask { - let lo = self.0.0.gt(&rhs.0.0); - let hi = self.0.1.gt(&rhs.0.1); - Mask256((lo, hi)) - } - } - } - pub(crate) mod v512 { - use std::ops::{BitAnd, BitOr, BitOrAssign}; - use super::{Mask, Simd, v256::{Mask256, Simd256i, Simd256u}}; - #[repr(transparent)] - pub struct Simd512u((Simd256u, Simd256u)); - #[automatically_derived] - impl ::core::fmt::Debug for Simd512u { - #[inline] - fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { - ::core::fmt::Formatter::debug_tuple_field1_finish( - f, - "Simd512u", - &&self.0, - ) - } - } - #[repr(transparent)] - pub struct Simd512i((Simd256i, Simd256i)); - #[automatically_derived] - impl ::core::fmt::Debug for Simd512i { - #[inline] - fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { - ::core::fmt::Formatter::debug_tuple_field1_finish( - f, - "Simd512i", - &&self.0, - ) - } - } - #[repr(transparent)] - pub struct Mask512((Mask256, Mask256)); - #[automatically_derived] - impl ::core::fmt::Debug for Mask512 { - #[inline] - fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result { - ::core::fmt::Formatter::debug_tuple_field1_finish(f, "Mask512", &&self.0) - } - } - impl Mask for Mask512 { - type BitMask = u64; - type Element = u8; - #[inline(always)] - fn bitmask(self) -> Self::BitMask { - fn combine_u32(lo: u32, hi: u32) -> u64 { - { (lo as u64) | ((hi as u64) << 32) } - } - combine_u32(self.0.0.bitmask(), self.0.1.bitmask()) - } - #[inline(always)] - fn splat(b: bool) -> Self { - Mask512((Mask256::splat(b), Mask256::splat(b))) - } - } - impl BitOr for Mask512 { - type Output = Self; - #[inline(always)] - fn bitor(self, rhs: Self) -> Self::Output { - let lo = self.0.0 | rhs.0.0; - let hi = self.0.1 | rhs.0.1; - Mask512((lo, hi)) - } - } - impl BitOrAssign for Mask512 { - #[inline(always)] - fn bitor_assign(&mut self, rhs: Self) { - self.0.0 |= rhs.0.0; - self.0.1 |= rhs.0.1; - } - } - impl BitAnd for Mask512 { - type Output = Self; - #[inline(always)] - fn bitand(self, rhs: Mask512) -> Self::Output { - let lo = self.0.0 & rhs.0.0; - let hi = self.0.1 & rhs.0.1; - Mask512((lo, hi)) - } - } - impl Simd for Simd512u { - const LANES: usize = 64; - type Element = u8; - type Mask = Mask512; - #[inline(always)] - unsafe fn loadu(ptr: *const u8) -> Self { - let lo = unsafe { Simd256u::loadu(ptr) }; - let hi = unsafe { Simd256u::loadu(ptr.add(Simd256u::LANES)) }; - Simd512u((lo, hi)) - } - #[inline(always)] - unsafe fn storeu(&self, ptr: *mut u8) { - unsafe { Simd256u::storeu(&self.0.0, ptr) }; - unsafe { Simd256u::storeu(&self.0.1, ptr.add(Simd256u::LANES)) }; - } - #[inline(always)] - fn eq(&self, rhs: &Self) -> Self::Mask { - let lo = self.0.0.eq(&rhs.0.0); - let hi = self.0.1.eq(&rhs.0.1); - Mask512((lo, hi)) - } - #[inline(always)] - fn splat(ch: u8) -> Self { - Simd512u((Simd256u::splat(ch), Simd256u::splat(ch))) - } - #[inline(always)] - fn le(&self, rhs: &Self) -> Self::Mask { - let lo = self.0.0.le(&rhs.0.0); - let hi = self.0.1.le(&rhs.0.1); - Mask512((lo, hi)) - } - #[inline(always)] - fn gt(&self, rhs: &Self) -> Self::Mask { - let lo = self.0.0.gt(&rhs.0.0); - let hi = self.0.1.gt(&rhs.0.1); - Mask512((lo, hi)) - } - } - impl Simd for Simd512i { - const LANES: usize = 64; - type Element = i8; - type Mask = Mask512; - #[inline(always)] - unsafe fn loadu(ptr: *const u8) -> Self { - let lo = unsafe { Simd256i::loadu(ptr) }; - let hi = unsafe { Simd256i::loadu(ptr.add(Simd256i::LANES)) }; - Simd512i((lo, hi)) - } - #[inline(always)] - unsafe fn storeu(&self, ptr: *mut u8) { - unsafe { Simd256i::storeu(&self.0.0, ptr) }; - unsafe { Simd256i::storeu(&self.0.1, ptr.add(Simd256i::LANES)) }; - } - #[inline(always)] - fn eq(&self, rhs: &Self) -> Self::Mask { - let lo = self.0.0.eq(&rhs.0.0); - let hi = self.0.1.eq(&rhs.0.1); - Mask512((lo, hi)) - } - #[inline(always)] - fn splat(elem: i8) -> Self { - Simd512i((Simd256i::splat(elem), Simd256i::splat(elem))) - } - #[inline(always)] - fn le(&self, rhs: &Self) -> Self::Mask { - let lo = self.0.0.le(&rhs.0.0); - let hi = self.0.1.le(&rhs.0.1); - Mask512((lo, hi)) - } - #[inline(always)] - fn gt(&self, rhs: &Self) -> Self::Mask { - let lo = self.0.0.gt(&rhs.0.0); - let hi = self.0.1.gt(&rhs.0.1); - Mask512((lo, hi)) - } - } - } - pub use self::traits::{BitMask, Mask, Simd}; -} -#[inline(always)] -unsafe fn load(ptr: *const u8) -> V { - let chunk = unsafe { from_raw_parts(ptr, V::LANES) }; - unsafe { V::from_slice_unaligned_unchecked(chunk) } -} -const QUOTE_TAB: [(u8, [u8; 8]); 256] = [ - (6, *b"\\u0000\0\0"), - (6, *b"\\u0001\0\0"), - (6, *b"\\u0002\0\0"), - (6, *b"\\u0003\0\0"), - (6, *b"\\u0004\0\0"), - (6, *b"\\u0005\0\0"), - (6, *b"\\u0006\0\0"), - (6, *b"\\u0007\0\0"), - (2, *b"\\b\0\0\0\0\0\0"), - (2, *b"\\t\0\0\0\0\0\0"), - (2, *b"\\n\0\0\0\0\0\0"), - (6, *b"\\u000b\0\0"), - (2, *b"\\f\0\0\0\0\0\0"), - (2, *b"\\r\0\0\0\0\0\0"), - (6, *b"\\u000e\0\0"), - (6, *b"\\u000f\0\0"), - (6, *b"\\u0010\0\0"), - (6, *b"\\u0011\0\0"), - (6, *b"\\u0012\0\0"), - (6, *b"\\u0013\0\0"), - (6, *b"\\u0014\0\0"), - (6, *b"\\u0015\0\0"), - (6, *b"\\u0016\0\0"), - (6, *b"\\u0017\0\0"), - (6, *b"\\u0018\0\0"), - (6, *b"\\u0019\0\0"), - (6, *b"\\u001a\0\0"), - (6, *b"\\u001b\0\0"), - (6, *b"\\u001c\0\0"), - (6, *b"\\u001d\0\0"), - (6, *b"\\u001e\0\0"), - (6, *b"\\u001f\0\0"), - (0, [0; 8]), - (0, [0; 8]), - (2, *b"\\\"\0\0\0\0\0\0"), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (2, *b"\\\\\0\0\0\0\0\0"), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), - (0, [0; 8]), -]; -const NEED_ESCAPED: [u8; 256] = [ - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 0, - 0, - 1, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 1, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, -]; -#[inline(always)] -unsafe fn escape_unchecked(src: &mut *const u8, nb: &mut usize, dst: &mut *mut u8) { - if true { - if !(*nb >= 1) { - ::core::panicking::panic("assertion failed: *nb >= 1") - } - } - loop { - let ch = unsafe { *(*src) }; - let cnt = QUOTE_TAB[ch as usize].0 as usize; - if true { - if !(cnt != 0) { - { - ::core::panicking::panic_fmt( - format_args!( - "char is {0}, cnt is {1}, NEED_ESCAPED is {2}", - ch as char, - cnt, - NEED_ESCAPED[ch as usize], - ), - ); - } - } - } - unsafe { - std::ptr::copy_nonoverlapping(QUOTE_TAB[ch as usize].1.as_ptr(), *dst, 8) - }; - unsafe { (*dst) = (*dst).add(cnt) }; - unsafe { (*src) = (*src).add(1) }; - (*nb) -= 1; - if (*nb) == 0 || unsafe { NEED_ESCAPED[*(*src) as usize] == 0 } { - return; - } - } -} -#[inline(always)] -fn check_cross_page(ptr: *const u8, step: usize) -> bool { - let page_size = 4096; - ((ptr as usize & (page_size - 1)) + step) > page_size -} -const LANES: usize = 16; -#[inline] -fn escaped_mask_generic(v: simd::v128::Simd128u) -> u16 { - use simd::v128::Simd128u as u8x16; - let x1f = u8x16::splat(0x1f); - let blash = u8x16::splat(b'\\'); - let quote = u8x16::splat(b'"'); - let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); - v.bitmask() -} -#[inline] -fn escaped_mask_neon(v: simd::neon::Simd128u) -> simd::bits::NeonBits { - use simd::neon::Simd128u as u8x16; - let x1f = u8x16::splat(0x1f); - let blash = u8x16::splat(b'\\'); - let quote = u8x16::splat(b'"'); - let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); - v.bitmask() -} -#[inline(always)] -fn format_string(value: &str, dst: &mut [u8]) -> usize { - let mut v_neon: simd::neon::Simd128u; - let mut v_generic: simd::v128::Simd128u; - unsafe { - let slice = value.as_bytes(); - let mut sptr = slice.as_ptr(); - let mut dptr = dst.as_mut_ptr(); - let dstart = dptr; - let mut nb: usize = slice.len(); - *dptr = b'"'; - dptr = dptr.add(1); - while nb >= LANES { - { - if true || (true || ::std_detect::detect::__is_feature_detected::asimd()) - { - v_neon = load(sptr); - v_neon - .write_to_slice_unaligned_unchecked( - std::slice::from_raw_parts_mut(dptr, LANES), - ); - let mask = escaped_mask_neon(v_neon); - if mask.all_zero() { - nb -= LANES; - dptr = dptr.add(LANES); - sptr = sptr.add(LANES); - } else { - let cn = mask.first_offset(); - nb -= cn; - dptr = dptr.add(cn); - sptr = sptr.add(cn); - escape_unchecked(&mut sptr, &mut nb, &mut dptr); - }; - } else { - v_generic = load(sptr); - v_generic - .write_to_slice_unaligned_unchecked( - std::slice::from_raw_parts_mut(dptr, LANES), - ); - let mask = escaped_mask_generic(v_generic); - if mask.all_zero() { - nb -= LANES; - dptr = dptr.add(LANES); - sptr = sptr.add(LANES); - } else { - let cn = mask.first_offset(); - nb -= cn; - dptr = dptr.add(cn); - sptr = sptr.add(cn); - escape_unchecked(&mut sptr, &mut nb, &mut dptr); - }; - } - } - } - #[allow(invalid_value, clippy::uninit_assumed_init)] - let mut placeholder: [u8; LANES] = core::mem::MaybeUninit::uninit() - .assume_init(); - while nb > 0 { - { - if true || (true || ::std_detect::detect::__is_feature_detected::asimd()) - { - v_neon = { - { - if check_cross_page(sptr, LANES) { - std::ptr::copy_nonoverlapping( - sptr, - placeholder[..].as_mut_ptr(), - nb, - ); - load(placeholder[..].as_ptr()) - } else { - { - std::ptr::copy_nonoverlapping( - sptr, - placeholder[..].as_mut_ptr(), - nb, - ); - load(placeholder[..].as_ptr()) - } - } - } - }; - v_neon - .write_to_slice_unaligned_unchecked( - std::slice::from_raw_parts_mut(dptr, LANES), - ); - let mask = escaped_mask_neon(v_neon); - if mask.all_zero() { - dptr = dptr.add(nb); - break; - } else { - let cn = mask.first_offset(); - nb -= cn; - dptr = dptr.add(cn); - sptr = sptr.add(cn); - escape_unchecked(&mut sptr, &mut nb, &mut dptr); - } - } else { - v_generic = { - { - if check_cross_page(sptr, LANES) { - std::ptr::copy_nonoverlapping( - sptr, - placeholder[..].as_mut_ptr(), - nb, - ); - load(placeholder[..].as_ptr()) - } else { - { - std::ptr::copy_nonoverlapping( - sptr, - placeholder[..].as_mut_ptr(), - nb, - ); - load(placeholder[..].as_ptr()) - } - } - } - }; - v_generic - .write_to_slice_unaligned_unchecked( - std::slice::from_raw_parts_mut(dptr, LANES), - ); - let mask = escaped_mask_generic(v_generic); - if mask.all_zero() { - dptr = dptr.add(nb); - break; - } else { - let cn = mask.first_offset(); - nb -= cn; - dptr = dptr.add(cn); - sptr = sptr.add(cn); - escape_unchecked(&mut sptr, &mut nb, &mut dptr); - } - } - } - } - *dptr = b'"'; - dptr = dptr.add(1); - dptr as usize - dstart as usize - } -} -pub fn escape(value: &str) -> String { - let capacity = value.len() * 6 + 32 + 3; - let mut buf = Vec::with_capacity(capacity); - unsafe { buf.set_len(capacity) }; - let cnt = format_string(value, &mut buf); - unsafe { buf.set_len(cnt) }; - unsafe { String::from_utf8_unchecked(buf) } -} -pub fn escape_into>(value: S, dst: &mut Vec) -> usize { - let value = value.as_ref(); - let needed_capacity = value.len() * 6 + 32 + 3; - dst.reserve(needed_capacity); - let old_len = dst.len(); - unsafe { - let spare = std::slice::from_raw_parts_mut( - dst.as_mut_ptr().add(old_len), - dst.capacity() - old_len, - ); - let cnt = format_string(value, spare); - dst.set_len(old_len + cnt); - cnt - } -} From 26691509183faf5f57a6d3ec9f9d1b8599141fa7 Mon Sep 17 00:00:00 2001 From: LongYinan Date: Mon, 13 Oct 2025 10:55:12 +0800 Subject: [PATCH 07/14] remove unsafe --- src/lib.rs | 40 +++++++--------------------------------- 1 file changed, 7 insertions(+), 33 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index f18ff7d..d95a284 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -460,6 +460,9 @@ fn format_string(value: &str, dst: &mut [u8]) -> usize { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] let has_sse2 = is_x86_feature_detected!("sse2"); + #[cfg(target_arch = "aarch64")] + let has_neon = cfg!(target_os = "macos") || std::arch::is_aarch64_feature_detected!("neon"); + let mut v_generic: simd::v128::Simd128u; #[cfg(all( @@ -491,9 +494,9 @@ fn format_string(value: &str, dst: &mut [u8]) -> usize { *dptr = b'"'; dptr = dptr.add(1); while nb >= LANES { - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + #[cfg(target_arch = "aarch64")] { - if cfg!(target_os = "macos") || std::arch::is_aarch64_feature_detected!("neon") { + if has_neon { v_neon = load(sptr); v_neon.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( dptr, LANES, @@ -543,15 +546,10 @@ fn format_string(value: &str, dst: &mut [u8]) -> usize { } } - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + #[cfg(target_arch = "aarch64")] { - if cfg!(target_os = "macos") || std::arch::is_aarch64_feature_detected!("neon") { + if has_neon { const LANES: usize = simd::neon::Simd128u::LANES; - // Scratch buffer reused for mask materialisation; stay uninitialised. - #[cfg(not(miri))] - #[allow(invalid_value, clippy::uninit_assumed_init)] - let mut placeholder: [u8; LANES] = core::mem::MaybeUninit::uninit().assume_init(); - #[cfg(miri)] let mut placeholder: [u8; LANES] = [0; LANES]; while nb > 0 { v_neon = load_v!(placeholder, sptr, nb); @@ -572,11 +570,6 @@ fn format_string(value: &str, dst: &mut [u8]) -> usize { } } else { const LANES: usize = simd::v128::Simd128u::LANES; - // Scratch buffer reused for mask materialisation; stay uninitialised. - #[cfg(not(miri))] - #[allow(invalid_value, clippy::uninit_assumed_init)] - let mut placeholder: [u8; LANES] = core::mem::MaybeUninit::uninit().assume_init(); - #[cfg(miri)] let mut placeholder: [u8; LANES] = [0; LANES]; while nb > 0 { v_generic = load_v!(placeholder, sptr, nb); @@ -601,11 +594,6 @@ fn format_string(value: &str, dst: &mut [u8]) -> usize { { if has_avx512 { const LANES: usize = simd::avx512::Simd512u::LANES; - // Scratch buffer reused for mask materialisation; stay uninitialised. - #[cfg(not(miri))] - #[allow(invalid_value, clippy::uninit_assumed_init)] - let mut placeholder: [u8; LANES] = core::mem::MaybeUninit::uninit().assume_init(); - #[cfg(miri)] let mut placeholder: [u8; LANES] = [0; LANES]; while nb > 0 { v_avx512 = load_v!(placeholder, sptr, nb); @@ -627,10 +615,6 @@ fn format_string(value: &str, dst: &mut [u8]) -> usize { } else if has_avx2 { const LANES: usize = simd::avx2::Simd256u::LANES; // Scratch buffer reused for mask materialisation; stay uninitialised. - #[cfg(not(miri))] - #[allow(invalid_value, clippy::uninit_assumed_init)] - let mut placeholder: [u8; LANES] = core::mem::MaybeUninit::uninit().assume_init(); - #[cfg(miri)] let mut placeholder: [u8; LANES] = [0; LANES]; while nb > 0 { v_avx2 = load_v!(placeholder, sptr, nb); @@ -651,11 +635,6 @@ fn format_string(value: &str, dst: &mut [u8]) -> usize { } } else if has_sse2 { const LANES: usize = simd::sse2::Simd128u::LANES; - // Scratch buffer reused for mask materialisation; stay uninitialised. - #[cfg(not(miri))] - #[allow(invalid_value, clippy::uninit_assumed_init)] - let mut placeholder: [u8; LANES] = core::mem::MaybeUninit::uninit().assume_init(); - #[cfg(miri)] let mut placeholder: [u8; LANES] = [0; LANES]; while nb > 0 { v_sse2 = load_v!(placeholder, sptr, nb); @@ -676,11 +655,6 @@ fn format_string(value: &str, dst: &mut [u8]) -> usize { } } else { const LANES: usize = simd::v128::Simd128u::LANES; - // Scratch buffer reused for mask materialisation; stay uninitialised. - #[cfg(not(miri))] - #[allow(invalid_value, clippy::uninit_assumed_init)] - let mut placeholder: [u8; LANES] = core::mem::MaybeUninit::uninit().assume_init(); - #[cfg(miri)] let mut placeholder: [u8; LANES] = [0; LANES]; while nb > 0 { v_generic = load_v!(placeholder, sptr, nb); From b45105344f18c8390293f0d3324feae30eebf630 Mon Sep 17 00:00:00 2001 From: LongYinan Date: Mon, 13 Oct 2025 12:21:14 +0800 Subject: [PATCH 08/14] inline --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index d95a284..f3056bb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -339,7 +339,7 @@ fn check_cross_page(ptr: *const u8, step: usize) -> bool { ((ptr as usize & (page_size - 1)) + step) > page_size } -#[inline] +#[inline(always)] fn escaped_mask_generic(v: simd::v128::Simd128u) -> u16 { use simd::v128::Simd128u as u8x16; From 6e84cdd4553a500583f6a2dffc821df70c65cbe0 Mon Sep 17 00:00:00 2001 From: LongYinan Date: Mon, 13 Oct 2025 12:21:46 +0800 Subject: [PATCH 09/14] bench debuginfo --- Cargo.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index db691f7..4073608 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,6 +38,8 @@ sonic-rs = "0.5" [profile.bench] lto = true codegen-units = 1 +debug = true +strip = false [profile.instruments] inherits = "release" From 62eb23d568af8936aa8d9aa66f88bc184e91171b Mon Sep 17 00:00:00 2001 From: LongYinan Date: Mon, 13 Oct 2025 12:26:19 +0800 Subject: [PATCH 10/14] remove unsafe --- src/lib.rs | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index f3056bb..40baa23 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -427,16 +427,7 @@ macro_rules! load_v { std::ptr::copy_nonoverlapping($sptr, $placeholder[..].as_mut_ptr(), $nb); load($placeholder[..].as_ptr()) } else { - #[cfg(not(debug_assertions))] - { - // disable memory sanitizer here - load($sptr) - } - #[cfg(debug_assertions)] - { - std::ptr::copy_nonoverlapping($sptr, $placeholder[..].as_mut_ptr(), $nb); - load($placeholder[..].as_ptr()) - } + load($sptr) } } }}; @@ -614,7 +605,6 @@ fn format_string(value: &str, dst: &mut [u8]) -> usize { } } else if has_avx2 { const LANES: usize = simd::avx2::Simd256u::LANES; - // Scratch buffer reused for mask materialisation; stay uninitialised. let mut placeholder: [u8; LANES] = [0; LANES]; while nb > 0 { v_avx2 = load_v!(placeholder, sptr, nb); From 70b51da93d5010568ac641b72dd2beb1f35d988f Mon Sep 17 00:00:00 2001 From: LongYinan Date: Mon, 13 Oct 2025 12:37:41 +0800 Subject: [PATCH 11/14] miri --- src/lib.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 40baa23..b1840b5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -427,7 +427,15 @@ macro_rules! load_v { std::ptr::copy_nonoverlapping($sptr, $placeholder[..].as_mut_ptr(), $nb); load($placeholder[..].as_ptr()) } else { - load($sptr) + #[cfg(miri)] + { + std::ptr::copy_nonoverlapping($sptr, $placeholder[..].as_mut_ptr(), $nb); + load($placeholder[..].as_ptr()) + } + #[cfg(not(miri))] + { + load($sptr) + } } } }}; From 0afd5dc11a302e928375882d0ecdb0196a414dd3 Mon Sep 17 00:00:00 2001 From: LongYinan Date: Mon, 13 Oct 2025 12:59:54 +0800 Subject: [PATCH 12/14] miri --- src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index b1840b5..7be2b40 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -427,12 +427,12 @@ macro_rules! load_v { std::ptr::copy_nonoverlapping($sptr, $placeholder[..].as_mut_ptr(), $nb); load($placeholder[..].as_ptr()) } else { - #[cfg(miri)] + #[cfg(any(debug_assertions, miri))] { std::ptr::copy_nonoverlapping($sptr, $placeholder[..].as_mut_ptr(), $nb); load($placeholder[..].as_ptr()) } - #[cfg(not(miri))] + #[cfg(not(any(debug_assertions, miri)))] { load($sptr) } From 2d86d69a1052f2a8362c0896f9ce1e9e1156b5d2 Mon Sep 17 00:00:00 2001 From: LongYinan Date: Mon, 13 Oct 2025 13:03:44 +0800 Subject: [PATCH 13/14] remove fn gt --- src/simd/avx2.rs | 5 ----- src/simd/avx512.rs | 5 ----- src/simd/neon.rs | 6 ------ src/simd/sse2.rs | 5 ----- src/simd/traits.rs | 8 ++------ src/simd/v128.rs | 8 -------- 6 files changed, 2 insertions(+), 35 deletions(-) diff --git a/src/simd/avx2.rs b/src/simd/avx2.rs index 60cde12..a1f2040 100644 --- a/src/simd/avx2.rs +++ b/src/simd/avx2.rs @@ -92,9 +92,4 @@ impl Simd for Simd256u { Mask256(eq) } } - - #[inline(always)] - fn gt(&self, _rhs: &Self) -> Self::Mask { - todo!() - } } diff --git a/src/simd/avx512.rs b/src/simd/avx512.rs index e798044..eec1a1d 100644 --- a/src/simd/avx512.rs +++ b/src/simd/avx512.rs @@ -84,9 +84,4 @@ impl Simd for Simd512u { fn le(&self, rhs: &Self) -> Self::Mask { unsafe { Mask512(_mm512_cmple_epu8_mask(self.0, rhs.0)) } } - - #[inline(always)] - fn gt(&self, rhs: &Self) -> Self::Mask { - unsafe { Mask512(_mm512_cmpgt_epu8_mask(self.0, rhs.0)) } - } } diff --git a/src/simd/neon.rs b/src/simd/neon.rs index cc6bb99..15f7773 100644 --- a/src/simd/neon.rs +++ b/src/simd/neon.rs @@ -36,12 +36,6 @@ impl Simd for Simd128u { fn le(&self, lhs: &Self) -> Self::Mask { unsafe { Mask128(vcleq_u8(self.0, lhs.0)) } } - - // greater than - #[inline(always)] - fn gt(&self, lhs: &Self) -> Self::Mask { - unsafe { Mask128(vcgtq_u8(self.0, lhs.0)) } - } } #[derive(Debug)] diff --git a/src/simd/sse2.rs b/src/simd/sse2.rs index 63a95b8..28e6cb9 100644 --- a/src/simd/sse2.rs +++ b/src/simd/sse2.rs @@ -89,9 +89,4 @@ impl Simd for Simd128u { Mask128(eq) } } - - #[inline(always)] - fn gt(&self, _rhs: &Self) -> Self::Mask { - todo!() - } } diff --git a/src/simd/traits.rs b/src/simd/traits.rs index b3f77f6..3d0d45b 100644 --- a/src/simd/traits.rs +++ b/src/simd/traits.rs @@ -1,6 +1,6 @@ use std::ops::{BitAnd, BitOr, BitOrAssign}; -/// Portbal SIMD traits +/// Portable SIMD traits pub trait Simd: Sized { const LANES: usize; @@ -29,15 +29,11 @@ pub trait Simd: Sized { fn splat(elem: Self::Element) -> Self; - #[allow(unused)] - /// greater than - fn gt(&self, rhs: &Self) -> Self::Mask; - /// less or equal fn le(&self, rhs: &Self) -> Self::Mask; } -/// Portbal SIMD mask traits +/// Portable SIMD mask traits pub trait Mask: Sized + BitOr + BitOrAssign + BitAnd { type Element; type BitMask: BitMask; diff --git a/src/simd/v128.rs b/src/simd/v128.rs index 448dece..7137a39 100644 --- a/src/simd/v128.rs +++ b/src/simd/v128.rs @@ -44,14 +44,6 @@ impl Simd for Simd128u { } Mask128(mask) } - - fn gt(&self, rhs: &Self) -> Self::Mask { - let mut mask = [0u8; 16]; - for i in 0..Self::LANES { - mask[i] = if self.0[i] > rhs.0[i] { 1 } else { 0 }; - } - Mask128(mask) - } } impl Mask for Mask128 { From 6a3bf553d4d45757150f43e47fdfa281267627d5 Mon Sep 17 00:00:00 2001 From: LongYinan Date: Mon, 13 Oct 2025 13:04:52 +0800 Subject: [PATCH 14/14] splat --- src/simd/avx2.rs | 6 ------ src/simd/avx512.rs | 5 ----- src/simd/neon.rs | 6 ------ src/simd/sse2.rs | 6 ------ src/simd/traits.rs | 3 --- src/simd/v128.rs | 4 ---- 6 files changed, 30 deletions(-) diff --git a/src/simd/avx2.rs b/src/simd/avx2.rs index a1f2040..cbad942 100644 --- a/src/simd/avx2.rs +++ b/src/simd/avx2.rs @@ -23,12 +23,6 @@ impl Mask for Mask256 { fn bitmask(self) -> Self::BitMask { unsafe { _mm256_movemask_epi8(self.0) as u32 } } - - #[inline(always)] - fn splat(b: bool) -> Self { - let v: i8 = if b { -1 } else { 0 }; - unsafe { Mask256(_mm256_set1_epi8(v)) } - } } impl BitAnd for Mask256 { diff --git a/src/simd/avx512.rs b/src/simd/avx512.rs index eec1a1d..98efdb6 100644 --- a/src/simd/avx512.rs +++ b/src/simd/avx512.rs @@ -23,11 +23,6 @@ impl Mask for Mask512 { fn bitmask(self) -> Self::BitMask { self.0 } - - #[inline(always)] - fn splat(b: bool) -> Self { - if b { Mask512(u64::MAX) } else { Mask512(0) } - } } impl BitOr for Mask512 { diff --git a/src/simd/neon.rs b/src/simd/neon.rs index 15f7773..d365062 100644 --- a/src/simd/neon.rs +++ b/src/simd/neon.rs @@ -57,12 +57,6 @@ impl Mask for Mask128 { NeonBits::new(vget_lane_u64(v64, 0)) } } - - #[inline(always)] - fn splat(b: bool) -> Self { - let v: i8 = if b { -1 } else { 0 }; - unsafe { Self(vdupq_n_u8(v as u8)) } - } } // Bitwise AND for Mask128 diff --git a/src/simd/sse2.rs b/src/simd/sse2.rs index 28e6cb9..543bc21 100644 --- a/src/simd/sse2.rs +++ b/src/simd/sse2.rs @@ -23,12 +23,6 @@ impl Mask for Mask128 { fn bitmask(self) -> Self::BitMask { unsafe { _mm_movemask_epi8(self.0) as u16 } } - - #[inline(always)] - fn splat(b: bool) -> Self { - let v: i8 = if b { -1 } else { 0 }; - unsafe { Mask128(_mm_set1_epi8(v)) } - } } impl BitAnd for Mask128 { diff --git a/src/simd/traits.rs b/src/simd/traits.rs index 3d0d45b..985e262 100644 --- a/src/simd/traits.rs +++ b/src/simd/traits.rs @@ -39,9 +39,6 @@ pub trait Mask: Sized + BitOr + BitOrAssign + BitAnd { type BitMask: BitMask; fn bitmask(self) -> Self::BitMask; - - #[allow(unused)] - fn splat(b: bool) -> Self; } /// Trait for the bitmask of a vector Mask. diff --git a/src/simd/v128.rs b/src/simd/v128.rs index 7137a39..8e03fa1 100644 --- a/src/simd/v128.rs +++ b/src/simd/v128.rs @@ -66,10 +66,6 @@ impl Mask for Mask128 { .fold(0, |acc, (i, &b)| acc | ((b as u16) << (15 - i))) } } - - fn splat(b: bool) -> Self { - Mask128([b as u8; 16]) - } } impl BitAnd for Mask128 {