From 6becdc3064492acf852175166e087cd8d8e89a10 Mon Sep 17 00:00:00 2001 From: LongYinan Date: Mon, 13 Oct 2025 14:51:21 +0800 Subject: [PATCH 1/2] refactor: split impls into arch --- src/lib.rs | 411 +++------------------------------------------ src/simd/avx2.rs | 184 +++++++++++++++++++- src/simd/avx512.rs | 181 +++++++++++++++++++- src/simd/bits.rs | 10 -- src/simd/mod.rs | 8 +- src/simd/neon.rs | 184 +++++++++++++++++++- src/simd/sse2.rs | 184 +++++++++++++++++++- src/simd/traits.rs | 16 -- src/simd/util.rs | 31 ++++ src/simd/v128.rs | 93 +++++++++- 10 files changed, 868 insertions(+), 434 deletions(-) create mode 100644 src/simd/util.rs diff --git a/src/lib.rs b/src/lib.rs index 7be2b40..e473c39 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,19 +4,10 @@ #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] use std::arch::is_x86_feature_detected; -use std::slice::from_raw_parts; - -use simd::{BitMask, Mask, Simd}; mod simd; -#[inline(always)] -unsafe fn load(ptr: *const u8) -> V { - let chunk = unsafe { from_raw_parts(ptr, V::LANES) }; - unsafe { V::from_slice_unaligned_unchecked(chunk) } -} - -const QUOTE_TAB: [(u8, [u8; 8]); 256] = [ +pub(crate) const QUOTE_TAB: [(u8, [u8; 8]); 256] = [ // 0x00 ~ 0x1f (6, *b"\\u0000\0\0"), (6, *b"\\u0001\0\0"), @@ -281,7 +272,7 @@ const QUOTE_TAB: [(u8, [u8; 8]); 256] = [ (0, [0; 8]), ]; -const NEED_ESCAPED: [u8; 256] = [ +pub(crate) const NEED_ESCAPED: [u8; 256] = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, @@ -292,390 +283,34 @@ const NEED_ESCAPED: [u8; 256] = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; -#[cfg(all( - any(target_arch = "x86", target_arch = "x86_64"), - not(feature = "codspeed") -))] -static COMPUTE_LANES: std::sync::Once = std::sync::Once::new(); -#[cfg(all( - any(target_arch = "x86", target_arch = "x86_64"), - not(feature = "codspeed") -))] -static mut LANES: usize = simd::avx2::Simd256u::LANES; -#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "codspeed"))] -const LANES: usize = simd::avx2::Simd256u::LANES; - -#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] -const LANES: usize = 16; - -// only check the src length. -#[inline(always)] -unsafe fn escape_unchecked(src: &mut *const u8, nb: &mut usize, dst: &mut *mut u8) { - debug_assert!(*nb >= 1); - loop { - let ch = unsafe { *(*src) }; - let cnt = QUOTE_TAB[ch as usize].0 as usize; - debug_assert!( - cnt != 0, - "char is {}, cnt is {}, NEED_ESCAPED is {}", - ch as char, - cnt, - NEED_ESCAPED[ch as usize] - ); - unsafe { std::ptr::copy_nonoverlapping(QUOTE_TAB[ch as usize].1.as_ptr(), *dst, 8) }; - unsafe { (*dst) = (*dst).add(cnt) }; - unsafe { (*src) = (*src).add(1) }; - (*nb) -= 1; - if (*nb) == 0 || unsafe { NEED_ESCAPED[*(*src) as usize] == 0 } { - return; - } - } -} - -#[cfg(any(target_os = "linux", target_os = "macos"))] -#[inline(always)] -fn check_cross_page(ptr: *const u8, step: usize) -> bool { - let page_size = 4096; - ((ptr as usize & (page_size - 1)) + step) > page_size -} - -#[inline(always)] -fn escaped_mask_generic(v: simd::v128::Simd128u) -> u16 { - use simd::v128::Simd128u as u8x16; - - let x1f = u8x16::splat(0x1f); // 0x00 ~ 0x20 - let blash = u8x16::splat(b'\\'); - let quote = u8x16::splat(b'"'); - let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); - v.bitmask() -} - -#[cfg(target_arch = "aarch64")] -#[inline(always)] -fn escaped_mask_neon(v: simd::neon::Simd128u) -> simd::bits::NeonBits { - use simd::neon::Simd128u as u8x16; - - let x1f = u8x16::splat(0x1f); // 0x00 ~ 0x20 - let blash = u8x16::splat(b'\\'); - let quote = u8x16::splat(b'"'); - let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); - v.bitmask() -} - -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -#[inline(always)] -fn escaped_mask_sse2(v: simd::sse2::Simd128u) -> u16 { - use simd::sse2::Simd128u as u8x16; - - let x1f = u8x16::splat(0x1f); // 0x00 ~ 0x20 - let blash = u8x16::splat(b'\\'); - let quote = u8x16::splat(b'"'); - let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); - v.bitmask() -} - -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -#[inline(always)] -fn escaped_mask_avx2(v: simd::avx2::Simd256u) -> u32 { - use simd::avx2::Simd256u as u8x32; - - let x1f = u8x32::splat(0x1f); // 0x00 ~ 0x20 - let blash = u8x32::splat(b'\\'); - let quote = u8x32::splat(b'"'); - let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); - v.bitmask() -} - -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -#[inline(always)] -fn escaped_mask_avx512(v: simd::avx512::Simd512u) -> u64 { - use simd::avx512::Simd512u as u8x64; - - let x1f = u8x64::splat(0x1f); // 0x00 ~ 0x20 - let blash = u8x64::splat(b'\\'); - let quote = u8x64::splat(b'"'); - let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); - v.bitmask() -} - -macro_rules! escape { - ($mask:expr, $nb:expr, $dptr:expr, $sptr:expr) => { - if $mask.all_zero() { - $nb -= LANES; - $dptr = $dptr.add(LANES); - $sptr = $sptr.add(LANES); - } else { - let cn = $mask.first_offset(); - $nb -= cn; - $dptr = $dptr.add(cn); - $sptr = $sptr.add(cn); - escape_unchecked(&mut $sptr, &mut $nb, &mut $dptr); - } - }; -} - -macro_rules! load_v { - ($placeholder:expr, $sptr:expr, $nb:expr) => {{ - #[cfg(not(any(target_os = "linux", target_os = "macos")))] - { - std::ptr::copy_nonoverlapping($sptr, $placeholder[..].as_mut_ptr(), $nb); - load($placeholder[..].as_ptr()) - } - #[cfg(any(target_os = "linux", target_os = "macos"))] - { - if check_cross_page($sptr, LANES) { - std::ptr::copy_nonoverlapping($sptr, $placeholder[..].as_mut_ptr(), $nb); - load($placeholder[..].as_ptr()) - } else { - #[cfg(any(debug_assertions, miri))] - { - std::ptr::copy_nonoverlapping($sptr, $placeholder[..].as_mut_ptr(), $nb); - load($placeholder[..].as_ptr()) - } - #[cfg(not(any(debug_assertions, miri)))] - { - load($sptr) - } - } - } - }}; -} - #[inline(always)] fn format_string(value: &str, dst: &mut [u8]) -> usize { #[cfg(target_arch = "aarch64")] - let mut v_neon: simd::neon::Simd128u; - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - let mut v_sse2: simd::sse2::Simd128u; - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - let mut v_avx2: simd::avx2::Simd256u; - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - let mut v_avx512: simd::avx512::Simd512u; - - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - let has_avx512 = is_x86_feature_detected!("avx512f"); - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - let has_avx2 = is_x86_feature_detected!("avx2"); - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - let has_sse2 = is_x86_feature_detected!("sse2"); - - #[cfg(target_arch = "aarch64")] - let has_neon = cfg!(target_os = "macos") || std::arch::is_aarch64_feature_detected!("neon"); - - let mut v_generic: simd::v128::Simd128u; - - #[cfg(all( - any(target_arch = "x86", target_arch = "x86_64"), - not(feature = "codspeed") - ))] - COMPUTE_LANES.call_once(|| { - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { - if is_x86_feature_detected!("avx512f") { - unsafe { - LANES = simd::avx512::Simd512u::LANES; - } - } else if !is_x86_feature_detected!("avx2") { - unsafe { - LANES = simd::sse2::Simd128u::LANES; - } - } + { + let has_neon = cfg!(target_os = "macos") || std::arch::is_aarch64_feature_detected!("neon"); + if has_neon { + unsafe { simd::neon::format_string(value, dst) } + } else { + simd::v128::format_string(value, dst) } - }); + } - unsafe { - let slice = value.as_bytes(); - let mut sptr = slice.as_ptr(); - let mut dptr = dst.as_mut_ptr(); - let dstart = dptr; - let mut nb: usize = slice.len(); - - *dptr = b'"'; - dptr = dptr.add(1); - while nb >= LANES { - #[cfg(target_arch = "aarch64")] - { - if has_neon { - v_neon = load(sptr); - v_neon.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( - dptr, LANES, - )); - let mask = escaped_mask_neon(v_neon); - escape!(mask, nb, dptr, sptr); - } else { - v_generic = load(sptr); - v_generic.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( - dptr, LANES, - )); - let mask = escaped_mask_generic(v_generic); - escape!(mask, nb, dptr, sptr); - } - } - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - { - if has_avx512 { - v_avx512 = load(sptr); - v_avx512.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( - dptr, LANES, - )); - let mask = escaped_mask_avx512(v_avx512); - escape!(mask, nb, dptr, sptr); - } else if has_avx2 { - v_avx2 = load(sptr); - v_avx2.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( - dptr, LANES, - )); - let mask = escaped_mask_avx2(v_avx2); - escape!(mask, nb, dptr, sptr); - } else if has_sse2 { - v_sse2 = load(sptr); - v_sse2.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( - dptr, LANES, - )); - let mask = escaped_mask_sse2(v_sse2); - escape!(mask, nb, dptr, sptr); - } else { - v_generic = load(sptr); - v_generic.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( - dptr, LANES, - )); - let mask = escaped_mask_generic(v_generic); - escape!(mask, nb, dptr, sptr); - } - } + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if is_x86_feature_detected!("avx512f") { + unsafe { simd::avx512::format_string(value, dst) } + } else if is_x86_feature_detected!("avx2") { + unsafe { simd::avx2::format_string(value, dst) } + } else if is_x86_feature_detected!("sse2") { + unsafe { simd::sse2::format_string(value, dst) } + } else { + simd::v128::format_string(value, dst) } + } - #[cfg(target_arch = "aarch64")] - { - if has_neon { - const LANES: usize = simd::neon::Simd128u::LANES; - let mut placeholder: [u8; LANES] = [0; LANES]; - while nb > 0 { - v_neon = load_v!(placeholder, sptr, nb); - v_neon.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( - dptr, LANES, - )); - let mask = escaped_mask_neon(v_neon).clear_high_bits(LANES - nb); - if mask.all_zero() { - dptr = dptr.add(nb); - break; - } else { - let cn = mask.first_offset(); - nb -= cn; - dptr = dptr.add(cn); - sptr = sptr.add(cn); - escape_unchecked(&mut sptr, &mut nb, &mut dptr); - } - } - } else { - const LANES: usize = simd::v128::Simd128u::LANES; - let mut placeholder: [u8; LANES] = [0; LANES]; - while nb > 0 { - v_generic = load_v!(placeholder, sptr, nb); - v_generic.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( - dptr, LANES, - )); - let mask = escaped_mask_generic(v_generic).clear_high_bits(LANES - nb); - if mask.all_zero() { - dptr = dptr.add(nb); - break; - } else { - let cn = mask.first_offset(); - nb -= cn; - dptr = dptr.add(cn); - sptr = sptr.add(cn); - escape_unchecked(&mut sptr, &mut nb, &mut dptr); - } - } - } - } - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - { - if has_avx512 { - const LANES: usize = simd::avx512::Simd512u::LANES; - let mut placeholder: [u8; LANES] = [0; LANES]; - while nb > 0 { - v_avx512 = load_v!(placeholder, sptr, nb); - v_avx512.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( - dptr, LANES, - )); - let mask = escaped_mask_avx512(v_avx512).clear_high_bits(LANES - nb); - if mask.all_zero() { - dptr = dptr.add(nb); - break; - } else { - let cn = mask.first_offset(); - nb -= cn; - dptr = dptr.add(cn); - sptr = sptr.add(cn); - escape_unchecked(&mut sptr, &mut nb, &mut dptr); - } - } - } else if has_avx2 { - const LANES: usize = simd::avx2::Simd256u::LANES; - let mut placeholder: [u8; LANES] = [0; LANES]; - while nb > 0 { - v_avx2 = load_v!(placeholder, sptr, nb); - v_avx2.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( - dptr, LANES, - )); - let mask = escaped_mask_avx2(v_avx2).clear_high_bits(LANES - nb); - if mask.all_zero() { - dptr = dptr.add(nb); - break; - } else { - let cn = mask.first_offset(); - nb -= cn; - dptr = dptr.add(cn); - sptr = sptr.add(cn); - escape_unchecked(&mut sptr, &mut nb, &mut dptr); - } - } - } else if has_sse2 { - const LANES: usize = simd::sse2::Simd128u::LANES; - let mut placeholder: [u8; LANES] = [0; LANES]; - while nb > 0 { - v_sse2 = load_v!(placeholder, sptr, nb); - v_sse2.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( - dptr, LANES, - )); - let mask = escaped_mask_sse2(v_sse2).clear_high_bits(LANES - nb); - if mask.all_zero() { - dptr = dptr.add(nb); - break; - } else { - let cn = mask.first_offset(); - nb -= cn; - dptr = dptr.add(cn); - sptr = sptr.add(cn); - escape_unchecked(&mut sptr, &mut nb, &mut dptr); - } - } - } else { - const LANES: usize = simd::v128::Simd128u::LANES; - let mut placeholder: [u8; LANES] = [0; LANES]; - while nb > 0 { - v_generic = load_v!(placeholder, sptr, nb); - v_generic.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut( - dptr, LANES, - )); - let mask = escaped_mask_generic(v_generic).clear_high_bits(LANES - nb); - if mask.all_zero() { - dptr = dptr.add(nb); - break; - } else { - let cn = mask.first_offset(); - nb -= cn; - dptr = dptr.add(cn); - sptr = sptr.add(cn); - escape_unchecked(&mut sptr, &mut nb, &mut dptr); - } - } - } - } - *dptr = b'"'; - dptr = dptr.add(1); - dptr as usize - dstart as usize + #[cfg(not(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64")))] + { + simd::v128::format_string(value, dst) } } diff --git a/src/simd/avx2.rs b/src/simd/avx2.rs index cbad942..94a4546 100644 --- a/src/simd/avx2.rs +++ b/src/simd/avx2.rs @@ -5,13 +5,19 @@ use std::arch::x86_64::*; use std::ops::{BitAnd, BitOr, BitOrAssign}; -use super::{Mask, Simd}; +use super::{Mask, Simd, traits::BitMask, util::escape_unchecked}; -#[derive(Debug)] +#[cfg(any(target_os = "linux", target_os = "macos"))] +use super::util::check_cross_page; + +const LANES: usize = 32; +const CHUNK: usize = LANES * 4; + +#[derive(Debug, Clone, Copy)] #[repr(transparent)] pub struct Simd256u(__m256i); -#[derive(Debug)] +#[derive(Debug, Clone, Copy)] #[repr(transparent)] pub struct Mask256(__m256i); @@ -51,7 +57,7 @@ impl BitOrAssign for Mask256 { } impl Simd for Simd256u { - const LANES: usize = 32; + const LANES: usize = LANES; type Mask = Mask256; type Element = u8; @@ -87,3 +93,173 @@ impl Simd for Simd256u { } } } + +#[inline(always)] +fn escaped_mask(v: Simd256u) -> u32 { + let x1f = Simd256u::splat(0x1f); // 0x00 ~ 0x20 + let blash = Simd256u::splat(b'\\'); + let quote = Simd256u::splat(b'"'); + let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); + v.bitmask() +} + +#[target_feature(enable = "avx2")] +pub unsafe fn format_string(value: &str, dst: &mut [u8]) -> usize { + unsafe { + let slice = value.as_bytes(); + let mut sptr = slice.as_ptr(); + let mut dptr = dst.as_mut_ptr(); + let dstart = dptr; + let mut nb: usize = slice.len(); + + *dptr = b'"'; + dptr = dptr.add(1); + + // Process CHUNK (4 * LANES = 128 bytes) at a time + while nb >= CHUNK { + // Load 4 SIMD vectors + let v1 = Simd256u::loadu(sptr); + let v2 = Simd256u::loadu(sptr.add(LANES)); + let v3 = Simd256u::loadu(sptr.add(LANES * 2)); + let v4 = Simd256u::loadu(sptr.add(LANES * 3)); + + // Check all 4 masks + let mask1 = escaped_mask(v1); + let mask2 = escaped_mask(v2); + let mask3 = escaped_mask(v3); + let mask4 = escaped_mask(v4); + + // Fast path: if all vectors are clean, write the entire chunk + if mask1.all_zero() && mask2.all_zero() && mask3.all_zero() && mask4.all_zero() { + v1.storeu(dptr); + v2.storeu(dptr.add(LANES)); + v3.storeu(dptr.add(LANES * 2)); + v4.storeu(dptr.add(LANES * 3)); + nb -= CHUNK; + dptr = dptr.add(CHUNK); + sptr = sptr.add(CHUNK); + } else { + // Slow path: handle escape character + // Process v1 + v1.storeu(dptr); + if !mask1.all_zero() { + let cn = mask1.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + continue; + } + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + + // Process v2 + v2.storeu(dptr); + if !mask2.all_zero() { + let cn = mask2.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + continue; + } + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + + // Process v3 + v3.storeu(dptr); + if !mask3.all_zero() { + let cn = mask3.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + continue; + } + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + + // Process v4 + v4.storeu(dptr); + if !mask4.all_zero() { + let cn = mask4.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + continue; + } + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + } + } + + // Process remaining LANES bytes at a time + while nb >= LANES { + let v = Simd256u::loadu(sptr); + v.storeu(dptr); + let mask = escaped_mask(v); + + if mask.all_zero() { + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } + + // Handle remaining bytes + let mut placeholder: [u8; LANES] = [0; LANES]; + while nb > 0 { + #[cfg(not(any(target_os = "linux", target_os = "macos")))] + let v = { + std::ptr::copy_nonoverlapping(sptr, placeholder.as_mut_ptr(), nb); + Simd256u::loadu(placeholder.as_ptr()) + }; + #[cfg(any(target_os = "linux", target_os = "macos"))] + let v = { + if check_cross_page(sptr, LANES) { + std::ptr::copy_nonoverlapping(sptr, placeholder.as_mut_ptr(), nb); + Simd256u::loadu(placeholder.as_ptr()) + } else { + #[cfg(any(debug_assertions, miri))] + { + std::ptr::copy_nonoverlapping(sptr, placeholder.as_mut_ptr(), nb); + Simd256u::loadu(placeholder.as_ptr()) + } + #[cfg(not(any(debug_assertions, miri)))] + { + Simd256u::loadu(sptr) + } + } + }; + + v.storeu(dptr); + let mask = escaped_mask(v).clear_high_bits(LANES - nb); + + if mask.all_zero() { + dptr = dptr.add(nb); + break; + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } + + *dptr = b'"'; + dptr = dptr.add(1); + dptr as usize - dstart as usize + } +} diff --git a/src/simd/avx512.rs b/src/simd/avx512.rs index 98efdb6..c6ad65c 100644 --- a/src/simd/avx512.rs +++ b/src/simd/avx512.rs @@ -5,9 +5,14 @@ use std::arch::x86_64::*; use std::ops::{BitAnd, BitOr, BitOrAssign}; -use super::{Mask, Simd}; +#[cfg(any(target_os = "linux", target_os = "macos"))] +use super::util::check_cross_page; +use super::{Mask, Simd, traits::BitMask, util::escape_unchecked}; -#[derive(Debug)] +const LANES: usize = 64; +const CHUNK: usize = LANES * 4; + +#[derive(Debug, Clone, Copy)] #[repr(transparent)] pub struct Simd512u(__m512i); @@ -51,7 +56,7 @@ impl BitAnd for Mask512 { } impl Simd for Simd512u { - const LANES: usize = 64; + const LANES: usize = LANES; type Element = u8; type Mask = Mask512; @@ -80,3 +85,173 @@ impl Simd for Simd512u { unsafe { Mask512(_mm512_cmple_epu8_mask(self.0, rhs.0)) } } } + +#[inline(always)] +fn escaped_mask(v: Simd512u) -> u64 { + let x1f = Simd512u::splat(0x1f); // 0x00 ~ 0x20 + let blash = Simd512u::splat(b'\\'); + let quote = Simd512u::splat(b'"'); + let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); + v.bitmask() +} + +#[target_feature(enable = "avx512f")] +pub unsafe fn format_string(value: &str, dst: &mut [u8]) -> usize { + unsafe { + let slice = value.as_bytes(); + let mut sptr = slice.as_ptr(); + let mut dptr = dst.as_mut_ptr(); + let dstart = dptr; + let mut nb: usize = slice.len(); + + *dptr = b'"'; + dptr = dptr.add(1); + + // Process CHUNK (4 * LANES = 256 bytes) at a time + while nb >= CHUNK { + // Load 4 SIMD vectors + let v1 = Simd512u::loadu(sptr); + let v2 = Simd512u::loadu(sptr.add(LANES)); + let v3 = Simd512u::loadu(sptr.add(LANES * 2)); + let v4 = Simd512u::loadu(sptr.add(LANES * 3)); + + // Check all 4 masks + let mask1 = escaped_mask(v1); + let mask2 = escaped_mask(v2); + let mask3 = escaped_mask(v3); + let mask4 = escaped_mask(v4); + + // Fast path: if all vectors are clean, write the entire chunk + if mask1.all_zero() && mask2.all_zero() && mask3.all_zero() && mask4.all_zero() { + v1.storeu(dptr); + v2.storeu(dptr.add(LANES)); + v3.storeu(dptr.add(LANES * 2)); + v4.storeu(dptr.add(LANES * 3)); + nb -= CHUNK; + dptr = dptr.add(CHUNK); + sptr = sptr.add(CHUNK); + } else { + // Slow path: handle escape character + // Process v1 + v1.storeu(dptr); + if !mask1.all_zero() { + let cn = mask1.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + continue; + } + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + + // Process v2 + v2.storeu(dptr); + if !mask2.all_zero() { + let cn = mask2.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + continue; + } + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + + // Process v3 + v3.storeu(dptr); + if !mask3.all_zero() { + let cn = mask3.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + continue; + } + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + + // Process v4 + v4.storeu(dptr); + if !mask4.all_zero() { + let cn = mask4.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + continue; + } + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + } + } + + // Process remaining LANES bytes at a time + while nb >= LANES { + let v = Simd512u::loadu(sptr); + v.storeu(dptr); + let mask = escaped_mask(v); + + if mask.all_zero() { + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } + + // Handle remaining bytes + let mut placeholder: [u8; LANES] = [0; LANES]; + while nb > 0 { + #[cfg(not(any(target_os = "linux", target_os = "macos")))] + let v = { + std::ptr::copy_nonoverlapping(sptr, placeholder.as_mut_ptr(), nb); + Simd512u::loadu(placeholder.as_ptr()) + }; + #[cfg(any(target_os = "linux", target_os = "macos"))] + let v = { + if check_cross_page(sptr, LANES) { + std::ptr::copy_nonoverlapping(sptr, placeholder.as_mut_ptr(), nb); + Simd512u::loadu(placeholder.as_ptr()) + } else { + #[cfg(any(debug_assertions, miri))] + { + std::ptr::copy_nonoverlapping(sptr, placeholder.as_mut_ptr(), nb); + Simd512u::loadu(placeholder.as_ptr()) + } + #[cfg(not(any(debug_assertions, miri)))] + { + Simd512u::loadu(sptr) + } + } + }; + + v.storeu(dptr); + let mask = escaped_mask(v).clear_high_bits(LANES - nb); + + if mask.all_zero() { + dptr = dptr.add(nb); + break; + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } + + *dptr = b'"'; + dptr = dptr.add(1); + dptr as usize - dstart as usize + } +} diff --git a/src/simd/bits.rs b/src/simd/bits.rs index 3bdb694..ac031c7 100644 --- a/src/simd/bits.rs +++ b/src/simd/bits.rs @@ -7,11 +7,6 @@ macro_rules! impl_bits { impl BitMask for $ty { const LEN: usize = std::mem::size_of::<$ty>() * 8; - #[inline] - fn before(&self, rhs: &Self) -> bool { - (self.as_little_endian() & rhs.as_little_endian().wrapping_sub(1)) != 0 - } - #[inline] fn first_offset(&self) -> usize { self.as_little_endian().trailing_zeros() as usize @@ -75,11 +70,6 @@ impl BitMask for NeonBits { (self.as_little_endian().0.trailing_zeros() as usize) >> 2 } - #[inline] - fn before(&self, rhs: &Self) -> bool { - (self.as_little_endian().0 & rhs.as_little_endian().0.wrapping_sub(1)) != 0 - } - #[inline] fn as_little_endian(&self) -> Self { #[cfg(target_endian = "little")] diff --git a/src/simd/mod.rs b/src/simd/mod.rs index a4c80ff..419f7e2 100644 --- a/src/simd/mod.rs +++ b/src/simd/mod.rs @@ -1,16 +1,16 @@ #![allow(non_camel_case_types)] -pub mod bits; -mod traits; - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] pub(crate) mod avx2; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] pub(crate) mod avx512; +pub mod bits; #[cfg(target_arch = "aarch64")] pub(crate) mod neon; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] pub(crate) mod sse2; +mod traits; +mod util; pub(crate) mod v128; -pub use self::traits::{BitMask, Mask, Simd}; +pub use self::traits::{Mask, Simd}; diff --git a/src/simd/neon.rs b/src/simd/neon.rs index d365062..c3e1f85 100644 --- a/src/simd/neon.rs +++ b/src/simd/neon.rs @@ -1,13 +1,19 @@ use std::arch::aarch64::*; -use super::{Mask, Simd, bits::NeonBits}; +use super::{Mask, Simd, bits::NeonBits, traits::BitMask, util::escape_unchecked}; -#[derive(Debug)] +#[cfg(any(target_os = "linux", target_os = "macos"))] +use super::util::check_cross_page; + +const LANES: usize = 16; +const CHUNK: usize = LANES * 4; + +#[derive(Debug, Clone, Copy)] #[repr(transparent)] pub struct Simd128u(uint8x16_t); impl Simd for Simd128u { - const LANES: usize = 16; + const LANES: usize = LANES; type Mask = Mask128; type Element = u8; @@ -38,7 +44,7 @@ impl Simd for Simd128u { } } -#[derive(Debug)] +#[derive(Debug, Clone, Copy)] #[repr(transparent)] pub struct Mask128(pub(crate) uint8x16_t); @@ -88,3 +94,173 @@ impl std::ops::BitOrAssign for Mask128 { } } } + +#[inline(always)] +fn escaped_mask(v: Simd128u) -> NeonBits { + let x1f = Simd128u::splat(0x1f); // 0x00 ~ 0x20 + let blash = Simd128u::splat(b'\\'); + let quote = Simd128u::splat(b'"'); + let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); + v.bitmask() +} + +#[target_feature(enable = "neon")] +pub unsafe fn format_string(value: &str, dst: &mut [u8]) -> usize { + unsafe { + let slice = value.as_bytes(); + let mut sptr = slice.as_ptr(); + let mut dptr = dst.as_mut_ptr(); + let dstart = dptr; + let mut nb: usize = slice.len(); + + *dptr = b'"'; + dptr = dptr.add(1); + + // Process CHUNK (4 * LANES = 64 bytes) at a time + while nb >= CHUNK { + // Load 4 SIMD vectors + let v1 = Simd128u::loadu(sptr); + let v2 = Simd128u::loadu(sptr.add(LANES)); + let v3 = Simd128u::loadu(sptr.add(LANES * 2)); + let v4 = Simd128u::loadu(sptr.add(LANES * 3)); + + // Check all 4 masks + let mask1 = escaped_mask(v1); + let mask2 = escaped_mask(v2); + let mask3 = escaped_mask(v3); + let mask4 = escaped_mask(v4); + + // Fast path: if all vectors are clean, write the entire chunk + if mask1.all_zero() && mask2.all_zero() && mask3.all_zero() && mask4.all_zero() { + v1.storeu(dptr); + v2.storeu(dptr.add(LANES)); + v3.storeu(dptr.add(LANES * 2)); + v4.storeu(dptr.add(LANES * 3)); + nb -= CHUNK; + dptr = dptr.add(CHUNK); + sptr = sptr.add(CHUNK); + } else { + // Slow path: handle escape character + // Process v1 + v1.storeu(dptr); + if !mask1.all_zero() { + let cn = mask1.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + continue; + } + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + + // Process v2 + v2.storeu(dptr); + if !mask2.all_zero() { + let cn = mask2.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + continue; + } + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + + // Process v3 + v3.storeu(dptr); + if !mask3.all_zero() { + let cn = mask3.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + continue; + } + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + + // Process v4 + v4.storeu(dptr); + if !mask4.all_zero() { + let cn = mask4.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + continue; + } + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + } + } + + // Process remaining LANES bytes at a time + while nb >= LANES { + let v = Simd128u::loadu(sptr); + v.storeu(dptr); + let mask = escaped_mask(v); + + if mask.all_zero() { + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } + + // Handle remaining bytes + let mut placeholder: [u8; 16] = [0; 16]; + while nb > 0 { + #[cfg(not(any(target_os = "linux", target_os = "macos")))] + let v = { + std::ptr::copy_nonoverlapping(sptr, placeholder.as_mut_ptr(), nb); + Simd128u::loadu(placeholder.as_ptr()) + }; + #[cfg(any(target_os = "linux", target_os = "macos"))] + let v = { + if check_cross_page(sptr, LANES) { + std::ptr::copy_nonoverlapping(sptr, placeholder.as_mut_ptr(), nb); + Simd128u::loadu(placeholder.as_ptr()) + } else { + #[cfg(any(debug_assertions, miri))] + { + std::ptr::copy_nonoverlapping(sptr, placeholder.as_mut_ptr(), nb); + Simd128u::loadu(placeholder.as_ptr()) + } + #[cfg(not(any(debug_assertions, miri)))] + { + Simd128u::loadu(sptr) + } + } + }; + + v.storeu(dptr); + let mask = escaped_mask(v).clear_high_bits(LANES - nb); + + if mask.all_zero() { + dptr = dptr.add(nb); + break; + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } + + *dptr = b'"'; + dptr = dptr.add(1); + dptr as usize - dstart as usize + } +} diff --git a/src/simd/sse2.rs b/src/simd/sse2.rs index 543bc21..c433f21 100644 --- a/src/simd/sse2.rs +++ b/src/simd/sse2.rs @@ -5,13 +5,19 @@ use std::arch::x86_64::*; use std::ops::{BitAnd, BitOr, BitOrAssign}; -use super::{Mask, Simd}; +use super::{Mask, Simd, traits::BitMask, util::escape_unchecked}; -#[derive(Debug)] +#[cfg(any(target_os = "linux", target_os = "macos"))] +use super::util::check_cross_page; + +const LANES: usize = 16; +const CHUNK: usize = LANES * 4; + +#[derive(Debug, Clone, Copy)] #[repr(transparent)] pub struct Simd128u(__m128i); -#[derive(Debug)] +#[derive(Debug, Clone, Copy)] #[repr(transparent)] pub struct Mask128(__m128i); @@ -51,7 +57,7 @@ impl BitOrAssign for Mask128 { } impl Simd for Simd128u { - const LANES: usize = 16; + const LANES: usize = LANES; type Mask = Mask128; type Element = u8; @@ -84,3 +90,173 @@ impl Simd for Simd128u { } } } + +#[inline(always)] +fn escaped_mask(v: Simd128u) -> u16 { + let x1f = Simd128u::splat(0x1f); // 0x00 ~ 0x20 + let blash = Simd128u::splat(b'\\'); + let quote = Simd128u::splat(b'"'); + let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); + v.bitmask() +} + +#[target_feature(enable = "sse2")] +pub unsafe fn format_string(value: &str, dst: &mut [u8]) -> usize { + unsafe { + let slice = value.as_bytes(); + let mut sptr = slice.as_ptr(); + let mut dptr = dst.as_mut_ptr(); + let dstart = dptr; + let mut nb: usize = slice.len(); + + *dptr = b'"'; + dptr = dptr.add(1); + + // Process CHUNK (4 * LANES = 64 bytes) at a time + while nb >= CHUNK { + // Load 4 SIMD vectors + let v1 = Simd128u::loadu(sptr); + let v2 = Simd128u::loadu(sptr.add(LANES)); + let v3 = Simd128u::loadu(sptr.add(LANES * 2)); + let v4 = Simd128u::loadu(sptr.add(LANES * 3)); + + // Check all 4 masks + let mask1 = escaped_mask(v1); + let mask2 = escaped_mask(v2); + let mask3 = escaped_mask(v3); + let mask4 = escaped_mask(v4); + + // Fast path: if all vectors are clean, write the entire chunk + if mask1.all_zero() && mask2.all_zero() && mask3.all_zero() && mask4.all_zero() { + v1.storeu(dptr); + v2.storeu(dptr.add(LANES)); + v3.storeu(dptr.add(LANES * 2)); + v4.storeu(dptr.add(LANES * 3)); + nb -= CHUNK; + dptr = dptr.add(CHUNK); + sptr = sptr.add(CHUNK); + } else { + // Slow path: handle escape character + // Process v1 + v1.storeu(dptr); + if !mask1.all_zero() { + let cn = mask1.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + continue; + } + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + + // Process v2 + v2.storeu(dptr); + if !mask2.all_zero() { + let cn = mask2.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + continue; + } + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + + // Process v3 + v3.storeu(dptr); + if !mask3.all_zero() { + let cn = mask3.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + continue; + } + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + + // Process v4 + v4.storeu(dptr); + if !mask4.all_zero() { + let cn = mask4.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + continue; + } + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + } + } + + // Process remaining LANES bytes at a time + while nb >= LANES { + let v = Simd128u::loadu(sptr); + v.storeu(dptr); + let mask = escaped_mask(v); + + if mask.all_zero() { + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } + + // Handle remaining bytes + let mut placeholder: [u8; 16] = [0; 16]; + while nb > 0 { + #[cfg(not(any(target_os = "linux", target_os = "macos")))] + let v = { + std::ptr::copy_nonoverlapping(sptr, placeholder.as_mut_ptr(), nb); + Simd128u::loadu(placeholder.as_ptr()) + }; + #[cfg(any(target_os = "linux", target_os = "macos"))] + let v = { + if check_cross_page(sptr, LANES) { + std::ptr::copy_nonoverlapping(sptr, placeholder.as_mut_ptr(), nb); + Simd128u::loadu(placeholder.as_ptr()) + } else { + #[cfg(any(debug_assertions, miri))] + { + std::ptr::copy_nonoverlapping(sptr, placeholder.as_mut_ptr(), nb); + Simd128u::loadu(placeholder.as_ptr()) + } + #[cfg(not(any(debug_assertions, miri)))] + { + Simd128u::loadu(sptr) + } + } + }; + + v.storeu(dptr); + let mask = escaped_mask(v).clear_high_bits(LANES - nb); + + if mask.all_zero() { + dptr = dptr.add(nb); + break; + } else { + let cn = mask.trailing_zeros() as usize; + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } + + *dptr = b'"'; + dptr = dptr.add(1); + dptr as usize - dstart as usize + } +} diff --git a/src/simd/traits.rs b/src/simd/traits.rs index 985e262..e92d95e 100644 --- a/src/simd/traits.rs +++ b/src/simd/traits.rs @@ -7,18 +7,6 @@ pub trait Simd: Sized { type Element; type Mask: Mask; - /// # Safety - unsafe fn from_slice_unaligned_unchecked(slice: &[u8]) -> Self { - debug_assert!(slice.len() >= Self::LANES); - unsafe { Self::loadu(slice.as_ptr()) } - } - - /// # Safety - unsafe fn write_to_slice_unaligned_unchecked(&self, slice: &mut [u8]) { - debug_assert!(slice.len() >= Self::LANES); - unsafe { self.storeu(slice.as_mut_ptr()) } - } - /// # Safety unsafe fn loadu(ptr: *const u8) -> Self; @@ -49,10 +37,6 @@ pub trait BitMask { /// get the offset of the first `1` bit. fn first_offset(&self) -> usize; - #[allow(unused)] - /// check if this bitmask is before the other bitmask. - fn before(&self, rhs: &Self) -> bool; - /// convert bitmask as little endian fn as_little_endian(&self) -> Self; diff --git a/src/simd/util.rs b/src/simd/util.rs new file mode 100644 index 0000000..e8bb901 --- /dev/null +++ b/src/simd/util.rs @@ -0,0 +1,31 @@ +use crate::{NEED_ESCAPED, QUOTE_TAB}; + +#[inline(always)] +pub(crate) unsafe fn escape_unchecked(src: &mut *const u8, nb: &mut usize, dst: &mut *mut u8) { + debug_assert!(*nb >= 1); + loop { + let ch = unsafe { *(*src) }; + let cnt = QUOTE_TAB[ch as usize].0 as usize; + debug_assert!( + cnt != 0, + "char is {}, cnt is {}, NEED_ESCAPED is {}", + ch as char, + cnt, + NEED_ESCAPED[ch as usize] + ); + unsafe { std::ptr::copy_nonoverlapping(QUOTE_TAB[ch as usize].1.as_ptr(), *dst, 8) }; + unsafe { (*dst) = (*dst).add(cnt) }; + unsafe { (*src) = (*src).add(1) }; + (*nb) -= 1; + if (*nb) == 0 || unsafe { NEED_ESCAPED[*(*src) as usize] == 0 } { + return; + } + } +} + +#[cfg(any(target_os = "linux", target_os = "macos"))] +#[inline(always)] +pub(crate) fn check_cross_page(ptr: *const u8, step: usize) -> bool { + let page_size = 4096; + ((ptr as usize & (page_size - 1)) + step) > page_size +} diff --git a/src/simd/v128.rs b/src/simd/v128.rs index 8e03fa1..f23009c 100644 --- a/src/simd/v128.rs +++ b/src/simd/v128.rs @@ -1,6 +1,9 @@ use std::ops::{BitAnd, BitOr, BitOrAssign}; -use super::{Mask, Simd}; +use super::{Mask, Simd, util::escape_unchecked}; + +#[cfg(any(target_os = "linux", target_os = "macos"))] +use super::util::check_cross_page; #[derive(Debug)] pub struct Simd128u([u8; 16]); @@ -99,3 +102,91 @@ impl BitOrAssign for Mask128 { } } } + +#[inline(always)] +fn escaped_mask(v: Simd128u) -> u16 { + let x1f = Simd128u::splat(0x1f); // 0x00 ~ 0x20 + let blash = Simd128u::splat(b'\\'); + let quote = Simd128u::splat(b'"'); + let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); + v.bitmask() +} + +pub fn format_string(value: &str, dst: &mut [u8]) -> usize { + unsafe { + let slice = value.as_bytes(); + let mut sptr = slice.as_ptr(); + let mut dptr = dst.as_mut_ptr(); + let dstart = dptr; + let mut nb: usize = slice.len(); + + *dptr = b'"'; + dptr = dptr.add(1); + + // Main loop: process LANES bytes at a time + while nb >= Simd128u::LANES { + let v = Simd128u::loadu(sptr); + v.storeu(dptr); + let mask = escaped_mask(v); + + if mask == 0 { + nb -= Simd128u::LANES; + dptr = dptr.add(Simd128u::LANES); + sptr = sptr.add(Simd128u::LANES); + } else { + let cn = mask.trailing_zeros() as usize; + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } + + // Handle remaining bytes + let mut placeholder: [u8; 16] = [0; 16]; + while nb > 0 { + #[cfg(not(any(target_os = "linux", target_os = "macos")))] + let v = { + std::ptr::copy_nonoverlapping(sptr, placeholder.as_mut_ptr(), nb); + Simd128u::loadu(placeholder.as_ptr()) + }; + #[cfg(any(target_os = "linux", target_os = "macos"))] + let v = { + if check_cross_page(sptr, Simd128u::LANES) { + std::ptr::copy_nonoverlapping(sptr, placeholder.as_mut_ptr(), nb); + Simd128u::loadu(placeholder.as_ptr()) + } else { + #[cfg(any(debug_assertions, miri))] + { + std::ptr::copy_nonoverlapping(sptr, placeholder.as_mut_ptr(), nb); + Simd128u::loadu(placeholder.as_ptr()) + } + #[cfg(not(any(debug_assertions, miri)))] + { + Simd128u::loadu(sptr) + } + } + }; + + v.storeu(dptr); + let mut mask = escaped_mask(v); + // Clear high bits for partial vector + mask &= (1u16 << nb) - 1; + + if mask == 0 { + dptr = dptr.add(nb); + break; + } else { + let cn = mask.trailing_zeros() as usize; + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } + + *dptr = b'"'; + dptr = dptr.add(1); + dptr as usize - dstart as usize + } +} From 911a84e9f9b1f9b5550411d2c7fdd0889676f165 Mon Sep 17 00:00:00 2001 From: LongYinan Date: Mon, 13 Oct 2025 15:10:27 +0800 Subject: [PATCH 2/2] clippy --- benches/escape.rs | 2 ++ src/lib.rs | 10 ++++------ src/simd/v128.rs | 12 ++++++------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/benches/escape.rs b/benches/escape.rs index a60916a..79bc213 100644 --- a/benches/escape.rs +++ b/benches/escape.rs @@ -2,9 +2,11 @@ use std::{fs, hint::black_box}; use criterion::{Criterion, criterion_group, criterion_main}; +#[cfg(not(feature = "codspeed"))] use generic::escape_generic; use json_escape_simd::escape; +#[cfg(not(feature = "codspeed"))] mod generic; fn get_rxjs_sources() -> Vec { diff --git a/src/lib.rs b/src/lib.rs index e473c39..a3e2e8f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -563,9 +563,9 @@ mod tests { .iter() .take(if cfg!(miri) { 10 } else { sources.len() }) { - assert_eq!(escape(&source), serde_json::to_string(&source).unwrap()); + assert_eq!(escape(source), serde_json::to_string(&source).unwrap()); let mut output = String::new(); - escape_into(&source, unsafe { output.as_mut_vec() }); + escape_into(source, unsafe { output.as_mut_vec() }); assert_eq!(output, serde_json::to_string(&source).unwrap()); } } @@ -603,10 +603,8 @@ mod tests { for entry in dir { let p = entry?; let metadata = std::fs::metadata(p.path())?; - if metadata.is_file() { - if f(p.path()) { - sources.push(std::fs::read_to_string(p.path())?); - } + if metadata.is_file() && f(p.path()) { + sources.push(std::fs::read_to_string(p.path())?); } if metadata.is_dir() { read_dir_recursive(p.path(), sources, f)?; diff --git a/src/simd/v128.rs b/src/simd/v128.rs index f23009c..f75157f 100644 --- a/src/simd/v128.rs +++ b/src/simd/v128.rs @@ -42,8 +42,8 @@ impl Simd for Simd128u { fn le(&self, rhs: &Self) -> Self::Mask { let mut mask = [0u8; 16]; - for i in 0..Self::LANES { - mask[i] = if self.0[i] <= rhs.0[i] { 1 } else { 0 }; + for (i, item) in mask.iter_mut().enumerate().take(Self::LANES) { + *item = if self.0[i] <= rhs.0[i] { 1 } else { 0 }; } Mask128(mask) } @@ -76,8 +76,8 @@ impl BitAnd for Mask128 { fn bitand(self, rhs: Self) -> Self::Output { let mut result = [0u8; 16]; - for i in 0..16 { - result[i] = self.0[i] & rhs.0[i]; + for (i, item) in result.iter_mut().enumerate() { + *item = self.0[i] & rhs.0[i]; } Mask128(result) } @@ -88,8 +88,8 @@ impl BitOr for Mask128 { fn bitor(self, rhs: Self) -> Self::Output { let mut result = [0u8; 16]; - for i in 0..16 { - result[i] = self.0[i] | rhs.0[i]; + for (i, item) in result.iter_mut().enumerate() { + *item = self.0[i] | rhs.0[i]; } Mask128(result) }