diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 3c0667f..8dfcac6 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -45,6 +45,30 @@ jobs: - name: Run tests run: cargo test + miri: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v5 + - name: Setup Rust + uses: dtolnay/rust-toolchain@stable + with: + targets: x86_64-unknown-linux-gnu + components: miri + toolchain: nightly + - uses: actions/setup-node@v5 + with: + node-version: 22 + cache: 'yarn' + - name: Install dependencies + run: yarn install + - name: Download fixtures + run: node download-fixtures.js + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Run miri + run: cargo miri test + env: + MIRIFLAGS: "-Zmiri-disable-isolation" bench: strategy: matrix: @@ -53,6 +77,8 @@ jobs: os: ubuntu-latest - target: x86_64-pc-windows-msvc os: windows-latest + - target: aarch64-pc-windows-msvc + os: windows-11-arm - target: aarch64-unknown-linux-gnu os: ubuntu-24.04-arm - target: aarch64-apple-darwin diff --git a/Cargo.lock b/Cargo.lock index 1602506..9684540 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3,25 +3,23 @@ version = 4 [[package]] -name = "aho-corasick" -version = "1.1.3" +name = "ahash" +version = "0.8.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ - "memchr", + "cfg-if", + "getrandom", + "once_cell", + "version_check", + "zerocopy", ] [[package]] name = "anes" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" - -[[package]] -name = "anstyle" -version = "1.0.11" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" +checksum = "dc43e46599f3d77fcf2f2ca89e4d962910b0c19c44e7b58679cbbdfd1820a662" [[package]] name = "anyhow" @@ -59,12 +57,24 @@ version = "2.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394" +[[package]] +name = "bpaf" +version = "0.9.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "473976d7a8620bb1e06dcdd184407c2363fe4fec8e983ee03ed9197222634a31" + [[package]] name = "bumpalo" version = "3.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" +[[package]] +name = "bytes" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" + [[package]] name = "cast" version = "0.3.0" @@ -110,31 +120,6 @@ dependencies = [ "half", ] -[[package]] -name = "clap" -version = "4.5.48" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2134bb3ea021b78629caa971416385309e0131b351b25e01dc16fb54e1b5fae" -dependencies = [ - "clap_builder", -] - -[[package]] -name = "clap_builder" -version = "4.5.48" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2ba64afa3c0a6df7fa517765e31314e983f51dda798ffba27b988194fb65dc9" -dependencies = [ - "anstyle", - "clap_lex", -] - -[[package]] -name = "clap_lex" -version = "0.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" - [[package]] name = "codspeed" version = "3.0.5" @@ -143,7 +128,7 @@ checksum = "35584c5fcba8059780748866387fb97c5a203bcfc563fc3d0790af406727a117" dependencies = [ "anyhow", "bincode", - "colored", + "colored 2.2.0", "glob", "libc", "nix", @@ -154,50 +139,21 @@ dependencies = [ ] [[package]] -name = "codspeed-criterion-compat" -version = "3.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78f6c1c6bed5fd84d319e8b0889da051daa361c79b7709c9394dfe1a882bba67" -dependencies = [ - "codspeed", - "codspeed-criterion-compat-walltime", - "colored", -] - -[[package]] -name = "codspeed-criterion-compat-walltime" -version = "3.0.5" +name = "colored" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c989289ce6b1cbde72ed560496cb8fbf5aa14d5ef5666f168e7f87751038352e" +checksum = "117725a109d387c937a1533ce01b450cbde6b88abceea8473c4d7a85853cda3c" dependencies = [ - "anes", - "cast", - "ciborium", - "clap", - "codspeed", - "criterion-plot", - "is-terminal", - "itertools", - "num-traits", - "once_cell", - "oorandom", - "plotters", - "rayon", - "regex", - "serde", - "serde_derive", - "serde_json", - "tinytemplate", - "walkdir", + "lazy_static", + "windows-sys 0.59.0", ] [[package]] name = "colored" -version = "2.2.0" +version = "3.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "117725a109d387c937a1533ce01b450cbde6b88abceea8473c4d7a85853cda3c" +checksum = "fde0e0ec90c9dfb3b4b1a0891a7dcd0e2bffde2f7efed5fe7c9bb00e5bfb915e" dependencies = [ - "lazy_static", "windows-sys 0.59.0", ] @@ -206,13 +162,23 @@ name = "cpu-features" version = "0.1.0" [[package]] -name = "criterion-plot" -version = "0.5.0" +name = "criterion2" +version = "3.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +checksum = "77cd1059d67baa066c334993d8d6e757ad257d21030db6a9a945dddbb559d4fe" dependencies = [ + "anes", + "bpaf", "cast", - "itertools", + "ciborium", + "codspeed", + "colored 3.0.0", + "num-traits", + "oorandom", + "rayon", + "serde", + "serde_json", + "walkdir", ] [[package]] @@ -252,6 +218,24 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "faststr" +version = "0.2.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baec6a0289d7f1fe5665586ef7340af82e3037207bef60f5785e57569776f0c8" +dependencies = [ + "bytes", + "rkyv", + "serde", + "simdutf8", +] + [[package]] name = "getrandom" version = "0.3.3" @@ -281,29 +265,25 @@ dependencies = [ ] [[package]] -name = "hermit-abi" -version = "0.5.2" +name = "hashbrown" +version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" [[package]] -name = "is-terminal" -version = "0.4.16" +name = "hashbrown" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9" -dependencies = [ - "hermit-abi", - "libc", - "windows-sys 0.59.0", -] +checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d" [[package]] -name = "itertools" -version = "0.10.5" +name = "indexmap" +version = "2.11.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +checksum = "4b0f83760fb341a774ed326568e19f5a863af4a952def8c39f9ab92fd95b88e5" dependencies = [ - "either", + "equivalent", + "hashbrown 0.16.0", ] [[package]] @@ -324,9 +304,9 @@ dependencies = [ [[package]] name = "json-escape" -version = "0.1.1" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3100a96840f2d70dd1c336ec5f76f160edfd56d8e9de2a949d440c0805459cce" +checksum = "b330a4975b953eb01659ab24f612fa9461a71e4279c25e02689d0c5db03c20e5" dependencies = [ "memchr", ] @@ -336,10 +316,15 @@ name = "json-escape-simd" version = "1.1.0" dependencies = [ "anyhow", - "codspeed-criterion-compat", + "criterion2", "glob", "json-escape", + "rand", + "serde", "serde_json", + "sonic-rs", + "sonic-simd", + "thiserror", "v_jsonescape", ] @@ -367,6 +352,26 @@ version = "2.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" +[[package]] +name = "munge" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e17401f259eba956ca16491461b6e8f72913a0a114e39736ce404410f915a0c" +dependencies = [ + "munge_macro", +] + +[[package]] +name = "munge_macro" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4568f25ccbd45ab5d5603dc34318c1ec56b117531781260002151b8530a9f931" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "nix" version = "0.29.0" @@ -401,40 +406,41 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" [[package]] -name = "plotters" -version = "0.3.7" +name = "ppv-lite86" +version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" dependencies = [ - "num-traits", - "plotters-backend", - "plotters-svg", - "wasm-bindgen", - "web-sys", + "zerocopy", ] [[package]] -name = "plotters-backend" -version = "0.3.7" +name = "proc-macro2" +version = "1.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" +checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" +dependencies = [ + "unicode-ident", +] [[package]] -name = "plotters-svg" -version = "0.3.7" +name = "ptr_meta" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +checksum = "0b9a0cf95a1196af61d4f1cbdab967179516d9a4a4312af1f31948f8f6224a79" dependencies = [ - "plotters-backend", + "ptr_meta_derive", ] [[package]] -name = "proc-macro2" -version = "1.0.101" +name = "ptr_meta_derive" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" +checksum = "7347867d0a7e1208d93b46767be83e2b8f978c3dad35f775ac8d8847551d6fe1" dependencies = [ - "unicode-ident", + "proc-macro2", + "quote", + "syn", ] [[package]] @@ -452,6 +458,44 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "rancor" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a063ea72381527c2a0561da9c80000ef822bdd7c3241b1cc1b12100e3df081ee" +dependencies = [ + "ptr_meta", +] + +[[package]] +name = "rand" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +dependencies = [ + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" +dependencies = [ + "getrandom", +] + [[package]] name = "rayon" version = "1.11.0" @@ -473,33 +517,59 @@ dependencies = [ ] [[package]] -name = "regex" -version = "1.11.2" +name = "ref-cast" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23d7fd106d8c02486a8d64e778353d1cffe08ce79ac2e82f540c86d0facf6912" +checksum = "f354300ae66f76f1c85c5f84693f0ce81d747e2c3f21a45fef496d89c960bf7d" dependencies = [ - "aho-corasick", - "memchr", - "regex-automata", - "regex-syntax", + "ref-cast-impl", ] [[package]] -name = "regex-automata" -version = "0.4.10" +name = "ref-cast-impl" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b9458fa0bfeeac22b5ca447c63aaf45f28439a709ccd244698632f9aa6394d6" +checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", + "proc-macro2", + "quote", + "syn", ] [[package]] -name = "regex-syntax" -version = "0.8.6" +name = "rend" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001" +checksum = "cadadef317c2f20755a64d7fdc48f9e7178ee6b0e1f7fce33fa60f1d68a276e6" + +[[package]] +name = "rkyv" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35a640b26f007713818e9a9b65d34da1cf58538207b052916a83d80e43f3ffa4" +dependencies = [ + "bytes", + "hashbrown 0.15.5", + "indexmap", + "munge", + "ptr_meta", + "rancor", + "rend", + "rkyv_derive", + "tinyvec", + "uuid", +] + +[[package]] +name = "rkyv_derive" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd83f5f173ff41e00337d97f6572e416d022ef8a19f371817259ae960324c482" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] [[package]] name = "rustversion" @@ -565,6 +635,51 @@ dependencies = [ "serde_core", ] +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + +[[package]] +name = "sonic-number" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8a74044c092f4f43ca7a6cfd62854cf9fb5ac8502b131347c990bf22bef1dfe" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "sonic-rs" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22540d56ba14521e4878ad436d498518c59698c39a89d5905c694932f0bf7134" +dependencies = [ + "ahash", + "bumpalo", + "bytes", + "cfg-if", + "faststr", + "itoa", + "ref-cast", + "ryu", + "serde", + "simdutf8", + "sonic-number", + "sonic-simd", + "thiserror", +] + +[[package]] +name = "sonic-simd" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b421f7b6aa4a5de8f685aaf398dfaa828346ee639d2b1c1061ab43d40baa6223" +dependencies = [ + "cfg-if", +] + [[package]] name = "statrs" version = "0.18.0" @@ -587,15 +702,40 @@ dependencies = [ ] [[package]] -name = "tinytemplate" -version = "1.2.1" +name = "thiserror" +version = "2.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" dependencies = [ - "serde", - "serde_json", + "thiserror-impl", ] +[[package]] +name = "thiserror-impl" +version = "2.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tinyvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + [[package]] name = "unicode-ident" version = "1.0.19" @@ -619,6 +759,12 @@ version = "0.7.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be8219cc464ba10c48c3231a6871f11d26d831c5c45a47467eea387ea7bb10e8" +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + [[package]] name = "walkdir" version = "2.5.0" @@ -706,16 +852,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "web-sys" -version = "0.3.80" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbe734895e869dc429d78c4b433f8d17d95f8d05317440b4fad5ab2d33e596dc" -dependencies = [ - "js-sys", - "wasm-bindgen", -] - [[package]] name = "winapi-util" version = "0.1.11" @@ -818,3 +954,23 @@ name = "wit-bindgen" version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" + +[[package]] +name = "zerocopy" +version = "0.8.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml index c9d15bb..243e350 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,8 +17,7 @@ name = "escape" path = "examples/escape.rs" [features] -force_aarch64_neon = [] # Force use of neon implementation on aarch64 -codspeed = [] +codspeed = ["criterion2/codspeed"] [[bench]] name = "escape" @@ -26,13 +25,18 @@ harness = false [dependencies] anyhow = "1" +sonic-simd = "0.1" +thiserror = "2" [dev-dependencies] -criterion = { version = "3.0.5", package = "codspeed-criterion-compat" } +criterion2 = "3" glob = "0.3" +rand = "0.9" +serde = "1" serde_json = "1" v_jsonescape = "0.7" json-escape = "0.3.0" +sonic-rs = "0.5" [profile.bench] lto = true diff --git a/README.md b/README.md index 54fa944..de6b290 100644 --- a/README.md +++ b/README.md @@ -4,17 +4,7 @@ ![docs.rs](https://img.shields.io/docsrs/json-escape-simd) [![CodSpeed Badge](https://img.shields.io/endpoint?url=https://codspeed.io/badge.json)](https://codspeed.io/napi-rs/json-escape-simd) -Optimized SIMD routines for escaping JSON strings. This repository contains the `json-escape-simd` crate, comparison fixtures, and Criterion benches against commonly used alternatives. - -> [!IMPORTANT] -> -> On aarch64 NEON hosts the available register width is **128** bits, which is narrower than the lookup table this implementation prefers. As a result the SIMD path may not outperform the generic fallback, which is reflected in the benchmark numbers below. -> -> On some modern macOS devices with larger register numbers, the SIMD path may outperform the generic fallback, see the [M3 max benchmark](#apple-m3-max) below. - -> [!NOTE] -> -> The `force_aarch64_neon` feature flag can be used to force use of the neon implementation on aarch64. This is useful for the benchmark. +Optimized SIMD routines for escaping JSON strings. The implementation is from [sonic-rs](https://github.com/cloudwego/sonic-rs), we only take the string escaping part to avoid the abstraction overhead. ## Benchmarks @@ -98,18 +88,31 @@ Neon enabled. | Implementation | Median time | vs fastest | | --------------------- | ------------- | ---------- | -| **`escape simd`** | **307.20 µs** | **1.00×** | -| `escape generic` | 490.00 µs | 1.60× | -| `serde_json` | 570.35 µs | 1.86× | -| `escape v_jsonescape` | 599.72 µs | 1.95× | -| `json-escape` | 644.73 µs | 2.10× | +| **`escape simd`** | **196.07 µs** | **1.00×** | +| `escape sonic` | 196.32 µs | 1.00× | +| `escape generic` | 488.37 µs | 2.49× | +| `serde_json` | 553.08 µs | 2.82× | +| `escape v_jsonescape` | 618.31 µs | 3.15× | +| `json-escape` | 446.94 µs | 2.28× | **Fixtures payload (~300 iterations)** | Implementation | Median time | vs fastest | | --------------------- | ------------ | ---------- | -| **`escape generic`** | **17.89 ms** | **1.00×** | -| **`escape simd`** | **17.92 ms** | **1.00×** | -| `serde_json` | 19.78 ms | 1.11× | -| `escape v_jsonescape` | 21.09 ms | 1.18× | -| `json-escape` | 22.43 ms | 1.25× | +| **`escape simd`** | **10.36 ms** | **1.00×** | +| `escape sonic` | 10.57 ms | 1.02× | +| `escape generic` | 17.61 ms | 1.70× | +| `json-escape` | 18.01 ms | 1.74× | +| `serde_json` | 19.00 ms | 1.83× | +| `escape v_jsonescape` | 21.38 ms | 2.06× | + +**Short string benchmark** + +| Implementation | Median time | vs fastest | +| --------------------- | ------------- | ---------- | +| **`escape simd`** | **90.58 ns** | **1.00×** | +| `serde_json` | 139.23 ns | 1.54× | +| `escape generic` | 146.15 ns | 1.61× | +| `json-escape` | 173.60 ns | 1.92× | +| `escape v_jsonescape` | 198.60 ns | 2.19× | +| `escape sonic` | 199.27 ns | 2.20× | diff --git a/benches/escape.rs b/benches/escape.rs index ef36350..fdb1fd8 100644 --- a/benches/escape.rs +++ b/benches/escape.rs @@ -2,7 +2,10 @@ use std::{fs, hint::black_box}; use criterion::{Criterion, criterion_group, criterion_main}; -use json_escape_simd::{escape, escape_generic}; +use generic::escape_generic; +use json_escape_simd::escape; + +mod generic; fn get_rxjs_sources() -> Vec { let rxjs_paths = glob::glob("node_modules/rxjs/src/**/*.ts").unwrap(); @@ -46,6 +49,14 @@ fn run_benchmarks(c: &mut Criterion, sources: &[String], prefix: &str) { }) }); #[cfg(not(feature = "codspeed"))] + c.bench_function(&format!("{} escape sonic", prefix), |b| { + b.iter(|| { + for source in sources { + black_box(sonic_rs::to_string(source).unwrap()); + } + }) + }); + #[cfg(not(feature = "codspeed"))] c.bench_function(&format!("{} escape v_jsonescape", prefix), |b| { b.iter(|| { for source in sources { @@ -61,6 +72,7 @@ fn run_benchmarks(c: &mut Criterion, sources: &[String], prefix: &str) { } }) }); + #[cfg(not(feature = "codspeed"))] c.bench_function(&format!("{} escape generic", prefix), |b| { b.iter(|| { for source in sources { @@ -78,6 +90,16 @@ fn run_benchmarks(c: &mut Criterion, sources: &[String], prefix: &str) { }); } +fn short_string_benchmark(c: &mut Criterion) { + let sources = vec![ + "Hello, world!".to_string(), + r#"abcdefghijklmnopqrstuvwxyz .*? hello world escape json string"#.to_string(), + "normal string 🥹".to_string(), + "中文 English 🚀 \n❓ 𝄞".to_string(), + ]; + run_benchmarks(c, &sources, "short string"); +} + fn rxjs_benchmark(c: &mut Criterion) { let sources = get_rxjs_sources(); if !sources.is_empty() { @@ -92,5 +114,10 @@ fn fixtures_benchmark(c: &mut Criterion) { } } -criterion_group!(benches, rxjs_benchmark, fixtures_benchmark); +criterion_group!( + benches, + short_string_benchmark, + rxjs_benchmark, + fixtures_benchmark +); criterion_main!(benches); diff --git a/src/generic.rs b/benches/generic.rs similarity index 95% rename from src/generic.rs rename to benches/generic.rs index 8e73929..ec0db08 100644 --- a/src/generic.rs +++ b/benches/generic.rs @@ -13,15 +13,6 @@ pub fn escape_generic>(s: S) -> String { unsafe { String::from_utf8_unchecked(result) } } -#[inline] -pub fn escape_into_generic>(s: S, output: &mut Vec) { - let s = s.as_ref(); - let bytes = s.as_bytes(); - output.push(b'"'); - escape_inner(bytes, output); - output.push(b'"'); -} - #[inline] // Slightly modified version of // diff --git a/examples/escape.rs b/examples/escape.rs index 7614380..5bd993a 100644 --- a/examples/escape.rs +++ b/examples/escape.rs @@ -1,12 +1,12 @@ use std::fs; -use json_escape_simd::{escape, escape_generic}; +use json_escape_simd::escape; fn main() { for fixture in get_rxjs_sources() { let encoded = escape(&fixture); - let encoded_fallback = escape_generic(&fixture); - assert_eq!(encoded, encoded_fallback); + assert_eq!(encoded, sonic_rs::to_string(&fixture).unwrap()); + assert_eq!(encoded, serde_json::to_string(&fixture).unwrap()); } } diff --git a/src/aarch64.rs b/src/aarch64.rs deleted file mode 100644 index 03a9f4e..0000000 --- a/src/aarch64.rs +++ /dev/null @@ -1,119 +0,0 @@ -use std::arch::aarch64::{ - vceqq_u8, vdupq_n_u8, vld1q_u8_x4, vmaxvq_u8, vorrq_u8, vqtbl4q_u8, vst1q_u8, -}; - -use crate::generic::{ESCAPE, HEX_BYTES, UU}; - -const CHUNK: usize = 64; -// 128 bytes ahead -const PREFETCH_DISTANCE: usize = CHUNK * 2; -const SLASH_SENTINEL: u8 = 0xFF; - -#[inline] -pub fn escape_neon(bytes: &[u8], output: &mut Vec) { - let n = bytes.len(); - - unsafe { - let tbl = vld1q_u8_x4(ESCAPE.as_ptr()); - let slash = vdupq_n_u8(b'\\'); - let mut i = 0usize; - - // Scratch buffer reused for mask materialisation; stay uninitialised. - #[allow(invalid_value)] - let mut placeholder: [u8; 16] = core::mem::MaybeUninit::uninit().assume_init(); - - while i + CHUNK <= n { - let ptr = bytes.as_ptr().add(i); - - core::arch::asm!( - "prfm pldl1keep, [{0}]", - in(reg) ptr.add(PREFETCH_DISTANCE), - ); - - let quad = vld1q_u8_x4(ptr); - - let a = quad.0; - let b = quad.1; - let c = quad.2; - let d = quad.3; - - let mask_1 = vorrq_u8(vqtbl4q_u8(tbl, a), vceqq_u8(slash, a)); - let mask_2 = vorrq_u8(vqtbl4q_u8(tbl, b), vceqq_u8(slash, b)); - let mask_3 = vorrq_u8(vqtbl4q_u8(tbl, c), vceqq_u8(slash, c)); - let mask_4 = vorrq_u8(vqtbl4q_u8(tbl, d), vceqq_u8(slash, d)); - - let mask_r_1 = vmaxvq_u8(mask_1); - let mask_r_2 = vmaxvq_u8(mask_2); - let mask_r_3 = vmaxvq_u8(mask_3); - let mask_r_4 = vmaxvq_u8(mask_4); - - if mask_r_1 | mask_r_2 | mask_r_3 | mask_r_4 == 0 { - output.extend_from_slice(std::slice::from_raw_parts(ptr, CHUNK)); - i += CHUNK; - continue; - } - - macro_rules! handle { - ($mask:expr, $mask_r:expr, $off:expr) => { - if $mask_r == 0 { - output.extend_from_slice(std::slice::from_raw_parts(ptr.add($off), 16)); - } else { - vst1q_u8(placeholder.as_mut_ptr(), $mask); - handle_block(&bytes[i + $off..i + $off + 16], &placeholder, output); - } - }; - } - - handle!(mask_1, mask_r_1, 0); - handle!(mask_2, mask_r_2, 16); - handle!(mask_3, mask_r_3, 32); - handle!(mask_4, mask_r_4, 48); - - i += CHUNK; - } - - if i < n { - handle_tail(&bytes[i..], output); - } - } -} - -#[inline(always)] -fn handle_tail(src: &[u8], dst: &mut Vec) { - for &c in src { - let escape_byte = ESCAPE[c as usize]; - if escape_byte == 0 { - dst.push(c); - } else { - write_escape(dst, escape_byte, c); - } - } -} - -#[inline(always)] -fn handle_block(src: &[u8], mask: &[u8; 16], dst: &mut Vec) { - for (j, &m) in mask.iter().enumerate() { - let c = src[j]; - if m == 0 { - dst.push(c); - } else if m == SLASH_SENTINEL { - dst.push(b'\\'); - dst.push(b'\\'); - } else { - write_escape(dst, m, c); - } - } -} - -#[inline(always)] -fn write_escape(dst: &mut Vec, escape_byte: u8, c: u8) { - dst.push(b'\\'); - if escape_byte == UU { - dst.extend_from_slice(b"u00"); - let hex = &HEX_BYTES[c as usize]; - dst.push(hex.0); - dst.push(hex.1); - } else { - dst.push(escape_byte); - } -} diff --git a/src/lib.rs b/src/lib.rs index 0dbdc3e..e0114ec 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,472 +1,733 @@ -//! Optimized SIMD routines for escaping JSON strings. +//! Borrowed from //! -//! ##
Important
-//! -//! On aarch64 NEON hosts the available register width is **128** bits, which is narrower than the lookup table this implementation prefers. As a result the SIMD path may not outperform the generic fallback, which is reflected in the benchmark numbers below. -//! -//! On some modern macOS devices with larger register numbers, the SIMD path may outperform the generic fallback, see the [M3 max benchmark](#apple-m3-max) below. -//! -//! ### Note -//! -//! The `force_aarch64_neon` feature flag can be used to force use of the neon implementation on aarch64. This is useful for the benchmark. -//! -//! ## Benchmarks -//! -//! Numbers below come from `cargo bench` runs on GitHub Actions hardware. Criterion reports are summarized to make it easier to spot relative performance. "vs fastest" shows how much slower each implementation is compared to the fastest entry in the table (1.00× means fastest). -//! -//! ### GitHub Actions x86_64 (`ubuntu-latest`) -//! -//! `AVX2` enabled. -//! -//! **RxJS payload (~10k iterations)** -//! -//! | Implementation | Median time | vs fastest | -//! | --------------------- | ------------- | ---------- | -//! | **`escape simd`** | **341.18 µs** | **1.00×** | -//! | `escape v_jsonescape` | 555.47 µs | 1.63× | -//! | `escape generic` | 656.85 µs | 1.93× | -//! | `serde_json` | 744.75 µs | 2.18× | -//! | `json-escape` | 777.15 µs | 2.28× | -//! -//! **Fixtures payload (~300 iterations)** -//! -//! | Implementation | Median time | vs fastest | -//! | --------------------- | ------------ | ---------- | -//! | **`escape simd`** | **12.67 ms** | **1.00×** | -//! | `escape v_jsonescape` | 20.58 ms | 1.62× | -//! | `escape generic` | 22.57 ms | 1.78× | -//! | `serde_json` | 24.52 ms | 1.94× | -//! | `json-escape` | 26.97 ms | 2.13× | -//! -//! ### GitHub Actions aarch64 (`ubuntu-24.04-arm`) -//! -//! Neon enabled. -//! -//! **RxJS payload (~10k iterations)** -//! -//! | Implementation | Median time | vs fastest | -//! | --------------------- | ------------- | ---------- | -//! | **`escape generic`** | **546.89 µs** | **1.00×** | -//! | `escape simd` | 589.29 µs | 1.08× | -//! | `serde_json` | 612.33 µs | 1.12× | -//! | `json-escape` | 624.66 µs | 1.14× | -//! | `escape v_jsonescape` | 789.14 µs | 1.44× | -//! -//! **Fixtures payload (~300 iterations)** -//! -//! | Implementation | Median time | vs fastest | -//! | --------------------- | ------------ | ---------- | -//! | **`escape generic`** | **17.81 ms** | **1.00×** | -//! | `serde_json` | 19.77 ms | 1.11× | -//! | `json-escape` | 20.84 ms | 1.17× | -//! | `escape simd` | 21.04 ms | 1.18× | -//! | `escape v_jsonescape` | 25.57 ms | 1.44× | -//! -//! ### GitHub Actions macOS (`macos-latest`) -//! -//! Apple M1 chip -//! -//! **RxJS payload (~10k iterations)** -//! -//! | Implementation | Median time | vs fastest | -//! | --------------------- | ------------- | ---------- | -//! | **`escape generic`** | **759.07 µs** | **1.00×** | -//! | `escape simd` | 764.98 µs | 1.01× | -//! | `serde_json` | 793.91 µs | 1.05× | -//! | `json-escape` | 868.21 µs | 1.14× | -//! | `escape v_jsonescape` | 926.00 µs | 1.22× | -//! -//! **Fixtures payload (~300 iterations)** -//! -//! | Implementation | Median time | vs fastest | -//! | --------------------- | ------------ | ---------- | -//! | **`serde_json`** | **26.41 ms** | **1.00×** | -//! | `escape generic` | 26.43 ms | 1.00× | -//! | `escape simd` | 26.42 ms | 1.00× | -//! | `json-escape` | 28.94 ms | 1.10× | -//! | `escape v_jsonescape` | 29.22 ms | 1.11× | -//! -//! ### Apple M3 Max -//! -//! **RxJS payload (~10k iterations)** -//! -//! | Implementation | Median time | vs fastest | -//! | --------------------- | ------------- | ---------- | -//! | **`escape simd`** | **307.20 µs** | **1.00×** | -//! | `escape generic` | 490.00 µs | 1.60× | -//! | `serde_json` | 570.35 µs | 1.86× | -//! | `escape v_jsonescape` | 599.72 µs | 1.95× | -//! | `json-escape` | 644.73 µs | 2.10× | -//! -//! **Fixtures payload (~300 iterations)** -//! -//! | Implementation | Median time | vs fastest | -//! | --------------------- | ------------ | ---------- | -//! | **`escape generic`** | **17.89 ms** | **1.00×** | -//! | **`escape simd`** | **17.92 ms** | **1.00×** | -//! | `serde_json` | 19.78 ms | 1.11× | -//! | `escape v_jsonescape` | 21.09 ms | 1.18× | -//! | `json-escape` | 22.43 ms | 1.25× | - -#[cfg(target_arch = "aarch64")] -mod aarch64; -mod generic; -#[cfg(target_arch = "x86_64")] -mod x86; - -pub use generic::{escape_generic, escape_into_generic}; - -/// Main entry point for JSON string escaping with SIMD acceleration -/// If the platform is supported, the SIMD path will be used. Otherwise, the generic fallback will be used. -pub fn escape>(input: S) -> String { - #[cfg(not(feature = "force_aarch64_neon"))] - use generic::escape_inner; - - let mut result = Vec::with_capacity(input.as_ref().len() + input.as_ref().len() / 2 + 2); - result.push(b'"'); - let s = input.as_ref(); - let bytes = s.as_bytes(); - #[cfg(target_arch = "x86_64")] - { - let len = bytes.len(); - // Runtime CPU feature detection for x86_64 - if is_x86_feature_detected!("avx512f") - && is_x86_feature_detected!("avx512bw") - && len >= x86::LOOP_SIZE_AVX512 - { - unsafe { x86::escape_avx512(bytes, &mut result) } - } else if is_x86_feature_detected!("avx2") && len >= x86::LOOP_SIZE_AVX2 { - unsafe { x86::escape_avx2(bytes, &mut result) } - } else if is_x86_feature_detected!("sse2") - && /* if len < 128, no need to use simd */ - len >= x86::LOOP_SIZE_AVX2 - { - unsafe { x86::escape_sse2(bytes, &mut result) } - } else { - escape_inner(bytes, &mut result); - } - } +//! Only takes the string escaping part to avoid the abstraction overhead. - #[cfg(target_arch = "aarch64")] - { - #[cfg(feature = "force_aarch64_neon")] - { - aarch64::escape_neon(bytes, &mut result); - } - #[cfg(not(feature = "force_aarch64_neon"))] - { - // on Apple M2 and later, the `bf16` feature is available - // it means they have more registers and can significantly benefit from the SIMD path - // TODO: add support for sve2 chips with wider registers - // github actions ubuntu-24.04-arm runner has 128 bits sve2 registers, it's not enough for the SIMD path - if cfg!(target_os = "macos") && std::arch::is_aarch64_feature_detected!("bf16") { - aarch64::escape_neon(bytes, &mut result); - } else { - escape_inner(bytes, &mut result); - } +use std::slice::from_raw_parts; + +#[cfg(not(all(target_feature = "neon", target_arch = "aarch64")))] +use sonic_simd::u8x32; +use sonic_simd::{BitMask, Mask, Simd}; +#[cfg(all(target_feature = "neon", target_arch = "aarch64"))] +use sonic_simd::{bits::NeonBits, u8x16}; + +#[inline(always)] +unsafe fn load(ptr: *const u8) -> V { + let chunk = unsafe { from_raw_parts(ptr, V::LANES) }; + unsafe { V::from_slice_unaligned_unchecked(chunk) } +} + +const QUOTE_TAB: [(u8, [u8; 8]); 256] = [ + // 0x00 ~ 0x1f + (6, *b"\\u0000\0\0"), + (6, *b"\\u0001\0\0"), + (6, *b"\\u0002\0\0"), + (6, *b"\\u0003\0\0"), + (6, *b"\\u0004\0\0"), + (6, *b"\\u0005\0\0"), + (6, *b"\\u0006\0\0"), + (6, *b"\\u0007\0\0"), + (2, *b"\\b\0\0\0\0\0\0"), + (2, *b"\\t\0\0\0\0\0\0"), + (2, *b"\\n\0\0\0\0\0\0"), + (6, *b"\\u000b\0\0"), + (2, *b"\\f\0\0\0\0\0\0"), + (2, *b"\\r\0\0\0\0\0\0"), + (6, *b"\\u000e\0\0"), + (6, *b"\\u000f\0\0"), + (6, *b"\\u0010\0\0"), + (6, *b"\\u0011\0\0"), + (6, *b"\\u0012\0\0"), + (6, *b"\\u0013\0\0"), + (6, *b"\\u0014\0\0"), + (6, *b"\\u0015\0\0"), + (6, *b"\\u0016\0\0"), + (6, *b"\\u0017\0\0"), + (6, *b"\\u0018\0\0"), + (6, *b"\\u0019\0\0"), + (6, *b"\\u001a\0\0"), + (6, *b"\\u001b\0\0"), + (6, *b"\\u001c\0\0"), + (6, *b"\\u001d\0\0"), + (6, *b"\\u001e\0\0"), + (6, *b"\\u001f\0\0"), + // 0x20 ~ 0x2f + (0, [0; 8]), + (0, [0; 8]), + (2, *b"\\\"\0\0\0\0\0\0"), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + // 0x30 ~ 0x3f + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + // 0x40 ~ 0x4f + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + // 0x50 ~ 0x5f + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (2, *b"\\\\\0\0\0\0\0\0"), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + // 0x60 ~ 0xff + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), + (0, [0; 8]), +]; + +const NEED_ESCAPED: [u8; 256] = [ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +]; + +// only check the src length. +#[inline(always)] +unsafe fn escape_unchecked(src: &mut *const u8, nb: &mut usize, dst: &mut *mut u8) { + debug_assert!(*nb >= 1); + loop { + let ch = unsafe { *(*src) }; + let cnt = QUOTE_TAB[ch as usize].0 as usize; + debug_assert!( + cnt != 0, + "char is {}, cnt is {}, NEED_ESCAPED is {}", + ch as char, + cnt, + NEED_ESCAPED[ch as usize] + ); + unsafe { std::ptr::copy_nonoverlapping(QUOTE_TAB[ch as usize].1.as_ptr(), *dst, 8) }; + unsafe { (*dst) = (*dst).add(cnt) }; + unsafe { (*src) = (*src).add(1) }; + (*nb) -= 1; + if (*nb) == 0 || unsafe { NEED_ESCAPED[*(*src) as usize] == 0 } { + return; } } +} - #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] - { - escape_inner(bytes, &mut result); - } - result.push(b'"'); - // SAFETY: We only pushed valid UTF-8 bytes (original string bytes and ASCII escape sequences) - unsafe { String::from_utf8_unchecked(result) } +#[cfg(any(target_os = "linux", target_os = "macos"))] +#[inline(always)] +fn check_cross_page(ptr: *const u8, step: usize) -> bool { + let page_size = 4096; + ((ptr as usize & (page_size - 1)) + step) > page_size } -/// Main entry point for JSON string escaping with SIMD acceleration -/// If the platform is supported, the SIMD path will be used. Otherwise, the generic fallback will be used. -pub fn escape_into>(input: S, output: &mut Vec) { - #[cfg(not(feature = "force_aarch64_neon"))] - use generic::escape_inner; - - output.push(b'"'); - let s = input.as_ref(); - let bytes = s.as_bytes(); - #[cfg(target_arch = "x86_64")] - { - let len = bytes.len(); - // Runtime CPU feature detection for x86_64 - if is_x86_feature_detected!("avx512f") - && is_x86_feature_detected!("avx512bw") - && len >= x86::LOOP_SIZE_AVX512 - { - unsafe { x86::escape_avx512(bytes, output) } - } else if is_x86_feature_detected!("avx2") && len >= x86::LOOP_SIZE_AVX2 { - unsafe { x86::escape_avx2(bytes, output) } - } else if is_x86_feature_detected!("sse2") - && /* if len < 128, no need to use simd */ - len >= x86::LOOP_SIZE_AVX2 - { - unsafe { x86::escape_sse2(bytes, output) } - } else { - escape_inner(bytes, output); - } +#[inline(always)] +fn format_string(value: &str, dst: &mut [u8]) -> usize { + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + let mut v: u8x16; + #[cfg(not(all(target_arch = "aarch64", target_feature = "neon")))] + let mut v: u8x32; + + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + const LANES: usize = 16; + #[cfg(not(all(target_arch = "aarch64", target_feature = "neon")))] + const LANES: usize = 32; + + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + #[inline] + fn escaped_mask(v: u8x16) -> NeonBits { + let x1f = u8x16::splat(0x1f); // 0x00 ~ 0x20 + let blash = u8x16::splat(b'\\'); + let quote = u8x16::splat(b'"'); + let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); + v.bitmask() } - #[cfg(target_arch = "aarch64")] - { - #[cfg(feature = "force_aarch64_neon")] - { - aarch64::escape_neon(bytes, output); - } - #[cfg(not(feature = "force_aarch64_neon"))] - { - // on Apple M2 and later, the `bf16` feature is available - // it means they have more registers and can significantly benefit from the SIMD path - // TODO: add support for sve2 chips with wider registers - // github actions ubuntu-24.04-arm runner has 128 bits sve2 registers, it's not enough for the SIMD path - if cfg!(target_os = "macos") && std::arch::is_aarch64_feature_detected!("bf16") { - aarch64::escape_neon(bytes, output); + #[cfg(not(all(target_arch = "aarch64", target_feature = "neon")))] + #[inline] + fn escaped_mask(v: u8x32) -> u32 { + let x1f = u8x32::splat(0x1f); // 0x00 ~ 0x20 + let blash = u8x32::splat(b'\\'); + let quote = u8x32::splat(b'"'); + let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); + v.bitmask() + } + + unsafe { + let slice = value.as_bytes(); + let mut sptr = slice.as_ptr(); + let mut dptr = dst.as_mut_ptr(); + let dstart = dptr; + let mut nb: usize = slice.len(); + + *dptr = b'"'; + dptr = dptr.add(1); + while nb >= LANES { + v = load(sptr); + v.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut(dptr, LANES)); + let mask = escaped_mask(v); + if mask.all_zero() { + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); } else { - escape_inner(bytes, output); + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); } } - } - #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] - { - escape_into_generic(input, output); + // Scratch buffer reused for mask materialisation; stay uninitialised. + #[cfg(not(miri))] + #[allow(invalid_value, clippy::uninit_assumed_init)] + let mut placeholder: [u8; LANES] = core::mem::MaybeUninit::uninit().assume_init(); + #[cfg(miri)] + let mut placeholder: [u8; LANES] = [0; LANES]; + while nb > 0 { + v = { + #[cfg(not(any(target_os = "linux", target_os = "macos")))] + { + std::ptr::copy_nonoverlapping(sptr, placeholder[..].as_mut_ptr(), nb); + load(placeholder[..].as_ptr()) + } + #[cfg(any(target_os = "linux", target_os = "macos"))] + { + if check_cross_page(sptr, LANES) { + std::ptr::copy_nonoverlapping(sptr, placeholder[..].as_mut_ptr(), nb); + load(placeholder[..].as_ptr()) + } else { + #[cfg(not(debug_assertions))] + { + // disable memory sanitizer here + load(sptr) + } + #[cfg(debug_assertions)] + { + std::ptr::copy_nonoverlapping(sptr, placeholder[..].as_mut_ptr(), nb); + load(placeholder[..].as_ptr()) + } + } + } + }; + v.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut(dptr, LANES)); + + let mask = escaped_mask(v).clear_high_bits(LANES - nb); + if mask.all_zero() { + dptr = dptr.add(nb); + break; + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } + *dptr = b'"'; + dptr = dptr.add(1); + dptr as usize - dstart as usize } - output.push(b'"'); } -#[test] -fn test_escape_ascii_json_string() { - let fixture = r#"abcdefghijklmnopqrstuvwxyz .*? hello world escape json string"#; - assert_eq!(escape(fixture), serde_json::to_string(fixture).unwrap()); +pub fn escape(value: &str) -> String { + let capacity = value.len() * 6 + 32 + 3; + let mut buf = Vec::with_capacity(capacity); + unsafe { buf.set_len(capacity) }; + let cnt = format_string(value, &mut buf); + unsafe { buf.set_len(cnt) }; + unsafe { String::from_utf8_unchecked(buf) } } -#[test] -fn test_escape_json_string() { - let mut fixture = String::new(); - for i in 0u8..=0x1F { - fixture.push(i as char); - } - fixture.push('\t'); - fixture.push('\x08'); - fixture.push('\x09'); - fixture.push('\x0A'); - fixture.push('\x0C'); - fixture.push('\x0D'); - fixture.push('\x22'); - fixture.push('\x5C'); - fixture.push_str("normal string"); - fixture.push('😊'); - fixture.push_str("中文 English 🚀 \n❓ 𝄞"); - escape(fixture.as_str()); - assert_eq!( - escape(fixture.as_str()), - serde_json::to_string(fixture.as_str()).unwrap(), - "fixture: {:?}", - fixture - ); -} +pub fn escape_into>(value: S, dst: &mut Vec) -> usize { + let value = value.as_ref(); + let needed_capacity = value.len() * 6 + 32 + 3; -// Test cases for various string sizes to cover different SIMD paths + // Ensure we have enough capacity + dst.reserve(needed_capacity); -#[test] -fn test_empty_string() { - assert_eq!(escape(""), r#""""#); -} + let old_len = dst.len(); -#[test] -fn test_very_small_strings() { - // Less than 16 bytes (SSE register size) - assert_eq!(escape("a"), r#""a""#); - assert_eq!(escape("ab"), r#""ab""#); - assert_eq!(escape("hello"), r#""hello""#); - assert_eq!(escape("hello\n"), r#""hello\n""#); - assert_eq!(escape("\""), r#""\"""#); - assert_eq!(escape("\\"), r#""\\""#); - assert_eq!(escape("\t"), r#""\t""#); - assert_eq!(escape("\r\n"), r#""\r\n""#); + // SAFETY: We've reserved enough capacity above, and format_string will + // write valid UTF-8 bytes. We'll set the correct length after. + unsafe { + // Get a slice that includes the spare capacity + let spare = + std::slice::from_raw_parts_mut(dst.as_mut_ptr().add(old_len), dst.capacity() - old_len); + let cnt = format_string(value, spare); + dst.set_len(old_len + cnt); + cnt + } } -#[test] -fn test_small_strings_16_bytes() { - // Exactly 16 bytes - SSE register boundary - let s16 = "0123456789abcdef"; - assert_eq!(s16.len(), 16); - assert_eq!(escape(s16), serde_json::to_string(s16).unwrap()); - - // 16 bytes with escapes - let s16_esc = "01234567\t9abcde"; - assert_eq!(s16_esc.len(), 15); // \t is 1 byte - assert_eq!(escape(s16_esc), serde_json::to_string(s16_esc).unwrap()); -} +#[cfg(test)] +mod tests { + use std::fs::read_dir; + use std::path::{Path, PathBuf}; -#[test] -fn test_medium_strings_32_bytes() { - // Exactly 32 bytes - AVX2 register boundary - let s32 = "0123456789abcdef0123456789abcdef"; - assert_eq!(s32.len(), 32); - assert_eq!(escape(s32), serde_json::to_string(s32).unwrap()); + use rand::seq::SliceRandom; - // 32 bytes with escapes at different positions - let s32_esc = "0123456789abcde\"0123456789abcde"; - assert_eq!(escape(s32_esc), serde_json::to_string(s32_esc).unwrap()); -} + use super::*; + + #[test] + fn test_escape_ascii_json_string() { + let fixture = r#"abcdefghijklmnopqrstuvwxyz .*? hello world escape json string"#; + assert_eq!(escape(fixture), serde_json::to_string(fixture).unwrap()); + } -#[test] -fn test_large_strings_128_bytes() { - // Exactly 128 bytes - main loop size - let s128 = "0123456789abcdef".repeat(8); - assert_eq!(s128.len(), 128); - assert_eq!(escape(&s128), serde_json::to_string(&s128).unwrap()); - - // 128 bytes with escapes spread throughout - let mut s128_esc = String::new(); - for i in 0..8 { - if i % 2 == 0 { - s128_esc.push_str("0123456789abcd\n"); - } else { - s128_esc.push_str("0123456789abcd\""); + #[test] + fn test_escape_json_string() { + let mut fixture = String::new(); + for i in 0u8..=0x1F { + fixture.push(i as char); } + fixture.push('\t'); + fixture.push('\x08'); + fixture.push('\x09'); + fixture.push('\x0A'); + fixture.push('\x0C'); + fixture.push('\x0D'); + fixture.push('\x22'); + fixture.push('\x5C'); + fixture.push_str("normal string"); + fixture.push('😊'); + fixture.push_str("中文 English 🚀 \n❓ 𝄞"); + escape(fixture.as_str()); + assert_eq!( + escape(fixture.as_str()), + serde_json::to_string(fixture.as_str()).unwrap(), + "fixture: {:?}", + fixture + ); } - assert_eq!(escape(&s128_esc), serde_json::to_string(&s128_esc).unwrap()); -} -#[test] -fn test_unaligned_data() { - // Test strings that start at various alignments - for offset in 0..32 { - let padding = " ".repeat(offset); - let test_str = format!("{}{}", padding, "test\nstring\"with\\escapes"); - let result = escape(&test_str[offset..]); - let expected = serde_json::to_string(&test_str[offset..]).unwrap(); - assert_eq!(result, expected, "Failed at offset {}", offset); + // Test cases for various string sizes to cover different SIMD paths + + #[test] + fn test_empty_string() { + assert_eq!(escape(""), r#""""#); } -} -#[test] -fn test_sparse_escapes() { - // Large string with escapes only at the beginning and end - let mut s = String::new(); - s.push('"'); - s.push_str(&"a".repeat(500)); - s.push('\\'); - assert_eq!(escape(&s), serde_json::to_string(&s).unwrap()); -} + #[test] + fn test_very_small_strings() { + // Less than 16 bytes (SSE register size) + assert_eq!(escape("a"), r#""a""#); + assert_eq!(escape("ab"), r#""ab""#); + assert_eq!(escape("hello"), r#""hello""#); + assert_eq!(escape("hello\n"), r#""hello\n""#); + assert_eq!(escape("\""), r#""\"""#); + assert_eq!(escape("\\"), r#""\\""#); + assert_eq!(escape("\t"), r#""\t""#); + assert_eq!(escape("\r\n"), r#""\r\n""#); + } -#[test] -fn test_dense_escapes() { - // String with many escapes - let s = "\"\\\"\\\"\\\"\\".repeat(50); - assert_eq!(escape(&s), serde_json::to_string(&s).unwrap()); + #[test] + fn test_small_strings_16_bytes() { + // Exactly 16 bytes - SSE register boundary + let s16 = "0123456789abcdef"; + assert_eq!(s16.len(), 16); + assert_eq!(escape(s16), serde_json::to_string(s16).unwrap()); + + // 16 bytes with escapes + let s16_esc = "01234567\t9abcde"; + assert_eq!(s16_esc.len(), 15); // \t is 1 byte + assert_eq!(escape(s16_esc), serde_json::to_string(s16_esc).unwrap()); + } - // All control characters - let mut ctrl = String::new(); - for _ in 0..10 { - for i in 0u8..32 { - ctrl.push(i as char); + #[test] + fn test_medium_strings_32_bytes() { + // Exactly 32 bytes - AVX2 register boundary + let s32 = "0123456789abcdef0123456789abcdef"; + assert_eq!(s32.len(), 32); + assert_eq!(escape(s32), serde_json::to_string(s32).unwrap()); + + // 32 bytes with escapes at different positions + let s32_esc = "0123456789abcde\"0123456789abcde"; + assert_eq!(escape(s32_esc), serde_json::to_string(s32_esc).unwrap()); + } + + #[test] + fn test_large_strings_128_bytes() { + // Exactly 128 bytes - main loop size + let s128 = "0123456789abcdef".repeat(8); + assert_eq!(s128.len(), 128); + assert_eq!(escape(&s128), serde_json::to_string(&s128).unwrap()); + + // 128 bytes with escapes spread throughout + let mut s128_esc = String::new(); + for i in 0..8 { + if i % 2 == 0 { + s128_esc.push_str("0123456789abcd\n"); + } else { + s128_esc.push_str("0123456789abcd\""); + } + } + assert_eq!(escape(&s128_esc), serde_json::to_string(&s128_esc).unwrap()); + } + + #[test] + fn test_unaligned_data() { + // Test strings that start at various alignments + for offset in 0..32 { + let padding = " ".repeat(offset); + let test_str = format!("{}{}", padding, "test\nstring\"with\\escapes"); + let result = escape(&test_str[offset..]); + let expected = serde_json::to_string(&test_str[offset..]).unwrap(); + assert_eq!(result, expected, "Failed at offset {}", offset); } } - assert_eq!(escape(&ctrl), serde_json::to_string(&ctrl).unwrap()); -} -#[test] -fn test_boundary_conditions() { - // Test around 256 byte boundary (common cache line multiple) - for size in 250..260 { - let s = "a".repeat(size); + #[test] + fn test_sparse_escapes() { + // Large string with escapes only at the beginning and end + let mut s = String::new(); + s.push('"'); + s.push_str(&"a".repeat(500)); + s.push('\\'); + assert_eq!(escape(&s), serde_json::to_string(&s).unwrap()); + } + + #[test] + fn test_dense_escapes() { + // String with many escapes + let s = "\"\\\"\\\"\\\"\\".repeat(50); assert_eq!(escape(&s), serde_json::to_string(&s).unwrap()); - // With escape at the end - let mut s_esc = "a".repeat(size - 1); - s_esc.push('"'); - assert_eq!(escape(&s_esc), serde_json::to_string(&s_esc).unwrap()); + // All control characters + let mut ctrl = String::new(); + for _ in 0..10 { + for i in 0u8..32 { + ctrl.push(i as char); + } + } + assert_eq!(escape(&ctrl), serde_json::to_string(&ctrl).unwrap()); } -} -#[test] -fn test_all_escape_types() { - // Test each escape type individually - assert_eq!(escape("\x00"), r#""\u0000""#); - assert_eq!(escape("\x08"), r#""\b""#); - assert_eq!(escape("\x09"), r#""\t""#); - assert_eq!(escape("\x0A"), r#""\n""#); - assert_eq!(escape("\x0C"), r#""\f""#); - assert_eq!(escape("\x0D"), r#""\r""#); - assert_eq!(escape("\x1F"), r#""\u001f""#); - assert_eq!(escape("\""), r#""\"""#); - assert_eq!(escape("\\"), r#""\\""#); - - // Test all control characters - for i in 0u8..32 { - let s = String::from_utf8(vec![i]).unwrap(); - let result = escape(&s); - let expected = serde_json::to_string(&s).unwrap(); - assert_eq!(result, expected, "Failed for byte 0x{:02x}", i); + #[test] + fn test_boundary_conditions() { + // Test around 256 byte boundary (common cache line multiple) + for size in 250..260 { + let s = "a".repeat(size); + assert_eq!(escape(&s), serde_json::to_string(&s).unwrap()); + + // With escape at the end + let mut s_esc = "a".repeat(size - 1); + s_esc.push('"'); + assert_eq!(escape(&s_esc), serde_json::to_string(&s_esc).unwrap()); + } + } + + #[test] + fn test_all_escape_types() { + // Test each escape type individually + assert_eq!(escape("\x00"), r#""\u0000""#); + assert_eq!(escape("\x08"), r#""\b""#); + assert_eq!(escape("\x09"), r#""\t""#); + assert_eq!(escape("\x0A"), r#""\n""#); + assert_eq!(escape("\x0C"), r#""\f""#); + assert_eq!(escape("\x0D"), r#""\r""#); + assert_eq!(escape("\x1F"), r#""\u001f""#); + assert_eq!(escape("\""), r#""\"""#); + assert_eq!(escape("\\"), r#""\\""#); + + // Test all control characters + for i in 0u8..32 { + let s = String::from_utf8(vec![i]).unwrap(); + let result = escape(&s); + let expected = serde_json::to_string(&s).unwrap(); + assert_eq!(result, expected, "Failed for byte 0x{:02x}", i); + } } -} -#[test] -fn test_mixed_content() { - // Mix of ASCII, escapes, and multi-byte UTF-8 - let mixed = r#"Hello "World"! + #[test] + fn test_mixed_content() { + // Mix of ASCII, escapes, and multi-byte UTF-8 + let mixed = r#"Hello "World"! Tab: Here Emoji: 😀 Chinese: 中文 Math: ∑∫∂ Music: 𝄞 Escape: \" \\ \n \r \t"#; - assert_eq!(escape(mixed), serde_json::to_string(mixed).unwrap()); -} - -#[test] -fn test_repeated_patterns() { - // Patterns that might benefit from or confuse SIMD operations - let pattern1 = "abcd".repeat(100); - assert_eq!(escape(&pattern1), serde_json::to_string(&pattern1).unwrap()); + assert_eq!(escape(mixed), serde_json::to_string(mixed).unwrap()); + } - let pattern2 = "a\"b\"".repeat(100); - assert_eq!(escape(&pattern2), serde_json::to_string(&pattern2).unwrap()); + #[test] + fn test_repeated_patterns() { + // Patterns that might benefit from or confuse SIMD operations + let pattern1 = "abcd".repeat(100); + assert_eq!(escape(&pattern1), serde_json::to_string(&pattern1).unwrap()); - let pattern3 = "\t\n".repeat(100); - assert_eq!(escape(&pattern3), serde_json::to_string(&pattern3).unwrap()); -} + let pattern2 = "a\"b\"".repeat(100); + assert_eq!(escape(&pattern2), serde_json::to_string(&pattern2).unwrap()); -#[test] -fn test_rxjs() { - let dir = glob::glob("node_modules/rxjs/src/**/*.ts").unwrap(); - let mut sources = Vec::new(); - for entry in dir { - sources.push(std::fs::read_to_string(entry.unwrap()).unwrap()); + let pattern3 = "\t\n".repeat(100); + assert_eq!(escape(&pattern3), serde_json::to_string(&pattern3).unwrap()); } - assert!(!sources.is_empty()); - for source in sources { - assert_eq!(escape(&source), serde_json::to_string(&source).unwrap()); - let mut output = String::new(); - escape_into(&source, unsafe { output.as_mut_vec() }); - assert_eq!(output, serde_json::to_string(&source).unwrap()); + + #[test] + fn test_rxjs() { + let mut sources = Vec::new(); + read_dir_recursive("node_modules/rxjs/src", &mut sources, |p| { + matches!(p.extension().and_then(|e| e.to_str()), Some("ts")) + }) + .unwrap(); + assert!(!sources.is_empty()); + sources.shuffle(&mut rand::rng()); + for source in sources + .iter() + .take(if cfg!(miri) { 10 } else { sources.len() }) + { + assert_eq!(escape(&source), serde_json::to_string(&source).unwrap()); + let mut output = String::new(); + escape_into(&source, unsafe { output.as_mut_vec() }); + assert_eq!(output, serde_json::to_string(&source).unwrap()); + } } -} -#[test] -fn test_sources() { - let ts_paths = glob::glob("fixtures/**/*.ts").unwrap(); - let tsx_paths = glob::glob("fixtures/**/*.tsx").unwrap(); - let js_paths = glob::glob("fixtures/**/*.js").unwrap(); - let mjs_paths = glob::glob("fixtures/**/*.mjs").unwrap(); - let cjs_paths = glob::glob("fixtures/**/*.cjs").unwrap(); - let mut sources = Vec::new(); - for entry in ts_paths - .chain(tsx_paths) - .chain(js_paths) - .chain(mjs_paths) - .chain(cjs_paths) - { - let p = entry.unwrap(); - if std::fs::metadata(&p).unwrap().is_file() { - sources.push(std::fs::read_to_string(&p).unwrap()); + #[test] + fn test_sources() { + for source in load_affine_sources().unwrap() { + assert_eq!(escape(&source), serde_json::to_string(&source).unwrap()); + let mut output = String::with_capacity(source.len() * 6 + 32 + 3); + escape_into(&source, unsafe { output.as_mut_vec() }); + assert_eq!(output, serde_json::to_string(&source).unwrap()); } } - assert!(!sources.is_empty()); - for source in sources { - assert_eq!(escape(&source), serde_json::to_string(&source).unwrap()); - let mut output = String::new(); - escape_into(&source, unsafe { output.as_mut_vec() }); - assert_eq!(output, serde_json::to_string(&source).unwrap()); + + fn load_affine_sources() -> Result, std::io::Error> { + let mut sources = Vec::new(); + read_dir_recursive("fixtures", &mut sources, |p| { + matches!( + p.extension().and_then(|e| e.to_str()), + Some("ts") | Some("tsx") | Some("js") | Some("mjs") | Some("cjs") + ) + })?; + assert!(!sources.is_empty()); + let len = sources.len(); + sources.shuffle(&mut rand::rng()); + Ok(sources.into_iter().take(if cfg!(miri) { 10 } else { len })) + } + + fn read_dir_recursive, F: Fn(PathBuf) -> bool + Copy>( + dir: P, + sources: &mut Vec, + f: F, + ) -> Result<(), std::io::Error> { + let dir = read_dir(dir)?; + for entry in dir { + let p = entry?; + let metadata = std::fs::metadata(p.path())?; + if metadata.is_file() { + if f(p.path()) { + sources.push(std::fs::read_to_string(p.path())?); + } + } + if metadata.is_dir() { + read_dir_recursive(p.path(), sources, f)?; + } + } + Ok(()) } } diff --git a/src/x86.rs b/src/x86.rs deleted file mode 100644 index c43ae22..0000000 --- a/src/x86.rs +++ /dev/null @@ -1,677 +0,0 @@ -#![allow(unsafe_op_in_unsafe_fn)] - -use std::arch::x86_64::{ - __m128i, __m256i, __m512i, _MM_HINT_T0, _mm_add_epi8, _mm_cmpeq_epi8, _mm_cmpgt_epi8, - _mm_load_si128, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128, _mm_prefetch, _mm_set1_epi8, - _mm256_add_epi8, _mm256_cmpeq_epi8, _mm256_cmpgt_epi8, _mm256_load_si256, _mm256_loadu_si256, - _mm256_movemask_epi8, _mm256_or_si256, _mm256_set1_epi8, _mm512_cmpeq_epi8_mask, - _mm512_cmplt_epu8_mask, _mm512_load_si512, _mm512_loadu_si512, _mm512_set1_epi8, -}; - -use crate::generic::{ESCAPE, HEX_BYTES, UU}; - -// Constants for control character detection using signed comparison trick -const TRANSLATION_A: i8 = i8::MAX - 31i8; -const BELOW_A: i8 = i8::MAX - (31i8 - 0i8) - 1; -const B: i8 = 34i8; // '"' -const C: i8 = 92i8; // '\\' - -const M512_VECTOR_SIZE: usize = std::mem::size_of::<__m512i>(); -const M256_VECTOR_SIZE: usize = std::mem::size_of::<__m256i>(); -const M128_VECTOR_SIZE: usize = std::mem::size_of::<__m128i>(); -pub(crate) const LOOP_SIZE_AVX2: usize = 4 * M256_VECTOR_SIZE; // Process 128 bytes at a time -pub(crate) const LOOP_SIZE_AVX512: usize = 4 * M512_VECTOR_SIZE; // Process 256 bytes at a time -const PREFETCH_DISTANCE_AVX2: usize = 256; // Prefetch 256 bytes ahead for AVX2 -const PREFETCH_DISTANCE_AVX512: usize = 512; // Prefetch 512 bytes ahead for AVX512 - -#[inline(always)] -fn sub(a: *const u8, b: *const u8) -> usize { - debug_assert!(b <= a); - (a as usize) - (b as usize) -} - -#[target_feature(enable = "avx512f", enable = "avx512bw")] -#[inline] -pub unsafe fn escape_avx512(bytes: &[u8], result: &mut Vec) { - let len = bytes.len(); - - let start_ptr = bytes.as_ptr(); - let end_ptr = bytes[len..].as_ptr(); - let mut ptr = start_ptr; - let mut start = 0; - - let v_b = _mm512_set1_epi8(B); - let v_c = _mm512_set1_epi8(C); - let v_ctrl_limit = _mm512_set1_epi8(0x20); - - // Handle alignment - skip if already aligned - const M512_VECTOR_ALIGN: usize = M512_VECTOR_SIZE - 1; - let misalignment = start_ptr as usize & M512_VECTOR_ALIGN; - if misalignment != 0 { - let align = M512_VECTOR_SIZE - misalignment; - let a = _mm512_loadu_si512(ptr as *const __m512i); - - // Check for quotes, backslash, and control characters - let quote_mask = _mm512_cmpeq_epi8_mask(a, v_b); - let slash_mask = _mm512_cmpeq_epi8_mask(a, v_c); - let ctrl_mask = _mm512_cmplt_epu8_mask(a, v_ctrl_limit); - - let mut mask = (quote_mask | slash_mask | ctrl_mask) as u64; - if align < 64 { - mask &= (1u64 << align) - 1; - } - - if mask != 0 { - let at = sub(ptr, start_ptr); - while mask != 0 { - let cur = mask.trailing_zeros() as usize; - let c = *ptr.add(cur); - let escape_byte = ESCAPE[c as usize]; - debug_assert!(escape_byte != 0); - let i = at + cur; - if start < i { - result.extend_from_slice(&bytes[start..i]); - } - write_escape(result, escape_byte, c); - start = i + 1; - mask &= mask - 1; - } - } - ptr = ptr.add(align); - } - - // Main loop processing 256 bytes at a time - while ptr <= end_ptr.sub(LOOP_SIZE_AVX512) { - debug_assert_eq!(0, (ptr as usize) % M512_VECTOR_SIZE); - - // Prefetch next iteration's data - if ptr.add(LOOP_SIZE_AVX512 + PREFETCH_DISTANCE_AVX512) < end_ptr { - _mm_prefetch( - ptr.add(LOOP_SIZE_AVX512 + PREFETCH_DISTANCE_AVX512) as *const i8, - _MM_HINT_T0, - ); - } - - // Load all 4 vectors at once for better pipelining - let a0 = _mm512_load_si512(ptr as *const __m512i); - let a1 = _mm512_load_si512(ptr.add(M512_VECTOR_SIZE) as *const __m512i); - let a2 = _mm512_load_si512(ptr.add(M512_VECTOR_SIZE * 2) as *const __m512i); - let a3 = _mm512_load_si512(ptr.add(M512_VECTOR_SIZE * 3) as *const __m512i); - - // Check for quotes (") in all vectors - let quote_0 = _mm512_cmpeq_epi8_mask(a0, v_b); - let quote_1 = _mm512_cmpeq_epi8_mask(a1, v_b); - let quote_2 = _mm512_cmpeq_epi8_mask(a2, v_b); - let quote_3 = _mm512_cmpeq_epi8_mask(a3, v_b); - - // Check for backslash (\) in all vectors - let slash_0 = _mm512_cmpeq_epi8_mask(a0, v_c); - let slash_1 = _mm512_cmpeq_epi8_mask(a1, v_c); - let slash_2 = _mm512_cmpeq_epi8_mask(a2, v_c); - let slash_3 = _mm512_cmpeq_epi8_mask(a3, v_c); - - // Check for control characters (< 0x20) in all vectors - let ctrl_0 = _mm512_cmplt_epu8_mask(a0, v_ctrl_limit); - let ctrl_1 = _mm512_cmplt_epu8_mask(a1, v_ctrl_limit); - let ctrl_2 = _mm512_cmplt_epu8_mask(a2, v_ctrl_limit); - let ctrl_3 = _mm512_cmplt_epu8_mask(a3, v_ctrl_limit); - - // Combine all masks - let mask_a = quote_0 | slash_0 | ctrl_0; - let mask_b = quote_1 | slash_1 | ctrl_1; - let mask_c = quote_2 | slash_2 | ctrl_2; - let mask_d = quote_3 | slash_3 | ctrl_3; - - // Fast path: check if any escaping needed - let any_escape = mask_a | mask_b | mask_c | mask_d; - - if any_escape == 0 { - // No escapes needed, copy whole chunk - if start < sub(ptr, start_ptr) { - result.extend_from_slice(&bytes[start..sub(ptr, start_ptr)]); - } - result.extend_from_slice(std::slice::from_raw_parts(ptr, LOOP_SIZE_AVX512)); - start = sub(ptr, start_ptr) + LOOP_SIZE_AVX512; - } else { - // Process each 64-byte chunk that has escapes - process_mask_avx512(ptr, start_ptr, result, &mut start, bytes, mask_a, 0); - process_mask_avx512( - ptr, - start_ptr, - result, - &mut start, - bytes, - mask_b, - M512_VECTOR_SIZE, - ); - process_mask_avx512( - ptr, - start_ptr, - result, - &mut start, - bytes, - mask_c, - M512_VECTOR_SIZE * 2, - ); - process_mask_avx512( - ptr, - start_ptr, - result, - &mut start, - bytes, - mask_d, - M512_VECTOR_SIZE * 3, - ); - } - - ptr = ptr.add(LOOP_SIZE_AVX512); - } - - // Process remaining aligned chunks - while ptr <= end_ptr.sub(M512_VECTOR_SIZE) { - debug_assert_eq!(0, (ptr as usize) % M512_VECTOR_SIZE); - let a = _mm512_load_si512(ptr as *const __m512i); - - let quote_mask = _mm512_cmpeq_epi8_mask(a, v_b); - let slash_mask = _mm512_cmpeq_epi8_mask(a, v_c); - let ctrl_mask = _mm512_cmplt_epu8_mask(a, v_ctrl_limit); - - let mut mask = (quote_mask | slash_mask | ctrl_mask) as u64; - - if mask != 0 { - let at = sub(ptr, start_ptr); - while mask != 0 { - let cur = mask.trailing_zeros() as usize; - let c = *ptr.add(cur); - let escape_byte = ESCAPE[c as usize]; - debug_assert!(escape_byte != 0); - let i = at + cur; - if start < i { - result.extend_from_slice(&bytes[start..i]); - } - write_escape(result, escape_byte, c); - start = i + 1; - mask &= mask - 1; - } - } - ptr = ptr.add(M512_VECTOR_SIZE); - } - - // Handle tail - if ptr < end_ptr { - let d = M512_VECTOR_SIZE - sub(end_ptr, ptr); - let a = _mm512_loadu_si512(ptr.sub(d) as *const __m512i); - - let quote_mask = _mm512_cmpeq_epi8_mask(a, v_b); - let slash_mask = _mm512_cmpeq_epi8_mask(a, v_c); - let ctrl_mask = _mm512_cmplt_epu8_mask(a, v_ctrl_limit); - - let mut mask = ((quote_mask | slash_mask | ctrl_mask) as u64).wrapping_shr(d as u32); - - if mask != 0 { - let at = sub(ptr, start_ptr); - while mask != 0 { - let cur = mask.trailing_zeros() as usize; - let c = *ptr.add(cur); - let escape_byte = ESCAPE[c as usize]; - debug_assert!(escape_byte != 0); - let i = at + cur; - if start < i { - result.extend_from_slice(&bytes[start..i]); - } - write_escape(result, escape_byte, c); - start = i + 1; - mask &= mask - 1; - } - } - } - - // Copy any remaining bytes - if start < len { - result.extend_from_slice(&bytes[start..]); - } -} - -#[target_feature(enable = "avx2")] -#[inline] -pub unsafe fn escape_avx2(bytes: &[u8], result: &mut Vec) { - let len = bytes.len(); - - let start_ptr = bytes.as_ptr(); - let end_ptr = bytes[len..].as_ptr(); - let mut ptr = start_ptr; - let mut start = 0; - - let v_translation_a = _mm256_set1_epi8(TRANSLATION_A); - let v_below_a = _mm256_set1_epi8(BELOW_A); - let v_b = _mm256_set1_epi8(B); - let v_c = _mm256_set1_epi8(C); - - // Handle alignment - skip if already aligned - const M256_VECTOR_ALIGN: usize = M256_VECTOR_SIZE - 1; - let misalignment = start_ptr as usize & M256_VECTOR_ALIGN; - if misalignment != 0 { - let align = M256_VECTOR_SIZE - misalignment; - let mut mask = { - let a = _mm256_loadu_si256(ptr as *const __m256i); - _mm256_movemask_epi8(_mm256_or_si256( - _mm256_or_si256(_mm256_cmpeq_epi8(a, v_b), _mm256_cmpeq_epi8(a, v_c)), - _mm256_cmpgt_epi8(_mm256_add_epi8(a, v_translation_a), v_below_a), - )) - }; - - if mask != 0 { - let at = sub(ptr, start_ptr); - let mut cur = mask.trailing_zeros() as usize; - while cur < align { - let c = *ptr.add(cur); - let escape_byte = ESCAPE[c as usize]; - if escape_byte != 0 { - let i = at + cur; - if start < i { - result.extend_from_slice(&bytes[start..i]); - } - write_escape(result, escape_byte, c); - start = i + 1; - } - mask ^= 1 << cur; - if mask == 0 { - break; - } - cur = mask.trailing_zeros() as usize; - } - } - ptr = ptr.add(align); - } - - // Main loop processing 128 bytes at a time - while ptr <= end_ptr.sub(LOOP_SIZE_AVX2) { - debug_assert_eq!(0, (ptr as usize) % M256_VECTOR_SIZE); - - // Prefetch next iteration's data - if ptr.add(LOOP_SIZE_AVX2 + PREFETCH_DISTANCE_AVX2) < end_ptr { - _mm_prefetch( - ptr.add(LOOP_SIZE_AVX2 + PREFETCH_DISTANCE_AVX2) as *const i8, - _MM_HINT_T0, - ); - } - - // Load all 4 vectors at once for better pipelining - let a0 = _mm256_load_si256(ptr as *const __m256i); - let a1 = _mm256_load_si256(ptr.add(M256_VECTOR_SIZE) as *const __m256i); - let a2 = _mm256_load_si256(ptr.add(M256_VECTOR_SIZE * 2) as *const __m256i); - let a3 = _mm256_load_si256(ptr.add(M256_VECTOR_SIZE * 3) as *const __m256i); - - // Check for quotes (") in all vectors - let quote_0 = _mm256_cmpeq_epi8(a0, v_b); - let quote_1 = _mm256_cmpeq_epi8(a1, v_b); - let quote_2 = _mm256_cmpeq_epi8(a2, v_b); - let quote_3 = _mm256_cmpeq_epi8(a3, v_b); - - // Check for backslash (\) in all vectors - let slash_0 = _mm256_cmpeq_epi8(a0, v_c); - let slash_1 = _mm256_cmpeq_epi8(a1, v_c); - let slash_2 = _mm256_cmpeq_epi8(a2, v_c); - let slash_3 = _mm256_cmpeq_epi8(a3, v_c); - - // Check for control characters (< 0x20) in all vectors - let ctrl_0 = _mm256_cmpgt_epi8(_mm256_add_epi8(a0, v_translation_a), v_below_a); - let ctrl_1 = _mm256_cmpgt_epi8(_mm256_add_epi8(a1, v_translation_a), v_below_a); - let ctrl_2 = _mm256_cmpgt_epi8(_mm256_add_epi8(a2, v_translation_a), v_below_a); - let ctrl_3 = _mm256_cmpgt_epi8(_mm256_add_epi8(a3, v_translation_a), v_below_a); - - // Combine all masks - let cmp_a = _mm256_or_si256(_mm256_or_si256(quote_0, slash_0), ctrl_0); - let cmp_b = _mm256_or_si256(_mm256_or_si256(quote_1, slash_1), ctrl_1); - let cmp_c = _mm256_or_si256(_mm256_or_si256(quote_2, slash_2), ctrl_2); - let cmp_d = _mm256_or_si256(_mm256_or_si256(quote_3, slash_3), ctrl_3); - - // Fast path: check if any escaping needed - let any_escape = - _mm256_or_si256(_mm256_or_si256(cmp_a, cmp_b), _mm256_or_si256(cmp_c, cmp_d)); - - if _mm256_movemask_epi8(any_escape) == 0 { - // No escapes needed, copy whole chunk - if start < sub(ptr, start_ptr) { - result.extend_from_slice(&bytes[start..sub(ptr, start_ptr)]); - } - result.extend_from_slice(std::slice::from_raw_parts(ptr, LOOP_SIZE_AVX2)); - start = sub(ptr, start_ptr) + LOOP_SIZE_AVX2; - } else { - // Get individual masks only when needed - let mask_a = _mm256_movemask_epi8(cmp_a); - let mask_b = _mm256_movemask_epi8(cmp_b); - let mask_c = _mm256_movemask_epi8(cmp_c); - let mask_d = _mm256_movemask_epi8(cmp_d); - - // Process each 32-byte chunk that has escapes - process_mask_avx(ptr, start_ptr, result, &mut start, bytes, mask_a, 0); - process_mask_avx( - ptr, - start_ptr, - result, - &mut start, - bytes, - mask_b, - M256_VECTOR_SIZE, - ); - process_mask_avx( - ptr, - start_ptr, - result, - &mut start, - bytes, - mask_c, - M256_VECTOR_SIZE * 2, - ); - process_mask_avx( - ptr, - start_ptr, - result, - &mut start, - bytes, - mask_d, - M256_VECTOR_SIZE * 3, - ); - } - - ptr = ptr.add(LOOP_SIZE_AVX2); - } - - // Process remaining aligned chunks - while ptr <= end_ptr.sub(M256_VECTOR_SIZE) { - debug_assert_eq!(0, (ptr as usize) % M256_VECTOR_SIZE); - let mut mask = { - let a = _mm256_load_si256(ptr as *const __m256i); - _mm256_movemask_epi8(_mm256_or_si256( - _mm256_or_si256(_mm256_cmpeq_epi8(a, v_b), _mm256_cmpeq_epi8(a, v_c)), - _mm256_cmpgt_epi8(_mm256_add_epi8(a, v_translation_a), v_below_a), - )) - }; - - if mask != 0 { - let at = sub(ptr, start_ptr); - let mut cur = mask.trailing_zeros() as usize; - loop { - let c = *ptr.add(cur); - let escape_byte = ESCAPE[c as usize]; - if escape_byte != 0 { - let i = at + cur; - if start < i { - result.extend_from_slice(&bytes[start..i]); - } - write_escape(result, escape_byte, c); - start = i + 1; - } - mask ^= 1 << cur; - if mask == 0 { - break; - } - cur = mask.trailing_zeros() as usize; - } - } - ptr = ptr.add(M256_VECTOR_SIZE); - } - - // Handle tail - if ptr < end_ptr { - let d = M256_VECTOR_SIZE - sub(end_ptr, ptr); - let mut mask = ({ - let a = _mm256_loadu_si256(ptr.sub(d) as *const __m256i); - _mm256_movemask_epi8(_mm256_or_si256( - _mm256_or_si256(_mm256_cmpeq_epi8(a, v_b), _mm256_cmpeq_epi8(a, v_c)), - _mm256_cmpgt_epi8(_mm256_add_epi8(a, v_translation_a), v_below_a), - )) - } as u32) - .wrapping_shr(d as u32); - - if mask != 0 { - let at = sub(ptr, start_ptr); - let mut cur = mask.trailing_zeros() as usize; - loop { - let c = *ptr.add(cur); - let escape_byte = ESCAPE[c as usize]; - if escape_byte != 0 { - let i = at + cur; - if start < i { - result.extend_from_slice(&bytes[start..i]); - } - write_escape(result, escape_byte, c); - start = i + 1; - } - mask ^= 1 << cur; - if mask == 0 { - break; - } - cur = mask.trailing_zeros() as usize; - } - } - } - - // Copy any remaining bytes - if start < len { - result.extend_from_slice(&bytes[start..]); - } -} - -#[target_feature(enable = "sse2")] -#[inline] -pub unsafe fn escape_sse2(bytes: &[u8], result: &mut Vec) { - let len = bytes.len(); - - let start_ptr = bytes.as_ptr(); - let end_ptr = bytes[len..].as_ptr(); - let mut ptr = start_ptr; - let mut start = 0; - - const M128_VECTOR_ALIGN: usize = M128_VECTOR_SIZE - 1; - - let v_translation_a = _mm_set1_epi8(TRANSLATION_A); - let v_below_a = _mm_set1_epi8(BELOW_A); - let v_b = _mm_set1_epi8(B); - let v_c = _mm_set1_epi8(C); - - // Handle alignment - skip if already aligned - let misalignment = start_ptr as usize & M128_VECTOR_ALIGN; - if misalignment != 0 { - let align = M128_VECTOR_SIZE - misalignment; - let mut mask = { - let a = _mm_loadu_si128(ptr as *const __m128i); - _mm_movemask_epi8(_mm_or_si128( - _mm_or_si128(_mm_cmpeq_epi8(a, v_b), _mm_cmpeq_epi8(a, v_c)), - _mm_cmpgt_epi8(_mm_add_epi8(a, v_translation_a), v_below_a), - )) - }; - - if mask != 0 { - let at = sub(ptr, start_ptr); - let mut cur = mask.trailing_zeros() as usize; - while cur < align { - let c = *ptr.add(cur); - let escape_byte = ESCAPE[c as usize]; - if escape_byte != 0 { - let i = at + cur; - if start < i { - result.extend_from_slice(&bytes[start..i]); - } - write_escape(result, escape_byte, c); - start = i + 1; - } - mask ^= 1 << cur; - if mask == 0 { - break; - } - cur = mask.trailing_zeros() as usize; - } - } - ptr = ptr.add(align); - } - - // Main loop - while ptr <= end_ptr.sub(M128_VECTOR_SIZE) { - debug_assert_eq!(0, (ptr as usize) % M128_VECTOR_SIZE); - let mut mask = { - let a = _mm_load_si128(ptr as *const __m128i); - _mm_movemask_epi8(_mm_or_si128( - _mm_or_si128(_mm_cmpeq_epi8(a, v_b), _mm_cmpeq_epi8(a, v_c)), - _mm_cmpgt_epi8(_mm_add_epi8(a, v_translation_a), v_below_a), - )) - }; - - if mask != 0 { - let at = sub(ptr, start_ptr); - let mut cur = mask.trailing_zeros() as usize; - loop { - let c = *ptr.add(cur); - let escape_byte = ESCAPE[c as usize]; - if escape_byte != 0 { - let i = at + cur; - if start < i { - result.extend_from_slice(&bytes[start..i]); - } - write_escape(result, escape_byte, c); - start = i + 1; - } - mask ^= 1 << cur; - if mask == 0 { - break; - } - cur = mask.trailing_zeros() as usize; - } - } - ptr = ptr.add(M128_VECTOR_SIZE); - } - - // Handle tail - if ptr < end_ptr { - let d = M128_VECTOR_SIZE - sub(end_ptr, ptr); - let mut mask = ({ - let a = _mm_loadu_si128(ptr.sub(d) as *const __m128i); - _mm_movemask_epi8(_mm_or_si128( - _mm_or_si128(_mm_cmpeq_epi8(a, v_b), _mm_cmpeq_epi8(a, v_c)), - _mm_cmpgt_epi8(_mm_add_epi8(a, v_translation_a), v_below_a), - )) - } as u16) - .wrapping_shr(d as u32); - - if mask != 0 { - let at = sub(ptr, start_ptr); - let mut cur = mask.trailing_zeros() as usize; - loop { - let c = *ptr.add(cur); - let escape_byte = ESCAPE[c as usize]; - if escape_byte != 0 { - let i = at + cur; - if start < i { - result.extend_from_slice(&bytes[start..i]); - } - write_escape(result, escape_byte, c); - start = i + 1; - } - mask ^= 1 << cur; - if mask == 0 { - break; - } - cur = mask.trailing_zeros() as usize; - } - } - } - - // Copy any remaining bytes - if start < len { - result.extend_from_slice(&bytes[start..]); - } -} - -#[inline(always)] -unsafe fn process_mask_avx( - ptr: *const u8, - start_ptr: *const u8, - result: &mut Vec, - start: &mut usize, - bytes: &[u8], - mask: i32, - offset: usize, -) { - if mask == 0 { - return; - } - - let ptr = ptr.add(offset); - let at = sub(ptr, start_ptr); - - // Process mask bits using bit manipulation - let mut remaining = mask as u32; - while remaining != 0 { - let cur = remaining.trailing_zeros() as usize; - let c = *ptr.add(cur); - let escape_byte = ESCAPE[c as usize]; - debug_assert!(escape_byte != 0); - - let i = at + cur; - // Copy unescaped portion if needed - if *start < i { - result.extend_from_slice(&bytes[*start..i]); - } - // Write escape sequence - write_escape(result, escape_byte, c); - *start = i + 1; - - // Clear the lowest set bit - remaining &= remaining - 1; - } -} - -#[inline(always)] -unsafe fn process_mask_avx512( - ptr: *const u8, - start_ptr: *const u8, - result: &mut Vec, - start: &mut usize, - bytes: &[u8], - mask: u64, - offset: usize, -) { - if mask == 0 { - return; - } - - let ptr = ptr.add(offset); - let at = sub(ptr, start_ptr); - - // Process mask bits using bit manipulation - let mut remaining = mask; - while remaining != 0 { - let cur = remaining.trailing_zeros() as usize; - let c = *ptr.add(cur); - let escape_byte = ESCAPE[c as usize]; - debug_assert!(escape_byte != 0); - - let i = at + cur; - // Copy unescaped portion if needed - if *start < i { - result.extend_from_slice(&bytes[*start..i]); - } - // Write escape sequence - write_escape(result, escape_byte, c); - *start = i + 1; - - // Clear the lowest set bit - remaining &= remaining - 1; - } -} - -#[inline(always)] -fn write_escape(result: &mut Vec, escape_byte: u8, c: u8) { - result.push(b'\\'); - if escape_byte == UU { - // Unicode escape for control characters - result.extend_from_slice(b"u00"); - let hex_digits = &HEX_BYTES[c as usize]; - result.push(hex_digits.0); - result.push(hex_digits.1); - } else { - // Simple escape - result.push(escape_byte); - } -}