From 46cc6e61d6ddff1f8d6bb107d067c30d846011e7 Mon Sep 17 00:00:00 2001
From: LongYinan <lynweklm@gmail.com>
Date: Sat, 11 Oct 2025 14:40:44 +0800
Subject: [PATCH 01/15] bench: add sonic-rs benchmark

---
 Cargo.lock         | 413 +++++++++++++++++++++++++++++----------------
 Cargo.toml         |   6 +-
 benches/escape.rs  |   8 +
 examples/escape.rs |   2 +
 4 files changed, 278 insertions(+), 151 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 1602506..fa43b7a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3,25 +3,23 @@
 version = 4
 
 [[package]]
-name = "aho-corasick"
-version = "1.1.3"
+name = "ahash"
+version = "0.8.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
+checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
 dependencies = [
- "memchr",
+ "cfg-if",
+ "getrandom",
+ "once_cell",
+ "version_check",
+ "zerocopy",
 ]
 
 [[package]]
 name = "anes"
-version = "0.1.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
-
-[[package]]
-name = "anstyle"
-version = "1.0.11"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd"
+checksum = "dc43e46599f3d77fcf2f2ca89e4d962910b0c19c44e7b58679cbbdfd1820a662"
 
 [[package]]
 name = "anyhow"
@@ -59,12 +57,24 @@ version = "2.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394"
 
+[[package]]
+name = "bpaf"
+version = "0.9.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "473976d7a8620bb1e06dcdd184407c2363fe4fec8e983ee03ed9197222634a31"
+
 [[package]]
 name = "bumpalo"
 version = "3.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43"
 
+[[package]]
+name = "bytes"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
+
 [[package]]
 name = "cast"
 version = "0.3.0"
@@ -110,31 +120,6 @@ dependencies = [
  "half",
 ]
 
-[[package]]
-name = "clap"
-version = "4.5.48"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2134bb3ea021b78629caa971416385309e0131b351b25e01dc16fb54e1b5fae"
-dependencies = [
- "clap_builder",
-]
-
-[[package]]
-name = "clap_builder"
-version = "4.5.48"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c2ba64afa3c0a6df7fa517765e31314e983f51dda798ffba27b988194fb65dc9"
-dependencies = [
- "anstyle",
- "clap_lex",
-]
-
-[[package]]
-name = "clap_lex"
-version = "0.7.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675"
-
 [[package]]
 name = "codspeed"
 version = "3.0.5"
@@ -143,7 +128,7 @@ checksum = "35584c5fcba8059780748866387fb97c5a203bcfc563fc3d0790af406727a117"
 dependencies = [
  "anyhow",
  "bincode",
- "colored",
+ "colored 2.2.0",
  "glob",
  "libc",
  "nix",
@@ -154,50 +139,21 @@ dependencies = [
 ]
 
 [[package]]
-name = "codspeed-criterion-compat"
-version = "3.0.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78f6c1c6bed5fd84d319e8b0889da051daa361c79b7709c9394dfe1a882bba67"
-dependencies = [
- "codspeed",
- "codspeed-criterion-compat-walltime",
- "colored",
-]
-
-[[package]]
-name = "codspeed-criterion-compat-walltime"
-version = "3.0.5"
+name = "colored"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c989289ce6b1cbde72ed560496cb8fbf5aa14d5ef5666f168e7f87751038352e"
+checksum = "117725a109d387c937a1533ce01b450cbde6b88abceea8473c4d7a85853cda3c"
 dependencies = [
- "anes",
- "cast",
- "ciborium",
- "clap",
- "codspeed",
- "criterion-plot",
- "is-terminal",
- "itertools",
- "num-traits",
- "once_cell",
- "oorandom",
- "plotters",
- "rayon",
- "regex",
- "serde",
- "serde_derive",
- "serde_json",
- "tinytemplate",
- "walkdir",
+ "lazy_static",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
 name = "colored"
-version = "2.2.0"
+version = "3.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "117725a109d387c937a1533ce01b450cbde6b88abceea8473c4d7a85853cda3c"
+checksum = "fde0e0ec90c9dfb3b4b1a0891a7dcd0e2bffde2f7efed5fe7c9bb00e5bfb915e"
 dependencies = [
- "lazy_static",
  "windows-sys 0.59.0",
 ]
 
@@ -206,13 +162,23 @@ name = "cpu-features"
 version = "0.1.0"
 
 [[package]]
-name = "criterion-plot"
-version = "0.5.0"
+name = "criterion2"
+version = "3.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
+checksum = "77cd1059d67baa066c334993d8d6e757ad257d21030db6a9a945dddbb559d4fe"
 dependencies = [
+ "anes",
+ "bpaf",
  "cast",
- "itertools",
+ "ciborium",
+ "codspeed",
+ "colored 3.0.0",
+ "num-traits",
+ "oorandom",
+ "rayon",
+ "serde",
+ "serde_json",
+ "walkdir",
 ]
 
 [[package]]
@@ -252,6 +218,24 @@ version = "1.15.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
 
+[[package]]
+name = "equivalent"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
+
+[[package]]
+name = "faststr"
+version = "0.2.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baec6a0289d7f1fe5665586ef7340af82e3037207bef60f5785e57569776f0c8"
+dependencies = [
+ "bytes",
+ "rkyv",
+ "serde",
+ "simdutf8",
+]
+
 [[package]]
 name = "getrandom"
 version = "0.3.3"
@@ -281,29 +265,25 @@ dependencies = [
 ]
 
 [[package]]
-name = "hermit-abi"
-version = "0.5.2"
+name = "hashbrown"
+version = "0.15.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
+checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
 
 [[package]]
-name = "is-terminal"
-version = "0.4.16"
+name = "hashbrown"
+version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9"
-dependencies = [
- "hermit-abi",
- "libc",
- "windows-sys 0.59.0",
-]
+checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d"
 
 [[package]]
-name = "itertools"
-version = "0.10.5"
+name = "indexmap"
+version = "2.11.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
+checksum = "4b0f83760fb341a774ed326568e19f5a863af4a952def8c39f9ab92fd95b88e5"
 dependencies = [
- "either",
+ "equivalent",
+ "hashbrown 0.16.0",
 ]
 
 [[package]]
@@ -324,9 +304,9 @@ dependencies = [
 
 [[package]]
 name = "json-escape"
-version = "0.1.1"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3100a96840f2d70dd1c336ec5f76f160edfd56d8e9de2a949d440c0805459cce"
+checksum = "b330a4975b953eb01659ab24f612fa9461a71e4279c25e02689d0c5db03c20e5"
 dependencies = [
  "memchr",
 ]
@@ -336,10 +316,12 @@ name = "json-escape-simd"
 version = "1.1.0"
 dependencies = [
  "anyhow",
- "codspeed-criterion-compat",
+ "criterion2",
  "glob",
  "json-escape",
+ "serde",
  "serde_json",
+ "sonic-rs",
  "v_jsonescape",
 ]
 
@@ -367,6 +349,26 @@ version = "2.7.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0"
 
+[[package]]
+name = "munge"
+version = "0.4.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e17401f259eba956ca16491461b6e8f72913a0a114e39736ce404410f915a0c"
+dependencies = [
+ "munge_macro",
+]
+
+[[package]]
+name = "munge_macro"
+version = "0.4.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4568f25ccbd45ab5d5603dc34318c1ec56b117531781260002151b8530a9f931"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "nix"
 version = "0.29.0"
@@ -401,40 +403,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
 
 [[package]]
-name = "plotters"
-version = "0.3.7"
+name = "proc-macro2"
+version = "1.0.101"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747"
+checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de"
 dependencies = [
- "num-traits",
- "plotters-backend",
- "plotters-svg",
- "wasm-bindgen",
- "web-sys",
+ "unicode-ident",
 ]
 
 [[package]]
-name = "plotters-backend"
-version = "0.3.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a"
-
-[[package]]
-name = "plotters-svg"
-version = "0.3.7"
+name = "ptr_meta"
+version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670"
+checksum = "0b9a0cf95a1196af61d4f1cbdab967179516d9a4a4312af1f31948f8f6224a79"
 dependencies = [
- "plotters-backend",
+ "ptr_meta_derive",
 ]
 
 [[package]]
-name = "proc-macro2"
-version = "1.0.101"
+name = "ptr_meta_derive"
+version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de"
+checksum = "7347867d0a7e1208d93b46767be83e2b8f978c3dad35f775ac8d8847551d6fe1"
 dependencies = [
- "unicode-ident",
+ "proc-macro2",
+ "quote",
+ "syn",
 ]
 
 [[package]]
@@ -452,6 +446,15 @@ version = "5.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
 
+[[package]]
+name = "rancor"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a063ea72381527c2a0561da9c80000ef822bdd7c3241b1cc1b12100e3df081ee"
+dependencies = [
+ "ptr_meta",
+]
+
 [[package]]
 name = "rayon"
 version = "1.11.0"
@@ -473,33 +476,59 @@ dependencies = [
 ]
 
 [[package]]
-name = "regex"
-version = "1.11.2"
+name = "ref-cast"
+version = "1.0.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "23d7fd106d8c02486a8d64e778353d1cffe08ce79ac2e82f540c86d0facf6912"
+checksum = "f354300ae66f76f1c85c5f84693f0ce81d747e2c3f21a45fef496d89c960bf7d"
 dependencies = [
- "aho-corasick",
- "memchr",
- "regex-automata",
- "regex-syntax",
+ "ref-cast-impl",
 ]
 
 [[package]]
-name = "regex-automata"
-version = "0.4.10"
+name = "ref-cast-impl"
+version = "1.0.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b9458fa0bfeeac22b5ca447c63aaf45f28439a709ccd244698632f9aa6394d6"
+checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da"
 dependencies = [
- "aho-corasick",
- "memchr",
- "regex-syntax",
+ "proc-macro2",
+ "quote",
+ "syn",
 ]
 
 [[package]]
-name = "regex-syntax"
-version = "0.8.6"
+name = "rend"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cadadef317c2f20755a64d7fdc48f9e7178ee6b0e1f7fce33fa60f1d68a276e6"
+
+[[package]]
+name = "rkyv"
+version = "0.8.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "35a640b26f007713818e9a9b65d34da1cf58538207b052916a83d80e43f3ffa4"
+dependencies = [
+ "bytes",
+ "hashbrown 0.15.5",
+ "indexmap",
+ "munge",
+ "ptr_meta",
+ "rancor",
+ "rend",
+ "rkyv_derive",
+ "tinyvec",
+ "uuid",
+]
+
+[[package]]
+name = "rkyv_derive"
+version = "0.8.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001"
+checksum = "bd83f5f173ff41e00337d97f6572e416d022ef8a19f371817259ae960324c482"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
 
 [[package]]
 name = "rustversion"
@@ -565,6 +594,51 @@ dependencies = [
  "serde_core",
 ]
 
+[[package]]
+name = "simdutf8"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e"
+
+[[package]]
+name = "sonic-number"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8a74044c092f4f43ca7a6cfd62854cf9fb5ac8502b131347c990bf22bef1dfe"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "sonic-rs"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22540d56ba14521e4878ad436d498518c59698c39a89d5905c694932f0bf7134"
+dependencies = [
+ "ahash",
+ "bumpalo",
+ "bytes",
+ "cfg-if",
+ "faststr",
+ "itoa",
+ "ref-cast",
+ "ryu",
+ "serde",
+ "simdutf8",
+ "sonic-number",
+ "sonic-simd",
+ "thiserror",
+]
+
+[[package]]
+name = "sonic-simd"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b421f7b6aa4a5de8f685aaf398dfaa828346ee639d2b1c1061ab43d40baa6223"
+dependencies = [
+ "cfg-if",
+]
+
 [[package]]
 name = "statrs"
 version = "0.18.0"
@@ -587,15 +661,40 @@ dependencies = [
 ]
 
 [[package]]
-name = "tinytemplate"
-version = "1.2.1"
+name = "thiserror"
+version = "2.0.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
+checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8"
 dependencies = [
- "serde",
- "serde_json",
+ "thiserror-impl",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "2.0.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
 ]
 
+[[package]]
+name = "tinyvec"
+version = "1.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa"
+dependencies = [
+ "tinyvec_macros",
+]
+
+[[package]]
+name = "tinyvec_macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
+
 [[package]]
 name = "unicode-ident"
 version = "1.0.19"
@@ -619,6 +718,12 @@ version = "0.7.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "be8219cc464ba10c48c3231a6871f11d26d831c5c45a47467eea387ea7bb10e8"
 
+[[package]]
+name = "version_check"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
+
 [[package]]
 name = "walkdir"
 version = "2.5.0"
@@ -706,16 +811,6 @@ dependencies = [
  "unicode-ident",
 ]
 
-[[package]]
-name = "web-sys"
-version = "0.3.80"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fbe734895e869dc429d78c4b433f8d17d95f8d05317440b4fad5ab2d33e596dc"
-dependencies = [
- "js-sys",
- "wasm-bindgen",
-]
-
 [[package]]
 name = "winapi-util"
 version = "0.1.11"
@@ -818,3 +913,23 @@ name = "wit-bindgen"
 version = "0.46.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59"
+
+[[package]]
+name = "zerocopy"
+version = "0.8.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c"
+dependencies = [
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.8.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
diff --git a/Cargo.toml b/Cargo.toml
index c9d15bb..1f88176 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,7 +18,7 @@ path = "examples/escape.rs"
 
 [features]
 force_aarch64_neon = [] # Force use of neon implementation on aarch64
-codspeed = []
+codspeed = ["criterion2/codspeed"]
 
 [[bench]]
 name = "escape"
@@ -28,11 +28,13 @@ harness = false
 anyhow = "1"
 
 [dev-dependencies]
-criterion = { version = "3.0.5", package = "codspeed-criterion-compat" }
+criterion2 = "3"
 glob = "0.3"
+serde = "1"
 serde_json = "1"
 v_jsonescape = "0.7"
 json-escape = "0.3.0"
+sonic-rs = "0.5"
 
 [profile.bench]
 lto = true
diff --git a/benches/escape.rs b/benches/escape.rs
index ef36350..e5a6435 100644
--- a/benches/escape.rs
+++ b/benches/escape.rs
@@ -46,6 +46,14 @@ fn run_benchmarks(c: &mut Criterion, sources: &[String], prefix: &str) {
         })
     });
     #[cfg(not(feature = "codspeed"))]
+    c.bench_function(&format!("{} escape sonic", prefix), |b| {
+        b.iter(|| {
+            for source in sources {
+                black_box(sonic_rs::to_string(source).unwrap());
+            }
+        })
+    });
+    #[cfg(not(feature = "codspeed"))]
     c.bench_function(&format!("{} escape v_jsonescape", prefix), |b| {
         b.iter(|| {
             for source in sources {
diff --git a/examples/escape.rs b/examples/escape.rs
index 7614380..18b4ea4 100644
--- a/examples/escape.rs
+++ b/examples/escape.rs
@@ -7,6 +7,8 @@ fn main() {
         let encoded = escape(&fixture);
         let encoded_fallback = escape_generic(&fixture);
         assert_eq!(encoded, encoded_fallback);
+        assert_eq!(encoded, sonic_rs::to_string(&fixture).unwrap());
+        assert_eq!(encoded, serde_json::to_string(&fixture).unwrap());
     }
 }
 

From 64bdd07f54ff7fc3b18282b83f5bd3b2105a0886 Mon Sep 17 00:00:00 2001
From: LongYinan <lynweklm@gmail.com>
Date: Sat, 11 Oct 2025 15:01:31 +0800
Subject: [PATCH 02/15] cross page check

---
 plan.md        | 193 +++++++++++++++++++++++++++++++++++++++++++++++++
 src/aarch64.rs |  40 ++++++++--
 src/x86.rs     |  99 ++++++++++++++++++++-----
 3 files changed, 310 insertions(+), 22 deletions(-)
 create mode 100644 plan.md

diff --git a/plan.md b/plan.md
new file mode 100644
index 0000000..5b2d3e0
--- /dev/null
+++ b/plan.md
@@ -0,0 +1,193 @@
+# Performance Analysis: sonic-rs vs json-escape-simd
+
+## Benchmark Results
+- **json-escape-simd**: 333.21 - 348.88 µs (median: ~341 µs)
+- **sonic-rs**: 205.62 - 210.19 µs (median: ~208 µs)
+- **Performance Gap**: sonic-rs is ~40% faster
+
+## Key Differences in Implementation
+
+### 1. Copy-First Strategy with Deferred Escaping
+
+**sonic-rs Approach:**
+```rust
+// Always copy the SIMD chunk first
+v.write_to_slice_unaligned_unchecked(dst_slice);
+let mask = escaped_mask(v);
+if mask.all_zero() {
+    // Fast path: no escapes, just advance pointers
+    advance_pointers();
+} else {
+    // Found escape, backtrack to handle it
+    let escape_pos = mask.first_offset();
+    adjust_and_escape();
+}
+```
+
+**json-escape-simd Approach:**
+```rust
+// Check for escapes first, copy only if clean
+if any_escape == 0 {
+    // Copy whole chunk
+    result.extend_from_slice(chunk);
+} else {
+    // Process each escape individually
+    process_mask_avx(...);
+}
+```
+
+**Why Copy-First is Faster:**
+- Reduces branches in the common case (most chunks have no escapes)
+- Better CPU pipeline utilization
+- Simpler control flow
+- Memory writes are buffered and can be overlapped with mask checking
+
+### 2. Pre-allocated Output Buffer with MaybeUninit
+
+**sonic-rs:**
+```rust
+// Pre-reserves worst-case buffer size upfront
+let buf = writer.reserve_with(value.len() * 6 + 32 + 3)?;
+// Works with MaybeUninit to avoid initialization overhead
+pub fn format_string(value: &str, dst: &mut [MaybeUninit<u8>], ...) -> usize
+```
+
+**json-escape-simd:**
+```rust
+// Uses Vec with potential dynamic growth
+let mut result = Vec::with_capacity(estimated_capacity);
+// Multiple extend_from_slice calls may trigger reallocation
+result.extend_from_slice(&bytes[start..i]);
+```
+
+**Benefits:**
+- No reallocation overhead during processing
+- No initialization cost for unused buffer space
+- Direct pointer arithmetic instead of Vec method calls
+
+### 3. Compact Escape Handling with Lookup Table
+
+**sonic-rs:**
+```rust
+// Pre-formatted escape sequences in 8-byte blocks
+pub const QUOTE_TAB: [(u8, [u8; 8]); 256] = [
+    (6, *b"\\u0000\0\0"),  // Length and padded sequence
+    (2, *b"\\t\0\0\0\0\0\0"),
+    // ...
+];
+// Single memcpy for any escape type
+std::ptr::copy_nonoverlapping(QUOTE_TAB[ch].1.as_ptr(), dst, 8);
+dst += QUOTE_TAB[ch].0;
+```
+
+**json-escape-simd:**
+```rust
+// Conditional logic for each escape type
+fn write_escape(result: &mut Vec<u8>, escape_byte: u8, c: u8) {
+    result.push(b'\\');
+    if escape_byte == UU {
+        result.extend_from_slice(b"u00");
+        result.push(hex_digits.0);
+        result.push(hex_digits.1);
+    } else {
+        result.push(escape_byte);
+    }
+}
+```
+
+**Advantages:**
+- Single memory operation vs multiple pushes
+- No conditional branches in escape writing
+- Better memory locality (8-byte aligned writes)
+
+### 4. Simpler Mask Processing
+
+**sonic-rs:**
+- Uses `first_offset()` to find only the first escape
+- Handles escapes sequentially from that point
+- Minimal bit manipulation
+
+**json-escape-simd:**
+- Processes every set bit in the mask using `trailing_zeros()`
+- Complex bit manipulation loop (`mask &= mask - 1`)
+- More branches and iterations
+
+### 5. Cross-Page Boundary Optimization
+
+**sonic-rs includes page boundary checks:**
+```rust
+if check_cross_page(sptr, LANES) {
+    // Use temporary buffer to avoid potential page faults
+    std::ptr::copy_nonoverlapping(sptr, temp.as_mut_ptr(), remaining);
+    load(temp.as_ptr())
+}
+```
+
+This prevents potential page faults when reading past the end of allocated memory.
+
+## Optimization Recommendations
+
+### Priority 1: Adopt Copy-First Strategy
+- Modify the SIMD loops to always write chunks first
+- Only check for escapes after copying
+- Backtrack when escapes are found
+
+### Priority 2: Use Pre-allocated MaybeUninit Buffer
+```rust
+pub fn escape_into_uninit(input: &str, output: &mut [MaybeUninit<u8>]) -> usize {
+    // Work directly with MaybeUninit buffer
+    // Return actual bytes written
+}
+```
+
+### Priority 3: Implement Compact Escape Table
+```rust
+const ESCAPE_TABLE: [(u8, [u8; 8]); 256] = [
+    // Pre-format all escape sequences
+    // Use single memcpy for writing
+];
+```
+
+### Priority 4: Simplify Mask Processing
+- Process only first escape per chunk
+- Continue sequentially from escape point
+- Reduce bit manipulation overhead
+
+### Priority 5: Add Page Boundary Handling
+- Implement cross-page detection for tail processing
+- Use temporary buffer when crossing boundaries
+
+## Expected Performance Improvements
+
+Based on the analysis, implementing these optimizations should:
+1. **Copy-First Strategy**: 15-20% improvement
+2. **MaybeUninit Buffer**: 5-10% improvement
+3. **Compact Escape Table**: 10-15% improvement
+4. **Simplified Mask Processing**: 5-10% improvement
+5. **Page Boundary Handling**: 2-3% improvement (safety/stability)
+
+Combined, these changes could potentially close most of the 40% performance gap with sonic-rs.
+
+## Implementation Strategy
+
+1. **Phase 1**: Implement copy-first strategy (biggest impact)
+2. **Phase 2**: Add compact escape table
+3. **Phase 3**: Switch to MaybeUninit buffer
+4. **Phase 4**: Optimize mask processing
+5. **Phase 5**: Add page boundary handling ✅ **COMPLETED**
+
+Each phase should be benchmarked independently to measure impact.
+
+## Completed Optimizations
+
+### Page Boundary Handling (Phase 5) - COMPLETED
+
+Added page boundary checking to prevent potential page faults when reading past the end of input:
+
+- Added `check_cross_page` function with conditional compilation for Linux/macOS
+- Updated AVX512, AVX2, SSE2 tail handling to use temporary buffers when crossing page boundaries
+- Updated aarch64 NEON implementation with page boundary checks
+- On Linux/macOS: checks if reading would cross 4096-byte page boundary
+- On other platforms: always uses safe path with temporary buffer
+
+This optimization improves safety and stability without significant performance impact.
\ No newline at end of file
diff --git a/src/aarch64.rs b/src/aarch64.rs
index 03a9f4e..e376c27 100644
--- a/src/aarch64.rs
+++ b/src/aarch64.rs
@@ -9,6 +9,24 @@ const CHUNK: usize = 64;
 const PREFETCH_DISTANCE: usize = CHUNK * 2;
 const SLASH_SENTINEL: u8 = 0xFF;
 
+#[inline(always)]
+fn check_cross_page(ptr: *const u8, step: usize) -> bool {
+    #[cfg(any(target_os = "linux", target_os = "macos"))]
+    {
+        // Check if reading 'step' bytes from 'ptr' would cross a page boundary
+        // Page size is typically 4096 bytes on aarch64 Linux and macOS
+        const PAGE_SIZE: usize = 4096;
+        ((ptr as usize & (PAGE_SIZE - 1)) + step) > PAGE_SIZE
+    }
+
+    #[cfg(not(any(target_os = "linux", target_os = "macos")))]
+    {
+        // On other platforms, always use the safe path with temporary buffer
+        // to avoid potential page faults
+        true
+    }
+}
+
 #[inline]
 pub fn escape_neon(bytes: &[u8], output: &mut Vec<u8>) {
     let n = bytes.len();
@@ -25,12 +43,24 @@ pub fn escape_neon(bytes: &[u8], output: &mut Vec<u8>) {
         while i + CHUNK <= n {
             let ptr = bytes.as_ptr().add(i);
 
-            core::arch::asm!(
-                "prfm pldl1keep, [{0}]",
-                in(reg) ptr.add(PREFETCH_DISTANCE),
-            );
+            // Only prefetch if we won't go past the end
+            if i + CHUNK + PREFETCH_DISTANCE <= n {
+                core::arch::asm!(
+                    "prfm pldl1keep, [{0}]",
+                    in(reg) ptr.add(PREFETCH_DISTANCE),
+                );
+            }
 
-            let quad = vld1q_u8_x4(ptr);
+            // Use temporary buffer if reading would cross page boundary
+            let quad = if i + CHUNK == n || !check_cross_page(ptr, CHUNK) {
+                // Safe to read directly
+                vld1q_u8_x4(ptr)
+            } else {
+                // Need to use temporary buffer
+                let mut temp = [0u8; CHUNK];
+                std::ptr::copy_nonoverlapping(ptr, temp.as_mut_ptr(), CHUNK);
+                vld1q_u8_x4(temp.as_ptr())
+            };
 
             let a = quad.0;
             let b = quad.1;
diff --git a/src/x86.rs b/src/x86.rs
index c43ae22..9025cf7 100644
--- a/src/x86.rs
+++ b/src/x86.rs
@@ -30,6 +30,24 @@ fn sub(a: *const u8, b: *const u8) -> usize {
     (a as usize) - (b as usize)
 }
 
+#[inline(always)]
+fn check_cross_page(ptr: *const u8, step: usize) -> bool {
+    #[cfg(any(target_os = "linux", target_os = "macos"))]
+    {
+        // Check if reading 'step' bytes from 'ptr' would cross a page boundary
+        // Page size is typically 4096 bytes on x86_64 Linux and macOS
+        const PAGE_SIZE: usize = 4096;
+        ((ptr as usize & (PAGE_SIZE - 1)) + step) > PAGE_SIZE
+    }
+
+    #[cfg(not(any(target_os = "linux", target_os = "macos")))]
+    {
+        // On other platforms, always use the safe path with temporary buffer
+        // to avoid potential page faults
+        true
+    }
+}
+
 #[target_feature(enable = "avx512f", enable = "avx512bw")]
 #[inline]
 pub unsafe fn escape_avx512(bytes: &[u8], result: &mut Vec<u8>) {
@@ -199,14 +217,29 @@ pub unsafe fn escape_avx512(bytes: &[u8], result: &mut Vec<u8>) {
 
     // Handle tail
     if ptr < end_ptr {
-        let d = M512_VECTOR_SIZE - sub(end_ptr, ptr);
-        let a = _mm512_loadu_si512(ptr.sub(d) as *const __m512i);
+        let remaining = sub(end_ptr, ptr);
+        let d = M512_VECTOR_SIZE - remaining;
+
+        // Use temporary buffer if reading would cross page boundary
+        let a = if check_cross_page(ptr.sub(d), M512_VECTOR_SIZE) {
+            let mut temp = [0u8; M512_VECTOR_SIZE];
+            // Copy remaining bytes to the beginning of temp buffer
+            std::ptr::copy_nonoverlapping(ptr, temp.as_mut_ptr(), remaining);
+            _mm512_loadu_si512(temp.as_ptr() as *const __m512i)
+        } else {
+            _mm512_loadu_si512(ptr.sub(d) as *const __m512i)
+        };
 
         let quote_mask = _mm512_cmpeq_epi8_mask(a, v_b);
         let slash_mask = _mm512_cmpeq_epi8_mask(a, v_c);
         let ctrl_mask = _mm512_cmplt_epu8_mask(a, v_ctrl_limit);
 
-        let mut mask = ((quote_mask | slash_mask | ctrl_mask) as u64).wrapping_shr(d as u32);
+        let mut mask = if check_cross_page(ptr.sub(d), M512_VECTOR_SIZE) {
+            // When using temp buffer, only check the valid bytes
+            (quote_mask | slash_mask | ctrl_mask) as u64 & ((1u64 << remaining) - 1)
+        } else {
+            ((quote_mask | slash_mask | ctrl_mask) as u64).wrapping_shr(d as u32)
+        };
 
         if mask != 0 {
             let at = sub(ptr, start_ptr);
@@ -415,15 +448,31 @@ pub unsafe fn escape_avx2(bytes: &[u8], result: &mut Vec<u8>) {
 
     // Handle tail
     if ptr < end_ptr {
-        let d = M256_VECTOR_SIZE - sub(end_ptr, ptr);
-        let mut mask = ({
-            let a = _mm256_loadu_si256(ptr.sub(d) as *const __m256i);
-            _mm256_movemask_epi8(_mm256_or_si256(
+        let remaining = sub(end_ptr, ptr);
+        let d = M256_VECTOR_SIZE - remaining;
+
+        // Use temporary buffer if reading would cross page boundary
+        let a = if check_cross_page(ptr.sub(d), M256_VECTOR_SIZE) {
+            let mut temp = [0u8; M256_VECTOR_SIZE];
+            // Copy remaining bytes to the beginning of temp buffer
+            std::ptr::copy_nonoverlapping(ptr, temp.as_mut_ptr(), remaining);
+            _mm256_loadu_si256(temp.as_ptr() as *const __m256i)
+        } else {
+            _mm256_loadu_si256(ptr.sub(d) as *const __m256i)
+        };
+
+        let mut mask = if check_cross_page(ptr.sub(d), M256_VECTOR_SIZE) {
+            // When using temp buffer, only check the valid bytes
+            (_mm256_movemask_epi8(_mm256_or_si256(
                 _mm256_or_si256(_mm256_cmpeq_epi8(a, v_b), _mm256_cmpeq_epi8(a, v_c)),
                 _mm256_cmpgt_epi8(_mm256_add_epi8(a, v_translation_a), v_below_a),
-            ))
-        } as u32)
-            .wrapping_shr(d as u32);
+            )) as u32) & ((1u32 << remaining) - 1)
+        } else {
+            (_mm256_movemask_epi8(_mm256_or_si256(
+                _mm256_or_si256(_mm256_cmpeq_epi8(a, v_b), _mm256_cmpeq_epi8(a, v_c)),
+                _mm256_cmpgt_epi8(_mm256_add_epi8(a, v_translation_a), v_below_a),
+            )) as u32).wrapping_shr(d as u32)
+        };
 
         if mask != 0 {
             let at = sub(ptr, start_ptr);
@@ -544,15 +593,31 @@ pub unsafe fn escape_sse2(bytes: &[u8], result: &mut Vec<u8>) {
 
     // Handle tail
     if ptr < end_ptr {
-        let d = M128_VECTOR_SIZE - sub(end_ptr, ptr);
-        let mut mask = ({
-            let a = _mm_loadu_si128(ptr.sub(d) as *const __m128i);
-            _mm_movemask_epi8(_mm_or_si128(
+        let remaining = sub(end_ptr, ptr);
+        let d = M128_VECTOR_SIZE - remaining;
+
+        // Use temporary buffer if reading would cross page boundary
+        let a = if check_cross_page(ptr.sub(d), M128_VECTOR_SIZE) {
+            let mut temp = [0u8; M128_VECTOR_SIZE];
+            // Copy remaining bytes to the beginning of temp buffer
+            std::ptr::copy_nonoverlapping(ptr, temp.as_mut_ptr(), remaining);
+            _mm_loadu_si128(temp.as_ptr() as *const __m128i)
+        } else {
+            _mm_loadu_si128(ptr.sub(d) as *const __m128i)
+        };
+
+        let mut mask = if check_cross_page(ptr.sub(d), M128_VECTOR_SIZE) {
+            // When using temp buffer, only check the valid bytes
+            (_mm_movemask_epi8(_mm_or_si128(
                 _mm_or_si128(_mm_cmpeq_epi8(a, v_b), _mm_cmpeq_epi8(a, v_c)),
                 _mm_cmpgt_epi8(_mm_add_epi8(a, v_translation_a), v_below_a),
-            ))
-        } as u16)
-            .wrapping_shr(d as u32);
+            )) as u16) & ((1u16 << remaining) - 1)
+        } else {
+            (_mm_movemask_epi8(_mm_or_si128(
+                _mm_or_si128(_mm_cmpeq_epi8(a, v_b), _mm_cmpeq_epi8(a, v_c)),
+                _mm_cmpgt_epi8(_mm_add_epi8(a, v_translation_a), v_below_a),
+            )) as u16).wrapping_shr(d as u32)
+        };
 
         if mask != 0 {
             let at = sub(ptr, start_ptr);

From 53d9beb460b452fbbfff484880f1a92b321c9d03 Mon Sep 17 00:00:00 2001
From: LongYinan <lynweklm@gmail.com>
Date: Sat, 11 Oct 2025 15:07:45 +0800
Subject: [PATCH 03/15] save

---
 plan.md        |  26 ++++++++++++-
 src/aarch64.rs |  43 ++++++++++++++++-----
 src/x86.rs     | 103 ++++++++++++++++++++++++++++---------------------
 3 files changed, 116 insertions(+), 56 deletions(-)

diff --git a/plan.md b/plan.md
index 5b2d3e0..7b458f3 100644
--- a/plan.md
+++ b/plan.md
@@ -173,7 +173,7 @@ Combined, these changes could potentially close most of the 40% performance gap
 1. **Phase 1**: Implement copy-first strategy (biggest impact)
 2. **Phase 2**: Add compact escape table
 3. **Phase 3**: Switch to MaybeUninit buffer
-4. **Phase 4**: Optimize mask processing
+4. **Phase 4**: Optimize mask processing ✅ **COMPLETED**
 5. **Phase 5**: Add page boundary handling ✅ **COMPLETED**
 
 Each phase should be benchmarked independently to measure impact.
@@ -190,4 +190,26 @@ Added page boundary checking to prevent potential page faults when reading past
 - On Linux/macOS: checks if reading would cross 4096-byte page boundary
 - On other platforms: always uses safe path with temporary buffer
 
-This optimization improves safety and stability without significant performance impact.
\ No newline at end of file
+This optimization improves safety and stability without significant performance impact (~1.5% improvement).
+
+### Simplified Mask Processing (Phase 4) - COMPLETED
+
+Optimized how escape characters are processed when found in SIMD chunks:
+
+**Previous approach:**
+- Used bit manipulation loop with `trailing_zeros()` and `mask &= mask - 1`
+- Processed every set bit in the mask individually
+- Multiple branches and iterations
+
+**New approach:**
+- Find first escape position with single `trailing_zeros()` call
+- Copy everything before first escape in one operation
+- Process bytes sequentially from first escape position
+- Reduced bit manipulation overhead
+
+**Changes made:**
+- Updated `process_mask_avx` and `process_mask_avx512` helper functions
+- Simplified AVX512, AVX2, SSE2 tail handling mask processing
+- Optimized aarch64 `handle_block` function with same approach
+
+This reduces CPU cycles spent on bit manipulation and improves branch prediction.
\ No newline at end of file
diff --git a/src/aarch64.rs b/src/aarch64.rs
index e376c27..40c0c26 100644
--- a/src/aarch64.rs
+++ b/src/aarch64.rs
@@ -122,15 +122,40 @@ fn handle_tail(src: &[u8], dst: &mut Vec<u8>) {
 
 #[inline(always)]
 fn handle_block(src: &[u8], mask: &[u8; 16], dst: &mut Vec<u8>) {
-    for (j, &m) in mask.iter().enumerate() {
-        let c = src[j];
-        if m == 0 {
-            dst.push(c);
-        } else if m == SLASH_SENTINEL {
-            dst.push(b'\\');
-            dst.push(b'\\');
-        } else {
-            write_escape(dst, m, c);
+    // Find first escape position
+    let mut first_escape = None;
+    for (i, &m) in mask.iter().enumerate() {
+        if m != 0 {
+            first_escape = Some(i);
+            break;
+        }
+    }
+
+    match first_escape {
+        None => {
+            // No escapes, copy all bytes
+            dst.extend_from_slice(src);
+        }
+        Some(pos) => {
+            // Copy everything before first escape
+            if pos > 0 {
+                dst.extend_from_slice(&src[0..pos]);
+            }
+
+            // Process from first escape position
+            for j in pos..16 {
+                let c = src[j];
+                let m = mask[j];
+
+                if m == 0 {
+                    dst.push(c);
+                } else if m == SLASH_SENTINEL {
+                    dst.push(b'\\');
+                    dst.push(b'\\');
+                } else {
+                    write_escape(dst, m, c);
+                }
+            }
         }
     }
 }
diff --git a/src/x86.rs b/src/x86.rs
index 9025cf7..af9ad74 100644
--- a/src/x86.rs
+++ b/src/x86.rs
@@ -621,24 +621,25 @@ pub unsafe fn escape_sse2(bytes: &[u8], result: &mut Vec<u8>) {
 
         if mask != 0 {
             let at = sub(ptr, start_ptr);
-            let mut cur = mask.trailing_zeros() as usize;
-            loop {
-                let c = *ptr.add(cur);
+            let first_escape = mask.trailing_zeros() as usize;
+
+            // Copy everything before the first escape
+            let i = at + first_escape;
+            if start < i {
+                result.extend_from_slice(&bytes[start..i]);
+            }
+
+            // Process bytes sequentially from the first escape position
+            for pos in first_escape..remaining {
+                let c = *ptr.add(pos);
                 let escape_byte = ESCAPE[c as usize];
                 if escape_byte != 0 {
-                    let i = at + cur;
-                    if start < i {
-                        result.extend_from_slice(&bytes[start..i]);
-                    }
                     write_escape(result, escape_byte, c);
-                    start = i + 1;
-                }
-                mask ^= 1 << cur;
-                if mask == 0 {
-                    break;
+                } else {
+                    result.push(c);
                 }
-                cur = mask.trailing_zeros() as usize;
             }
+            start = at + remaining;
         }
     }
 
@@ -665,26 +666,32 @@ unsafe fn process_mask_avx(
     let ptr = ptr.add(offset);
     let at = sub(ptr, start_ptr);
 
-    // Process mask bits using bit manipulation
-    let mut remaining = mask as u32;
-    while remaining != 0 {
-        let cur = remaining.trailing_zeros() as usize;
-        let c = *ptr.add(cur);
+    // Find the first escape position
+    let first_escape = (mask as u32).trailing_zeros() as usize;
+
+    // Copy everything before the first escape
+    let i = at + first_escape;
+    if *start < i {
+        result.extend_from_slice(&bytes[*start..i]);
+    }
+
+    // Process bytes sequentially from the first escape position
+    let mut pos = first_escape;
+    let end = at + M256_VECTOR_SIZE;
+
+    while pos < M256_VECTOR_SIZE {
+        let c = *ptr.add(pos);
         let escape_byte = ESCAPE[c as usize];
-        debug_assert!(escape_byte != 0);
 
-        let i = at + cur;
-        // Copy unescaped portion if needed
-        if *start < i {
-            result.extend_from_slice(&bytes[*start..i]);
+        if escape_byte != 0 {
+            write_escape(result, escape_byte, c);
+        } else {
+            result.push(c);
         }
-        // Write escape sequence
-        write_escape(result, escape_byte, c);
-        *start = i + 1;
-
-        // Clear the lowest set bit
-        remaining &= remaining - 1;
+        pos += 1;
     }
+
+    *start = end;
 }
 
 #[inline(always)]
@@ -704,26 +711,32 @@ unsafe fn process_mask_avx512(
     let ptr = ptr.add(offset);
     let at = sub(ptr, start_ptr);
 
-    // Process mask bits using bit manipulation
-    let mut remaining = mask;
-    while remaining != 0 {
-        let cur = remaining.trailing_zeros() as usize;
-        let c = *ptr.add(cur);
+    // Find the first escape position
+    let first_escape = mask.trailing_zeros() as usize;
+
+    // Copy everything before the first escape
+    let i = at + first_escape;
+    if *start < i {
+        result.extend_from_slice(&bytes[*start..i]);
+    }
+
+    // Process bytes sequentially from the first escape position
+    let mut pos = first_escape;
+    let end = at + M512_VECTOR_SIZE;
+
+    while pos < M512_VECTOR_SIZE {
+        let c = *ptr.add(pos);
         let escape_byte = ESCAPE[c as usize];
-        debug_assert!(escape_byte != 0);
 
-        let i = at + cur;
-        // Copy unescaped portion if needed
-        if *start < i {
-            result.extend_from_slice(&bytes[*start..i]);
+        if escape_byte != 0 {
+            write_escape(result, escape_byte, c);
+        } else {
+            result.push(c);
         }
-        // Write escape sequence
-        write_escape(result, escape_byte, c);
-        *start = i + 1;
-
-        // Clear the lowest set bit
-        remaining &= remaining - 1;
+        pos += 1;
     }
+
+    *start = end;
 }
 
 #[inline(always)]

From 95e2d8cd1a649c295888473c14dd84115979370a Mon Sep 17 00:00:00 2001
From: LongYinan <lynweklm@gmail.com>
Date: Sat, 11 Oct 2025 15:14:37 +0800
Subject: [PATCH 04/15] Revert "save"

This reverts commit 53d9beb460b452fbbfff484880f1a92b321c9d03.
---
 plan.md        |  26 +------------
 src/aarch64.rs |  43 +++++----------------
 src/x86.rs     | 103 +++++++++++++++++++++----------------------------
 3 files changed, 56 insertions(+), 116 deletions(-)

diff --git a/plan.md b/plan.md
index 7b458f3..5b2d3e0 100644
--- a/plan.md
+++ b/plan.md
@@ -173,7 +173,7 @@ Combined, these changes could potentially close most of the 40% performance gap
 1. **Phase 1**: Implement copy-first strategy (biggest impact)
 2. **Phase 2**: Add compact escape table
 3. **Phase 3**: Switch to MaybeUninit buffer
-4. **Phase 4**: Optimize mask processing ✅ **COMPLETED**
+4. **Phase 4**: Optimize mask processing
 5. **Phase 5**: Add page boundary handling ✅ **COMPLETED**
 
 Each phase should be benchmarked independently to measure impact.
@@ -190,26 +190,4 @@ Added page boundary checking to prevent potential page faults when reading past
 - On Linux/macOS: checks if reading would cross 4096-byte page boundary
 - On other platforms: always uses safe path with temporary buffer
 
-This optimization improves safety and stability without significant performance impact (~1.5% improvement).
-
-### Simplified Mask Processing (Phase 4) - COMPLETED
-
-Optimized how escape characters are processed when found in SIMD chunks:
-
-**Previous approach:**
-- Used bit manipulation loop with `trailing_zeros()` and `mask &= mask - 1`
-- Processed every set bit in the mask individually
-- Multiple branches and iterations
-
-**New approach:**
-- Find first escape position with single `trailing_zeros()` call
-- Copy everything before first escape in one operation
-- Process bytes sequentially from first escape position
-- Reduced bit manipulation overhead
-
-**Changes made:**
-- Updated `process_mask_avx` and `process_mask_avx512` helper functions
-- Simplified AVX512, AVX2, SSE2 tail handling mask processing
-- Optimized aarch64 `handle_block` function with same approach
-
-This reduces CPU cycles spent on bit manipulation and improves branch prediction.
\ No newline at end of file
+This optimization improves safety and stability without significant performance impact.
\ No newline at end of file
diff --git a/src/aarch64.rs b/src/aarch64.rs
index 40c0c26..e376c27 100644
--- a/src/aarch64.rs
+++ b/src/aarch64.rs
@@ -122,40 +122,15 @@ fn handle_tail(src: &[u8], dst: &mut Vec<u8>) {
 
 #[inline(always)]
 fn handle_block(src: &[u8], mask: &[u8; 16], dst: &mut Vec<u8>) {
-    // Find first escape position
-    let mut first_escape = None;
-    for (i, &m) in mask.iter().enumerate() {
-        if m != 0 {
-            first_escape = Some(i);
-            break;
-        }
-    }
-
-    match first_escape {
-        None => {
-            // No escapes, copy all bytes
-            dst.extend_from_slice(src);
-        }
-        Some(pos) => {
-            // Copy everything before first escape
-            if pos > 0 {
-                dst.extend_from_slice(&src[0..pos]);
-            }
-
-            // Process from first escape position
-            for j in pos..16 {
-                let c = src[j];
-                let m = mask[j];
-
-                if m == 0 {
-                    dst.push(c);
-                } else if m == SLASH_SENTINEL {
-                    dst.push(b'\\');
-                    dst.push(b'\\');
-                } else {
-                    write_escape(dst, m, c);
-                }
-            }
+    for (j, &m) in mask.iter().enumerate() {
+        let c = src[j];
+        if m == 0 {
+            dst.push(c);
+        } else if m == SLASH_SENTINEL {
+            dst.push(b'\\');
+            dst.push(b'\\');
+        } else {
+            write_escape(dst, m, c);
         }
     }
 }
diff --git a/src/x86.rs b/src/x86.rs
index af9ad74..9025cf7 100644
--- a/src/x86.rs
+++ b/src/x86.rs
@@ -621,25 +621,24 @@ pub unsafe fn escape_sse2(bytes: &[u8], result: &mut Vec<u8>) {
 
         if mask != 0 {
             let at = sub(ptr, start_ptr);
-            let first_escape = mask.trailing_zeros() as usize;
-
-            // Copy everything before the first escape
-            let i = at + first_escape;
-            if start < i {
-                result.extend_from_slice(&bytes[start..i]);
-            }
-
-            // Process bytes sequentially from the first escape position
-            for pos in first_escape..remaining {
-                let c = *ptr.add(pos);
+            let mut cur = mask.trailing_zeros() as usize;
+            loop {
+                let c = *ptr.add(cur);
                 let escape_byte = ESCAPE[c as usize];
                 if escape_byte != 0 {
+                    let i = at + cur;
+                    if start < i {
+                        result.extend_from_slice(&bytes[start..i]);
+                    }
                     write_escape(result, escape_byte, c);
-                } else {
-                    result.push(c);
+                    start = i + 1;
+                }
+                mask ^= 1 << cur;
+                if mask == 0 {
+                    break;
                 }
+                cur = mask.trailing_zeros() as usize;
             }
-            start = at + remaining;
         }
     }
 
@@ -666,32 +665,26 @@ unsafe fn process_mask_avx(
     let ptr = ptr.add(offset);
     let at = sub(ptr, start_ptr);
 
-    // Find the first escape position
-    let first_escape = (mask as u32).trailing_zeros() as usize;
-
-    // Copy everything before the first escape
-    let i = at + first_escape;
-    if *start < i {
-        result.extend_from_slice(&bytes[*start..i]);
-    }
-
-    // Process bytes sequentially from the first escape position
-    let mut pos = first_escape;
-    let end = at + M256_VECTOR_SIZE;
-
-    while pos < M256_VECTOR_SIZE {
-        let c = *ptr.add(pos);
+    // Process mask bits using bit manipulation
+    let mut remaining = mask as u32;
+    while remaining != 0 {
+        let cur = remaining.trailing_zeros() as usize;
+        let c = *ptr.add(cur);
         let escape_byte = ESCAPE[c as usize];
+        debug_assert!(escape_byte != 0);
 
-        if escape_byte != 0 {
-            write_escape(result, escape_byte, c);
-        } else {
-            result.push(c);
+        let i = at + cur;
+        // Copy unescaped portion if needed
+        if *start < i {
+            result.extend_from_slice(&bytes[*start..i]);
         }
-        pos += 1;
-    }
+        // Write escape sequence
+        write_escape(result, escape_byte, c);
+        *start = i + 1;
 
-    *start = end;
+        // Clear the lowest set bit
+        remaining &= remaining - 1;
+    }
 }
 
 #[inline(always)]
@@ -711,32 +704,26 @@ unsafe fn process_mask_avx512(
     let ptr = ptr.add(offset);
     let at = sub(ptr, start_ptr);
 
-    // Find the first escape position
-    let first_escape = mask.trailing_zeros() as usize;
-
-    // Copy everything before the first escape
-    let i = at + first_escape;
-    if *start < i {
-        result.extend_from_slice(&bytes[*start..i]);
-    }
-
-    // Process bytes sequentially from the first escape position
-    let mut pos = first_escape;
-    let end = at + M512_VECTOR_SIZE;
-
-    while pos < M512_VECTOR_SIZE {
-        let c = *ptr.add(pos);
+    // Process mask bits using bit manipulation
+    let mut remaining = mask;
+    while remaining != 0 {
+        let cur = remaining.trailing_zeros() as usize;
+        let c = *ptr.add(cur);
         let escape_byte = ESCAPE[c as usize];
+        debug_assert!(escape_byte != 0);
 
-        if escape_byte != 0 {
-            write_escape(result, escape_byte, c);
-        } else {
-            result.push(c);
+        let i = at + cur;
+        // Copy unescaped portion if needed
+        if *start < i {
+            result.extend_from_slice(&bytes[*start..i]);
         }
-        pos += 1;
-    }
+        // Write escape sequence
+        write_escape(result, escape_byte, c);
+        *start = i + 1;
 
-    *start = end;
+        // Clear the lowest set bit
+        remaining &= remaining - 1;
+    }
 }
 
 #[inline(always)]

From 846c1a7dfcd02d6e7bdcaa934039a0a4563406a4 Mon Sep 17 00:00:00 2001
From: LongYinan <lynweklm@gmail.com>
Date: Sat, 11 Oct 2025 15:46:31 +0800
Subject: [PATCH 05/15] optimze mask processing

---
 src/aarch64.rs |  39 ++++++++-----
 src/x86.rs     | 150 ++++++++++++++++++++++++++-----------------------
 2 files changed, 106 insertions(+), 83 deletions(-)

diff --git a/src/aarch64.rs b/src/aarch64.rs
index e376c27..e457d9d 100644
--- a/src/aarch64.rs
+++ b/src/aarch64.rs
@@ -83,21 +83,34 @@ pub fn escape_neon(bytes: &[u8], output: &mut Vec<u8>) {
                 continue;
             }
 
-            macro_rules! handle {
-                ($mask:expr, $mask_r:expr, $off:expr) => {
-                    if $mask_r == 0 {
-                        output.extend_from_slice(std::slice::from_raw_parts(ptr.add($off), 16));
-                    } else {
-                        vst1q_u8(placeholder.as_mut_ptr(), $mask);
-                        handle_block(&bytes[i + $off..i + $off + 16], &placeholder, output);
-                    }
-                };
+            // Process each 16-byte chunk that has escapes
+            if mask_r_1 != 0 {
+                vst1q_u8(placeholder.as_mut_ptr(), mask_1);
+                handle_block(&bytes[i..i + 16], &placeholder, output);
+            } else {
+                output.extend_from_slice(std::slice::from_raw_parts(ptr, 16));
             }
 
-            handle!(mask_1, mask_r_1, 0);
-            handle!(mask_2, mask_r_2, 16);
-            handle!(mask_3, mask_r_3, 32);
-            handle!(mask_4, mask_r_4, 48);
+            if mask_r_2 != 0 {
+                vst1q_u8(placeholder.as_mut_ptr(), mask_2);
+                handle_block(&bytes[i + 16..i + 32], &placeholder, output);
+            } else {
+                output.extend_from_slice(std::slice::from_raw_parts(ptr.add(16), 16));
+            }
+
+            if mask_r_3 != 0 {
+                vst1q_u8(placeholder.as_mut_ptr(), mask_3);
+                handle_block(&bytes[i + 32..i + 48], &placeholder, output);
+            } else {
+                output.extend_from_slice(std::slice::from_raw_parts(ptr.add(32), 16));
+            }
+
+            if mask_r_4 != 0 {
+                vst1q_u8(placeholder.as_mut_ptr(), mask_4);
+                handle_block(&bytes[i + 48..i + 64], &placeholder, output);
+            } else {
+                output.extend_from_slice(std::slice::from_raw_parts(ptr.add(48), 16));
+            }
 
             i += CHUNK;
         }
diff --git a/src/x86.rs b/src/x86.rs
index 9025cf7..c3ea6e1 100644
--- a/src/x86.rs
+++ b/src/x86.rs
@@ -145,41 +145,50 @@ pub unsafe fn escape_avx512(bytes: &[u8], result: &mut Vec<u8>) {
 
         if any_escape == 0 {
             // No escapes needed, copy whole chunk
-            if start < sub(ptr, start_ptr) {
-                result.extend_from_slice(&bytes[start..sub(ptr, start_ptr)]);
+            let at = sub(ptr, start_ptr);
+            if start < at {
+                result.extend_from_slice(&bytes[start..at]);
             }
             result.extend_from_slice(std::slice::from_raw_parts(ptr, LOOP_SIZE_AVX512));
-            start = sub(ptr, start_ptr) + LOOP_SIZE_AVX512;
+            start = at + LOOP_SIZE_AVX512;
         } else {
             // Process each 64-byte chunk that has escapes
-            process_mask_avx512(ptr, start_ptr, result, &mut start, bytes, mask_a, 0);
-            process_mask_avx512(
-                ptr,
-                start_ptr,
-                result,
-                &mut start,
-                bytes,
-                mask_b,
-                M512_VECTOR_SIZE,
-            );
-            process_mask_avx512(
-                ptr,
-                start_ptr,
-                result,
-                &mut start,
-                bytes,
-                mask_c,
-                M512_VECTOR_SIZE * 2,
-            );
-            process_mask_avx512(
-                ptr,
-                start_ptr,
-                result,
-                &mut start,
-                bytes,
-                mask_d,
-                M512_VECTOR_SIZE * 3,
-            );
+            if mask_a != 0 {
+                process_mask_avx512(ptr, start_ptr, result, &mut start, bytes, mask_a, 0);
+            }
+            if mask_b != 0 {
+                process_mask_avx512(
+                    ptr,
+                    start_ptr,
+                    result,
+                    &mut start,
+                    bytes,
+                    mask_b,
+                    M512_VECTOR_SIZE,
+                );
+            }
+            if mask_c != 0 {
+                process_mask_avx512(
+                    ptr,
+                    start_ptr,
+                    result,
+                    &mut start,
+                    bytes,
+                    mask_c,
+                    M512_VECTOR_SIZE * 2,
+                );
+            }
+            if mask_d != 0 {
+                process_mask_avx512(
+                    ptr,
+                    start_ptr,
+                    result,
+                    &mut start,
+                    bytes,
+                    mask_d,
+                    M512_VECTOR_SIZE * 3,
+                );
+            }
         }
 
         ptr = ptr.add(LOOP_SIZE_AVX512);
@@ -365,11 +374,12 @@ pub unsafe fn escape_avx2(bytes: &[u8], result: &mut Vec<u8>) {
 
         if _mm256_movemask_epi8(any_escape) == 0 {
             // No escapes needed, copy whole chunk
-            if start < sub(ptr, start_ptr) {
-                result.extend_from_slice(&bytes[start..sub(ptr, start_ptr)]);
+            let at = sub(ptr, start_ptr);
+            if start < at {
+                result.extend_from_slice(&bytes[start..at]);
             }
             result.extend_from_slice(std::slice::from_raw_parts(ptr, LOOP_SIZE_AVX2));
-            start = sub(ptr, start_ptr) + LOOP_SIZE_AVX2;
+            start = at + LOOP_SIZE_AVX2;
         } else {
             // Get individual masks only when needed
             let mask_a = _mm256_movemask_epi8(cmp_a);
@@ -378,34 +388,42 @@ pub unsafe fn escape_avx2(bytes: &[u8], result: &mut Vec<u8>) {
             let mask_d = _mm256_movemask_epi8(cmp_d);
 
             // Process each 32-byte chunk that has escapes
-            process_mask_avx(ptr, start_ptr, result, &mut start, bytes, mask_a, 0);
-            process_mask_avx(
-                ptr,
-                start_ptr,
-                result,
-                &mut start,
-                bytes,
-                mask_b,
-                M256_VECTOR_SIZE,
-            );
-            process_mask_avx(
-                ptr,
-                start_ptr,
-                result,
-                &mut start,
-                bytes,
-                mask_c,
-                M256_VECTOR_SIZE * 2,
-            );
-            process_mask_avx(
-                ptr,
-                start_ptr,
-                result,
-                &mut start,
-                bytes,
-                mask_d,
-                M256_VECTOR_SIZE * 3,
-            );
+            if mask_a != 0 {
+                process_mask_avx(ptr, start_ptr, result, &mut start, bytes, mask_a, 0);
+            }
+            if mask_b != 0 {
+                process_mask_avx(
+                    ptr,
+                    start_ptr,
+                    result,
+                    &mut start,
+                    bytes,
+                    mask_b,
+                    M256_VECTOR_SIZE,
+                );
+            }
+            if mask_c != 0 {
+                process_mask_avx(
+                    ptr,
+                    start_ptr,
+                    result,
+                    &mut start,
+                    bytes,
+                    mask_c,
+                    M256_VECTOR_SIZE * 2,
+                );
+            }
+            if mask_d != 0 {
+                process_mask_avx(
+                    ptr,
+                    start_ptr,
+                    result,
+                    &mut start,
+                    bytes,
+                    mask_d,
+                    M256_VECTOR_SIZE * 3,
+                );
+            }
         }
 
         ptr = ptr.add(LOOP_SIZE_AVX2);
@@ -658,10 +676,6 @@ unsafe fn process_mask_avx(
     mask: i32,
     offset: usize,
 ) {
-    if mask == 0 {
-        return;
-    }
-
     let ptr = ptr.add(offset);
     let at = sub(ptr, start_ptr);
 
@@ -697,10 +711,6 @@ unsafe fn process_mask_avx512(
     mask: u64,
     offset: usize,
 ) {
-    if mask == 0 {
-        return;
-    }
-
     let ptr = ptr.add(offset);
     let at = sub(ptr, start_ptr);
 

From e100960a8688ea70d5b61356a2c4c1c3e259001e Mon Sep 17 00:00:00 2001
From: LongYinan <lynweklm@gmail.com>
Date: Sat, 11 Oct 2025 15:58:25 +0800
Subject: [PATCH 06/15] copy_nonoverlapping

---
 src/aarch64.rs | 31 ++++++++++++++------
 src/generic.rs | 46 +++++++++++++++++++++++++++++
 src/x86.rs     | 78 +++++++++++++++++++++++++++++---------------------
 3 files changed, 114 insertions(+), 41 deletions(-)

diff --git a/src/aarch64.rs b/src/aarch64.rs
index e457d9d..e746a70 100644
--- a/src/aarch64.rs
+++ b/src/aarch64.rs
@@ -2,7 +2,7 @@ use std::arch::aarch64::{
     vceqq_u8, vdupq_n_u8, vld1q_u8_x4, vmaxvq_u8, vorrq_u8, vqtbl4q_u8, vst1q_u8,
 };
 
-use crate::generic::{ESCAPE, HEX_BYTES, UU};
+use crate::generic::{ESCAPE, ESCAPE_TABLE, HEX_BYTES, UU};
 
 const CHUNK: usize = 64;
 // 128 bytes ahead
@@ -150,13 +150,28 @@ fn handle_block(src: &[u8], mask: &[u8; 16], dst: &mut Vec<u8>) {
 
 #[inline(always)]
 fn write_escape(dst: &mut Vec<u8>, escape_byte: u8, c: u8) {
-    dst.push(b'\\');
-    if escape_byte == UU {
-        dst.extend_from_slice(b"u00");
-        let hex = &HEX_BYTES[c as usize];
-        dst.push(hex.0);
-        dst.push(hex.1);
+    // Use optimized escape table for bulk writing
+    let (len, bytes) = ESCAPE_TABLE[c as usize];
+    if len > 0 {
+        // Ensure we have enough capacity for the escape sequence
+        dst.reserve(len as usize);
+        unsafe {
+            let ptr = dst.as_mut_ptr().add(dst.len());
+            // Use copy_nonoverlapping for fast bulk copy
+            std::ptr::copy_nonoverlapping(bytes.as_ptr(), ptr, 8);
+            // Update the length - only add the actual escape sequence length
+            dst.set_len(dst.len() + len as usize);
+        }
     } else {
-        dst.push(escape_byte);
+        // Fallback to old method for characters not in the table
+        dst.push(b'\\');
+        if escape_byte == UU {
+            dst.extend_from_slice(b"u00");
+            let hex = &HEX_BYTES[c as usize];
+            dst.push(hex.0);
+            dst.push(hex.1);
+        } else {
+            dst.push(escape_byte);
+        }
     }
 }
diff --git a/src/generic.rs b/src/generic.rs
index 8e73929..9ff9d32 100644
--- a/src/generic.rs
+++ b/src/generic.rs
@@ -140,3 +140,49 @@ pub(crate) static HEX_BYTES: [HexPair; 32] = [
     HexPair(b'1', b'e'),
     HexPair(b'1', b'f'),
 ];
+
+// Optimized escape table with 8-byte arrays for fast bulk writing
+// First element is the length of escape sequence, followed by the escape bytes
+pub(crate) static ESCAPE_TABLE: [(u8, [u8; 8]); 256] = {
+    let mut table = [(0u8, [0u8; 8]); 256];
+
+    // Control characters \u0000 - \u001f
+    table[0x00] = (6, *b"\\u0000\0\0");
+    table[0x01] = (6, *b"\\u0001\0\0");
+    table[0x02] = (6, *b"\\u0002\0\0");
+    table[0x03] = (6, *b"\\u0003\0\0");
+    table[0x04] = (6, *b"\\u0004\0\0");
+    table[0x05] = (6, *b"\\u0005\0\0");
+    table[0x06] = (6, *b"\\u0006\0\0");
+    table[0x07] = (6, *b"\\u0007\0\0");
+    table[0x08] = (2, *b"\\b\0\0\0\0\0\0");
+    table[0x09] = (2, *b"\\t\0\0\0\0\0\0");
+    table[0x0A] = (2, *b"\\n\0\0\0\0\0\0");
+    table[0x0B] = (6, *b"\\u000b\0\0");
+    table[0x0C] = (2, *b"\\f\0\0\0\0\0\0");
+    table[0x0D] = (2, *b"\\r\0\0\0\0\0\0");
+    table[0x0E] = (6, *b"\\u000e\0\0");
+    table[0x0F] = (6, *b"\\u000f\0\0");
+    table[0x10] = (6, *b"\\u0010\0\0");
+    table[0x11] = (6, *b"\\u0011\0\0");
+    table[0x12] = (6, *b"\\u0012\0\0");
+    table[0x13] = (6, *b"\\u0013\0\0");
+    table[0x14] = (6, *b"\\u0014\0\0");
+    table[0x15] = (6, *b"\\u0015\0\0");
+    table[0x16] = (6, *b"\\u0016\0\0");
+    table[0x17] = (6, *b"\\u0017\0\0");
+    table[0x18] = (6, *b"\\u0018\0\0");
+    table[0x19] = (6, *b"\\u0019\0\0");
+    table[0x1A] = (6, *b"\\u001a\0\0");
+    table[0x1B] = (6, *b"\\u001b\0\0");
+    table[0x1C] = (6, *b"\\u001c\0\0");
+    table[0x1D] = (6, *b"\\u001d\0\0");
+    table[0x1E] = (6, *b"\\u001e\0\0");
+    table[0x1F] = (6, *b"\\u001f\0\0");
+
+    // Special characters
+    table[0x22] = (2, *b"\\\"\0\0\0\0\0\0"); // "
+    table[0x5C] = (2, *b"\\\\\0\0\0\0\0\0"); // \
+
+    table
+};
diff --git a/src/x86.rs b/src/x86.rs
index c3ea6e1..c3898fc 100644
--- a/src/x86.rs
+++ b/src/x86.rs
@@ -8,7 +8,7 @@ use std::arch::x86_64::{
     _mm512_cmplt_epu8_mask, _mm512_load_si512, _mm512_loadu_si512, _mm512_set1_epi8,
 };
 
-use crate::generic::{ESCAPE, HEX_BYTES, UU};
+use crate::generic::{ESCAPE, ESCAPE_TABLE, HEX_BYTES, UU};
 
 // Constants for control character detection using signed comparison trick
 const TRANSLATION_A: i8 = i8::MAX - 31i8;
@@ -344,29 +344,24 @@ pub unsafe fn escape_avx2(bytes: &[u8], result: &mut Vec<u8>) {
         let a2 = _mm256_load_si256(ptr.add(M256_VECTOR_SIZE * 2) as *const __m256i);
         let a3 = _mm256_load_si256(ptr.add(M256_VECTOR_SIZE * 3) as *const __m256i);
 
-        // Check for quotes (") in all vectors
-        let quote_0 = _mm256_cmpeq_epi8(a0, v_b);
-        let quote_1 = _mm256_cmpeq_epi8(a1, v_b);
-        let quote_2 = _mm256_cmpeq_epi8(a2, v_b);
-        let quote_3 = _mm256_cmpeq_epi8(a3, v_b);
-
-        // Check for backslash (\) in all vectors
-        let slash_0 = _mm256_cmpeq_epi8(a0, v_c);
-        let slash_1 = _mm256_cmpeq_epi8(a1, v_c);
-        let slash_2 = _mm256_cmpeq_epi8(a2, v_c);
-        let slash_3 = _mm256_cmpeq_epi8(a3, v_c);
-
-        // Check for control characters (< 0x20) in all vectors
-        let ctrl_0 = _mm256_cmpgt_epi8(_mm256_add_epi8(a0, v_translation_a), v_below_a);
-        let ctrl_1 = _mm256_cmpgt_epi8(_mm256_add_epi8(a1, v_translation_a), v_below_a);
-        let ctrl_2 = _mm256_cmpgt_epi8(_mm256_add_epi8(a2, v_translation_a), v_below_a);
-        let ctrl_3 = _mm256_cmpgt_epi8(_mm256_add_epi8(a3, v_translation_a), v_below_a);
-
-        // Combine all masks
-        let cmp_a = _mm256_or_si256(_mm256_or_si256(quote_0, slash_0), ctrl_0);
-        let cmp_b = _mm256_or_si256(_mm256_or_si256(quote_1, slash_1), ctrl_1);
-        let cmp_c = _mm256_or_si256(_mm256_or_si256(quote_2, slash_2), ctrl_2);
-        let cmp_d = _mm256_or_si256(_mm256_or_si256(quote_3, slash_3), ctrl_3);
+        // Combined mask computation - all escape conditions in one operation
+        // This reduces instruction count and improves pipelining
+        let cmp_a = _mm256_or_si256(
+            _mm256_or_si256(_mm256_cmpeq_epi8(a0, v_b), _mm256_cmpeq_epi8(a0, v_c)),
+            _mm256_cmpgt_epi8(_mm256_add_epi8(a0, v_translation_a), v_below_a),
+        );
+        let cmp_b = _mm256_or_si256(
+            _mm256_or_si256(_mm256_cmpeq_epi8(a1, v_b), _mm256_cmpeq_epi8(a1, v_c)),
+            _mm256_cmpgt_epi8(_mm256_add_epi8(a1, v_translation_a), v_below_a),
+        );
+        let cmp_c = _mm256_or_si256(
+            _mm256_or_si256(_mm256_cmpeq_epi8(a2, v_b), _mm256_cmpeq_epi8(a2, v_c)),
+            _mm256_cmpgt_epi8(_mm256_add_epi8(a2, v_translation_a), v_below_a),
+        );
+        let cmp_d = _mm256_or_si256(
+            _mm256_or_si256(_mm256_cmpeq_epi8(a3, v_b), _mm256_cmpeq_epi8(a3, v_c)),
+            _mm256_cmpgt_epi8(_mm256_add_epi8(a3, v_translation_a), v_below_a),
+        );
 
         // Fast path: check if any escaping needed
         let any_escape =
@@ -738,15 +733,32 @@ unsafe fn process_mask_avx512(
 
 #[inline(always)]
 fn write_escape(result: &mut Vec<u8>, escape_byte: u8, c: u8) {
-    result.push(b'\\');
-    if escape_byte == UU {
-        // Unicode escape for control characters
-        result.extend_from_slice(b"u00");
-        let hex_digits = &HEX_BYTES[c as usize];
-        result.push(hex_digits.0);
-        result.push(hex_digits.1);
+    // Use optimized escape table for bulk writing
+    let (len, bytes) = ESCAPE_TABLE[c as usize];
+    if len > 0 {
+        // Ensure we have enough capacity for the escape sequence
+        result.reserve(len as usize);
+        let dst = result.as_mut_ptr().add(result.len());
+        // Use copy_nonoverlapping for fast bulk copy
+        unsafe {
+            std::ptr::copy_nonoverlapping(bytes.as_ptr(), dst, 8);
+        }
+        // Update the length - only add the actual escape sequence length
+        unsafe {
+            result.set_len(result.len() + len as usize);
+        }
     } else {
-        // Simple escape
-        result.push(escape_byte);
+        // Fallback to old method for characters not in the table
+        result.push(b'\\');
+        if escape_byte == UU {
+            // Unicode escape for control characters
+            result.extend_from_slice(b"u00");
+            let hex_digits = &HEX_BYTES[c as usize];
+            result.push(hex_digits.0);
+            result.push(hex_digits.1);
+        } else {
+            // Simple escape
+            result.push(escape_byte);
+        }
     }
 }

From f9d7bb2c366cd8d1ea2e49137222f118089896ba Mon Sep 17 00:00:00 2001
From: LongYinan <lynweklm@gmail.com>
Date: Sat, 11 Oct 2025 19:43:40 +0800
Subject: [PATCH 07/15] optimize more

---
 src/aarch64.rs | 154 +++++++++++++++++++++++++++-----------------
 src/lib.rs     |   9 ++-
 src/util.rs    |  17 +++++
 src/x86.rs     | 169 ++++++++++++++++++++++++++++++++++++-------------
 4 files changed, 245 insertions(+), 104 deletions(-)
 create mode 100644 src/util.rs

diff --git a/src/aarch64.rs b/src/aarch64.rs
index e746a70..7f7749d 100644
--- a/src/aarch64.rs
+++ b/src/aarch64.rs
@@ -3,30 +3,13 @@ use std::arch::aarch64::{
 };
 
 use crate::generic::{ESCAPE, ESCAPE_TABLE, HEX_BYTES, UU};
+use crate::util::check_cross_page;
 
 const CHUNK: usize = 64;
 // 128 bytes ahead
 const PREFETCH_DISTANCE: usize = CHUNK * 2;
 const SLASH_SENTINEL: u8 = 0xFF;
 
-#[inline(always)]
-fn check_cross_page(ptr: *const u8, step: usize) -> bool {
-    #[cfg(any(target_os = "linux", target_os = "macos"))]
-    {
-        // Check if reading 'step' bytes from 'ptr' would cross a page boundary
-        // Page size is typically 4096 bytes on aarch64 Linux and macOS
-        const PAGE_SIZE: usize = 4096;
-        ((ptr as usize & (PAGE_SIZE - 1)) + step) > PAGE_SIZE
-    }
-
-    #[cfg(not(any(target_os = "linux", target_os = "macos")))]
-    {
-        // On other platforms, always use the safe path with temporary buffer
-        // to avoid potential page faults
-        true
-    }
-}
-
 #[inline]
 pub fn escape_neon(bytes: &[u8], output: &mut Vec<u8>) {
     let n = bytes.len();
@@ -123,55 +106,112 @@ pub fn escape_neon(bytes: &[u8], output: &mut Vec<u8>) {
 
 #[inline(always)]
 fn handle_tail(src: &[u8], dst: &mut Vec<u8>) {
-    for &c in src {
-        let escape_byte = ESCAPE[c as usize];
-        if escape_byte == 0 {
-            dst.push(c);
-        } else {
-            write_escape(dst, escape_byte, c);
+    unsafe {
+        let mut dst_ptr = dst.as_mut_ptr().add(dst.len());
+        let dst_start = dst_ptr;
+        let mut i = 0;
+
+        while i < src.len() {
+            let c = src[i];
+            let escape_byte = ESCAPE[c as usize];
+
+            if escape_byte == 0 {
+                // No escape needed
+                *dst_ptr = c;
+                dst_ptr = dst_ptr.add(1);
+                i += 1;
+            } else {
+                // Handle continuous escapes
+                let consumed = escape_continuous(src, &mut dst_ptr, i);
+                i += consumed;
+            }
         }
+
+        let bytes_written = dst_ptr as usize - dst_start as usize;
+        dst.set_len(dst.len() + bytes_written);
     }
 }
 
+/// Process continuous escaped characters efficiently
+/// Returns the number of source bytes consumed
 #[inline(always)]
-fn handle_block(src: &[u8], mask: &[u8; 16], dst: &mut Vec<u8>) {
-    for (j, &m) in mask.iter().enumerate() {
-        let c = src[j];
-        if m == 0 {
-            dst.push(c);
-        } else if m == SLASH_SENTINEL {
-            dst.push(b'\\');
-            dst.push(b'\\');
-        } else {
-            write_escape(dst, m, c);
+unsafe fn escape_continuous(src: &[u8], dst: &mut *mut u8, start_idx: usize) -> usize {
+    let mut i = start_idx;
+
+    while i < src.len() {
+        let c = src[i];
+        let escape_byte = ESCAPE[c as usize];
+
+        if escape_byte == 0 {
+            break;
+        }
+
+        let (len, escape_bytes) = ESCAPE_TABLE[c as usize];
+
+        unsafe {
+            if len > 0 {
+                // Copy 8 bytes at once (actual escape + padding)
+                std::ptr::copy_nonoverlapping(escape_bytes.as_ptr(), *dst, 8);
+                *dst = dst.add(len as usize);
+            } else {
+                // Rare fallback for characters not in table
+                **dst = b'\\';
+                *dst = dst.add(1);
+                if escape_byte == UU {
+                    std::ptr::copy_nonoverlapping(b"u00".as_ptr(), *dst, 3);
+                    *dst = dst.add(3);
+                    let hex = &HEX_BYTES[c as usize];
+                    **dst = hex.0;
+                    *dst = dst.add(1);
+                    **dst = hex.1;
+                    *dst = dst.add(1);
+                } else {
+                    **dst = escape_byte;
+                    *dst = dst.add(1);
+                }
+            }
+        }
+
+        i += 1;
+
+        // Check if next character also needs escaping
+        if i < src.len() && ESCAPE[src[i] as usize] == 0 {
+            break;
         }
     }
+
+    i - start_idx
 }
 
 #[inline(always)]
-fn write_escape(dst: &mut Vec<u8>, escape_byte: u8, c: u8) {
-    // Use optimized escape table for bulk writing
-    let (len, bytes) = ESCAPE_TABLE[c as usize];
-    if len > 0 {
-        // Ensure we have enough capacity for the escape sequence
-        dst.reserve(len as usize);
-        unsafe {
-            let ptr = dst.as_mut_ptr().add(dst.len());
-            // Use copy_nonoverlapping for fast bulk copy
-            std::ptr::copy_nonoverlapping(bytes.as_ptr(), ptr, 8);
-            // Update the length - only add the actual escape sequence length
-            dst.set_len(dst.len() + len as usize);
-        }
-    } else {
-        // Fallback to old method for characters not in the table
-        dst.push(b'\\');
-        if escape_byte == UU {
-            dst.extend_from_slice(b"u00");
-            let hex = &HEX_BYTES[c as usize];
-            dst.push(hex.0);
-            dst.push(hex.1);
-        } else {
-            dst.push(escape_byte);
+fn handle_block(src: &[u8], mask: &[u8; 16], dst: &mut Vec<u8>) {
+    unsafe {
+        let mut dst_ptr = dst.as_mut_ptr().add(dst.len());
+        let dst_start = dst_ptr;
+        let mut j = 0;
+
+        while j < 16 {
+            let m = mask[j];
+            let c = src[j];
+
+            if m == 0 {
+                // No escape needed, copy directly
+                *dst_ptr = c;
+                dst_ptr = dst_ptr.add(1);
+                j += 1;
+            } else if m == SLASH_SENTINEL {
+                // Backslash escape
+                std::ptr::copy_nonoverlapping(b"\\\\".as_ptr(), dst_ptr, 2);
+                dst_ptr = dst_ptr.add(2);
+                j += 1;
+            } else {
+                // Handle continuous escapes
+                let consumed = escape_continuous(src, &mut dst_ptr, j);
+                j += consumed;
+            }
         }
+
+        let bytes_written = dst_ptr as usize - dst_start as usize;
+        dst.set_len(dst.len() + bytes_written);
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 0dbdc3e..dd2e395 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -111,6 +111,7 @@
 #[cfg(target_arch = "aarch64")]
 mod aarch64;
 mod generic;
+mod util;
 #[cfg(target_arch = "x86_64")]
 mod x86;
 
@@ -122,7 +123,7 @@ pub fn escape<S: AsRef<str>>(input: S) -> String {
     #[cfg(not(feature = "force_aarch64_neon"))]
     use generic::escape_inner;
 
-    let mut result = Vec::with_capacity(input.as_ref().len() + input.as_ref().len() / 2 + 2);
+    let mut result = Vec::with_capacity(input.as_ref().len() * 6 + 32 + 3);
     result.push(b'"');
     let s = input.as_ref();
     let bytes = s.as_bytes();
@@ -182,6 +183,8 @@ pub fn escape_into<S: AsRef<str>>(input: S, output: &mut Vec<u8>) {
     #[cfg(not(feature = "force_aarch64_neon"))]
     use generic::escape_inner;
 
+    output.reserve(input.as_ref().len() * 6 + 32 + 3);
+
     output.push(b'"');
     let s = input.as_ref();
     let bytes = s.as_bytes();
@@ -437,7 +440,7 @@ fn test_rxjs() {
     assert!(!sources.is_empty());
     for source in sources {
         assert_eq!(escape(&source), serde_json::to_string(&source).unwrap());
-        let mut output = String::new();
+        let mut output = String::with_capacity(source.len() * 6 + 32 + 3);
         escape_into(&source, unsafe { output.as_mut_vec() });
         assert_eq!(output, serde_json::to_string(&source).unwrap());
     }
@@ -465,7 +468,7 @@ fn test_sources() {
     assert!(!sources.is_empty());
     for source in sources {
         assert_eq!(escape(&source), serde_json::to_string(&source).unwrap());
-        let mut output = String::new();
+        let mut output = String::with_capacity(source.len() * 6 + 32 + 3);
         escape_into(&source, unsafe { output.as_mut_vec() });
         assert_eq!(output, serde_json::to_string(&source).unwrap());
     }
diff --git a/src/util.rs b/src/util.rs
new file mode 100644
index 0000000..a2e2fd6
--- /dev/null
+++ b/src/util.rs
@@ -0,0 +1,17 @@
+#[inline(always)]
+pub(crate) fn check_cross_page(ptr: *const u8, step: usize) -> bool {
+    #[cfg(any(target_os = "linux", target_os = "macos"))]
+    {
+        // Check if reading 'step' bytes from 'ptr' would cross a page boundary
+        // Page size is typically 4096 bytes on x86_64 Linux and macOS
+        const PAGE_SIZE: usize = 4096;
+        ((ptr as usize & (PAGE_SIZE - 1)) + step) > PAGE_SIZE
+    }
+
+    #[cfg(not(any(target_os = "linux", target_os = "macos")))]
+    {
+        // On other platforms, always use the safe path with temporary buffer
+        // to avoid potential page faults
+        true
+    }
+}
diff --git a/src/x86.rs b/src/x86.rs
index c3898fc..7cd8b21 100644
--- a/src/x86.rs
+++ b/src/x86.rs
@@ -9,6 +9,7 @@ use std::arch::x86_64::{
 };
 
 use crate::generic::{ESCAPE, ESCAPE_TABLE, HEX_BYTES, UU};
+use crate::util::check_cross_page;
 
 // Constants for control character detection using signed comparison trick
 const TRANSLATION_A: i8 = i8::MAX - 31i8;
@@ -30,24 +31,6 @@ fn sub(a: *const u8, b: *const u8) -> usize {
     (a as usize) - (b as usize)
 }
 
-#[inline(always)]
-fn check_cross_page(ptr: *const u8, step: usize) -> bool {
-    #[cfg(any(target_os = "linux", target_os = "macos"))]
-    {
-        // Check if reading 'step' bytes from 'ptr' would cross a page boundary
-        // Page size is typically 4096 bytes on x86_64 Linux and macOS
-        const PAGE_SIZE: usize = 4096;
-        ((ptr as usize & (PAGE_SIZE - 1)) + step) > PAGE_SIZE
-    }
-
-    #[cfg(not(any(target_os = "linux", target_os = "macos")))]
-    {
-        // On other platforms, always use the safe path with temporary buffer
-        // to avoid potential page faults
-        true
-    }
-}
-
 #[target_feature(enable = "avx512f", enable = "avx512bw")]
 #[inline]
 pub unsafe fn escape_avx512(bytes: &[u8], result: &mut Vec<u8>) {
@@ -479,12 +462,14 @@ pub unsafe fn escape_avx2(bytes: &[u8], result: &mut Vec<u8>) {
             (_mm256_movemask_epi8(_mm256_or_si256(
                 _mm256_or_si256(_mm256_cmpeq_epi8(a, v_b), _mm256_cmpeq_epi8(a, v_c)),
                 _mm256_cmpgt_epi8(_mm256_add_epi8(a, v_translation_a), v_below_a),
-            )) as u32) & ((1u32 << remaining) - 1)
+            )) as u32)
+                & ((1u32 << remaining) - 1)
         } else {
             (_mm256_movemask_epi8(_mm256_or_si256(
                 _mm256_or_si256(_mm256_cmpeq_epi8(a, v_b), _mm256_cmpeq_epi8(a, v_c)),
                 _mm256_cmpgt_epi8(_mm256_add_epi8(a, v_translation_a), v_below_a),
-            )) as u32).wrapping_shr(d as u32)
+            )) as u32)
+                .wrapping_shr(d as u32)
         };
 
         if mask != 0 {
@@ -624,12 +609,14 @@ pub unsafe fn escape_sse2(bytes: &[u8], result: &mut Vec<u8>) {
             (_mm_movemask_epi8(_mm_or_si128(
                 _mm_or_si128(_mm_cmpeq_epi8(a, v_b), _mm_cmpeq_epi8(a, v_c)),
                 _mm_cmpgt_epi8(_mm_add_epi8(a, v_translation_a), v_below_a),
-            )) as u16) & ((1u16 << remaining) - 1)
+            )) as u16)
+                & ((1u16 << remaining) - 1)
         } else {
             (_mm_movemask_epi8(_mm_or_si128(
                 _mm_or_si128(_mm_cmpeq_epi8(a, v_b), _mm_cmpeq_epi8(a, v_c)),
                 _mm_cmpgt_epi8(_mm_add_epi8(a, v_translation_a), v_below_a),
-            )) as u16).wrapping_shr(d as u32)
+            )) as u16)
+                .wrapping_shr(d as u32)
         };
 
         if mask != 0 {
@@ -674,25 +661,45 @@ unsafe fn process_mask_avx(
     let ptr = ptr.add(offset);
     let at = sub(ptr, start_ptr);
 
+    // Reserve space upfront to reduce allocations
+    // Worst case: each byte needs 6 bytes (e.g., \u001f)
+    let max_needed = 32 * 6;
+    result.reserve(max_needed);
+
     // Process mask bits using bit manipulation
     let mut remaining = mask as u32;
     while remaining != 0 {
         let cur = remaining.trailing_zeros() as usize;
-        let c = *ptr.add(cur);
-        let escape_byte = ESCAPE[c as usize];
-        debug_assert!(escape_byte != 0);
-
         let i = at + cur;
-        // Copy unescaped portion if needed
+
+        // Copy unescaped portion using copy_nonoverlapping
         if *start < i {
-            result.extend_from_slice(&bytes[*start..i]);
+            let src = bytes.as_ptr().add(*start);
+            let len = i - *start;
+            let dst = result.as_mut_ptr().add(result.len());
+            std::ptr::copy_nonoverlapping(src, dst, len);
+            result.set_len(result.len() + len);
         }
-        // Write escape sequence
-        write_escape(result, escape_byte, c);
-        *start = i + 1;
 
-        // Clear the lowest set bit
-        remaining &= remaining - 1;
+        // Handle continuous escapes starting from current position
+        let escape_src = ptr.add(cur);
+        let mut dst = result.as_mut_ptr().add(result.len());
+        let new_src = escape_continuous(escape_src, &mut dst, bytes, start_ptr);
+        let bytes_written = sub(dst, result.as_mut_ptr().add(result.len()));
+        result.set_len(result.len() + bytes_written);
+
+        // Update start position
+        let chars_processed = sub(new_src, escape_src);
+        *start = i + chars_processed;
+
+        // Clear processed bits from mask
+        // We need to clear all bits up to and including the last processed character
+        let bits_to_clear = cur + chars_processed;
+        if bits_to_clear < 32 {
+            remaining &= !((1u32 << bits_to_clear) - 1);
+        } else {
+            remaining = 0;
+        }
     }
 }
 
@@ -709,28 +716,102 @@ unsafe fn process_mask_avx512(
     let ptr = ptr.add(offset);
     let at = sub(ptr, start_ptr);
 
+    // Reserve space upfront to reduce allocations
+    // Worst case: each byte needs 6 bytes (e.g., \u001f)
+    let max_needed = 64 * 6;
+    result.reserve(max_needed);
+
     // Process mask bits using bit manipulation
     let mut remaining = mask;
     while remaining != 0 {
         let cur = remaining.trailing_zeros() as usize;
-        let c = *ptr.add(cur);
-        let escape_byte = ESCAPE[c as usize];
-        debug_assert!(escape_byte != 0);
-
         let i = at + cur;
-        // Copy unescaped portion if needed
+
+        // Copy unescaped portion using copy_nonoverlapping
         if *start < i {
-            result.extend_from_slice(&bytes[*start..i]);
+            let src = bytes.as_ptr().add(*start);
+            let len = i - *start;
+            let dst = result.as_mut_ptr().add(result.len());
+            std::ptr::copy_nonoverlapping(src, dst, len);
+            result.set_len(result.len() + len);
         }
-        // Write escape sequence
-        write_escape(result, escape_byte, c);
-        *start = i + 1;
 
-        // Clear the lowest set bit
-        remaining &= remaining - 1;
+        // Handle continuous escapes starting from current position
+        let escape_src = ptr.add(cur);
+        let mut dst = result.as_mut_ptr().add(result.len());
+        let new_src = escape_continuous(escape_src, &mut dst, bytes, start_ptr);
+        let bytes_written = sub(dst, result.as_mut_ptr().add(result.len()));
+        result.set_len(result.len() + bytes_written);
+
+        // Update start position
+        let chars_processed = sub(new_src, escape_src);
+        *start = i + chars_processed;
+
+        // Clear processed bits from mask
+        // We need to clear all bits up to and including the last processed character
+        let bits_to_clear = cur + chars_processed;
+        if bits_to_clear < 64 {
+            remaining &= !((1u64 << bits_to_clear) - 1);
+        } else {
+            remaining = 0;
+        }
     }
 }
 
+/// Process continuous escaped characters efficiently
+/// Returns the new source pointer position
+#[inline(always)]
+unsafe fn escape_continuous(
+    src: *const u8,
+    dst: &mut *mut u8,
+    bytes: &[u8],
+    start_ptr: *const u8,
+) -> *const u8 {
+    let mut ptr = src;
+
+    loop {
+        let c = *ptr;
+        let escape_byte = ESCAPE[c as usize];
+
+        if escape_byte == 0 {
+            break;
+        }
+
+        let (len, escape_bytes) = ESCAPE_TABLE[c as usize];
+
+        if len > 0 {
+            // Copy 8 bytes at once (actual escape + padding)
+            std::ptr::copy_nonoverlapping(escape_bytes.as_ptr(), *dst, 8);
+            *dst = dst.add(len as usize);
+        } else {
+            // Rare fallback for characters not in table
+            **dst = b'\\';
+            *dst = dst.add(1);
+            if escape_byte == UU {
+                std::ptr::copy_nonoverlapping(b"u00".as_ptr(), *dst, 3);
+                *dst = dst.add(3);
+                let hex = &HEX_BYTES[c as usize];
+                **dst = hex.0;
+                *dst = dst.add(1);
+                **dst = hex.1;
+                *dst = dst.add(1);
+            } else {
+                **dst = escape_byte;
+                *dst = dst.add(1);
+            }
+        }
+
+        ptr = ptr.add(1);
+
+        // Check if next character also needs escaping to continue the loop
+        if ptr >= bytes.as_ptr().add(bytes.len()) || ESCAPE[*ptr as usize] == 0 {
+            break;
+        }
+    }
+
+    ptr
+}
+
 #[inline(always)]
 fn write_escape(result: &mut Vec<u8>, escape_byte: u8, c: u8) {
     // Use optimized escape table for bulk writing

From a9762effd39e147d8a798a6a48cd6ce19dce8983 Mon Sep 17 00:00:00 2001
From: LongYinan <lynweklm@gmail.com>
Date: Sat, 11 Oct 2025 19:43:50 +0800
Subject: [PATCH 08/15] rm plan

---
 plan.md | 193 --------------------------------------------------------
 1 file changed, 193 deletions(-)
 delete mode 100644 plan.md

diff --git a/plan.md b/plan.md
deleted file mode 100644
index 5b2d3e0..0000000
--- a/plan.md
+++ /dev/null
@@ -1,193 +0,0 @@
-# Performance Analysis: sonic-rs vs json-escape-simd
-
-## Benchmark Results
-- **json-escape-simd**: 333.21 - 348.88 µs (median: ~341 µs)
-- **sonic-rs**: 205.62 - 210.19 µs (median: ~208 µs)
-- **Performance Gap**: sonic-rs is ~40% faster
-
-## Key Differences in Implementation
-
-### 1. Copy-First Strategy with Deferred Escaping
-
-**sonic-rs Approach:**
-```rust
-// Always copy the SIMD chunk first
-v.write_to_slice_unaligned_unchecked(dst_slice);
-let mask = escaped_mask(v);
-if mask.all_zero() {
-    // Fast path: no escapes, just advance pointers
-    advance_pointers();
-} else {
-    // Found escape, backtrack to handle it
-    let escape_pos = mask.first_offset();
-    adjust_and_escape();
-}
-```
-
-**json-escape-simd Approach:**
-```rust
-// Check for escapes first, copy only if clean
-if any_escape == 0 {
-    // Copy whole chunk
-    result.extend_from_slice(chunk);
-} else {
-    // Process each escape individually
-    process_mask_avx(...);
-}
-```
-
-**Why Copy-First is Faster:**
-- Reduces branches in the common case (most chunks have no escapes)
-- Better CPU pipeline utilization
-- Simpler control flow
-- Memory writes are buffered and can be overlapped with mask checking
-
-### 2. Pre-allocated Output Buffer with MaybeUninit
-
-**sonic-rs:**
-```rust
-// Pre-reserves worst-case buffer size upfront
-let buf = writer.reserve_with(value.len() * 6 + 32 + 3)?;
-// Works with MaybeUninit to avoid initialization overhead
-pub fn format_string(value: &str, dst: &mut [MaybeUninit<u8>], ...) -> usize
-```
-
-**json-escape-simd:**
-```rust
-// Uses Vec with potential dynamic growth
-let mut result = Vec::with_capacity(estimated_capacity);
-// Multiple extend_from_slice calls may trigger reallocation
-result.extend_from_slice(&bytes[start..i]);
-```
-
-**Benefits:**
-- No reallocation overhead during processing
-- No initialization cost for unused buffer space
-- Direct pointer arithmetic instead of Vec method calls
-
-### 3. Compact Escape Handling with Lookup Table
-
-**sonic-rs:**
-```rust
-// Pre-formatted escape sequences in 8-byte blocks
-pub const QUOTE_TAB: [(u8, [u8; 8]); 256] = [
-    (6, *b"\\u0000\0\0"),  // Length and padded sequence
-    (2, *b"\\t\0\0\0\0\0\0"),
-    // ...
-];
-// Single memcpy for any escape type
-std::ptr::copy_nonoverlapping(QUOTE_TAB[ch].1.as_ptr(), dst, 8);
-dst += QUOTE_TAB[ch].0;
-```
-
-**json-escape-simd:**
-```rust
-// Conditional logic for each escape type
-fn write_escape(result: &mut Vec<u8>, escape_byte: u8, c: u8) {
-    result.push(b'\\');
-    if escape_byte == UU {
-        result.extend_from_slice(b"u00");
-        result.push(hex_digits.0);
-        result.push(hex_digits.1);
-    } else {
-        result.push(escape_byte);
-    }
-}
-```
-
-**Advantages:**
-- Single memory operation vs multiple pushes
-- No conditional branches in escape writing
-- Better memory locality (8-byte aligned writes)
-
-### 4. Simpler Mask Processing
-
-**sonic-rs:**
-- Uses `first_offset()` to find only the first escape
-- Handles escapes sequentially from that point
-- Minimal bit manipulation
-
-**json-escape-simd:**
-- Processes every set bit in the mask using `trailing_zeros()`
-- Complex bit manipulation loop (`mask &= mask - 1`)
-- More branches and iterations
-
-### 5. Cross-Page Boundary Optimization
-
-**sonic-rs includes page boundary checks:**
-```rust
-if check_cross_page(sptr, LANES) {
-    // Use temporary buffer to avoid potential page faults
-    std::ptr::copy_nonoverlapping(sptr, temp.as_mut_ptr(), remaining);
-    load(temp.as_ptr())
-}
-```
-
-This prevents potential page faults when reading past the end of allocated memory.
-
-## Optimization Recommendations
-
-### Priority 1: Adopt Copy-First Strategy
-- Modify the SIMD loops to always write chunks first
-- Only check for escapes after copying
-- Backtrack when escapes are found
-
-### Priority 2: Use Pre-allocated MaybeUninit Buffer
-```rust
-pub fn escape_into_uninit(input: &str, output: &mut [MaybeUninit<u8>]) -> usize {
-    // Work directly with MaybeUninit buffer
-    // Return actual bytes written
-}
-```
-
-### Priority 3: Implement Compact Escape Table
-```rust
-const ESCAPE_TABLE: [(u8, [u8; 8]); 256] = [
-    // Pre-format all escape sequences
-    // Use single memcpy for writing
-];
-```
-
-### Priority 4: Simplify Mask Processing
-- Process only first escape per chunk
-- Continue sequentially from escape point
-- Reduce bit manipulation overhead
-
-### Priority 5: Add Page Boundary Handling
-- Implement cross-page detection for tail processing
-- Use temporary buffer when crossing boundaries
-
-## Expected Performance Improvements
-
-Based on the analysis, implementing these optimizations should:
-1. **Copy-First Strategy**: 15-20% improvement
-2. **MaybeUninit Buffer**: 5-10% improvement
-3. **Compact Escape Table**: 10-15% improvement
-4. **Simplified Mask Processing**: 5-10% improvement
-5. **Page Boundary Handling**: 2-3% improvement (safety/stability)
-
-Combined, these changes could potentially close most of the 40% performance gap with sonic-rs.
-
-## Implementation Strategy
-
-1. **Phase 1**: Implement copy-first strategy (biggest impact)
-2. **Phase 2**: Add compact escape table
-3. **Phase 3**: Switch to MaybeUninit buffer
-4. **Phase 4**: Optimize mask processing
-5. **Phase 5**: Add page boundary handling ✅ **COMPLETED**
-
-Each phase should be benchmarked independently to measure impact.
-
-## Completed Optimizations
-
-### Page Boundary Handling (Phase 5) - COMPLETED
-
-Added page boundary checking to prevent potential page faults when reading past the end of input:
-
-- Added `check_cross_page` function with conditional compilation for Linux/macOS
-- Updated AVX512, AVX2, SSE2 tail handling to use temporary buffers when crossing page boundaries
-- Updated aarch64 NEON implementation with page boundary checks
-- On Linux/macOS: checks if reading would cross 4096-byte page boundary
-- On other platforms: always uses safe path with temporary buffer
-
-This optimization improves safety and stability without significant performance impact.
\ No newline at end of file

From 4167bf36da4eaf345d9dcddbe70e3a961c64921e Mon Sep 17 00:00:00 2001
From: LongYinan <lynweklm@gmail.com>
Date: Sun, 12 Oct 2025 13:50:43 +0800
Subject: [PATCH 09/15] borrow sonic-rs impl

---
 .github/workflows/CI.yml    |   24 +
 Cargo.lock                  |   41 ++
 Cargo.toml                  |    4 +-
 README.md                   |   12 +-
 benches/escape.rs           |   22 +-
 {src => benches}/generic.rs |   55 --
 examples/escape.rs          |    4 +-
 src/aarch64.rs              |  217 -------
 src/lib.rs                  | 1082 +++++++++++++++++++++--------------
 src/util.rs                 |   17 -
 src/x86.rs                  |  845 ---------------------------
 11 files changed, 758 insertions(+), 1565 deletions(-)
 rename {src => benches}/generic.rs (70%)
 delete mode 100644 src/aarch64.rs
 delete mode 100644 src/util.rs
 delete mode 100644 src/x86.rs

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 3c0667f..ed8e7d3 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -45,6 +45,30 @@ jobs:
       - name: Run tests
         run: cargo test
 
+  miri:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v5
+      - name: Setup Rust
+        uses: dtolnay/rust-toolchain@stable
+        with:
+          targets: x86_64-unknown-linux-gnu
+          components: miri
+          toolchain: nightly
+      - uses: actions/setup-node@v5
+        with:
+          node-version: 22
+          cache: 'yarn'
+      - name: Install dependencies
+        run: yarn install
+      - name: Download fixtures
+        run: node download-fixtures.js
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Run miri
+        run: cargo miri test
+        env:
+          MIRIFLAGS: "-Zmiri-disable-isolation"
   bench:
     strategy:
       matrix:
diff --git a/Cargo.lock b/Cargo.lock
index fa43b7a..9684540 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -319,9 +319,12 @@ dependencies = [
  "criterion2",
  "glob",
  "json-escape",
+ "rand",
  "serde",
  "serde_json",
  "sonic-rs",
+ "sonic-simd",
+ "thiserror",
  "v_jsonescape",
 ]
 
@@ -402,6 +405,15 @@ version = "11.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
 
+[[package]]
+name = "ppv-lite86"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
+dependencies = [
+ "zerocopy",
+]
+
 [[package]]
 name = "proc-macro2"
 version = "1.0.101"
@@ -455,6 +467,35 @@ dependencies = [
  "ptr_meta",
 ]
 
+[[package]]
+name = "rand"
+version = "0.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1"
+dependencies = [
+ "rand_chacha",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
+dependencies = [
+ "ppv-lite86",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
+dependencies = [
+ "getrandom",
+]
+
 [[package]]
 name = "rayon"
 version = "1.11.0"
diff --git a/Cargo.toml b/Cargo.toml
index 1f88176..243e350 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -17,7 +17,6 @@ name = "escape"
 path = "examples/escape.rs"
 
 [features]
-force_aarch64_neon = [] # Force use of neon implementation on aarch64
 codspeed = ["criterion2/codspeed"]
 
 [[bench]]
@@ -26,10 +25,13 @@ harness = false
 
 [dependencies]
 anyhow = "1"
+sonic-simd = "0.1"
+thiserror = "2"
 
 [dev-dependencies]
 criterion2 = "3"
 glob = "0.3"
+rand = "0.9"
 serde = "1"
 serde_json = "1"
 v_jsonescape = "0.7"
diff --git a/README.md b/README.md
index 54fa944..9d6773d 100644
--- a/README.md
+++ b/README.md
@@ -4,17 +4,7 @@
 ![docs.rs](https://img.shields.io/docsrs/json-escape-simd)
 [![CodSpeed Badge](https://img.shields.io/endpoint?url=https://codspeed.io/badge.json)](https://codspeed.io/napi-rs/json-escape-simd)
 
-Optimized SIMD routines for escaping JSON strings. This repository contains the `json-escape-simd` crate, comparison fixtures, and Criterion benches against commonly used alternatives.
-
-> [!IMPORTANT]
->
-> On aarch64 NEON hosts the available register width is **128** bits, which is narrower than the lookup table this implementation prefers. As a result the SIMD path may not outperform the generic fallback, which is reflected in the benchmark numbers below.
->
-> On some modern macOS devices with larger register numbers, the SIMD path may outperform the generic fallback, see the [M3 max benchmark](#apple-m3-max) below.
-
-> [!NOTE]
->
-> The `force_aarch64_neon` feature flag can be used to force use of the neon implementation on aarch64. This is useful for the benchmark.
+Optimized SIMD routines for escaping JSON strings. The implementation is from [sonic-rs](https://github.com/cloudwego/sonic-rs), we only take the string escaping part to avoid the abstraction overhead.
 
 ## Benchmarks
 
diff --git a/benches/escape.rs b/benches/escape.rs
index e5a6435..4ff2226 100644
--- a/benches/escape.rs
+++ b/benches/escape.rs
@@ -2,7 +2,10 @@ use std::{fs, hint::black_box};
 
 use criterion::{Criterion, criterion_group, criterion_main};
 
-use json_escape_simd::{escape, escape_generic};
+use generic::escape_generic;
+use json_escape_simd::escape;
+
+mod generic;
 
 fn get_rxjs_sources() -> Vec<String> {
     let rxjs_paths = glob::glob("node_modules/rxjs/src/**/*.ts").unwrap();
@@ -86,6 +89,16 @@ fn run_benchmarks(c: &mut Criterion, sources: &[String], prefix: &str) {
     });
 }
 
+fn short_string_benchmark(c: &mut Criterion) {
+    let sources = vec![
+        "Hello, world!".to_string(),
+        r#"abcdefghijklmnopqrstuvwxyz .*? hello world escape json string"#.to_string(),
+        "normal string 🥹".to_string(),
+        "中文 English 🚀 \n❓ 𝄞".to_string(),
+    ];
+    run_benchmarks(c, &sources, "short string");
+}
+
 fn rxjs_benchmark(c: &mut Criterion) {
     let sources = get_rxjs_sources();
     if !sources.is_empty() {
@@ -100,5 +113,10 @@ fn fixtures_benchmark(c: &mut Criterion) {
     }
 }
 
-criterion_group!(benches, rxjs_benchmark, fixtures_benchmark);
+criterion_group!(
+    benches,
+    short_string_benchmark,
+    rxjs_benchmark,
+    fixtures_benchmark
+);
 criterion_main!(benches);
diff --git a/src/generic.rs b/benches/generic.rs
similarity index 70%
rename from src/generic.rs
rename to benches/generic.rs
index 9ff9d32..ec0db08 100644
--- a/src/generic.rs
+++ b/benches/generic.rs
@@ -13,15 +13,6 @@ pub fn escape_generic<S: AsRef<str>>(s: S) -> String {
     unsafe { String::from_utf8_unchecked(result) }
 }
 
-#[inline]
-pub fn escape_into_generic<S: AsRef<str>>(s: S, output: &mut Vec<u8>) {
-    let s = s.as_ref();
-    let bytes = s.as_bytes();
-    output.push(b'"');
-    escape_inner(bytes, output);
-    output.push(b'"');
-}
-
 #[inline]
 // Slightly modified version of
 // <https://github.com/serde-rs/json/blob/d12e943590208da738c092db92c34b39796a2538/src/ser.rs#L2079>
@@ -140,49 +131,3 @@ pub(crate) static HEX_BYTES: [HexPair; 32] = [
     HexPair(b'1', b'e'),
     HexPair(b'1', b'f'),
 ];
-
-// Optimized escape table with 8-byte arrays for fast bulk writing
-// First element is the length of escape sequence, followed by the escape bytes
-pub(crate) static ESCAPE_TABLE: [(u8, [u8; 8]); 256] = {
-    let mut table = [(0u8, [0u8; 8]); 256];
-
-    // Control characters \u0000 - \u001f
-    table[0x00] = (6, *b"\\u0000\0\0");
-    table[0x01] = (6, *b"\\u0001\0\0");
-    table[0x02] = (6, *b"\\u0002\0\0");
-    table[0x03] = (6, *b"\\u0003\0\0");
-    table[0x04] = (6, *b"\\u0004\0\0");
-    table[0x05] = (6, *b"\\u0005\0\0");
-    table[0x06] = (6, *b"\\u0006\0\0");
-    table[0x07] = (6, *b"\\u0007\0\0");
-    table[0x08] = (2, *b"\\b\0\0\0\0\0\0");
-    table[0x09] = (2, *b"\\t\0\0\0\0\0\0");
-    table[0x0A] = (2, *b"\\n\0\0\0\0\0\0");
-    table[0x0B] = (6, *b"\\u000b\0\0");
-    table[0x0C] = (2, *b"\\f\0\0\0\0\0\0");
-    table[0x0D] = (2, *b"\\r\0\0\0\0\0\0");
-    table[0x0E] = (6, *b"\\u000e\0\0");
-    table[0x0F] = (6, *b"\\u000f\0\0");
-    table[0x10] = (6, *b"\\u0010\0\0");
-    table[0x11] = (6, *b"\\u0011\0\0");
-    table[0x12] = (6, *b"\\u0012\0\0");
-    table[0x13] = (6, *b"\\u0013\0\0");
-    table[0x14] = (6, *b"\\u0014\0\0");
-    table[0x15] = (6, *b"\\u0015\0\0");
-    table[0x16] = (6, *b"\\u0016\0\0");
-    table[0x17] = (6, *b"\\u0017\0\0");
-    table[0x18] = (6, *b"\\u0018\0\0");
-    table[0x19] = (6, *b"\\u0019\0\0");
-    table[0x1A] = (6, *b"\\u001a\0\0");
-    table[0x1B] = (6, *b"\\u001b\0\0");
-    table[0x1C] = (6, *b"\\u001c\0\0");
-    table[0x1D] = (6, *b"\\u001d\0\0");
-    table[0x1E] = (6, *b"\\u001e\0\0");
-    table[0x1F] = (6, *b"\\u001f\0\0");
-
-    // Special characters
-    table[0x22] = (2, *b"\\\"\0\0\0\0\0\0"); // "
-    table[0x5C] = (2, *b"\\\\\0\0\0\0\0\0"); // \
-
-    table
-};
diff --git a/examples/escape.rs b/examples/escape.rs
index 18b4ea4..5bd993a 100644
--- a/examples/escape.rs
+++ b/examples/escape.rs
@@ -1,12 +1,10 @@
 use std::fs;
 
-use json_escape_simd::{escape, escape_generic};
+use json_escape_simd::escape;
 
 fn main() {
     for fixture in get_rxjs_sources() {
         let encoded = escape(&fixture);
-        let encoded_fallback = escape_generic(&fixture);
-        assert_eq!(encoded, encoded_fallback);
         assert_eq!(encoded, sonic_rs::to_string(&fixture).unwrap());
         assert_eq!(encoded, serde_json::to_string(&fixture).unwrap());
     }
diff --git a/src/aarch64.rs b/src/aarch64.rs
deleted file mode 100644
index 7f7749d..0000000
--- a/src/aarch64.rs
+++ /dev/null
@@ -1,217 +0,0 @@
-use std::arch::aarch64::{
-    vceqq_u8, vdupq_n_u8, vld1q_u8_x4, vmaxvq_u8, vorrq_u8, vqtbl4q_u8, vst1q_u8,
-};
-
-use crate::generic::{ESCAPE, ESCAPE_TABLE, HEX_BYTES, UU};
-use crate::util::check_cross_page;
-
-const CHUNK: usize = 64;
-// 128 bytes ahead
-const PREFETCH_DISTANCE: usize = CHUNK * 2;
-const SLASH_SENTINEL: u8 = 0xFF;
-
-#[inline]
-pub fn escape_neon(bytes: &[u8], output: &mut Vec<u8>) {
-    let n = bytes.len();
-
-    unsafe {
-        let tbl = vld1q_u8_x4(ESCAPE.as_ptr());
-        let slash = vdupq_n_u8(b'\\');
-        let mut i = 0usize;
-
-        // Scratch buffer reused for mask materialisation; stay uninitialised.
-        #[allow(invalid_value)]
-        let mut placeholder: [u8; 16] = core::mem::MaybeUninit::uninit().assume_init();
-
-        while i + CHUNK <= n {
-            let ptr = bytes.as_ptr().add(i);
-
-            // Only prefetch if we won't go past the end
-            if i + CHUNK + PREFETCH_DISTANCE <= n {
-                core::arch::asm!(
-                    "prfm pldl1keep, [{0}]",
-                    in(reg) ptr.add(PREFETCH_DISTANCE),
-                );
-            }
-
-            // Use temporary buffer if reading would cross page boundary
-            let quad = if i + CHUNK == n || !check_cross_page(ptr, CHUNK) {
-                // Safe to read directly
-                vld1q_u8_x4(ptr)
-            } else {
-                // Need to use temporary buffer
-                let mut temp = [0u8; CHUNK];
-                std::ptr::copy_nonoverlapping(ptr, temp.as_mut_ptr(), CHUNK);
-                vld1q_u8_x4(temp.as_ptr())
-            };
-
-            let a = quad.0;
-            let b = quad.1;
-            let c = quad.2;
-            let d = quad.3;
-
-            let mask_1 = vorrq_u8(vqtbl4q_u8(tbl, a), vceqq_u8(slash, a));
-            let mask_2 = vorrq_u8(vqtbl4q_u8(tbl, b), vceqq_u8(slash, b));
-            let mask_3 = vorrq_u8(vqtbl4q_u8(tbl, c), vceqq_u8(slash, c));
-            let mask_4 = vorrq_u8(vqtbl4q_u8(tbl, d), vceqq_u8(slash, d));
-
-            let mask_r_1 = vmaxvq_u8(mask_1);
-            let mask_r_2 = vmaxvq_u8(mask_2);
-            let mask_r_3 = vmaxvq_u8(mask_3);
-            let mask_r_4 = vmaxvq_u8(mask_4);
-
-            if mask_r_1 | mask_r_2 | mask_r_3 | mask_r_4 == 0 {
-                output.extend_from_slice(std::slice::from_raw_parts(ptr, CHUNK));
-                i += CHUNK;
-                continue;
-            }
-
-            // Process each 16-byte chunk that has escapes
-            if mask_r_1 != 0 {
-                vst1q_u8(placeholder.as_mut_ptr(), mask_1);
-                handle_block(&bytes[i..i + 16], &placeholder, output);
-            } else {
-                output.extend_from_slice(std::slice::from_raw_parts(ptr, 16));
-            }
-
-            if mask_r_2 != 0 {
-                vst1q_u8(placeholder.as_mut_ptr(), mask_2);
-                handle_block(&bytes[i + 16..i + 32], &placeholder, output);
-            } else {
-                output.extend_from_slice(std::slice::from_raw_parts(ptr.add(16), 16));
-            }
-
-            if mask_r_3 != 0 {
-                vst1q_u8(placeholder.as_mut_ptr(), mask_3);
-                handle_block(&bytes[i + 32..i + 48], &placeholder, output);
-            } else {
-                output.extend_from_slice(std::slice::from_raw_parts(ptr.add(32), 16));
-            }
-
-            if mask_r_4 != 0 {
-                vst1q_u8(placeholder.as_mut_ptr(), mask_4);
-                handle_block(&bytes[i + 48..i + 64], &placeholder, output);
-            } else {
-                output.extend_from_slice(std::slice::from_raw_parts(ptr.add(48), 16));
-            }
-
-            i += CHUNK;
-        }
-
-        if i < n {
-            handle_tail(&bytes[i..], output);
-        }
-    }
-}
-
-#[inline(always)]
-fn handle_tail(src: &[u8], dst: &mut Vec<u8>) {
-    unsafe {
-        let mut dst_ptr = dst.as_mut_ptr().add(dst.len());
-        let dst_start = dst_ptr;
-        let mut i = 0;
-
-        while i < src.len() {
-            let c = src[i];
-            let escape_byte = ESCAPE[c as usize];
-
-            if escape_byte == 0 {
-                // No escape needed
-                *dst_ptr = c;
-                dst_ptr = dst_ptr.add(1);
-                i += 1;
-            } else {
-                // Handle continuous escapes
-                let consumed = escape_continuous(src, &mut dst_ptr, i);
-                i += consumed;
-            }
-        }
-
-        let bytes_written = dst_ptr as usize - dst_start as usize;
-        dst.set_len(dst.len() + bytes_written);
-    }
-}
-
-/// Process continuous escaped characters efficiently
-/// Returns the number of source bytes consumed
-#[inline(always)]
-unsafe fn escape_continuous(src: &[u8], dst: &mut *mut u8, start_idx: usize) -> usize {
-    let mut i = start_idx;
-
-    while i < src.len() {
-        let c = src[i];
-        let escape_byte = ESCAPE[c as usize];
-
-        if escape_byte == 0 {
-            break;
-        }
-
-        let (len, escape_bytes) = ESCAPE_TABLE[c as usize];
-
-        unsafe {
-            if len > 0 {
-                // Copy 8 bytes at once (actual escape + padding)
-                std::ptr::copy_nonoverlapping(escape_bytes.as_ptr(), *dst, 8);
-                *dst = dst.add(len as usize);
-            } else {
-                // Rare fallback for characters not in table
-                **dst = b'\\';
-                *dst = dst.add(1);
-                if escape_byte == UU {
-                    std::ptr::copy_nonoverlapping(b"u00".as_ptr(), *dst, 3);
-                    *dst = dst.add(3);
-                    let hex = &HEX_BYTES[c as usize];
-                    **dst = hex.0;
-                    *dst = dst.add(1);
-                    **dst = hex.1;
-                    *dst = dst.add(1);
-                } else {
-                    **dst = escape_byte;
-                    *dst = dst.add(1);
-                }
-            }
-        }
-
-        i += 1;
-
-        // Check if next character also needs escaping
-        if i < src.len() && ESCAPE[src[i] as usize] == 0 {
-            break;
-        }
-    }
-
-    i - start_idx
-}
-
-#[inline(always)]
-fn handle_block(src: &[u8], mask: &[u8; 16], dst: &mut Vec<u8>) {
-    unsafe {
-        let mut dst_ptr = dst.as_mut_ptr().add(dst.len());
-        let dst_start = dst_ptr;
-        let mut j = 0;
-
-        while j < 16 {
-            let m = mask[j];
-            let c = src[j];
-
-            if m == 0 {
-                // No escape needed, copy directly
-                *dst_ptr = c;
-                dst_ptr = dst_ptr.add(1);
-                j += 1;
-            } else if m == SLASH_SENTINEL {
-                // Backslash escape
-                std::ptr::copy_nonoverlapping(b"\\\\".as_ptr(), dst_ptr, 2);
-                dst_ptr = dst_ptr.add(2);
-                j += 1;
-            } else {
-                // Handle continuous escapes
-                let consumed = escape_continuous(src, &mut dst_ptr, j);
-                j += consumed;
-            }
-        }
-
-        let bytes_written = dst_ptr as usize - dst_start as usize;
-        dst.set_len(dst.len() + bytes_written);
-    }
-}
diff --git a/src/lib.rs b/src/lib.rs
index dd2e395..f525423 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,475 +1,729 @@
-//! Optimized SIMD routines for escaping JSON strings.
+//! Borrowed from <https://github.com/cloudwego/sonic-rs/blob/v0.5.5/src/util/string.rs>
 //!
-//! ## <div class="warning">Important</div>
-//!
-//! On aarch64 NEON hosts the available register width is **128** bits, which is narrower than the lookup table this implementation prefers. As a result the SIMD path may not outperform the generic fallback, which is reflected in the benchmark numbers below.
-//!
-//! On some modern macOS devices with larger register numbers, the SIMD path may outperform the generic fallback, see the [M3 max benchmark](#apple-m3-max) below.
-//!
-//! ### Note
-//!
-//! The `force_aarch64_neon` feature flag can be used to force use of the neon implementation on aarch64. This is useful for the benchmark.
-//!
-//! ## Benchmarks
-//!
-//! Numbers below come from `cargo bench` runs on GitHub Actions hardware. Criterion reports are summarized to make it easier to spot relative performance. "vs fastest" shows how much slower each implementation is compared to the fastest entry in the table (1.00× means fastest).
-//!
-//! ### GitHub Actions x86_64 (`ubuntu-latest`)
-//!
-//! `AVX2` enabled.
-//!
-//! **RxJS payload (~10k iterations)**
-//!
-//! | Implementation        | Median time   | vs fastest |
-//! | --------------------- | ------------- | ---------- |
-//! | **`escape simd`**     | **341.18 µs** | **1.00×**  |
-//! | `escape v_jsonescape` | 555.47 µs     | 1.63×      |
-//! | `escape generic`      | 656.85 µs     | 1.93×      |
-//! | `serde_json`          | 744.75 µs     | 2.18×      |
-//! | `json-escape`         | 777.15 µs     | 2.28×      |
-//!
-//! **Fixtures payload (~300 iterations)**
-//!
-//! | Implementation        | Median time  | vs fastest |
-//! | --------------------- | ------------ | ---------- |
-//! | **`escape simd`**     | **12.67 ms** | **1.00×**  |
-//! | `escape v_jsonescape` | 20.58 ms     | 1.62×      |
-//! | `escape generic`      | 22.57 ms     | 1.78×      |
-//! | `serde_json`          | 24.52 ms     | 1.94×      |
-//! | `json-escape`         | 26.97 ms     | 2.13×      |
-//!
-//! ### GitHub Actions aarch64 (`ubuntu-24.04-arm`)
-//!
-//! Neon enabled.
-//!
-//! **RxJS payload (~10k iterations)**
-//!
-//! | Implementation        | Median time   | vs fastest |
-//! | --------------------- | ------------- | ---------- |
-//! | **`escape generic`**  | **546.89 µs** | **1.00×**  |
-//! | `escape simd`         | 589.29 µs     | 1.08×      |
-//! | `serde_json`          | 612.33 µs     | 1.12×      |
-//! | `json-escape`         | 624.66 µs     | 1.14×      |
-//! | `escape v_jsonescape` | 789.14 µs     | 1.44×      |
-//!
-//! **Fixtures payload (~300 iterations)**
-//!
-//! | Implementation        | Median time  | vs fastest |
-//! | --------------------- | ------------ | ---------- |
-//! | **`escape generic`**  | **17.81 ms** | **1.00×**  |
-//! | `serde_json`          | 19.77 ms     | 1.11×      |
-//! | `json-escape`         | 20.84 ms     | 1.17×      |
-//! | `escape simd`         | 21.04 ms     | 1.18×      |
-//! | `escape v_jsonescape` | 25.57 ms     | 1.44×      |
-//!
-//! ### GitHub Actions macOS (`macos-latest`)
-//!
-//! Apple M1 chip
-//!
-//! **RxJS payload (~10k iterations)**
-//!
-//! | Implementation        | Median time   | vs fastest |
-//! | --------------------- | ------------- | ---------- |
-//! | **`escape generic`**  | **759.07 µs** | **1.00×**  |
-//! | `escape simd`         | 764.98 µs     | 1.01×      |
-//! | `serde_json`          | 793.91 µs     | 1.05×      |
-//! | `json-escape`         | 868.21 µs     | 1.14×      |
-//! | `escape v_jsonescape` | 926.00 µs     | 1.22×      |
-//!
-//! **Fixtures payload (~300 iterations)**
-//!
-//! | Implementation        | Median time  | vs fastest |
-//! | --------------------- | ------------ | ---------- |
-//! | **`serde_json`**      | **26.41 ms** | **1.00×**  |
-//! | `escape generic`      | 26.43 ms     | 1.00×      |
-//! | `escape simd`         | 26.42 ms     | 1.00×      |
-//! | `json-escape`         | 28.94 ms     | 1.10×      |
-//! | `escape v_jsonescape` | 29.22 ms     | 1.11×      |
-//!
-//! ### Apple M3 Max
-//!
-//! **RxJS payload (~10k iterations)**
-//!
-//! | Implementation        | Median time   | vs fastest |
-//! | --------------------- | ------------- | ---------- |
-//! | **`escape simd`**     | **307.20 µs** | **1.00×**  |
-//! | `escape generic`      | 490.00 µs     | 1.60×      |
-//! | `serde_json`          | 570.35 µs     | 1.86×      |
-//! | `escape v_jsonescape` | 599.72 µs     | 1.95×      |
-//! | `json-escape`         | 644.73 µs     | 2.10×      |
-//!
-//! **Fixtures payload (~300 iterations)**
-//!
-//! | Implementation        | Median time  | vs fastest |
-//! | --------------------- | ------------ | ---------- |
-//! | **`escape generic`**  | **17.89 ms** | **1.00×**  |
-//! | **`escape simd`**     | **17.92 ms** | **1.00×**  |
-//! | `serde_json`          | 19.78 ms     | 1.11×      |
-//! | `escape v_jsonescape` | 21.09 ms     | 1.18×      |
-//! | `json-escape`         | 22.43 ms     | 1.25×      |
-
-#[cfg(target_arch = "aarch64")]
-mod aarch64;
-mod generic;
-mod util;
-#[cfg(target_arch = "x86_64")]
-mod x86;
-
-pub use generic::{escape_generic, escape_into_generic};
-
-/// Main entry point for JSON string escaping with SIMD acceleration
-/// If the platform is supported, the SIMD path will be used. Otherwise, the generic fallback will be used.
-pub fn escape<S: AsRef<str>>(input: S) -> String {
-    #[cfg(not(feature = "force_aarch64_neon"))]
-    use generic::escape_inner;
-
-    let mut result = Vec::with_capacity(input.as_ref().len() * 6 + 32 + 3);
-    result.push(b'"');
-    let s = input.as_ref();
-    let bytes = s.as_bytes();
-    #[cfg(target_arch = "x86_64")]
-    {
-        let len = bytes.len();
-        // Runtime CPU feature detection for x86_64
-        if is_x86_feature_detected!("avx512f")
-            && is_x86_feature_detected!("avx512bw")
-            && len >= x86::LOOP_SIZE_AVX512
-        {
-            unsafe { x86::escape_avx512(bytes, &mut result) }
-        } else if is_x86_feature_detected!("avx2") && len >= x86::LOOP_SIZE_AVX2 {
-            unsafe { x86::escape_avx2(bytes, &mut result) }
-        } else if is_x86_feature_detected!("sse2")
-            && /* if len < 128, no need to use simd */
-            len >= x86::LOOP_SIZE_AVX2
-        {
-            unsafe { x86::escape_sse2(bytes, &mut result) }
-        } else {
-            escape_inner(bytes, &mut result);
+//! Only takes the string escaping part to avoid the abstraction overhead.
+
+use std::slice::from_raw_parts;
+
+#[cfg(not(all(target_feature = "neon", target_arch = "aarch64")))]
+use sonic_simd::u8x32;
+use sonic_simd::{BitMask, Mask, Simd};
+#[cfg(all(target_feature = "neon", target_arch = "aarch64"))]
+use sonic_simd::{bits::NeonBits, u8x16};
+
+#[inline(always)]
+unsafe fn load<V: Simd>(ptr: *const u8) -> V {
+    let chunk = unsafe { from_raw_parts(ptr, V::LANES) };
+    unsafe { V::from_slice_unaligned_unchecked(chunk) }
+}
+
+const QUOTE_TAB: [(u8, [u8; 8]); 256] = [
+    // 0x00 ~ 0x1f
+    (6, *b"\\u0000\0\0"),
+    (6, *b"\\u0001\0\0"),
+    (6, *b"\\u0002\0\0"),
+    (6, *b"\\u0003\0\0"),
+    (6, *b"\\u0004\0\0"),
+    (6, *b"\\u0005\0\0"),
+    (6, *b"\\u0006\0\0"),
+    (6, *b"\\u0007\0\0"),
+    (2, *b"\\b\0\0\0\0\0\0"),
+    (2, *b"\\t\0\0\0\0\0\0"),
+    (2, *b"\\n\0\0\0\0\0\0"),
+    (6, *b"\\u000b\0\0"),
+    (2, *b"\\f\0\0\0\0\0\0"),
+    (2, *b"\\r\0\0\0\0\0\0"),
+    (6, *b"\\u000e\0\0"),
+    (6, *b"\\u000f\0\0"),
+    (6, *b"\\u0010\0\0"),
+    (6, *b"\\u0011\0\0"),
+    (6, *b"\\u0012\0\0"),
+    (6, *b"\\u0013\0\0"),
+    (6, *b"\\u0014\0\0"),
+    (6, *b"\\u0015\0\0"),
+    (6, *b"\\u0016\0\0"),
+    (6, *b"\\u0017\0\0"),
+    (6, *b"\\u0018\0\0"),
+    (6, *b"\\u0019\0\0"),
+    (6, *b"\\u001a\0\0"),
+    (6, *b"\\u001b\0\0"),
+    (6, *b"\\u001c\0\0"),
+    (6, *b"\\u001d\0\0"),
+    (6, *b"\\u001e\0\0"),
+    (6, *b"\\u001f\0\0"),
+    // 0x20 ~ 0x2f
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (2, *b"\\\"\0\0\0\0\0\0"),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    // 0x30 ~ 0x3f
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    // 0x40 ~ 0x4f
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    // 0x50 ~ 0x5f
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (2, *b"\\\\\0\0\0\0\0\0"),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    // 0x60 ~ 0xff
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+    (0, [0; 8]),
+];
+
+const NEED_ESCAPED: [u8; 256] = [
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+];
+
+// only check the src length.
+#[inline(always)]
+unsafe fn escape_unchecked(src: &mut *const u8, nb: &mut usize, dst: &mut *mut u8) {
+    debug_assert!(*nb >= 1);
+    loop {
+        let ch = unsafe { *(*src) };
+        let cnt = QUOTE_TAB[ch as usize].0 as usize;
+        debug_assert!(
+            cnt != 0,
+            "char is {}, cnt is {},  NEED_ESCAPED is {}",
+            ch as char,
+            cnt,
+            NEED_ESCAPED[ch as usize]
+        );
+        unsafe { std::ptr::copy_nonoverlapping(QUOTE_TAB[ch as usize].1.as_ptr(), *dst, 8) };
+        unsafe { (*dst) = (*dst).add(cnt) };
+        unsafe { (*src) = (*src).add(1) };
+        (*nb) -= 1;
+        if (*nb) == 0 || unsafe { NEED_ESCAPED[*(*src) as usize] == 0 } {
+            return;
         }
     }
+}
 
-    #[cfg(target_arch = "aarch64")]
+#[inline(always)]
+fn check_cross_page(ptr: *const u8, step: usize) -> bool {
+    #[cfg(any(target_os = "linux", target_os = "macos"))]
     {
-        #[cfg(feature = "force_aarch64_neon")]
-        {
-            aarch64::escape_neon(bytes, &mut result);
-        }
-        #[cfg(not(feature = "force_aarch64_neon"))]
-        {
-            // on Apple M2 and later, the `bf16` feature is available
-            // it means they have more registers and can significantly benefit from the SIMD path
-            // TODO: add support for sve2 chips with wider registers
-            // github actions ubuntu-24.04-arm runner has 128 bits sve2 registers, it's not enough for the SIMD path
-            if cfg!(target_os = "macos") && std::arch::is_aarch64_feature_detected!("bf16") {
-                aarch64::escape_neon(bytes, &mut result);
-            } else {
-                escape_inner(bytes, &mut result);
-            }
-        }
+        let page_size = 4096;
+        ((ptr as usize & (page_size - 1)) + step) > page_size
     }
 
-    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+    #[cfg(not(any(target_os = "linux", target_os = "macos")))]
     {
-        escape_inner(bytes, &mut result);
+        // not check page cross in fallback envs, always true
+        true
     }
-    result.push(b'"');
-    // SAFETY: We only pushed valid UTF-8 bytes (original string bytes and ASCII escape sequences)
-    unsafe { String::from_utf8_unchecked(result) }
 }
 
-/// Main entry point for JSON string escaping with SIMD acceleration
-/// If the platform is supported, the SIMD path will be used. Otherwise, the generic fallback will be used.
-pub fn escape_into<S: AsRef<str>>(input: S, output: &mut Vec<u8>) {
-    #[cfg(not(feature = "force_aarch64_neon"))]
-    use generic::escape_inner;
-
-    output.reserve(input.as_ref().len() * 6 + 32 + 3);
+#[inline(always)]
+fn format_string(value: &str, dst: &mut [u8]) -> usize {
+    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+    let mut v: u8x16;
+    #[cfg(not(all(target_arch = "aarch64", target_feature = "neon")))]
+    let mut v: u8x32;
+
+    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+    const LANES: usize = 16;
+    #[cfg(not(all(target_arch = "aarch64", target_feature = "neon")))]
+    const LANES: usize = 32;
+
+    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+    #[inline]
+    fn escaped_mask(v: u8x16) -> NeonBits {
+        let x1f = u8x16::splat(0x1f); // 0x00 ~ 0x20
+        let blash = u8x16::splat(b'\\');
+        let quote = u8x16::splat(b'"');
+        let v = v.le(&x1f) | v.eq(&blash) | v.eq(&quote);
+        v.bitmask()
+    }
 
-    output.push(b'"');
-    let s = input.as_ref();
-    let bytes = s.as_bytes();
-    #[cfg(target_arch = "x86_64")]
-    {
-        let len = bytes.len();
-        // Runtime CPU feature detection for x86_64
-        if is_x86_feature_detected!("avx512f")
-            && is_x86_feature_detected!("avx512bw")
-            && len >= x86::LOOP_SIZE_AVX512
-        {
-            unsafe { x86::escape_avx512(bytes, output) }
-        } else if is_x86_feature_detected!("avx2") && len >= x86::LOOP_SIZE_AVX2 {
-            unsafe { x86::escape_avx2(bytes, output) }
-        } else if is_x86_feature_detected!("sse2")
-          && /* if len < 128, no need to use simd */
-          len >= x86::LOOP_SIZE_AVX2
-        {
-            unsafe { x86::escape_sse2(bytes, output) }
-        } else {
-            escape_inner(bytes, output);
-        }
+    #[cfg(not(all(target_arch = "aarch64", target_feature = "neon")))]
+    #[inline]
+    fn escaped_mask(v: u8x32) -> u32 {
+        let x1f = u8x32::splat(0x1f); // 0x00 ~ 0x20
+        let blash = u8x32::splat(b'\\');
+        let quote = u8x32::splat(b'"');
+        let v = v.le(&x1f) | v.eq(&blash) | v.eq(&quote);
+        v.bitmask()
     }
 
-    #[cfg(target_arch = "aarch64")]
-    {
-        #[cfg(feature = "force_aarch64_neon")]
-        {
-            aarch64::escape_neon(bytes, output);
-        }
-        #[cfg(not(feature = "force_aarch64_neon"))]
-        {
-            // on Apple M2 and later, the `bf16` feature is available
-            // it means they have more registers and can significantly benefit from the SIMD path
-            // TODO: add support for sve2 chips with wider registers
-            // github actions ubuntu-24.04-arm runner has 128 bits sve2 registers, it's not enough for the SIMD path
-            if cfg!(target_os = "macos") && std::arch::is_aarch64_feature_detected!("bf16") {
-                aarch64::escape_neon(bytes, output);
+    unsafe {
+        let slice = value.as_bytes();
+        let mut sptr = slice.as_ptr();
+        let mut dptr = dst.as_mut_ptr();
+        let dstart = dptr;
+        let mut nb: usize = slice.len();
+
+        *dptr = b'"';
+        dptr = dptr.add(1);
+        while nb >= LANES {
+            v = load(sptr);
+            v.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut(dptr, LANES));
+            let mask = escaped_mask(v);
+            if mask.all_zero() {
+                nb -= LANES;
+                dptr = dptr.add(LANES);
+                sptr = sptr.add(LANES);
             } else {
-                escape_inner(bytes, output);
+                let cn = mask.first_offset();
+                nb -= cn;
+                dptr = dptr.add(cn);
+                sptr = sptr.add(cn);
+                escape_unchecked(&mut sptr, &mut nb, &mut dptr);
             }
         }
-    }
 
-    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
-    {
-        escape_into_generic(input, output);
+        // Scratch buffer reused for mask materialisation; stay uninitialised.
+        #[cfg(not(miri))]
+        #[allow(invalid_value, clippy::uninit_assumed_init)]
+        let mut placeholder: [u8; LANES] = core::mem::MaybeUninit::uninit().assume_init();
+        #[cfg(miri)]
+        let mut placeholder: [u8; LANES] = [0; LANES];
+        while nb > 0 {
+            v = if check_cross_page(sptr, LANES) {
+                std::ptr::copy_nonoverlapping(sptr, placeholder[..].as_mut_ptr(), nb);
+                load(placeholder[..].as_ptr())
+            } else {
+                #[cfg(not(debug_assertions))]
+                {
+                    // disable memory sanitizer here
+                    load(sptr)
+                }
+                #[cfg(debug_assertions)]
+                {
+                    std::ptr::copy_nonoverlapping(sptr, placeholder[..].as_mut_ptr(), nb);
+                    load(placeholder[..].as_ptr())
+                }
+            };
+            v.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut(dptr, LANES));
+
+            let mask = escaped_mask(v).clear_high_bits(LANES - nb);
+            if mask.all_zero() {
+                dptr = dptr.add(nb);
+                break;
+            } else {
+                let cn = mask.first_offset();
+                nb -= cn;
+                dptr = dptr.add(cn);
+                sptr = sptr.add(cn);
+                escape_unchecked(&mut sptr, &mut nb, &mut dptr);
+            }
+        }
+        *dptr = b'"';
+        dptr = dptr.add(1);
+        dptr as usize - dstart as usize
     }
-    output.push(b'"');
 }
 
-#[test]
-fn test_escape_ascii_json_string() {
-    let fixture = r#"abcdefghijklmnopqrstuvwxyz .*? hello world escape json string"#;
-    assert_eq!(escape(fixture), serde_json::to_string(fixture).unwrap());
+pub fn escape(value: &str) -> String {
+    let mut buf = vec![0; value.len() * 6 + 32 + 3];
+    let cnt = format_string(value, &mut buf);
+    unsafe { buf.set_len(cnt) };
+    unsafe { String::from_utf8_unchecked(buf) }
 }
 
-#[test]
-fn test_escape_json_string() {
-    let mut fixture = String::new();
-    for i in 0u8..=0x1F {
-        fixture.push(i as char);
-    }
-    fixture.push('\t');
-    fixture.push('\x08');
-    fixture.push('\x09');
-    fixture.push('\x0A');
-    fixture.push('\x0C');
-    fixture.push('\x0D');
-    fixture.push('\x22');
-    fixture.push('\x5C');
-    fixture.push_str("normal string");
-    fixture.push('😊');
-    fixture.push_str("中文 English 🚀 \n❓ 𝄞");
-    escape(fixture.as_str());
-    assert_eq!(
-        escape(fixture.as_str()),
-        serde_json::to_string(fixture.as_str()).unwrap(),
-        "fixture: {:?}",
-        fixture
-    );
-}
+pub fn escape_into<S: AsRef<str>>(value: S, dst: &mut Vec<u8>) -> usize {
+    let value = value.as_ref();
+    let needed_capacity = value.len() * 6 + 32 + 3;
 
-// Test cases for various string sizes to cover different SIMD paths
+    // Ensure we have enough capacity
+    dst.reserve(needed_capacity);
 
-#[test]
-fn test_empty_string() {
-    assert_eq!(escape(""), r#""""#);
-}
+    let old_len = dst.len();
 
-#[test]
-fn test_very_small_strings() {
-    // Less than 16 bytes (SSE register size)
-    assert_eq!(escape("a"), r#""a""#);
-    assert_eq!(escape("ab"), r#""ab""#);
-    assert_eq!(escape("hello"), r#""hello""#);
-    assert_eq!(escape("hello\n"), r#""hello\n""#);
-    assert_eq!(escape("\""), r#""\"""#);
-    assert_eq!(escape("\\"), r#""\\""#);
-    assert_eq!(escape("\t"), r#""\t""#);
-    assert_eq!(escape("\r\n"), r#""\r\n""#);
+    // SAFETY: We've reserved enough capacity above, and format_string will
+    // write valid UTF-8 bytes. We'll set the correct length after.
+    unsafe {
+        // Get a slice that includes the spare capacity
+        let spare =
+            std::slice::from_raw_parts_mut(dst.as_mut_ptr().add(old_len), dst.capacity() - old_len);
+        let cnt = format_string(value, spare);
+        dst.set_len(old_len + cnt);
+        cnt
+    }
 }
 
-#[test]
-fn test_small_strings_16_bytes() {
-    // Exactly 16 bytes - SSE register boundary
-    let s16 = "0123456789abcdef";
-    assert_eq!(s16.len(), 16);
-    assert_eq!(escape(s16), serde_json::to_string(s16).unwrap());
-
-    // 16 bytes with escapes
-    let s16_esc = "01234567\t9abcde";
-    assert_eq!(s16_esc.len(), 15); // \t is 1 byte
-    assert_eq!(escape(s16_esc), serde_json::to_string(s16_esc).unwrap());
-}
+#[cfg(test)]
+mod tests {
+    use std::fs::read_dir;
+    use std::path::{Path, PathBuf};
 
-#[test]
-fn test_medium_strings_32_bytes() {
-    // Exactly 32 bytes - AVX2 register boundary
-    let s32 = "0123456789abcdef0123456789abcdef";
-    assert_eq!(s32.len(), 32);
-    assert_eq!(escape(s32), serde_json::to_string(s32).unwrap());
+    use rand::seq::SliceRandom;
 
-    // 32 bytes with escapes at different positions
-    let s32_esc = "0123456789abcde\"0123456789abcde";
-    assert_eq!(escape(s32_esc), serde_json::to_string(s32_esc).unwrap());
-}
+    use super::*;
 
-#[test]
-fn test_large_strings_128_bytes() {
-    // Exactly 128 bytes - main loop size
-    let s128 = "0123456789abcdef".repeat(8);
-    assert_eq!(s128.len(), 128);
-    assert_eq!(escape(&s128), serde_json::to_string(&s128).unwrap());
-
-    // 128 bytes with escapes spread throughout
-    let mut s128_esc = String::new();
-    for i in 0..8 {
-        if i % 2 == 0 {
-            s128_esc.push_str("0123456789abcd\n");
-        } else {
-            s128_esc.push_str("0123456789abcd\"");
+    #[test]
+    fn test_escape_ascii_json_string() {
+        let fixture = r#"abcdefghijklmnopqrstuvwxyz .*? hello world escape json string"#;
+        assert_eq!(escape(fixture), serde_json::to_string(fixture).unwrap());
+    }
+
+    #[test]
+    fn test_escape_json_string() {
+        let mut fixture = String::new();
+        for i in 0u8..=0x1F {
+            fixture.push(i as char);
         }
+        fixture.push('\t');
+        fixture.push('\x08');
+        fixture.push('\x09');
+        fixture.push('\x0A');
+        fixture.push('\x0C');
+        fixture.push('\x0D');
+        fixture.push('\x22');
+        fixture.push('\x5C');
+        fixture.push_str("normal string");
+        fixture.push('😊');
+        fixture.push_str("中文 English 🚀 \n❓ 𝄞");
+        escape(fixture.as_str());
+        assert_eq!(
+            escape(fixture.as_str()),
+            serde_json::to_string(fixture.as_str()).unwrap(),
+            "fixture: {:?}",
+            fixture
+        );
     }
-    assert_eq!(escape(&s128_esc), serde_json::to_string(&s128_esc).unwrap());
-}
 
-#[test]
-fn test_unaligned_data() {
-    // Test strings that start at various alignments
-    for offset in 0..32 {
-        let padding = " ".repeat(offset);
-        let test_str = format!("{}{}", padding, "test\nstring\"with\\escapes");
-        let result = escape(&test_str[offset..]);
-        let expected = serde_json::to_string(&test_str[offset..]).unwrap();
-        assert_eq!(result, expected, "Failed at offset {}", offset);
+    // Test cases for various string sizes to cover different SIMD paths
+
+    #[test]
+    fn test_empty_string() {
+        assert_eq!(escape(""), r#""""#);
     }
-}
 
-#[test]
-fn test_sparse_escapes() {
-    // Large string with escapes only at the beginning and end
-    let mut s = String::new();
-    s.push('"');
-    s.push_str(&"a".repeat(500));
-    s.push('\\');
-    assert_eq!(escape(&s), serde_json::to_string(&s).unwrap());
-}
+    #[test]
+    fn test_very_small_strings() {
+        // Less than 16 bytes (SSE register size)
+        assert_eq!(escape("a"), r#""a""#);
+        assert_eq!(escape("ab"), r#""ab""#);
+        assert_eq!(escape("hello"), r#""hello""#);
+        assert_eq!(escape("hello\n"), r#""hello\n""#);
+        assert_eq!(escape("\""), r#""\"""#);
+        assert_eq!(escape("\\"), r#""\\""#);
+        assert_eq!(escape("\t"), r#""\t""#);
+        assert_eq!(escape("\r\n"), r#""\r\n""#);
+    }
 
-#[test]
-fn test_dense_escapes() {
-    // String with many escapes
-    let s = "\"\\\"\\\"\\\"\\".repeat(50);
-    assert_eq!(escape(&s), serde_json::to_string(&s).unwrap());
+    #[test]
+    fn test_small_strings_16_bytes() {
+        // Exactly 16 bytes - SSE register boundary
+        let s16 = "0123456789abcdef";
+        assert_eq!(s16.len(), 16);
+        assert_eq!(escape(s16), serde_json::to_string(s16).unwrap());
+
+        // 16 bytes with escapes
+        let s16_esc = "01234567\t9abcde";
+        assert_eq!(s16_esc.len(), 15); // \t is 1 byte
+        assert_eq!(escape(s16_esc), serde_json::to_string(s16_esc).unwrap());
+    }
 
-    // All control characters
-    let mut ctrl = String::new();
-    for _ in 0..10 {
-        for i in 0u8..32 {
-            ctrl.push(i as char);
+    #[test]
+    fn test_medium_strings_32_bytes() {
+        // Exactly 32 bytes - AVX2 register boundary
+        let s32 = "0123456789abcdef0123456789abcdef";
+        assert_eq!(s32.len(), 32);
+        assert_eq!(escape(s32), serde_json::to_string(s32).unwrap());
+
+        // 32 bytes with escapes at different positions
+        let s32_esc = "0123456789abcde\"0123456789abcde";
+        assert_eq!(escape(s32_esc), serde_json::to_string(s32_esc).unwrap());
+    }
+
+    #[test]
+    fn test_large_strings_128_bytes() {
+        // Exactly 128 bytes - main loop size
+        let s128 = "0123456789abcdef".repeat(8);
+        assert_eq!(s128.len(), 128);
+        assert_eq!(escape(&s128), serde_json::to_string(&s128).unwrap());
+
+        // 128 bytes with escapes spread throughout
+        let mut s128_esc = String::new();
+        for i in 0..8 {
+            if i % 2 == 0 {
+                s128_esc.push_str("0123456789abcd\n");
+            } else {
+                s128_esc.push_str("0123456789abcd\"");
+            }
+        }
+        assert_eq!(escape(&s128_esc), serde_json::to_string(&s128_esc).unwrap());
+    }
+
+    #[test]
+    fn test_unaligned_data() {
+        // Test strings that start at various alignments
+        for offset in 0..32 {
+            let padding = " ".repeat(offset);
+            let test_str = format!("{}{}", padding, "test\nstring\"with\\escapes");
+            let result = escape(&test_str[offset..]);
+            let expected = serde_json::to_string(&test_str[offset..]).unwrap();
+            assert_eq!(result, expected, "Failed at offset {}", offset);
         }
     }
-    assert_eq!(escape(&ctrl), serde_json::to_string(&ctrl).unwrap());
-}
 
-#[test]
-fn test_boundary_conditions() {
-    // Test around 256 byte boundary (common cache line multiple)
-    for size in 250..260 {
-        let s = "a".repeat(size);
+    #[test]
+    fn test_sparse_escapes() {
+        // Large string with escapes only at the beginning and end
+        let mut s = String::new();
+        s.push('"');
+        s.push_str(&"a".repeat(500));
+        s.push('\\');
+        assert_eq!(escape(&s), serde_json::to_string(&s).unwrap());
+    }
+
+    #[test]
+    fn test_dense_escapes() {
+        // String with many escapes
+        let s = "\"\\\"\\\"\\\"\\".repeat(50);
         assert_eq!(escape(&s), serde_json::to_string(&s).unwrap());
 
-        // With escape at the end
-        let mut s_esc = "a".repeat(size - 1);
-        s_esc.push('"');
-        assert_eq!(escape(&s_esc), serde_json::to_string(&s_esc).unwrap());
+        // All control characters
+        let mut ctrl = String::new();
+        for _ in 0..10 {
+            for i in 0u8..32 {
+                ctrl.push(i as char);
+            }
+        }
+        assert_eq!(escape(&ctrl), serde_json::to_string(&ctrl).unwrap());
+    }
+
+    #[test]
+    fn test_boundary_conditions() {
+        // Test around 256 byte boundary (common cache line multiple)
+        for size in 250..260 {
+            let s = "a".repeat(size);
+            assert_eq!(escape(&s), serde_json::to_string(&s).unwrap());
+
+            // With escape at the end
+            let mut s_esc = "a".repeat(size - 1);
+            s_esc.push('"');
+            assert_eq!(escape(&s_esc), serde_json::to_string(&s_esc).unwrap());
+        }
     }
-}
 
-#[test]
-fn test_all_escape_types() {
-    // Test each escape type individually
-    assert_eq!(escape("\x00"), r#""\u0000""#);
-    assert_eq!(escape("\x08"), r#""\b""#);
-    assert_eq!(escape("\x09"), r#""\t""#);
-    assert_eq!(escape("\x0A"), r#""\n""#);
-    assert_eq!(escape("\x0C"), r#""\f""#);
-    assert_eq!(escape("\x0D"), r#""\r""#);
-    assert_eq!(escape("\x1F"), r#""\u001f""#);
-    assert_eq!(escape("\""), r#""\"""#);
-    assert_eq!(escape("\\"), r#""\\""#);
-
-    // Test all control characters
-    for i in 0u8..32 {
-        let s = String::from_utf8(vec![i]).unwrap();
-        let result = escape(&s);
-        let expected = serde_json::to_string(&s).unwrap();
-        assert_eq!(result, expected, "Failed for byte 0x{:02x}", i);
+    #[test]
+    fn test_all_escape_types() {
+        // Test each escape type individually
+        assert_eq!(escape("\x00"), r#""\u0000""#);
+        assert_eq!(escape("\x08"), r#""\b""#);
+        assert_eq!(escape("\x09"), r#""\t""#);
+        assert_eq!(escape("\x0A"), r#""\n""#);
+        assert_eq!(escape("\x0C"), r#""\f""#);
+        assert_eq!(escape("\x0D"), r#""\r""#);
+        assert_eq!(escape("\x1F"), r#""\u001f""#);
+        assert_eq!(escape("\""), r#""\"""#);
+        assert_eq!(escape("\\"), r#""\\""#);
+
+        // Test all control characters
+        for i in 0u8..32 {
+            let s = String::from_utf8(vec![i]).unwrap();
+            let result = escape(&s);
+            let expected = serde_json::to_string(&s).unwrap();
+            assert_eq!(result, expected, "Failed for byte 0x{:02x}", i);
+        }
     }
-}
 
-#[test]
-fn test_mixed_content() {
-    // Mix of ASCII, escapes, and multi-byte UTF-8
-    let mixed = r#"Hello "World"!
+    #[test]
+    fn test_mixed_content() {
+        // Mix of ASCII, escapes, and multi-byte UTF-8
+        let mixed = r#"Hello "World"!
     Tab:	Here
     Emoji: 😀 Chinese: 中文
     Math: ∑∫∂ Music: 𝄞
     Escape: \" \\ \n \r \t"#;
-    assert_eq!(escape(mixed), serde_json::to_string(mixed).unwrap());
-}
-
-#[test]
-fn test_repeated_patterns() {
-    // Patterns that might benefit from or confuse SIMD operations
-    let pattern1 = "abcd".repeat(100);
-    assert_eq!(escape(&pattern1), serde_json::to_string(&pattern1).unwrap());
+        assert_eq!(escape(mixed), serde_json::to_string(mixed).unwrap());
+    }
 
-    let pattern2 = "a\"b\"".repeat(100);
-    assert_eq!(escape(&pattern2), serde_json::to_string(&pattern2).unwrap());
+    #[test]
+    fn test_repeated_patterns() {
+        // Patterns that might benefit from or confuse SIMD operations
+        let pattern1 = "abcd".repeat(100);
+        assert_eq!(escape(&pattern1), serde_json::to_string(&pattern1).unwrap());
 
-    let pattern3 = "\t\n".repeat(100);
-    assert_eq!(escape(&pattern3), serde_json::to_string(&pattern3).unwrap());
-}
+        let pattern2 = "a\"b\"".repeat(100);
+        assert_eq!(escape(&pattern2), serde_json::to_string(&pattern2).unwrap());
 
-#[test]
-fn test_rxjs() {
-    let dir = glob::glob("node_modules/rxjs/src/**/*.ts").unwrap();
-    let mut sources = Vec::new();
-    for entry in dir {
-        sources.push(std::fs::read_to_string(entry.unwrap()).unwrap());
+        let pattern3 = "\t\n".repeat(100);
+        assert_eq!(escape(&pattern3), serde_json::to_string(&pattern3).unwrap());
     }
-    assert!(!sources.is_empty());
-    for source in sources {
-        assert_eq!(escape(&source), serde_json::to_string(&source).unwrap());
-        let mut output = String::with_capacity(source.len() * 6 + 32 + 3);
-        escape_into(&source, unsafe { output.as_mut_vec() });
-        assert_eq!(output, serde_json::to_string(&source).unwrap());
+
+    #[test]
+    fn test_rxjs() {
+        let mut sources = Vec::new();
+        read_dir_recursive("node_modules/rxjs/src", &mut sources, |p| {
+            matches!(p.extension().and_then(|e| e.to_str()), Some("ts"))
+        })
+        .unwrap();
+        assert!(!sources.is_empty());
+        sources.shuffle(&mut rand::rng());
+        for source in sources
+            .iter()
+            .take(if cfg!(miri) { 10 } else { sources.len() })
+        {
+            assert_eq!(escape(&source), serde_json::to_string(&source).unwrap());
+            let mut output = String::new();
+            escape_into(&source, unsafe { output.as_mut_vec() });
+            assert_eq!(output, serde_json::to_string(&source).unwrap());
+        }
     }
-}
 
-#[test]
-fn test_sources() {
-    let ts_paths = glob::glob("fixtures/**/*.ts").unwrap();
-    let tsx_paths = glob::glob("fixtures/**/*.tsx").unwrap();
-    let js_paths = glob::glob("fixtures/**/*.js").unwrap();
-    let mjs_paths = glob::glob("fixtures/**/*.mjs").unwrap();
-    let cjs_paths = glob::glob("fixtures/**/*.cjs").unwrap();
-    let mut sources = Vec::new();
-    for entry in ts_paths
-        .chain(tsx_paths)
-        .chain(js_paths)
-        .chain(mjs_paths)
-        .chain(cjs_paths)
-    {
-        let p = entry.unwrap();
-        if std::fs::metadata(&p).unwrap().is_file() {
-            sources.push(std::fs::read_to_string(&p).unwrap());
+    #[test]
+    fn test_sources() {
+        for source in load_affine_sources().unwrap() {
+            assert_eq!(escape(&source), serde_json::to_string(&source).unwrap());
+            let mut output = String::with_capacity(source.len() * 6 + 32 + 3);
+            escape_into(&source, unsafe { output.as_mut_vec() });
+            assert_eq!(output, serde_json::to_string(&source).unwrap());
         }
     }
-    assert!(!sources.is_empty());
-    for source in sources {
-        assert_eq!(escape(&source), serde_json::to_string(&source).unwrap());
-        let mut output = String::with_capacity(source.len() * 6 + 32 + 3);
-        escape_into(&source, unsafe { output.as_mut_vec() });
-        assert_eq!(output, serde_json::to_string(&source).unwrap());
+
+    fn load_affine_sources() -> Result<impl Iterator<Item = String>, std::io::Error> {
+        let mut sources = Vec::new();
+        read_dir_recursive("fixtures", &mut sources, |p| {
+            matches!(
+                p.extension().and_then(|e| e.to_str()),
+                Some("ts") | Some("tsx") | Some("js") | Some("mjs") | Some("cjs")
+            )
+        })?;
+        assert!(!sources.is_empty());
+        let len = sources.len();
+        sources.shuffle(&mut rand::rng());
+        Ok(sources.into_iter().take(if cfg!(miri) { 10 } else { len }))
+    }
+
+    fn read_dir_recursive<P: AsRef<Path>, F: Fn(PathBuf) -> bool + Copy>(
+        dir: P,
+        sources: &mut Vec<String>,
+        f: F,
+    ) -> Result<(), std::io::Error> {
+        let dir = read_dir(dir)?;
+        for entry in dir {
+            let p = entry?;
+            let metadata = std::fs::metadata(p.path())?;
+            if metadata.is_file() {
+                if f(p.path()) {
+                    sources.push(std::fs::read_to_string(p.path())?);
+                }
+            }
+            if metadata.is_dir() {
+                read_dir_recursive(p.path(), sources, f)?;
+            }
+        }
+        Ok(())
     }
 }
diff --git a/src/util.rs b/src/util.rs
deleted file mode 100644
index a2e2fd6..0000000
--- a/src/util.rs
+++ /dev/null
@@ -1,17 +0,0 @@
-#[inline(always)]
-pub(crate) fn check_cross_page(ptr: *const u8, step: usize) -> bool {
-    #[cfg(any(target_os = "linux", target_os = "macos"))]
-    {
-        // Check if reading 'step' bytes from 'ptr' would cross a page boundary
-        // Page size is typically 4096 bytes on x86_64 Linux and macOS
-        const PAGE_SIZE: usize = 4096;
-        ((ptr as usize & (PAGE_SIZE - 1)) + step) > PAGE_SIZE
-    }
-
-    #[cfg(not(any(target_os = "linux", target_os = "macos")))]
-    {
-        // On other platforms, always use the safe path with temporary buffer
-        // to avoid potential page faults
-        true
-    }
-}
diff --git a/src/x86.rs b/src/x86.rs
deleted file mode 100644
index 7cd8b21..0000000
--- a/src/x86.rs
+++ /dev/null
@@ -1,845 +0,0 @@
-#![allow(unsafe_op_in_unsafe_fn)]
-
-use std::arch::x86_64::{
-    __m128i, __m256i, __m512i, _MM_HINT_T0, _mm_add_epi8, _mm_cmpeq_epi8, _mm_cmpgt_epi8,
-    _mm_load_si128, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128, _mm_prefetch, _mm_set1_epi8,
-    _mm256_add_epi8, _mm256_cmpeq_epi8, _mm256_cmpgt_epi8, _mm256_load_si256, _mm256_loadu_si256,
-    _mm256_movemask_epi8, _mm256_or_si256, _mm256_set1_epi8, _mm512_cmpeq_epi8_mask,
-    _mm512_cmplt_epu8_mask, _mm512_load_si512, _mm512_loadu_si512, _mm512_set1_epi8,
-};
-
-use crate::generic::{ESCAPE, ESCAPE_TABLE, HEX_BYTES, UU};
-use crate::util::check_cross_page;
-
-// Constants for control character detection using signed comparison trick
-const TRANSLATION_A: i8 = i8::MAX - 31i8;
-const BELOW_A: i8 = i8::MAX - (31i8 - 0i8) - 1;
-const B: i8 = 34i8; // '"'
-const C: i8 = 92i8; // '\\'
-
-const M512_VECTOR_SIZE: usize = std::mem::size_of::<__m512i>();
-const M256_VECTOR_SIZE: usize = std::mem::size_of::<__m256i>();
-const M128_VECTOR_SIZE: usize = std::mem::size_of::<__m128i>();
-pub(crate) const LOOP_SIZE_AVX2: usize = 4 * M256_VECTOR_SIZE; // Process 128 bytes at a time
-pub(crate) const LOOP_SIZE_AVX512: usize = 4 * M512_VECTOR_SIZE; // Process 256 bytes at a time
-const PREFETCH_DISTANCE_AVX2: usize = 256; // Prefetch 256 bytes ahead for AVX2
-const PREFETCH_DISTANCE_AVX512: usize = 512; // Prefetch 512 bytes ahead for AVX512
-
-#[inline(always)]
-fn sub(a: *const u8, b: *const u8) -> usize {
-    debug_assert!(b <= a);
-    (a as usize) - (b as usize)
-}
-
-#[target_feature(enable = "avx512f", enable = "avx512bw")]
-#[inline]
-pub unsafe fn escape_avx512(bytes: &[u8], result: &mut Vec<u8>) {
-    let len = bytes.len();
-
-    let start_ptr = bytes.as_ptr();
-    let end_ptr = bytes[len..].as_ptr();
-    let mut ptr = start_ptr;
-    let mut start = 0;
-
-    let v_b = _mm512_set1_epi8(B);
-    let v_c = _mm512_set1_epi8(C);
-    let v_ctrl_limit = _mm512_set1_epi8(0x20);
-
-    // Handle alignment - skip if already aligned
-    const M512_VECTOR_ALIGN: usize = M512_VECTOR_SIZE - 1;
-    let misalignment = start_ptr as usize & M512_VECTOR_ALIGN;
-    if misalignment != 0 {
-        let align = M512_VECTOR_SIZE - misalignment;
-        let a = _mm512_loadu_si512(ptr as *const __m512i);
-
-        // Check for quotes, backslash, and control characters
-        let quote_mask = _mm512_cmpeq_epi8_mask(a, v_b);
-        let slash_mask = _mm512_cmpeq_epi8_mask(a, v_c);
-        let ctrl_mask = _mm512_cmplt_epu8_mask(a, v_ctrl_limit);
-
-        let mut mask = (quote_mask | slash_mask | ctrl_mask) as u64;
-        if align < 64 {
-            mask &= (1u64 << align) - 1;
-        }
-
-        if mask != 0 {
-            let at = sub(ptr, start_ptr);
-            while mask != 0 {
-                let cur = mask.trailing_zeros() as usize;
-                let c = *ptr.add(cur);
-                let escape_byte = ESCAPE[c as usize];
-                debug_assert!(escape_byte != 0);
-                let i = at + cur;
-                if start < i {
-                    result.extend_from_slice(&bytes[start..i]);
-                }
-                write_escape(result, escape_byte, c);
-                start = i + 1;
-                mask &= mask - 1;
-            }
-        }
-        ptr = ptr.add(align);
-    }
-
-    // Main loop processing 256 bytes at a time
-    while ptr <= end_ptr.sub(LOOP_SIZE_AVX512) {
-        debug_assert_eq!(0, (ptr as usize) % M512_VECTOR_SIZE);
-
-        // Prefetch next iteration's data
-        if ptr.add(LOOP_SIZE_AVX512 + PREFETCH_DISTANCE_AVX512) < end_ptr {
-            _mm_prefetch(
-                ptr.add(LOOP_SIZE_AVX512 + PREFETCH_DISTANCE_AVX512) as *const i8,
-                _MM_HINT_T0,
-            );
-        }
-
-        // Load all 4 vectors at once for better pipelining
-        let a0 = _mm512_load_si512(ptr as *const __m512i);
-        let a1 = _mm512_load_si512(ptr.add(M512_VECTOR_SIZE) as *const __m512i);
-        let a2 = _mm512_load_si512(ptr.add(M512_VECTOR_SIZE * 2) as *const __m512i);
-        let a3 = _mm512_load_si512(ptr.add(M512_VECTOR_SIZE * 3) as *const __m512i);
-
-        // Check for quotes (") in all vectors
-        let quote_0 = _mm512_cmpeq_epi8_mask(a0, v_b);
-        let quote_1 = _mm512_cmpeq_epi8_mask(a1, v_b);
-        let quote_2 = _mm512_cmpeq_epi8_mask(a2, v_b);
-        let quote_3 = _mm512_cmpeq_epi8_mask(a3, v_b);
-
-        // Check for backslash (\) in all vectors
-        let slash_0 = _mm512_cmpeq_epi8_mask(a0, v_c);
-        let slash_1 = _mm512_cmpeq_epi8_mask(a1, v_c);
-        let slash_2 = _mm512_cmpeq_epi8_mask(a2, v_c);
-        let slash_3 = _mm512_cmpeq_epi8_mask(a3, v_c);
-
-        // Check for control characters (< 0x20) in all vectors
-        let ctrl_0 = _mm512_cmplt_epu8_mask(a0, v_ctrl_limit);
-        let ctrl_1 = _mm512_cmplt_epu8_mask(a1, v_ctrl_limit);
-        let ctrl_2 = _mm512_cmplt_epu8_mask(a2, v_ctrl_limit);
-        let ctrl_3 = _mm512_cmplt_epu8_mask(a3, v_ctrl_limit);
-
-        // Combine all masks
-        let mask_a = quote_0 | slash_0 | ctrl_0;
-        let mask_b = quote_1 | slash_1 | ctrl_1;
-        let mask_c = quote_2 | slash_2 | ctrl_2;
-        let mask_d = quote_3 | slash_3 | ctrl_3;
-
-        // Fast path: check if any escaping needed
-        let any_escape = mask_a | mask_b | mask_c | mask_d;
-
-        if any_escape == 0 {
-            // No escapes needed, copy whole chunk
-            let at = sub(ptr, start_ptr);
-            if start < at {
-                result.extend_from_slice(&bytes[start..at]);
-            }
-            result.extend_from_slice(std::slice::from_raw_parts(ptr, LOOP_SIZE_AVX512));
-            start = at + LOOP_SIZE_AVX512;
-        } else {
-            // Process each 64-byte chunk that has escapes
-            if mask_a != 0 {
-                process_mask_avx512(ptr, start_ptr, result, &mut start, bytes, mask_a, 0);
-            }
-            if mask_b != 0 {
-                process_mask_avx512(
-                    ptr,
-                    start_ptr,
-                    result,
-                    &mut start,
-                    bytes,
-                    mask_b,
-                    M512_VECTOR_SIZE,
-                );
-            }
-            if mask_c != 0 {
-                process_mask_avx512(
-                    ptr,
-                    start_ptr,
-                    result,
-                    &mut start,
-                    bytes,
-                    mask_c,
-                    M512_VECTOR_SIZE * 2,
-                );
-            }
-            if mask_d != 0 {
-                process_mask_avx512(
-                    ptr,
-                    start_ptr,
-                    result,
-                    &mut start,
-                    bytes,
-                    mask_d,
-                    M512_VECTOR_SIZE * 3,
-                );
-            }
-        }
-
-        ptr = ptr.add(LOOP_SIZE_AVX512);
-    }
-
-    // Process remaining aligned chunks
-    while ptr <= end_ptr.sub(M512_VECTOR_SIZE) {
-        debug_assert_eq!(0, (ptr as usize) % M512_VECTOR_SIZE);
-        let a = _mm512_load_si512(ptr as *const __m512i);
-
-        let quote_mask = _mm512_cmpeq_epi8_mask(a, v_b);
-        let slash_mask = _mm512_cmpeq_epi8_mask(a, v_c);
-        let ctrl_mask = _mm512_cmplt_epu8_mask(a, v_ctrl_limit);
-
-        let mut mask = (quote_mask | slash_mask | ctrl_mask) as u64;
-
-        if mask != 0 {
-            let at = sub(ptr, start_ptr);
-            while mask != 0 {
-                let cur = mask.trailing_zeros() as usize;
-                let c = *ptr.add(cur);
-                let escape_byte = ESCAPE[c as usize];
-                debug_assert!(escape_byte != 0);
-                let i = at + cur;
-                if start < i {
-                    result.extend_from_slice(&bytes[start..i]);
-                }
-                write_escape(result, escape_byte, c);
-                start = i + 1;
-                mask &= mask - 1;
-            }
-        }
-        ptr = ptr.add(M512_VECTOR_SIZE);
-    }
-
-    // Handle tail
-    if ptr < end_ptr {
-        let remaining = sub(end_ptr, ptr);
-        let d = M512_VECTOR_SIZE - remaining;
-
-        // Use temporary buffer if reading would cross page boundary
-        let a = if check_cross_page(ptr.sub(d), M512_VECTOR_SIZE) {
-            let mut temp = [0u8; M512_VECTOR_SIZE];
-            // Copy remaining bytes to the beginning of temp buffer
-            std::ptr::copy_nonoverlapping(ptr, temp.as_mut_ptr(), remaining);
-            _mm512_loadu_si512(temp.as_ptr() as *const __m512i)
-        } else {
-            _mm512_loadu_si512(ptr.sub(d) as *const __m512i)
-        };
-
-        let quote_mask = _mm512_cmpeq_epi8_mask(a, v_b);
-        let slash_mask = _mm512_cmpeq_epi8_mask(a, v_c);
-        let ctrl_mask = _mm512_cmplt_epu8_mask(a, v_ctrl_limit);
-
-        let mut mask = if check_cross_page(ptr.sub(d), M512_VECTOR_SIZE) {
-            // When using temp buffer, only check the valid bytes
-            (quote_mask | slash_mask | ctrl_mask) as u64 & ((1u64 << remaining) - 1)
-        } else {
-            ((quote_mask | slash_mask | ctrl_mask) as u64).wrapping_shr(d as u32)
-        };
-
-        if mask != 0 {
-            let at = sub(ptr, start_ptr);
-            while mask != 0 {
-                let cur = mask.trailing_zeros() as usize;
-                let c = *ptr.add(cur);
-                let escape_byte = ESCAPE[c as usize];
-                debug_assert!(escape_byte != 0);
-                let i = at + cur;
-                if start < i {
-                    result.extend_from_slice(&bytes[start..i]);
-                }
-                write_escape(result, escape_byte, c);
-                start = i + 1;
-                mask &= mask - 1;
-            }
-        }
-    }
-
-    // Copy any remaining bytes
-    if start < len {
-        result.extend_from_slice(&bytes[start..]);
-    }
-}
-
-#[target_feature(enable = "avx2")]
-#[inline]
-pub unsafe fn escape_avx2(bytes: &[u8], result: &mut Vec<u8>) {
-    let len = bytes.len();
-
-    let start_ptr = bytes.as_ptr();
-    let end_ptr = bytes[len..].as_ptr();
-    let mut ptr = start_ptr;
-    let mut start = 0;
-
-    let v_translation_a = _mm256_set1_epi8(TRANSLATION_A);
-    let v_below_a = _mm256_set1_epi8(BELOW_A);
-    let v_b = _mm256_set1_epi8(B);
-    let v_c = _mm256_set1_epi8(C);
-
-    // Handle alignment - skip if already aligned
-    const M256_VECTOR_ALIGN: usize = M256_VECTOR_SIZE - 1;
-    let misalignment = start_ptr as usize & M256_VECTOR_ALIGN;
-    if misalignment != 0 {
-        let align = M256_VECTOR_SIZE - misalignment;
-        let mut mask = {
-            let a = _mm256_loadu_si256(ptr as *const __m256i);
-            _mm256_movemask_epi8(_mm256_or_si256(
-                _mm256_or_si256(_mm256_cmpeq_epi8(a, v_b), _mm256_cmpeq_epi8(a, v_c)),
-                _mm256_cmpgt_epi8(_mm256_add_epi8(a, v_translation_a), v_below_a),
-            ))
-        };
-
-        if mask != 0 {
-            let at = sub(ptr, start_ptr);
-            let mut cur = mask.trailing_zeros() as usize;
-            while cur < align {
-                let c = *ptr.add(cur);
-                let escape_byte = ESCAPE[c as usize];
-                if escape_byte != 0 {
-                    let i = at + cur;
-                    if start < i {
-                        result.extend_from_slice(&bytes[start..i]);
-                    }
-                    write_escape(result, escape_byte, c);
-                    start = i + 1;
-                }
-                mask ^= 1 << cur;
-                if mask == 0 {
-                    break;
-                }
-                cur = mask.trailing_zeros() as usize;
-            }
-        }
-        ptr = ptr.add(align);
-    }
-
-    // Main loop processing 128 bytes at a time
-    while ptr <= end_ptr.sub(LOOP_SIZE_AVX2) {
-        debug_assert_eq!(0, (ptr as usize) % M256_VECTOR_SIZE);
-
-        // Prefetch next iteration's data
-        if ptr.add(LOOP_SIZE_AVX2 + PREFETCH_DISTANCE_AVX2) < end_ptr {
-            _mm_prefetch(
-                ptr.add(LOOP_SIZE_AVX2 + PREFETCH_DISTANCE_AVX2) as *const i8,
-                _MM_HINT_T0,
-            );
-        }
-
-        // Load all 4 vectors at once for better pipelining
-        let a0 = _mm256_load_si256(ptr as *const __m256i);
-        let a1 = _mm256_load_si256(ptr.add(M256_VECTOR_SIZE) as *const __m256i);
-        let a2 = _mm256_load_si256(ptr.add(M256_VECTOR_SIZE * 2) as *const __m256i);
-        let a3 = _mm256_load_si256(ptr.add(M256_VECTOR_SIZE * 3) as *const __m256i);
-
-        // Combined mask computation - all escape conditions in one operation
-        // This reduces instruction count and improves pipelining
-        let cmp_a = _mm256_or_si256(
-            _mm256_or_si256(_mm256_cmpeq_epi8(a0, v_b), _mm256_cmpeq_epi8(a0, v_c)),
-            _mm256_cmpgt_epi8(_mm256_add_epi8(a0, v_translation_a), v_below_a),
-        );
-        let cmp_b = _mm256_or_si256(
-            _mm256_or_si256(_mm256_cmpeq_epi8(a1, v_b), _mm256_cmpeq_epi8(a1, v_c)),
-            _mm256_cmpgt_epi8(_mm256_add_epi8(a1, v_translation_a), v_below_a),
-        );
-        let cmp_c = _mm256_or_si256(
-            _mm256_or_si256(_mm256_cmpeq_epi8(a2, v_b), _mm256_cmpeq_epi8(a2, v_c)),
-            _mm256_cmpgt_epi8(_mm256_add_epi8(a2, v_translation_a), v_below_a),
-        );
-        let cmp_d = _mm256_or_si256(
-            _mm256_or_si256(_mm256_cmpeq_epi8(a3, v_b), _mm256_cmpeq_epi8(a3, v_c)),
-            _mm256_cmpgt_epi8(_mm256_add_epi8(a3, v_translation_a), v_below_a),
-        );
-
-        // Fast path: check if any escaping needed
-        let any_escape =
-            _mm256_or_si256(_mm256_or_si256(cmp_a, cmp_b), _mm256_or_si256(cmp_c, cmp_d));
-
-        if _mm256_movemask_epi8(any_escape) == 0 {
-            // No escapes needed, copy whole chunk
-            let at = sub(ptr, start_ptr);
-            if start < at {
-                result.extend_from_slice(&bytes[start..at]);
-            }
-            result.extend_from_slice(std::slice::from_raw_parts(ptr, LOOP_SIZE_AVX2));
-            start = at + LOOP_SIZE_AVX2;
-        } else {
-            // Get individual masks only when needed
-            let mask_a = _mm256_movemask_epi8(cmp_a);
-            let mask_b = _mm256_movemask_epi8(cmp_b);
-            let mask_c = _mm256_movemask_epi8(cmp_c);
-            let mask_d = _mm256_movemask_epi8(cmp_d);
-
-            // Process each 32-byte chunk that has escapes
-            if mask_a != 0 {
-                process_mask_avx(ptr, start_ptr, result, &mut start, bytes, mask_a, 0);
-            }
-            if mask_b != 0 {
-                process_mask_avx(
-                    ptr,
-                    start_ptr,
-                    result,
-                    &mut start,
-                    bytes,
-                    mask_b,
-                    M256_VECTOR_SIZE,
-                );
-            }
-            if mask_c != 0 {
-                process_mask_avx(
-                    ptr,
-                    start_ptr,
-                    result,
-                    &mut start,
-                    bytes,
-                    mask_c,
-                    M256_VECTOR_SIZE * 2,
-                );
-            }
-            if mask_d != 0 {
-                process_mask_avx(
-                    ptr,
-                    start_ptr,
-                    result,
-                    &mut start,
-                    bytes,
-                    mask_d,
-                    M256_VECTOR_SIZE * 3,
-                );
-            }
-        }
-
-        ptr = ptr.add(LOOP_SIZE_AVX2);
-    }
-
-    // Process remaining aligned chunks
-    while ptr <= end_ptr.sub(M256_VECTOR_SIZE) {
-        debug_assert_eq!(0, (ptr as usize) % M256_VECTOR_SIZE);
-        let mut mask = {
-            let a = _mm256_load_si256(ptr as *const __m256i);
-            _mm256_movemask_epi8(_mm256_or_si256(
-                _mm256_or_si256(_mm256_cmpeq_epi8(a, v_b), _mm256_cmpeq_epi8(a, v_c)),
-                _mm256_cmpgt_epi8(_mm256_add_epi8(a, v_translation_a), v_below_a),
-            ))
-        };
-
-        if mask != 0 {
-            let at = sub(ptr, start_ptr);
-            let mut cur = mask.trailing_zeros() as usize;
-            loop {
-                let c = *ptr.add(cur);
-                let escape_byte = ESCAPE[c as usize];
-                if escape_byte != 0 {
-                    let i = at + cur;
-                    if start < i {
-                        result.extend_from_slice(&bytes[start..i]);
-                    }
-                    write_escape(result, escape_byte, c);
-                    start = i + 1;
-                }
-                mask ^= 1 << cur;
-                if mask == 0 {
-                    break;
-                }
-                cur = mask.trailing_zeros() as usize;
-            }
-        }
-        ptr = ptr.add(M256_VECTOR_SIZE);
-    }
-
-    // Handle tail
-    if ptr < end_ptr {
-        let remaining = sub(end_ptr, ptr);
-        let d = M256_VECTOR_SIZE - remaining;
-
-        // Use temporary buffer if reading would cross page boundary
-        let a = if check_cross_page(ptr.sub(d), M256_VECTOR_SIZE) {
-            let mut temp = [0u8; M256_VECTOR_SIZE];
-            // Copy remaining bytes to the beginning of temp buffer
-            std::ptr::copy_nonoverlapping(ptr, temp.as_mut_ptr(), remaining);
-            _mm256_loadu_si256(temp.as_ptr() as *const __m256i)
-        } else {
-            _mm256_loadu_si256(ptr.sub(d) as *const __m256i)
-        };
-
-        let mut mask = if check_cross_page(ptr.sub(d), M256_VECTOR_SIZE) {
-            // When using temp buffer, only check the valid bytes
-            (_mm256_movemask_epi8(_mm256_or_si256(
-                _mm256_or_si256(_mm256_cmpeq_epi8(a, v_b), _mm256_cmpeq_epi8(a, v_c)),
-                _mm256_cmpgt_epi8(_mm256_add_epi8(a, v_translation_a), v_below_a),
-            )) as u32)
-                & ((1u32 << remaining) - 1)
-        } else {
-            (_mm256_movemask_epi8(_mm256_or_si256(
-                _mm256_or_si256(_mm256_cmpeq_epi8(a, v_b), _mm256_cmpeq_epi8(a, v_c)),
-                _mm256_cmpgt_epi8(_mm256_add_epi8(a, v_translation_a), v_below_a),
-            )) as u32)
-                .wrapping_shr(d as u32)
-        };
-
-        if mask != 0 {
-            let at = sub(ptr, start_ptr);
-            let mut cur = mask.trailing_zeros() as usize;
-            loop {
-                let c = *ptr.add(cur);
-                let escape_byte = ESCAPE[c as usize];
-                if escape_byte != 0 {
-                    let i = at + cur;
-                    if start < i {
-                        result.extend_from_slice(&bytes[start..i]);
-                    }
-                    write_escape(result, escape_byte, c);
-                    start = i + 1;
-                }
-                mask ^= 1 << cur;
-                if mask == 0 {
-                    break;
-                }
-                cur = mask.trailing_zeros() as usize;
-            }
-        }
-    }
-
-    // Copy any remaining bytes
-    if start < len {
-        result.extend_from_slice(&bytes[start..]);
-    }
-}
-
-#[target_feature(enable = "sse2")]
-#[inline]
-pub unsafe fn escape_sse2(bytes: &[u8], result: &mut Vec<u8>) {
-    let len = bytes.len();
-
-    let start_ptr = bytes.as_ptr();
-    let end_ptr = bytes[len..].as_ptr();
-    let mut ptr = start_ptr;
-    let mut start = 0;
-
-    const M128_VECTOR_ALIGN: usize = M128_VECTOR_SIZE - 1;
-
-    let v_translation_a = _mm_set1_epi8(TRANSLATION_A);
-    let v_below_a = _mm_set1_epi8(BELOW_A);
-    let v_b = _mm_set1_epi8(B);
-    let v_c = _mm_set1_epi8(C);
-
-    // Handle alignment - skip if already aligned
-    let misalignment = start_ptr as usize & M128_VECTOR_ALIGN;
-    if misalignment != 0 {
-        let align = M128_VECTOR_SIZE - misalignment;
-        let mut mask = {
-            let a = _mm_loadu_si128(ptr as *const __m128i);
-            _mm_movemask_epi8(_mm_or_si128(
-                _mm_or_si128(_mm_cmpeq_epi8(a, v_b), _mm_cmpeq_epi8(a, v_c)),
-                _mm_cmpgt_epi8(_mm_add_epi8(a, v_translation_a), v_below_a),
-            ))
-        };
-
-        if mask != 0 {
-            let at = sub(ptr, start_ptr);
-            let mut cur = mask.trailing_zeros() as usize;
-            while cur < align {
-                let c = *ptr.add(cur);
-                let escape_byte = ESCAPE[c as usize];
-                if escape_byte != 0 {
-                    let i = at + cur;
-                    if start < i {
-                        result.extend_from_slice(&bytes[start..i]);
-                    }
-                    write_escape(result, escape_byte, c);
-                    start = i + 1;
-                }
-                mask ^= 1 << cur;
-                if mask == 0 {
-                    break;
-                }
-                cur = mask.trailing_zeros() as usize;
-            }
-        }
-        ptr = ptr.add(align);
-    }
-
-    // Main loop
-    while ptr <= end_ptr.sub(M128_VECTOR_SIZE) {
-        debug_assert_eq!(0, (ptr as usize) % M128_VECTOR_SIZE);
-        let mut mask = {
-            let a = _mm_load_si128(ptr as *const __m128i);
-            _mm_movemask_epi8(_mm_or_si128(
-                _mm_or_si128(_mm_cmpeq_epi8(a, v_b), _mm_cmpeq_epi8(a, v_c)),
-                _mm_cmpgt_epi8(_mm_add_epi8(a, v_translation_a), v_below_a),
-            ))
-        };
-
-        if mask != 0 {
-            let at = sub(ptr, start_ptr);
-            let mut cur = mask.trailing_zeros() as usize;
-            loop {
-                let c = *ptr.add(cur);
-                let escape_byte = ESCAPE[c as usize];
-                if escape_byte != 0 {
-                    let i = at + cur;
-                    if start < i {
-                        result.extend_from_slice(&bytes[start..i]);
-                    }
-                    write_escape(result, escape_byte, c);
-                    start = i + 1;
-                }
-                mask ^= 1 << cur;
-                if mask == 0 {
-                    break;
-                }
-                cur = mask.trailing_zeros() as usize;
-            }
-        }
-        ptr = ptr.add(M128_VECTOR_SIZE);
-    }
-
-    // Handle tail
-    if ptr < end_ptr {
-        let remaining = sub(end_ptr, ptr);
-        let d = M128_VECTOR_SIZE - remaining;
-
-        // Use temporary buffer if reading would cross page boundary
-        let a = if check_cross_page(ptr.sub(d), M128_VECTOR_SIZE) {
-            let mut temp = [0u8; M128_VECTOR_SIZE];
-            // Copy remaining bytes to the beginning of temp buffer
-            std::ptr::copy_nonoverlapping(ptr, temp.as_mut_ptr(), remaining);
-            _mm_loadu_si128(temp.as_ptr() as *const __m128i)
-        } else {
-            _mm_loadu_si128(ptr.sub(d) as *const __m128i)
-        };
-
-        let mut mask = if check_cross_page(ptr.sub(d), M128_VECTOR_SIZE) {
-            // When using temp buffer, only check the valid bytes
-            (_mm_movemask_epi8(_mm_or_si128(
-                _mm_or_si128(_mm_cmpeq_epi8(a, v_b), _mm_cmpeq_epi8(a, v_c)),
-                _mm_cmpgt_epi8(_mm_add_epi8(a, v_translation_a), v_below_a),
-            )) as u16)
-                & ((1u16 << remaining) - 1)
-        } else {
-            (_mm_movemask_epi8(_mm_or_si128(
-                _mm_or_si128(_mm_cmpeq_epi8(a, v_b), _mm_cmpeq_epi8(a, v_c)),
-                _mm_cmpgt_epi8(_mm_add_epi8(a, v_translation_a), v_below_a),
-            )) as u16)
-                .wrapping_shr(d as u32)
-        };
-
-        if mask != 0 {
-            let at = sub(ptr, start_ptr);
-            let mut cur = mask.trailing_zeros() as usize;
-            loop {
-                let c = *ptr.add(cur);
-                let escape_byte = ESCAPE[c as usize];
-                if escape_byte != 0 {
-                    let i = at + cur;
-                    if start < i {
-                        result.extend_from_slice(&bytes[start..i]);
-                    }
-                    write_escape(result, escape_byte, c);
-                    start = i + 1;
-                }
-                mask ^= 1 << cur;
-                if mask == 0 {
-                    break;
-                }
-                cur = mask.trailing_zeros() as usize;
-            }
-        }
-    }
-
-    // Copy any remaining bytes
-    if start < len {
-        result.extend_from_slice(&bytes[start..]);
-    }
-}
-
-#[inline(always)]
-unsafe fn process_mask_avx(
-    ptr: *const u8,
-    start_ptr: *const u8,
-    result: &mut Vec<u8>,
-    start: &mut usize,
-    bytes: &[u8],
-    mask: i32,
-    offset: usize,
-) {
-    let ptr = ptr.add(offset);
-    let at = sub(ptr, start_ptr);
-
-    // Reserve space upfront to reduce allocations
-    // Worst case: each byte needs 6 bytes (e.g., \u001f)
-    let max_needed = 32 * 6;
-    result.reserve(max_needed);
-
-    // Process mask bits using bit manipulation
-    let mut remaining = mask as u32;
-    while remaining != 0 {
-        let cur = remaining.trailing_zeros() as usize;
-        let i = at + cur;
-
-        // Copy unescaped portion using copy_nonoverlapping
-        if *start < i {
-            let src = bytes.as_ptr().add(*start);
-            let len = i - *start;
-            let dst = result.as_mut_ptr().add(result.len());
-            std::ptr::copy_nonoverlapping(src, dst, len);
-            result.set_len(result.len() + len);
-        }
-
-        // Handle continuous escapes starting from current position
-        let escape_src = ptr.add(cur);
-        let mut dst = result.as_mut_ptr().add(result.len());
-        let new_src = escape_continuous(escape_src, &mut dst, bytes, start_ptr);
-        let bytes_written = sub(dst, result.as_mut_ptr().add(result.len()));
-        result.set_len(result.len() + bytes_written);
-
-        // Update start position
-        let chars_processed = sub(new_src, escape_src);
-        *start = i + chars_processed;
-
-        // Clear processed bits from mask
-        // We need to clear all bits up to and including the last processed character
-        let bits_to_clear = cur + chars_processed;
-        if bits_to_clear < 32 {
-            remaining &= !((1u32 << bits_to_clear) - 1);
-        } else {
-            remaining = 0;
-        }
-    }
-}
-
-#[inline(always)]
-unsafe fn process_mask_avx512(
-    ptr: *const u8,
-    start_ptr: *const u8,
-    result: &mut Vec<u8>,
-    start: &mut usize,
-    bytes: &[u8],
-    mask: u64,
-    offset: usize,
-) {
-    let ptr = ptr.add(offset);
-    let at = sub(ptr, start_ptr);
-
-    // Reserve space upfront to reduce allocations
-    // Worst case: each byte needs 6 bytes (e.g., \u001f)
-    let max_needed = 64 * 6;
-    result.reserve(max_needed);
-
-    // Process mask bits using bit manipulation
-    let mut remaining = mask;
-    while remaining != 0 {
-        let cur = remaining.trailing_zeros() as usize;
-        let i = at + cur;
-
-        // Copy unescaped portion using copy_nonoverlapping
-        if *start < i {
-            let src = bytes.as_ptr().add(*start);
-            let len = i - *start;
-            let dst = result.as_mut_ptr().add(result.len());
-            std::ptr::copy_nonoverlapping(src, dst, len);
-            result.set_len(result.len() + len);
-        }
-
-        // Handle continuous escapes starting from current position
-        let escape_src = ptr.add(cur);
-        let mut dst = result.as_mut_ptr().add(result.len());
-        let new_src = escape_continuous(escape_src, &mut dst, bytes, start_ptr);
-        let bytes_written = sub(dst, result.as_mut_ptr().add(result.len()));
-        result.set_len(result.len() + bytes_written);
-
-        // Update start position
-        let chars_processed = sub(new_src, escape_src);
-        *start = i + chars_processed;
-
-        // Clear processed bits from mask
-        // We need to clear all bits up to and including the last processed character
-        let bits_to_clear = cur + chars_processed;
-        if bits_to_clear < 64 {
-            remaining &= !((1u64 << bits_to_clear) - 1);
-        } else {
-            remaining = 0;
-        }
-    }
-}
-
-/// Process continuous escaped characters efficiently
-/// Returns the new source pointer position
-#[inline(always)]
-unsafe fn escape_continuous(
-    src: *const u8,
-    dst: &mut *mut u8,
-    bytes: &[u8],
-    start_ptr: *const u8,
-) -> *const u8 {
-    let mut ptr = src;
-
-    loop {
-        let c = *ptr;
-        let escape_byte = ESCAPE[c as usize];
-
-        if escape_byte == 0 {
-            break;
-        }
-
-        let (len, escape_bytes) = ESCAPE_TABLE[c as usize];
-
-        if len > 0 {
-            // Copy 8 bytes at once (actual escape + padding)
-            std::ptr::copy_nonoverlapping(escape_bytes.as_ptr(), *dst, 8);
-            *dst = dst.add(len as usize);
-        } else {
-            // Rare fallback for characters not in table
-            **dst = b'\\';
-            *dst = dst.add(1);
-            if escape_byte == UU {
-                std::ptr::copy_nonoverlapping(b"u00".as_ptr(), *dst, 3);
-                *dst = dst.add(3);
-                let hex = &HEX_BYTES[c as usize];
-                **dst = hex.0;
-                *dst = dst.add(1);
-                **dst = hex.1;
-                *dst = dst.add(1);
-            } else {
-                **dst = escape_byte;
-                *dst = dst.add(1);
-            }
-        }
-
-        ptr = ptr.add(1);
-
-        // Check if next character also needs escaping to continue the loop
-        if ptr >= bytes.as_ptr().add(bytes.len()) || ESCAPE[*ptr as usize] == 0 {
-            break;
-        }
-    }
-
-    ptr
-}
-
-#[inline(always)]
-fn write_escape(result: &mut Vec<u8>, escape_byte: u8, c: u8) {
-    // Use optimized escape table for bulk writing
-    let (len, bytes) = ESCAPE_TABLE[c as usize];
-    if len > 0 {
-        // Ensure we have enough capacity for the escape sequence
-        result.reserve(len as usize);
-        let dst = result.as_mut_ptr().add(result.len());
-        // Use copy_nonoverlapping for fast bulk copy
-        unsafe {
-            std::ptr::copy_nonoverlapping(bytes.as_ptr(), dst, 8);
-        }
-        // Update the length - only add the actual escape sequence length
-        unsafe {
-            result.set_len(result.len() + len as usize);
-        }
-    } else {
-        // Fallback to old method for characters not in the table
-        result.push(b'\\');
-        if escape_byte == UU {
-            // Unicode escape for control characters
-            result.extend_from_slice(b"u00");
-            let hex_digits = &HEX_BYTES[c as usize];
-            result.push(hex_digits.0);
-            result.push(hex_digits.1);
-        } else {
-            // Simple escape
-            result.push(escape_byte);
-        }
-    }
-}

From 62b9e4a32e31d8428a64e19f91e13cc03efe6939 Mon Sep 17 00:00:00 2001
From: LongYinan <lynweklm@gmail.com>
Date: Sun, 12 Oct 2025 14:01:48 +0800
Subject: [PATCH 10/15] allocate only

---
 src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lib.rs b/src/lib.rs
index f525423..dc0d912 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -432,7 +432,7 @@ fn format_string(value: &str, dst: &mut [u8]) -> usize {
 }
 
 pub fn escape(value: &str) -> String {
-    let mut buf = vec![0; value.len() * 6 + 32 + 3];
+    let mut buf = Vec::with_capacity(value.len() * 6 + 32 + 3);
     let cnt = format_string(value, &mut buf);
     unsafe { buf.set_len(cnt) };
     unsafe { String::from_utf8_unchecked(buf) }

From b77636527a460ef076cdd9d08d431973b4c664f9 Mon Sep 17 00:00:00 2001
From: LongYinan <lynweklm@gmail.com>
Date: Sun, 12 Oct 2025 14:13:38 +0800
Subject: [PATCH 11/15] Fix miri

---
 src/lib.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/lib.rs b/src/lib.rs
index dc0d912..2623cbb 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -432,7 +432,9 @@ fn format_string(value: &str, dst: &mut [u8]) -> usize {
 }
 
 pub fn escape(value: &str) -> String {
-    let mut buf = Vec::with_capacity(value.len() * 6 + 32 + 3);
+    let capacity = value.len() * 6 + 32 + 3;
+    let mut buf = Vec::with_capacity(capacity);
+    unsafe { buf.set_len(capacity) };
     let cnt = format_string(value, &mut buf);
     unsafe { buf.set_len(cnt) };
     unsafe { String::from_utf8_unchecked(buf) }

From edcc1803b7e80ed600b7555c674509577a279223 Mon Sep 17 00:00:00 2001
From: LongYinan <lynweklm@gmail.com>
Date: Sun, 12 Oct 2025 14:17:24 +0800
Subject: [PATCH 12/15] ignore genric

---
 README.md         | 33 +++++++++++++++++++++++----------
 benches/escape.rs |  1 +
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 9d6773d..de6b290 100644
--- a/README.md
+++ b/README.md
@@ -88,18 +88,31 @@ Neon enabled.
 
 | Implementation        | Median time   | vs fastest |
 | --------------------- | ------------- | ---------- |
-| **`escape simd`**     | **307.20 µs** | **1.00×**  |
-| `escape generic`      | 490.00 µs     | 1.60×      |
-| `serde_json`          | 570.35 µs     | 1.86×      |
-| `escape v_jsonescape` | 599.72 µs     | 1.95×      |
-| `json-escape`         | 644.73 µs     | 2.10×      |
+| **`escape simd`**     | **196.07 µs** | **1.00×**  |
+| `escape sonic`        | 196.32 µs     | 1.00×      |
+| `escape generic`      | 488.37 µs     | 2.49×      |
+| `serde_json`          | 553.08 µs     | 2.82×      |
+| `escape v_jsonescape` | 618.31 µs     | 3.15×      |
+| `json-escape`         | 446.94 µs     | 2.28×      |
 
 **Fixtures payload (~300 iterations)**
 
 | Implementation        | Median time  | vs fastest |
 | --------------------- | ------------ | ---------- |
-| **`escape generic`**  | **17.89 ms** | **1.00×**  |
-| **`escape simd`**     | **17.92 ms** | **1.00×**  |
-| `serde_json`          | 19.78 ms     | 1.11×      |
-| `escape v_jsonescape` | 21.09 ms     | 1.18×      |
-| `json-escape`         | 22.43 ms     | 1.25×      |
+| **`escape simd`**     | **10.36 ms** | **1.00×**  |
+| `escape sonic`        | 10.57 ms     | 1.02×      |
+| `escape generic`      | 17.61 ms     | 1.70×      |
+| `json-escape`         | 18.01 ms     | 1.74×      |
+| `serde_json`          | 19.00 ms     | 1.83×      |
+| `escape v_jsonescape` | 21.38 ms     | 2.06×      |
+
+**Short string benchmark**
+
+| Implementation        | Median time   | vs fastest |
+| --------------------- | ------------- | ---------- |
+| **`escape simd`**     | **90.58 ns**  | **1.00×**  |
+| `serde_json`          | 139.23 ns     | 1.54×      |
+| `escape generic`      | 146.15 ns     | 1.61×      |
+| `json-escape`         | 173.60 ns     | 1.92×      |
+| `escape v_jsonescape` | 198.60 ns     | 2.19×      |
+| `escape sonic`        | 199.27 ns     | 2.20×      |
diff --git a/benches/escape.rs b/benches/escape.rs
index 4ff2226..fdb1fd8 100644
--- a/benches/escape.rs
+++ b/benches/escape.rs
@@ -72,6 +72,7 @@ fn run_benchmarks(c: &mut Criterion, sources: &[String], prefix: &str) {
             }
         })
     });
+    #[cfg(not(feature = "codspeed"))]
     c.bench_function(&format!("{} escape generic", prefix), |b| {
         b.iter(|| {
             for source in sources {

From 92f12e5f84869fccca7da9680db7b70975dd8cf9 Mon Sep 17 00:00:00 2001
From: LongYinan <lynweklm@gmail.com>
Date: Sun, 12 Oct 2025 14:22:19 +0800
Subject: [PATCH 13/15] compile warning

---
 src/lib.rs | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 2623cbb..69213d1 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -316,19 +316,11 @@ unsafe fn escape_unchecked(src: &mut *const u8, nb: &mut usize, dst: &mut *mut u
     }
 }
 
+#[cfg(any(target_os = "linux", target_os = "macos"))]
 #[inline(always)]
 fn check_cross_page(ptr: *const u8, step: usize) -> bool {
-    #[cfg(any(target_os = "linux", target_os = "macos"))]
-    {
-        let page_size = 4096;
-        ((ptr as usize & (page_size - 1)) + step) > page_size
-    }
-
-    #[cfg(not(any(target_os = "linux", target_os = "macos")))]
-    {
-        // not check page cross in fallback envs, always true
-        true
-    }
+    let page_size = 4096;
+    ((ptr as usize & (page_size - 1)) + step) > page_size
 }
 
 #[inline(always)]
@@ -396,7 +388,9 @@ fn format_string(value: &str, dst: &mut [u8]) -> usize {
         #[cfg(miri)]
         let mut placeholder: [u8; LANES] = [0; LANES];
         while nb > 0 {
-            v = if check_cross_page(sptr, LANES) {
+            v = if cfg!(not(any(target_os = "linux", target_os = "macos")))
+                || check_cross_page(sptr, LANES)
+            {
                 std::ptr::copy_nonoverlapping(sptr, placeholder[..].as_mut_ptr(), nb);
                 load(placeholder[..].as_ptr())
             } else {

From 217cb2fcd4cb5db578f4664b7cddd0306c72169c Mon Sep 17 00:00:00 2001
From: LongYinan <lynweklm@gmail.com>
Date: Sun, 12 Oct 2025 14:27:16 +0800
Subject: [PATCH 14/15] compile error

---
 src/lib.rs | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 69213d1..e0114ec 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -388,22 +388,30 @@ fn format_string(value: &str, dst: &mut [u8]) -> usize {
         #[cfg(miri)]
         let mut placeholder: [u8; LANES] = [0; LANES];
         while nb > 0 {
-            v = if cfg!(not(any(target_os = "linux", target_os = "macos")))
-                || check_cross_page(sptr, LANES)
-            {
-                std::ptr::copy_nonoverlapping(sptr, placeholder[..].as_mut_ptr(), nb);
-                load(placeholder[..].as_ptr())
-            } else {
-                #[cfg(not(debug_assertions))]
-                {
-                    // disable memory sanitizer here
-                    load(sptr)
-                }
-                #[cfg(debug_assertions)]
+            v = {
+                #[cfg(not(any(target_os = "linux", target_os = "macos")))]
                 {
                     std::ptr::copy_nonoverlapping(sptr, placeholder[..].as_mut_ptr(), nb);
                     load(placeholder[..].as_ptr())
                 }
+                #[cfg(any(target_os = "linux", target_os = "macos"))]
+                {
+                    if check_cross_page(sptr, LANES) {
+                        std::ptr::copy_nonoverlapping(sptr, placeholder[..].as_mut_ptr(), nb);
+                        load(placeholder[..].as_ptr())
+                    } else {
+                        #[cfg(not(debug_assertions))]
+                        {
+                            // disable memory sanitizer here
+                            load(sptr)
+                        }
+                        #[cfg(debug_assertions)]
+                        {
+                            std::ptr::copy_nonoverlapping(sptr, placeholder[..].as_mut_ptr(), nb);
+                            load(placeholder[..].as_ptr())
+                        }
+                    }
+                }
             };
             v.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut(dptr, LANES));
 

From 605d6da5322db251d417d519c3801c73f072e8e7 Mon Sep 17 00:00:00 2001
From: LongYinan <lynweklm@gmail.com>
Date: Sun, 12 Oct 2025 14:27:50 +0800
Subject: [PATCH 15/15] add woa

---
 .github/workflows/CI.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index ed8e7d3..8dfcac6 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -77,6 +77,8 @@ jobs:
             os: ubuntu-latest
           - target: x86_64-pc-windows-msvc
             os: windows-latest
+          - target: aarch64-pc-windows-msvc
+            os: windows-11-arm
           - target: aarch64-unknown-linux-gnu
             os: ubuntu-24.04-arm
           - target: aarch64-apple-darwin