oxc-project · dyxushuai · Feb 7, 2024 · Feb 7, 2024 · Feb 7, 2024 · Feb 7, 2024
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+  "rust-analyzer.showUnlinkedFileNotification": false
+}
diff --git a/Cargo.toml b/Cargo.toml
@@ -137,6 +137,7 @@ tsify                     = { version = "0.4.5" }
 wasm-bindgen              = { version = "0.2" }
 serde-wasm-bindgen        = { version = "0.6.3" }
 
+
 [profile.release.package.oxc_wasm]
 opt-level = 'z'
 

diff --git a/crates/oxc_parser/src/lexer/identifier.rs b/crates/oxc_parser/src/lexer/identifier.rs
@@ -1,29 +1,36 @@
 use super::{
     cold_branch,
-    search::{byte_search, safe_byte_match_table, SafeByteMatchTable},
+    search::{byte_search, simd_byte_match_table, SimdByteMatchTable, SEARCH_BATCH_SIZE},
     Kind, Lexer, SourcePosition,
 };
 use crate::diagnostics;
 
-use std::cmp::max;
-
 use oxc_allocator::String;
 use oxc_span::Span;
 use oxc_syntax::identifier::{
     is_identifier_part, is_identifier_part_unicode, is_identifier_start_unicode,
 };
+use std::{borrow::Cow, cmp::max};
 
 const MIN_ESCAPED_STR_LEN: usize = 16;
 
-static ASCII_ID_START_TABLE: SafeByteMatchTable =
-    safe_byte_match_table!(|b| b.is_ascii_alphabetic() || b == b'_' || b == b'$');
+static ASCII_ID_START_TABLE: SimdByteMatchTable =
+    simd_byte_match_table!(|b| b.is_ascii_alphabetic() || b == b'_' || b == b'$', false);
 
-static NOT_ASCII_ID_CONTINUE_TABLE: SafeByteMatchTable =
-    safe_byte_match_table!(|b| !(b.is_ascii_alphanumeric() || b == b'_' || b == b'$'));
+static NOT_ASCII_ID_CONTINUE_TABLE: SimdByteMatchTable =
+    simd_byte_match_table!(|b| !(b.is_ascii_alphanumeric() || b == b'_' || b == b'$'), true);
 
 #[inline]
-fn is_identifier_start_ascii_byte(byte: u8) -> bool {
-    ASCII_ID_START_TABLE.matches(byte)
+fn is_identifier_start_ascii_byte(data: Option<(Cow<[u8; SEARCH_BATCH_SIZE]>, usize)>) -> bool {
+    let data = match data {
+        Some(data) => data,
+        None => return false,
+    };
+    let mut iter = ASCII_ID_START_TABLE.matches(data.0.as_ref(), data.1);
+    match iter.next() {
+        Some((offset, _)) => offset == 0,
+        None => false,
+    }
 }
 
 impl<'a> Lexer<'a> {
@@ -224,10 +231,11 @@ impl<'a> Lexer<'a> {
             });
         }
 
+        let pos = self.source.position();
         // Handle if not an ASCII identifier byte.
         // SAFETY: Not at EOF, so safe to read a byte.
-        let b = unsafe { start_pos.read() };
-        if !is_identifier_start_ascii_byte(b) {
+        let data = unsafe { pos.peek_n_with_padding::<SEARCH_BATCH_SIZE>(self.source.end_addr()) };
+        if !is_identifier_start_ascii_byte(data) {
             return self.private_identifier_not_ascii_id();
         }
 

diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs
@@ -17,6 +17,7 @@ mod numeric;
 mod punctuation;
 mod regex;
 mod search;
+mod simd;
 mod source;
 mod string;
 mod string_builder;

diff --git a/crates/oxc_parser/src/lexer/search.rs b/crates/oxc_parser/src/lexer/search.rs
@@ -4,8 +4,52 @@
 //! * `byte_match_table!` and `safe_byte_match_table!` macros create those tables at compile time.
 //! * `byte_search!` macro searches source text for first byte matching a byte table.
 
+use super::simd;
+
 /// Batch size for searching
-pub const SEARCH_BATCH_SIZE: usize = 32;
+pub const SEARCH_BATCH_SIZE: usize = simd::ALIGNMENT;
+
+pub struct SimdByteMatchTable(simd::MatchTable);
+
+#[allow(dead_code)]
+impl SimdByteMatchTable {
+    // Create new `SimdByteMatchTable`.
+    pub const fn new(bytes: [bool; 256], reverse: bool) -> Self {
+        Self(simd::MatchTable::new(bytes, reverse))
+    }
+
+    /// Declare that using this table for searching.
+    /// An unsafe function here, whereas for `SafeByteMatchTable` it's safe.
+    /// `byte_search!` macro calls `.use_table()` on whatever table it's provided, which makes
+    /// using the macro unsafe for `ByteMatchTable`, but safe for `SafeByteMatchTable`.
+    #[allow(clippy::unused_self)]
+    #[inline]
+    pub const fn use_table(&self) {}
+
+    /// Test a value against this `ByteMatchTable`.
+    #[inline]
+    pub fn matches<'a>(
+        &'a self,
+        data: &'a [u8; SEARCH_BATCH_SIZE],
+        actual_len: usize,
+    ) -> impl Iterator<Item = (usize, u8)> + 'a {
+        self.0.matches(data, actual_len)
+    }
+}
+
+macro_rules! simd_byte_match_table {
+    (|$byte:ident| $res:expr, $reverse:expr) => {{
+        use crate::lexer::search::SimdByteMatchTable;
+        // Clippy creates warnings because e.g. `byte_match_table!(|b| b == 0)`
+        // is expanded to `SimdByteMatchTable([(0 == 0), ... ])`
+        #[allow(clippy::eq_op)]
+        const TABLE: SimdByteMatchTable = seq_macro::seq!($byte in 0u8..=255 {
+            SimdByteMatchTable::new([ #($res,)* ], $reverse)
+        });
+        TABLE
+    }};
+}
+pub(crate) use simd_byte_match_table;
 
 /// Byte matcher lookup table.
 ///
@@ -158,6 +202,7 @@ pub(crate) use byte_match_table;
 ///   }
 /// }
 /// ```
+#[derive(Debug)]
 #[repr(C, align(64))]
 pub struct SafeByteMatchTable([bool; 256]);
 
@@ -207,10 +252,96 @@ impl SafeByteMatchTable {
     #[inline]
     pub const fn use_table(&self) {}
 
-    /// Test a value against this `SafeByteMatchTable`.
+    /// Returns the position of matched first delimiter and the matched first byte.
     #[inline]
-    pub const fn matches(&self, b: u8) -> bool {
-        self.0[b as usize]
+    pub fn matches<'a>(
+        &'a self,
+        data: &'a [u8; SEARCH_BATCH_SIZE],
+        actual_len: usize,
+    ) -> impl Iterator<Item = (usize, u8)> + 'a {
+        SafeByteMatchTableIter { table: self, data, actual_len, offset: 0 }
+    }
+}
+
+struct SafeByteMatchTableIter<'a> {
+    table: &'a SafeByteMatchTable,
+    data: &'a [u8; SEARCH_BATCH_SIZE],
+    actual_len: usize,
+    offset: usize,
+}
+
+impl Iterator for SafeByteMatchTableIter<'_> {
+    type Item = (usize, u8);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        for (i, &b) in self.data[self.offset..self.actual_len].iter().enumerate() {
+            self.offset += 1;
+            if self.table.0[b as usize] {
+                return Some((i, b));
+            }
+        }
+        None
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::SafeByteMatchTable;
+    use crate::lexer::{source::Source, UniquePromise};
+
+    const SEARCH_BATCH_SIZE: usize = 16;
+    #[test]
+    fn neon_find_non_ascii() {
+        let table = seq_macro::seq!(b in 0u8..=255 {
+            SafeByteMatchTable::new([#(!(b.is_ascii_alphanumeric() || b == b'_' || b == b'$'),)*])
+        });
+        let data = [
+            "AAAAAAAA\"\rAAAAAA",
+            "AAAAAAAAAAAAAAA\"",
+            "AAAAAAAAAAAAAAAA",
+            "AAAAAAAA",
+            "AAAAAAAA\r",
+            "AAAAAAAAAAAAAAA\r",
+        ]
+        .map(|x| Source::new(x, UniquePromise::new_for_tests()));
+        let expected = [
+            (vec![Some((8, b'"')), Some((0, b'\r')), None], SEARCH_BATCH_SIZE),
+            (vec![Some((15, b'"')), None], SEARCH_BATCH_SIZE),
+            (vec![None], SEARCH_BATCH_SIZE),
+            (vec![None], 8),
+            (vec![Some((8, b'\r')), None], 9),
+            (vec![Some((15, b'\r')), None], SEARCH_BATCH_SIZE),
+        ];
+
+        for (idx, d) in data.into_iter().enumerate() {
+            let pos = d.position();
+            let (data, actual_len) =
+                unsafe { pos.peek_n_with_padding::<SEARCH_BATCH_SIZE>(d.end_addr()) }.unwrap();
+            let mut result = table.matches(&data, actual_len);
+            for val in &expected[idx].0 {
+                assert_eq!(result.next(), *val);
+            }
+            assert_eq!(actual_len, expected[idx].1);
+        }
+    }
+
+    #[test]
+    fn neon_find_single_quote_string() {
+        let table = seq_macro::seq!(b in 0u8..=255 {
+            // find non ascii
+            SafeByteMatchTable::new([#(matches!(b, b'\'' | b'\r' | b'\n' | b'\\'),)*])
+        });
+        let s1 = String::from(138u8 as char);
+        let data = [&s1].map(|x| Source::new(x, UniquePromise::new_for_tests()));
+        let expected = [(None, 2)];
+
+        for (idx, d) in data.into_iter().enumerate() {
+            let pos = d.position();
+            let (data, actual_len) =
+                unsafe { pos.peek_n_with_padding::<SEARCH_BATCH_SIZE>(d.end_addr()) }.unwrap();
+            let mut result = table.matches(&data, actual_len);
+            assert_eq!((result.next(), actual_len), expected[idx]);
+        }
     }
 }
 
@@ -495,93 +626,54 @@ macro_rules! byte_search {
 
         let mut $pos = $start;
         #[allow(unused_unsafe)] // Silence warnings if macro called in unsafe code
-        loop {
-            if $pos.addr() <= $lexer.source.end_for_batch_search_addr() {
-                // Search a batch of `SEARCH_BATCH_SIZE` bytes.
-                // The compiler unrolls this loop.
-                // SAFETY:
-                // `$pos.addr() > lexer.source.end_for_batch_search_addr()` check above ensures there are
-                // at least `SEARCH_BATCH_SIZE` bytes remaining in `lexer.source`.
-                // So calls to `$pos.read()` and `$pos.add(1)` in this loop cannot go out of bounds.
-                for _i in 0..crate::lexer::search::SEARCH_BATCH_SIZE {
-                    // SAFETY: `$pos` cannot go out of bounds in this loop (see above).
-                    let $match_byte = unsafe { $pos.read() };
-                    if $table.matches($match_byte) {
-                        // Found match.
-                        // Check if should continue.
-                        {
-                            let $continue_byte = $match_byte;
-                            if $should_continue {
-                                // Not a match after all - continue searching.
-                                // SAFETY: `pos` is not at end of source, so safe to advance 1 byte.
-                                // See above about UTF-8 character boundaries invariant.
-                                $pos = unsafe { $pos.add(1) };
-                                continue;
-                            }
-                        }
-
-                        // Advance `lexer.source`'s position up to `$pos`, consuming unmatched bytes.
-                        // SAFETY: See above about UTF-8 character boundaries invariant.
-                        $lexer.source.set_position($pos);
-
-                        let $match_start = $start;
-                        return $match_handler;
-                    }
-
-                    // No match - continue searching
-                    // SAFETY: `$pos` cannot go out of bounds in this loop (see above).
-                    // Also see above about UTF-8 character boundaries invariant.
-                    $pos = unsafe { $pos.add(1) };
-                }
-                // No match in batch - loop round and searching next batch
-            } else {
-                // Not enough bytes remaining to process as a batch.
-                // This branch marked `#[cold]` as should be very uncommon in normal-length JS files.
-                // Very short JS files will be penalized, but they'll be very fast to parse anyway.
-                // TODO: Could extend very short files with padding during parser initialization
-                // to remove that problem.
-                return crate::lexer::cold_branch(|| {
-                    let end_addr = $lexer.source.end_addr();
-                    while $pos.addr() < end_addr {
-                        // SAFETY: `pos` is not at end of source, so safe to read a byte
-                        let $match_byte = unsafe { $pos.read() };
-                        if $table.matches($match_byte) {
-                            // Found match.
-                            // Check if should continue.
-                            {
-                                let $continue_byte = $match_byte;
-                                if $should_continue {
-                                    // Not a match after all - continue searching.
-                                    // SAFETY: `pos` is not at end of source, so safe to advance 1 byte.
-                                    // See above about UTF-8 character boundaries invariant.
-                                    $pos = unsafe { $pos.add(1) };
-                                    continue;
-                                }
-                            }
-
-                            // Advance `lexer.source`'s position up to `pos`, consuming unmatched bytes.
-                            // SAFETY: See above about UTF-8 character boundaries invariant.
-                            $lexer.source.set_position($pos);
-
-                            let $match_start = $start;
-                            return $match_handler;
-                        }
-
-                        // No match - continue searching
+        while let Some((data, actual_len)) = unsafe {
+            $pos.peek_n_with_padding::<{ crate::lexer::search::SEARCH_BATCH_SIZE }>(
+                $lexer.source.end_addr(),
+            )
+        } {
+            let mut iter = $table.matches(&data, actual_len);
+            let mut remaining = actual_len;
+            while let Some((offset, b)) = iter.next() {
+                // Advance the $pos with the batch matched pos
+                // SAFETY: `pos` is not at end of source, so safe to advance `pos` bytes.
+                // See above about UTF-8 character boundaries invariant.
+                $pos = unsafe { $pos.add(offset) };
+                remaining -= offset;
+                // SAFETY: `$pos` cannot go out of bounds in this loop (see above).
+                let $match_byte = b;
+                // Found match.
+                // Check if should continue.
+                {
+                    let $continue_byte = $match_byte;
+                    if $should_continue {
+                        // Not a match after all - continue searching.
                         // SAFETY: `pos` is not at end of source, so safe to advance 1 byte.
                         // See above about UTF-8 character boundaries invariant.
                         $pos = unsafe { $pos.add(1) };
+                        remaining -= 1;
+                        continue;
                     }
+                }
+                // Advance `lexer.source`'s position up to `$pos`, consuming unmatched bytes.
+                // SAFETY: See above about UTF-8 character boundaries invariant.
+                $lexer.source.set_position($pos);
 
-                    // EOF.
-                    // Advance `lexer.source`'s position to end of file.
-                    $lexer.source.set_position($pos);
-
-                    let $eof_start = $start;
-                    $eof_handler
-                });
+                let $match_start = $start;
+                return $match_handler;
             }
+            // No match in batch - loop round and searching next batch
+
+            // No match - continue searching
+            // SAFETY: `$pos` cannot go out of bounds in this loop (see above).
+            // Also see above about UTF-8 character boundaries invariant.
+            $pos = unsafe { $pos.add(remaining) };
         }
+
+        // EOF.
+        // Advance `lexer.source`'s position to end of file.
+        $lexer.source.set_position($pos);
+        let $eof_start = $start;
+        return $eof_handler;
     }};
 }
 pub(crate) use byte_search;