Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf(parser): lexer byte handlers consume ASCII chars faster #2046

Merged
merged 2 commits into from
Jan 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ oxc_prettier = { path = "crates/oxc_prettier" }
oxc_tasks_common = { path = "tasks/common" }
oxc_language_server = { path = "crates/oxc_language_server" }

assert-unchecked = { version = "0.1.2" }
bpaf = { version = "0.9.8" }
bitflags = { version = "2.4.1" }
bumpalo = { version = "3.14.0" }
Expand Down
7 changes: 4 additions & 3 deletions crates/oxc_parser/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,10 @@ oxc_syntax = { workspace = true }
oxc_diagnostics = { workspace = true }
oxc_index = { workspace = true }

bitflags = { workspace = true }
rustc-hash = { workspace = true }
num-bigint = { workspace = true }
assert-unchecked = { workspace = true }
bitflags = { workspace = true }
rustc-hash = { workspace = true }
num-bigint = { workspace = true }

[dev-dependencies]
oxc_ast = { workspace = true, features = ["serde"] }
Expand Down
125 changes: 89 additions & 36 deletions crates/oxc_parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ mod string_builder;
mod token;
mod trivia_builder;

use assert_unchecked::assert_unchecked;
use rustc_hash::FxHashMap;
use std::{collections::VecDeque, str::Chars};

Expand Down Expand Up @@ -270,6 +271,20 @@ impl<'a> Lexer<'a> {
self.current.chars.next().unwrap()
}

/// Consume the current char when it's known to be ASCII.
/// This compiles down to a single instruction, just incrementing `chars` iterator's pointer.
/// NOTE: Caller must ensure not at EOF and current char is ASCII.
#[inline]
fn consume_ascii_char(&mut self) -> char {
let s = self.current.chars.as_str();
// SAFETY: Caller must ensure not at EOF and current char is ASCII.
unsafe {
assert_unchecked!(!s.is_empty());
assert_unchecked!(s.as_bytes()[0] < 128);
}
self.current.chars.next().unwrap()
}

/// Peek the next char without advancing the position
#[inline]
fn peek(&self) -> Option<char> {
Expand Down Expand Up @@ -1315,28 +1330,33 @@ static BYTE_HANDLERS: [ByteHandler; 128] = [
L_P, IDT, L_R, L_S, L_T, L_U, L_V, L_W, IDT, L_Y, IDT, BEO, PIP, BEC, TLD, ERR, // 7
];

// `\0` `\1` etc
const ERR: ByteHandler = |lexer| {
let c = lexer.consume_char();
// Next char is an ASCII char e.g. `\0`
let c = lexer.consume_ascii_char();
lexer.error(diagnostics::InvalidCharacter(c, lexer.unterminated_range()));
Kind::Undetermined
};

// <TAB> <VT> <FF>
// <SPACE> <TAB> <VT> <FF>
const SPS: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is an ASCII space character
lexer.consume_ascii_char();
Kind::WhiteSpace
};

// '\r' '\n'
const LIN: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `\r` or `\n`, which are both ASCII
lexer.consume_ascii_char();
lexer.current.token.is_on_new_line = true;
Kind::NewLine
};

// !
const EXL: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `!`, which is ASCII
lexer.consume_ascii_char();
if lexer.next_eq('=') {
if lexer.next_eq('=') {
Kind::Neq2
Expand All @@ -1350,7 +1370,8 @@ const EXL: ByteHandler = |lexer| {

// ' "
const QOT: ByteHandler = |lexer| {
let c = lexer.consume_char();
// Next char is `'` or `"`, which are both ASCII
let c = lexer.consume_ascii_char();
if lexer.context == LexerContext::JsxAttributeValue {
lexer.read_jsx_string_literal(c)
} else {
Expand All @@ -1360,7 +1381,8 @@ const QOT: ByteHandler = |lexer| {

// #
const HAS: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `#`, which is ASCII
lexer.consume_ascii_char();
// HashbangComment ::
// `#!` SingleLineCommentChars?
if lexer.current.token.start == 0 && lexer.next_eq('!') {
Expand All @@ -1377,7 +1399,8 @@ const IDT: ByteHandler = |lexer| {

// %
const PRC: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `%`, which is ASCII
lexer.consume_ascii_char();
if lexer.next_eq('=') {
Kind::PercentEq
} else {
Expand All @@ -1387,7 +1410,8 @@ const PRC: ByteHandler = |lexer| {

// &
const AMP: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `&`, which is ASCII
lexer.consume_ascii_char();
if lexer.next_eq('&') {
if lexer.next_eq('=') {
Kind::Amp2Eq
Expand All @@ -1403,19 +1427,22 @@ const AMP: ByteHandler = |lexer| {

// (
const PNO: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `(`, which is ASCII
lexer.consume_ascii_char();
Kind::LParen
};

// )
const PNC: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `)`, which is ASCII
lexer.consume_ascii_char();
Kind::RParen
};

// *
const ATR: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `*`, which is ASCII
lexer.consume_ascii_char();
if lexer.next_eq('*') {
if lexer.next_eq('=') {
Kind::Star2Eq
Expand All @@ -1431,7 +1458,8 @@ const ATR: ByteHandler = |lexer| {

// +
const PLS: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `+`, which is ASCII
lexer.consume_ascii_char();
if lexer.next_eq('+') {
Kind::Plus2
} else if lexer.next_eq('=') {
Expand All @@ -1443,25 +1471,29 @@ const PLS: ByteHandler = |lexer| {

// ,
const COM: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `,`, which is ASCII
lexer.consume_ascii_char();
Kind::Comma
};

// -
const MIN: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `-`, which is ASCII
lexer.consume_ascii_char();
lexer.read_minus().unwrap_or_else(|| lexer.skip_single_line_comment())
};

// .
const PRD: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `.`, which is ASCII
lexer.consume_ascii_char();
lexer.read_dot()
};

// /
const SLH: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `/`, which is ASCII
lexer.consume_ascii_char();
match lexer.peek() {
Some('/') => {
lexer.current.chars.next();
Expand All @@ -1484,37 +1516,43 @@ const SLH: ByteHandler = |lexer| {

// 0
const ZER: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `0`, which is ASCII
lexer.consume_ascii_char();
lexer.read_zero()
};

// 1 to 9
const DIG: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is an ASCII digit
lexer.consume_ascii_char();
lexer.decimal_literal_after_first_digit()
};

// :
const COL: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `:`, which is ASCII
lexer.consume_ascii_char();
Kind::Colon
};

// ;
const SEM: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `;`, which is ASCII
lexer.consume_ascii_char();
Kind::Semicolon
};

// <
const LSS: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `<`, which is ASCII
lexer.consume_ascii_char();
lexer.read_left_angle().unwrap_or_else(|| lexer.skip_single_line_comment())
};

// =
const EQL: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `=`, which is ASCII
lexer.consume_ascii_char();
if lexer.next_eq('=') {
if lexer.next_eq('=') {
Kind::Eq3
Expand All @@ -1530,14 +1568,16 @@ const EQL: ByteHandler = |lexer| {

// >
const GTR: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `>`, which is ASCII
lexer.consume_ascii_char();
// `>=` is re-lexed with [Lexer::next_jsx_child]
Kind::RAngle
};

// ?
const QST: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `?`, which is ASCII
lexer.consume_ascii_char();
if lexer.next_eq('?') {
if lexer.next_eq('=') {
Kind::Question2Eq
Expand All @@ -1559,20 +1599,26 @@ const QST: ByteHandler = |lexer| {

// @
const AT_: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `@`, which is ASCII
lexer.consume_ascii_char();
Kind::At
};

// [
const BTO: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `[`, which is ASCII
lexer.consume_ascii_char();
Kind::LBrack
};

// \
const ESC: ByteHandler = |lexer| {
let mut builder = AutoCow::new(lexer);
lexer.consume_char();
let lexer_ref = lexer as &Lexer<'_>;
let mut builder = AutoCow::new(lexer_ref);
// Next char at start of this function was `\`, which is ASCII.
// `AutoCow::new` cannot have changed the state of `lexer.current.chars` iterator,
// as we explicitly passed it only an immutable reference.
lexer.consume_ascii_char();
builder.force_allocation_without_current_ascii_char(lexer);
lexer.identifier_unicode_escape_sequence(&mut builder, true);
let text = lexer.identifier_name(builder);
Expand All @@ -1581,13 +1627,15 @@ const ESC: ByteHandler = |lexer| {

// ]
const BTC: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `]`, which is ASCII
lexer.consume_ascii_char();
Kind::RBrack
};

// ^
const CRT: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `^`, which is ASCII
lexer.consume_ascii_char();
if lexer.next_eq('=') {
Kind::CaretEq
} else {
Expand All @@ -1597,19 +1645,22 @@ const CRT: ByteHandler = |lexer| {

// `
const TPL: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is '`', which is ASCII
lexer.consume_ascii_char();
lexer.read_template_literal(Kind::TemplateHead, Kind::NoSubstitutionTemplate)
};

// {
const BEO: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `{`, which is ASCII
lexer.consume_ascii_char();
Kind::LCurly
};

// |
const PIP: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `|`, which is ASCII
lexer.consume_ascii_char();
if lexer.next_eq('|') {
if lexer.next_eq('=') {
Kind::Pipe2Eq
Expand All @@ -1625,13 +1676,15 @@ const PIP: ByteHandler = |lexer| {

// }
const BEC: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `}`, which is ASCII
lexer.consume_ascii_char();
Kind::RCurly
};

// ~
const TLD: ByteHandler = |lexer| {
lexer.consume_char();
// Next char is `~`, which is ASCII
lexer.consume_ascii_char();
Kind::Tilde
};

Expand Down