From ff0de242434fce97202bec4cd2cea8a6a73327a0 Mon Sep 17 00:00:00 2001 From: martinohmann Date: Fri, 16 Jun 2023 20:08:43 +0200 Subject: [PATCH] perf(parser): handle unescaping of escaped markers in parser code directly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The changes in https://github.com/martinohmann/hcl-rs/pull/247 and https://github.com/martinohmann/hcl-rs/pull/249 enabled some improvements to the string parsing code which this change implements. Namely, the unescaping of escaped template markers (`$${` -> `${` and `%%{` -> `%{`) is done directly when they are encountered now. This speeds up common cases quite a bit. Before: ``` parse/hcl-edit/deeply_nested.tf time: [19.631 µs 19.647 µs 19.668 µs] thrpt: [35.929 MiB/s 35.968 MiB/s 35.997 MiB/s] parse/hcl-edit/large.tf time: [2.1344 ms 2.1389 ms 2.1445 ms] thrpt: [38.015 MiB/s 38.114 MiB/s 38.194 MiB/s] parse/hcl-edit/medium.tf time: [449.06 µs 451.22 µs 453.42 µs] thrpt: [31.667 MiB/s 31.821 MiB/s 31.975 MiB/s] parse/hcl-edit/small.tf time: [27.485 µs 27.600 µs 27.744 µs] thrpt: [34.133 MiB/s 34.311 MiB/s 34.456 MiB/s] ``` After: ``` parse/hcl-edit/deeply_nested.tf time: [18.461 µs 18.488 µs 18.526 µs] thrpt: [38.146 MiB/s 38.223 MiB/s 38.280 MiB/s] change: time: [-5.8990% -4.6557% -2.0981%] (p = 0.00 < 0.05) thrpt: [+2.1431% +4.8830% +6.2688%] Performance has improved. parse/hcl-edit/large.tf time: [1.7640 ms 1.7699 ms 1.7787 ms] thrpt: [45.833 MiB/s 46.061 MiB/s 46.216 MiB/s] change: time: [-17.613% -17.253% -16.777%] (p = 0.00 < 0.05) thrpt: [+20.160% +20.850% +21.379%] Performance has improved. parse/hcl-edit/medium.tf time: [407.78 µs 408.77 µs 409.99 µs] thrpt: [35.022 MiB/s 35.127 MiB/s 35.212 MiB/s] change: time: [-9.9006% -9.4090% -8.9185%] (p = 0.00 < 0.05) thrpt: [+9.7918% +10.386% +10.988%] Performance has improved. parse/hcl-edit/small.tf time: [24.139 µs 24.249 µs 24.385 µs] thrpt: [38.835 MiB/s 39.053 MiB/s 39.230 MiB/s] change: time: [-12.375% -12.060% -11.751%] (p = 0.00 < 0.05) thrpt: [+13.316% +13.713% +14.123%] Performance has improved. ``` --- crates/hcl-edit/src/parser/string.rs | 127 +++++++++++++++++-------- crates/hcl-edit/src/parser/template.rs | 24 +++-- 2 files changed, 103 insertions(+), 48 deletions(-) diff --git a/crates/hcl-edit/src/parser/string.rs b/crates/hcl-edit/src/parser/string.rs index 6d1c821f..9afd4af4 100644 --- a/crates/hcl-edit/src/parser/string.rs +++ b/crates/hcl-edit/src/parser/string.rs @@ -5,64 +5,106 @@ use super::{ IResult, Input, }; use crate::{Decorated, Ident, RawString}; -use hcl_primitives::template::unescape_markers; use std::borrow::Cow; use winnow::{ combinator::{alt, cut_err, delimited, fail, not, opt, preceded, repeat, success}, dispatch, stream::AsChar, - token::{any, one_of, tag, take_while}, + token::{any, one_of, take, take_while}, Parser, }; pub(super) fn string(input: Input) -> IResult { - delimited(b'"', opt(build_string), b'"') + delimited(b'"', opt(build_string(quoted_string_fragment)), b'"') .map(Option::unwrap_or_default) - .map(|s| unescape_markers(&s).into()) + .output_into() .parse_next(input) } -pub(super) fn build_string(input: Input) -> IResult> { - let (mut input, mut string) = match string_fragment(input) { - Ok((input, fragment)) => match fragment { - StringFragment::Literal(s) => (input, Cow::Borrowed(s)), - StringFragment::EscapedChar(c) => (input, Cow::Owned(String::from(c))), - }, - Err(err) => return Err(err), - }; - - loop { - match string_fragment(input) { - Ok((rest, fragment)) => { - match fragment { - StringFragment::Literal(s) => string.to_mut().push_str(s), - StringFragment::EscapedChar(c) => string.to_mut().push(c), - }; - input = rest; +pub(super) fn build_string<'a, F>( + mut fragment_parser: F, +) -> impl Parser, Cow<'a, str>, ParseError>> +where + F: Parser, StringFragment<'a>, ParseError>>, +{ + move |input: Input<'a>| { + let (mut input, mut string) = match fragment_parser.parse_next(input) { + Ok((input, fragment)) => match fragment { + StringFragment::Literal(s) => (input, Cow::Borrowed(s)), + StringFragment::EscapedChar(c) => (input, Cow::Owned(String::from(c))), + StringFragment::EscapedMarker(m) => (input, Cow::Borrowed(m.unescape())), + }, + Err(err) => return Err(err), + }; + + loop { + match fragment_parser.parse_next(input) { + Ok((rest, fragment)) => { + match fragment { + StringFragment::Literal(s) => string.to_mut().push_str(s), + StringFragment::EscapedChar(c) => string.to_mut().push(c), + StringFragment::EscapedMarker(m) => string.to_mut().push_str(m.unescape()), + }; + input = rest; + } + Err(_) => return Ok((input, string)), } - Err(_) => return Ok((input, string)), } } } /// A string fragment contains a fragment of a string being parsed: either -/// a non-empty Literal (a series of non-escaped characters) or a single -/// parsed escaped character. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum StringFragment<'a> { +/// a non-empty Literal (a series of non-escaped characters), a single +/// parsed escaped character or an escaped template start marker. +#[derive(Clone)] +pub(super) enum StringFragment<'a> { Literal(&'a str), EscapedChar(char), + EscapedMarker(EscapedMarker), } -fn string_fragment(input: Input) -> IResult { +/// An escaped marker which would start a template interpolation or directive if unescaped. +#[derive(Clone)] +pub(super) enum EscapedMarker { + Interpolation, + Directive, +} + +impl EscapedMarker { + // Returns the unescaped form of the escaped marker. + fn unescape(&self) -> &'static str { + match self { + EscapedMarker::Interpolation => "${", + EscapedMarker::Directive => "%{", + } + } +} + +pub(super) fn quoted_string_fragment(input: Input) -> IResult { alt(( + escaped_marker.map(StringFragment::EscapedMarker), string_literal.map(StringFragment::Literal), escaped_char.map(StringFragment::EscapedChar), )) .parse_next(input) } -/// Parse a non-empty block of text that doesn't include `\`, `"` or non-escaped template +pub(super) fn template_string_fragment<'a, F, T>( + mut literal_end: F, +) -> impl Parser, StringFragment<'a>, ParseError>> +where + F: Parser, T, ParseError>>, +{ + move |input: Input<'a>| { + alt(( + escaped_marker.map(StringFragment::EscapedMarker), + any_until(literal_end.by_ref()).map(StringFragment::Literal), + )) + .parse_next(input) + } +} + +/// Parse a non-empty block of text that doesn't include `"` or non-escaped template /// interpolation/directive start markers. fn string_literal(input: Input) -> IResult { let literal_end = dispatch! {any; @@ -70,25 +112,26 @@ fn string_literal(input: Input) -> IResult { b'$' | b'%' => b'{'.value(true), _ => fail, }; - literal_until(literal_end).parse_next(input) + any_until(literal_end).parse_next(input) } -pub(super) fn literal_until<'a, F, T>( - literal_end: F, -) -> impl Parser, &'a str, ParseError>> +fn any_until<'a, F, T>(literal_end: F) -> impl Parser, &'a str, ParseError>> where F: Parser, T, ParseError>>, { - void(repeat( - 1.., - alt(( - tag("$${"), - tag("%%{"), - preceded(not(literal_end), any).recognize(), - )), - )) - .recognize() - .try_map(std::str::from_utf8) + void(repeat(1.., preceded(not(literal_end), any))) + .recognize() + .try_map(std::str::from_utf8) +} + +/// Parse an escaped start marker for a template interpolation or directive. +fn escaped_marker(input: Input) -> IResult { + dispatch! {take::<_, Input, _>(3usize); + b"$${" => success(EscapedMarker::Interpolation), + b"%%{" => success(EscapedMarker::Directive), + _ => fail, + } + .parse_next(input) } /// Parse an escaped character: `\n`, `\t`, `\r`, `\u00AC`, etc. diff --git a/crates/hcl-edit/src/parser/template.rs b/crates/hcl-edit/src/parser/template.rs index 26fe3079..737411bf 100644 --- a/crates/hcl-edit/src/parser/template.rs +++ b/crates/hcl-edit/src/parser/template.rs @@ -3,7 +3,10 @@ use super::{ error::ParseError, expr::expr, repr::{decorated, spanned}, - string::{build_string, from_utf8_unchecked, literal_until, raw_string}, + string::{ + build_string, from_utf8_unchecked, quoted_string_fragment, raw_string, + template_string_fragment, + }, trivia::ws, IResult, Input, }; @@ -15,7 +18,6 @@ use crate::{ }, SetSpan, Span, Spanned, }; -use hcl_primitives::template::unescape_markers; use std::borrow::Cow; use winnow::{ ascii::{line_ending, space0}, @@ -24,14 +26,14 @@ use winnow::{ }; pub(super) fn string_template(input: Input) -> IResult { - delimited(b'"', elements(build_string), b'"') + delimited(b'"', elements(build_string(quoted_string_fragment)), b'"') .output_into() .parse_next(input) } pub(super) fn template(input: Input) -> IResult { let literal_end = alt((b"${", b"%{")); - let literal = literal_until(literal_end).output_into(); + let literal = template_literal(literal_end); elements(literal).output_into().parse_next(input) } @@ -51,7 +53,7 @@ pub(super) fn heredoc_template<'a>( // the line ending to the last template element below. let heredoc_end = (line_ending, space0, delim).recognize(); let literal_end = alt((b"${", b"%{", heredoc_end)); - let literal = literal_until(literal_end).output_into(); + let literal = template_literal(literal_end); // Use `opt` to handle an empty template. opt((elements(literal), line_ending.with_span()).map( @@ -79,6 +81,16 @@ pub(super) fn heredoc_template<'a>( } } +#[inline] +fn template_literal<'a, F, T>( + literal_end: F, +) -> impl Parser, Cow<'a, str>, ParseError>> +where + F: Parser, T, ParseError>>, +{ + build_string(template_string_fragment(literal_end)) +} + fn elements<'a, P>(literal: P) -> impl Parser, Vec, ParseError>> where P: Parser, Cow<'a, str>, ParseError>>, @@ -86,7 +98,7 @@ where repeat( 0.., spanned(alt(( - literal.map(|s| Element::Literal(Spanned::new(unescape_markers(&s).into()))), + literal.map(|s| Element::Literal(Spanned::new(s.into()))), interpolation.map(Element::Interpolation), directive.map(Element::Directive), ))),