[xt but not xc] Delay string literal unescaping.

Currently string literals are unescaped twice. - Once during lexing in `cook_quoted`/`cook_c_string`/`cook_common`. This one just checks for errors. - Again in `LitKind::from_token_lit`, which is called when lowering AST to HIR, and also in a few other places during expansion. This one actually constructs the unescaped string. It also has error checking code, but that part of the code is actually dead (and has several bugs) because the check during lexing catches all errors! This commit removes the error-check-only unescaping during lexing, and fixes up `LitKind::from_token_lit` so it properly does both checking and construction. This is a backwards-compatible language change: some programs now compile that previously did not. For example, it is now possible for macros to consume "invalid" string literals like "\a\b\c". This is a continuation of a trend of delaying semantic error checking of literals to after expansion: - rust-lang#102944 did this for some cases for numeric literals - The detection of NUL chars in C string literals is already delayed in this way. Notes about test changes: - `ignore-block-help.rs`: this requires a parse error for the test to work. The error used was an unescaping error, which is now delayed to after parsing. So the commit changes it to an "unterminated character literal" error which still occurs during parsing. - `tests/ui/lexer/error-stage.rs`: this shows the newly allowed cases, due to delayed literal unescaping. - Several tests had unescaping errors combined with unterminated literal errors. The former are now delayed but the latter remain as lexing errors. So the unterminated literal part needed to be split into a separate test file otherwise compilation would end before the other errors were reported. - issue-62913.rs: The structure and output changed a bit. Issue rust-lang#62913 was about an ICE due to an unterminated string literal, so the new version should be good enough. - literals-are-validated-before-expansion.rs: this tests exactly the behaviour that has been changed, and so was removed - A couple of other test produce the same errors, just in a different order.
nnethercote · Dec 12, 2023 · f9e3632 · f9e3632
1 parent aa39bd0
commit f9e3632
Show file tree

Hide file tree

Showing 37 changed files with 615 additions and 444 deletions.
diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs
diff --git a/compiler/rustc_ast_lowering/src/expr.rs b/compiler/rustc_ast_lowering/src/expr.rs
@@ -14,7 +14,7 @@ use rustc_data_structures::stack::ensure_sufficient_stack;
 use rustc_hir as hir;
 use rustc_hir::def::{DefKind, Res};
 use rustc_middle::span_bug;
-use rustc_parse::parser::report_lit_error;
+use rustc_parse::parser::token_lit_to_lit_kind_and_report_errs;
 use rustc_span::source_map::{respan, Spanned};
 use rustc_span::symbol::{kw, sym, Ident, Symbol};
 use rustc_span::DUMMY_SP;
@@ -119,13 +119,12 @@ impl<'hir> LoweringContext<'_, 'hir> {
                     hir::ExprKind::Unary(op, ohs)
                 }
                 ExprKind::Lit(token_lit) => {
-                    let lit_kind = match LitKind::from_token_lit(*token_lit) {
-                        Ok(lit_kind) => lit_kind,
-                        Err(err) => {
-                            report_lit_error(&self.tcx.sess.parse_sess, err, *token_lit, e.span);
-                            LitKind::Err
-                        }
-                    };
+                    let lit_kind = token_lit_to_lit_kind_and_report_errs(
+                        &self.tcx.sess.parse_sess,
+                        *token_lit,
+                        e.span,
+                    )
+                    .unwrap_or(LitKind::Err);
                     let lit = self.arena.alloc(respan(self.lower_span(e.span), lit_kind));
                     hir::ExprKind::Lit(lit)
                 }

diff --git a/compiler/rustc_builtin_macros/src/concat.rs b/compiler/rustc_builtin_macros/src/concat.rs
@@ -1,7 +1,7 @@
 use rustc_ast as ast;
 use rustc_ast::tokenstream::TokenStream;
 use rustc_expand::base::{self, DummyResult};
-use rustc_parse::parser::report_lit_error;
+use rustc_parse::parser::token_lit_to_lit_kind_and_report_errs;
 use rustc_span::symbol::Symbol;
 
 use crate::errors;
@@ -19,44 +19,43 @@ pub fn expand_concat(
     let mut has_errors = false;
     for e in es {
         match e.kind {
-            ast::ExprKind::Lit(token_lit) => match ast::LitKind::from_token_lit(token_lit) {
-                Ok(ast::LitKind::Str(s, _) | ast::LitKind::Float(s, _)) => {
-                    accumulator.push_str(s.as_str());
-                }
-                Ok(ast::LitKind::Char(c)) => {
-                    accumulator.push(c);
-                }
-                Ok(ast::LitKind::Int(i, _)) => {
-                    accumulator.push_str(&i.to_string());
-                }
-                Ok(ast::LitKind::Bool(b)) => {
-                    accumulator.push_str(&b.to_string());
-                }
-                Ok(ast::LitKind::CStr(..)) => {
-                    cx.emit_err(errors::ConcatCStrLit { span: e.span });
-                    has_errors = true;
-                }
-                Ok(ast::LitKind::Byte(..) | ast::LitKind::ByteStr(..)) => {
-                    cx.emit_err(errors::ConcatBytestr { span: e.span });
-                    has_errors = true;
-                }
-                Ok(ast::LitKind::Err) => {
-                    has_errors = true;
-                }
-                Err(err) => {
-                    report_lit_error(&cx.sess.parse_sess, err, token_lit, e.span);
-                    has_errors = true;
+            ast::ExprKind::Lit(token_lit) => {
+                match token_lit_to_lit_kind_and_report_errs(&cx.sess.parse_sess, token_lit, e.span)
+                {
+                    Ok(ast::LitKind::Str(s, _) | ast::LitKind::Float(s, _)) => {
+                        accumulator.push_str(s.as_str());
+                    }
+                    Ok(ast::LitKind::Char(c)) => {
+                        accumulator.push(c);
+                    }
+                    Ok(ast::LitKind::Int(i, _)) => {
+                        accumulator.push_str(&i.to_string());
+                    }
+                    Ok(ast::LitKind::Bool(b)) => {
+                        accumulator.push_str(&b.to_string());
+                    }
+                    Ok(ast::LitKind::CStr(..)) => {
+                        cx.emit_err(errors::ConcatCStrLit { span: e.span });
+                        has_errors = true;
+                    }
+                    Ok(ast::LitKind::Byte(..) | ast::LitKind::ByteStr(..)) => {
+                        cx.emit_err(errors::ConcatBytestr { span: e.span });
+                        has_errors = true;
+                    }
+                    Ok(ast::LitKind::Err) | Err(()) => {
+                        has_errors = true;
+                    }
                 }
-            },
+            }
             // We also want to allow negative numeric literals.
             ast::ExprKind::Unary(ast::UnOp::Neg, ref expr)
                 if let ast::ExprKind::Lit(token_lit) = expr.kind =>
             {
-                match ast::LitKind::from_token_lit(token_lit) {
+                match token_lit_to_lit_kind_and_report_errs(&cx.sess.parse_sess, token_lit, e.span)
+                {
                     Ok(ast::LitKind::Int(i, _)) => accumulator.push_str(&format!("-{i}")),
                     Ok(ast::LitKind::Float(f, _)) => accumulator.push_str(&format!("-{f}")),
-                    Err(err) => {
-                        report_lit_error(&cx.sess.parse_sess, err, token_lit, e.span);
+                    Err(()) => {
                         has_errors = true;
                     }
                     _ => missing_literal.push(e.span),

diff --git a/compiler/rustc_builtin_macros/src/concat_bytes.rs b/compiler/rustc_builtin_macros/src/concat_bytes.rs
@@ -1,7 +1,7 @@
 use rustc_ast as ast;
 use rustc_ast::{ptr::P, tokenstream::TokenStream};
 use rustc_expand::base::{self, DummyResult};
-use rustc_parse::parser::report_lit_error;
+use rustc_parse::parser::token_lit_to_lit_kind_and_report_errs;
 use rustc_span::Span;
 
 use crate::errors;
@@ -17,7 +17,7 @@ fn invalid_type_err(
         ConcatBytesInvalid, ConcatBytesInvalidSuggestion, ConcatBytesNonU8, ConcatBytesOob,
     };
     let snippet = cx.sess.source_map().span_to_snippet(span).ok();
-    match ast::LitKind::from_token_lit(token_lit) {
+    match token_lit_to_lit_kind_and_report_errs(&cx.sess.parse_sess, token_lit, span) {
         Ok(ast::LitKind::CStr(_, _)) => {
             // Avoid ambiguity in handling of terminal `NUL` by refusing to
             // concatenate C string literals as bytes.
@@ -60,9 +60,7 @@ fn invalid_type_err(
             cx.emit_err(ConcatBytesNonU8 { span });
         }
         Ok(ast::LitKind::ByteStr(..) | ast::LitKind::Byte(_)) => unreachable!(),
-        Err(err) => {
-            report_lit_error(&cx.sess.parse_sess, err, token_lit, span);
-        }
+        Err(()) => {}
     }
 }
 

diff --git a/compiler/rustc_expand/src/base.rs b/compiler/rustc_expand/src/base.rs
@@ -1235,26 +1235,30 @@ pub fn expr_to_spanned_string<'a>(
     let expr = cx.expander().fully_expand_fragment(AstFragment::Expr(expr)).make_expr();
 
     Err(match expr.kind {
-        ast::ExprKind::Lit(token_lit) => match ast::LitKind::from_token_lit(token_lit) {
-            Ok(ast::LitKind::Str(s, style)) => return Ok((s, style, expr.span)),
-            Ok(ast::LitKind::ByteStr(..)) => {
-                let mut err = cx.struct_span_err(expr.span, err_msg);
-                let span = expr.span.shrink_to_lo();
-                err.span_suggestion(
-                    span.with_hi(span.lo() + BytePos(1)),
-                    "consider removing the leading `b`",
-                    "",
-                    Applicability::MaybeIncorrect,
-                );
-                Some((err, true))
-            }
-            Ok(ast::LitKind::Err) => None,
-            Err(err) => {
-                parser::report_lit_error(&cx.sess.parse_sess, err, token_lit, expr.span);
-                None
-            }
-            _ => Some((cx.struct_span_err(expr.span, err_msg), false)),
-        },
+        ast::ExprKind::Lit(token_lit) => {
+            let res = match parser::token_lit_to_lit_kind_and_report_errs(
+                &cx.sess.parse_sess,
+                token_lit,
+                expr.span,
+            ) {
+                Ok(ast::LitKind::Str(s, style)) => return Ok((s, style, expr.span)),
+                Ok(ast::LitKind::ByteStr(..)) => {
+                    let mut err = cx.struct_span_err(expr.span, err_msg);
+                    let span = expr.span.shrink_to_lo();
+                    err.span_suggestion(
+                        span.with_hi(span.lo() + BytePos(1)),
+                        "consider removing the leading `b`",
+                        "",
+                        Applicability::MaybeIncorrect,
+                    );
+                    Some((err, true))
+                }
+                Ok(ast::LitKind::Err) => None,
+                Err(()) => None,
+                _ => Some((cx.struct_span_err(expr.span, err_msg), false)),
+            };
+            res
+        }
         ast::ExprKind::Err => None,
         _ => Some((cx.struct_span_err(expr.span, err_msg), false)),
     })

diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs
@@ -347,7 +347,7 @@ where
     // them in the range computation.
     while let Some(c) = chars.next() {
         let start = src.len() - chars.as_str().len() - c.len_utf8();
-        let res = match c {
+        let res: Result<T, EscapeError> = match c {
             '\\' => {
                 match chars.clone().next() {
                     Some('\n') => {

diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
@@ -1,5 +1,3 @@
-use std::ops::Range;
-
 use crate::errors;
 use crate::lexer::unicode_chars::UNICODE_ARRAY;
 use crate::make_unclosed_delims_error;
@@ -8,7 +6,6 @@ use rustc_ast::token::{self, CommentKind, Delimiter, Token, TokenKind};
 use rustc_ast::tokenstream::TokenStream;
 use rustc_ast::util::unicode::contains_text_flow_control_chars;
 use rustc_errors::{error_code, Applicability, Diagnostic, DiagnosticBuilder, StashKey};
-use rustc_lexer::unescape::{self, EscapeError, Mode};
 use rustc_lexer::{Base, DocStyle, RawStrError};
 use rustc_lexer::{Cursor, LiteralKind};
 use rustc_session::lint::builtin::{
@@ -21,10 +18,10 @@ use rustc_span::{edition::Edition, BytePos, Pos, Span};
 
 mod diagnostics;
 mod tokentrees;
-mod unescape_error_reporting;
+pub(crate) mod unescape_error_reporting;
 mod unicode_chars;
 
-use unescape_error_reporting::{emit_unescape_error, escaped_char};
+use unescape_error_reporting::escaped_char;
 
 // This type is used a lot. Make sure it doesn't unintentionally get bigger.
 //
@@ -409,7 +406,7 @@ impl<'a> StringReader<'a> {
                         error_code!(E0762),
                     )
                 }
-                self.cook_quoted(token::Char, Mode::Char, start, end, 1, 1) // ' '
+                self.cook_quoted(token::Char, start, end, 1, 1) // ' '
             }
             rustc_lexer::LiteralKind::Byte { terminated } => {
                 if !terminated {
@@ -419,7 +416,7 @@ impl<'a> StringReader<'a> {
                         error_code!(E0763),
                     )
                 }
-                self.cook_quoted(token::Byte, Mode::Byte, start, end, 2, 1) // b' '
+                self.cook_quoted(token::Byte, start, end, 2, 1) // b' '
             }
             rustc_lexer::LiteralKind::Str { terminated } => {
                 if !terminated {
@@ -429,7 +426,7 @@ impl<'a> StringReader<'a> {
                         error_code!(E0765),
                     )
                 }
-                self.cook_quoted(token::Str, Mode::Str, start, end, 1, 1) // " "
+                self.cook_quoted(token::Str, start, end, 1, 1) // " "
             }
             rustc_lexer::LiteralKind::ByteStr { terminated } => {
                 if !terminated {
@@ -439,7 +436,7 @@ impl<'a> StringReader<'a> {
                         error_code!(E0766),
                     )
                 }
-                self.cook_quoted(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" "
+                self.cook_quoted(token::ByteStr, start, end, 2, 1) // b" "
             }
             rustc_lexer::LiteralKind::CStr { terminated } => {
                 if !terminated {
@@ -449,13 +446,13 @@ impl<'a> StringReader<'a> {
                         error_code!(E0767),
                     )
                 }
-                self.cook_c_string(token::CStr, Mode::CStr, start, end, 2, 1) // c" "
+                self.cook_quoted(token::CStr, start, end, 2, 1) // c" "
             }
             rustc_lexer::LiteralKind::RawStr { n_hashes } => {
                 if let Some(n_hashes) = n_hashes {
                     let n = u32::from(n_hashes);
                     let kind = token::StrRaw(n_hashes);
-                    self.cook_quoted(kind, Mode::RawStr, start, end, 2 + n, 1 + n) // r##" "##
+                    self.cook_quoted(kind, start, end, 2 + n, 1 + n) // r##" "##
                 } else {
                     self.report_raw_str_error(start, 1);
                 }
@@ -464,7 +461,7 @@ impl<'a> StringReader<'a> {
                 if let Some(n_hashes) = n_hashes {
                     let n = u32::from(n_hashes);
                     let kind = token::ByteStrRaw(n_hashes);
-                    self.cook_quoted(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n) // br##" "##
+                    self.cook_quoted(kind, start, end, 3 + n, 1 + n) // br##" "##
                 } else {
                     self.report_raw_str_error(start, 2);
                 }
@@ -473,7 +470,7 @@ impl<'a> StringReader<'a> {
                 if let Some(n_hashes) = n_hashes {
                     let n = u32::from(n_hashes);
                     let kind = token::CStrRaw(n_hashes);
-                    self.cook_c_string(kind, Mode::RawCStr, start, end, 3 + n, 1 + n) // cr##" "##
+                    self.cook_quoted(kind, start, end, 3 + n, 1 + n) // cr##" "##
                 } else {
                     self.report_raw_str_error(start, 2);
                 }
@@ -693,82 +690,18 @@ impl<'a> StringReader<'a> {
         self.sess.emit_fatal(errors::TooManyHashes { span: self.mk_sp(start, self.pos), num });
     }
 
-    fn cook_common(
+    fn cook_quoted(
         &self,
         kind: token::LitKind,
-        mode: Mode,
         start: BytePos,
         end: BytePos,
         prefix_len: u32,
         postfix_len: u32,
-        unescape: fn(&str, Mode, &mut dyn FnMut(Range<usize>, Result<(), EscapeError>)),
     ) -> (token::LitKind, Symbol) {
-        let mut has_fatal_err = false;
         let content_start = start + BytePos(prefix_len);
         let content_end = end - BytePos(postfix_len);
         let lit_content = self.str_from_to(content_start, content_end);
-        unescape(lit_content, mode, &mut |range, result| {
-            // Here we only check for errors. The actual unescaping is done later.
-            if let Err(err) = result {
-                let span_with_quotes = self.mk_sp(start, end);
-                let (start, end) = (range.start as u32, range.end as u32);
-                let lo = content_start + BytePos(start);
-                let hi = lo + BytePos(end - start);
-                let span = self.mk_sp(lo, hi);
-                if err.is_fatal() {
-                    has_fatal_err = true;
-                }
-                emit_unescape_error(
-                    &self.sess.span_diagnostic,
-                    lit_content,
-                    span_with_quotes,
-                    span,
-                    mode,
-                    range,
-                    err,
-                );
-            }
-        });
-
-        // We normally exclude the quotes for the symbol, but for errors we
-        // include it because it results in clearer error messages.
-        if !has_fatal_err {
-            (kind, Symbol::intern(lit_content))
-        } else {
-            (token::Err, self.symbol_from_to(start, end))
-        }
-    }
-
-    fn cook_quoted(
-        &self,
-        kind: token::LitKind,
-        mode: Mode,
-        start: BytePos,
-        end: BytePos,
-        prefix_len: u32,
-        postfix_len: u32,
-    ) -> (token::LitKind, Symbol) {
-        self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| {
-            unescape::unescape_literal(src, mode, &mut |span, result| {
-                callback(span, result.map(drop))
-            })
-        })
-    }
-
-    fn cook_c_string(
-        &self,
-        kind: token::LitKind,
-        mode: Mode,
-        start: BytePos,
-        end: BytePos,
-        prefix_len: u32,
-        postfix_len: u32,
-    ) -> (token::LitKind, Symbol) {
-        self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| {
-            unescape::unescape_c_string(src, mode, &mut |span, result| {
-                callback(span, result.map(drop))
-            })
-        })
+        (kind, Symbol::intern(lit_content))
     }
 }