From 126aefede57dce170c054e9906af8ce22944080b Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Fri, 9 Jun 2023 21:39:01 +0100 Subject: [PATCH] gh-105549: Tokenize separately NUMBER and NAME tokens and allow 0-prefixed literals (GH-105555) (cherry picked from commit b047fa5e56ba725362c64ca3d6fccbdcf51d0cab) Co-authored-by: Pablo Galindo Salgado --- Lib/test/test_tokenize.py | 33 +++++++++++++++++++ ...-06-09-12-59-18.gh-issue-105549.PYfTNp.rst | 2 ++ Parser/tokenizer.c | 13 ++++++-- 3 files changed, 45 insertions(+), 3 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2023-06-09-12-59-18.gh-issue-105549.PYfTNp.rst diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 2c124f062e7fd6..df9c9db322dc94 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -284,7 +284,12 @@ def number_token(s): # this won't work with compound complex inputs continue self.assertEqual(number_token(lit), lit) + # Valid cases with extra underscores in the tokenize module + # See gh-105549 for context + extra_valid_cases = {"0_7", "09_99"} for lit in INVALID_UNDERSCORE_LITERALS: + if lit in extra_valid_cases: + continue try: number_token(lit) except TokenError: @@ -1873,6 +1878,34 @@ def test_indentation_semantics_retained(self): self.check_roundtrip(code) +class InvalidPythonTests(TestCase): + def test_number_followed_by_name(self): + # See issue #gh-105549 + source = "2sin(x)" + expected_tokens = [ + TokenInfo(type=token.NUMBER, string='2', start=(1, 0), end=(1, 1), line='2sin(x)'), + TokenInfo(type=token.NAME, string='sin', start=(1, 1), end=(1, 4), line='2sin(x)'), + TokenInfo(type=token.OP, string='(', start=(1, 4), end=(1, 5), line='2sin(x)'), + TokenInfo(type=token.NAME, string='x', start=(1, 5), end=(1, 6), line='2sin(x)'), + TokenInfo(type=token.OP, string=')', start=(1, 6), end=(1, 7), line='2sin(x)'), + TokenInfo(type=token.NEWLINE, string='', start=(1, 7), end=(1, 8), line='2sin(x)'), + TokenInfo(type=token.ENDMARKER, string='', start=(2, 0), end=(2, 0), line='') + ] + + tokens = list(generate_tokens(StringIO(source).readline)) + self.assertEqual(tokens, expected_tokens) + + def test_number_starting_with_zero(self): + source = "01234" + expected_tokens = [ + TokenInfo(type=token.NUMBER, string='01234', start=(1, 0), end=(1, 5), line='01234'), + TokenInfo(type=token.NEWLINE, string='', start=(1, 5), end=(1, 6), line='01234'), + TokenInfo(type=token.ENDMARKER, string='', start=(2, 0), end=(2, 0), line='') + ] + + tokens = list(generate_tokens(StringIO(source).readline)) + self.assertEqual(tokens, expected_tokens) + class CTokenizeTest(TestCase): def check_tokenize(self, s, expected): # Format the tokens in s in a table format. diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-06-09-12-59-18.gh-issue-105549.PYfTNp.rst b/Misc/NEWS.d/next/Core and Builtins/2023-06-09-12-59-18.gh-issue-105549.PYfTNp.rst new file mode 100644 index 00000000000000..c3dcaeea7a11cd --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2023-06-09-12-59-18.gh-issue-105549.PYfTNp.rst @@ -0,0 +1,2 @@ +Tokenize separately `NUMBER` and `NAME` tokens that are not ambiguous. Patch +by Pablo Galindo diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 89594e6974fe04..57c98fe5fcaa40 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1600,8 +1600,12 @@ lookahead(struct tok_state *tok, const char *test) } static int -verify_end_of_number(struct tok_state *tok, int c, const char *kind) -{ +verify_end_of_number(struct tok_state *tok, int c, const char *kind) { + if (tok->tok_extra_tokens) { + // When we are parsing extra tokens, we don't want to emit warnings + // about invalid literals, because we want to be a bit more liberal. + return 1; + } /* Emit a deprecation warning only if the numeric literal is immediately * followed by one of keywords which can occur after a numeric literal * in valid code: "and", "else", "for", "if", "in", "is" and "or". @@ -1659,6 +1663,9 @@ verify_end_of_number(struct tok_state *tok, int c, const char *kind) static int verify_identifier(struct tok_state *tok) { + if (tok->tok_extra_tokens) { + return 1; + } PyObject *s; if (tok->decoding_erred) return 0; @@ -2318,7 +2325,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t else if (c == 'j' || c == 'J') { goto imaginary; } - else if (nonzero) { + else if (nonzero && !tok->tok_extra_tokens) { /* Old-style octal: now disallowed. */ tok_backup(tok, c); return MAKE_TOKEN(syntaxerror_known_range(