From 46119059f680cd18d9a1d373d8b20a6268a1f3f7 Mon Sep 17 00:00:00 2001 From: Tyson Andre Date: Mon, 3 Aug 2020 14:04:49 -0400 Subject: [PATCH] [Proposal] Bigint shorthand (123n) for GMP objects (i.e. Arbitrary-Precision integers) Motivations: + https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/BigInt was a similar approach another dynamically typed language recently took to support convenient arbitrary precision. + Supporting bigints as anything other than objects in PHP's type system seemed from the discussion thread to have several drawbacks: - Native bigints by default would cause a B.C. break for extensions or php user code relying on float behavior. - Decrease in performance - Require updating opcache's inferences to support big integers + GMP objects already overrides numeric operators. Implementation: This effectively makes `123_456n` a shorthand for `gmp_init('123456')` Related to https://externals.io/message/77863 TODO: 1. Support LibTomMath or another library backend to implement http://php.net/gmp, (related to the original RFC PR thread) GMP is LGPL, which would cause issues with packagers. 2. Make GMP always-on and use the C library LibTomMath instead of GMP by default instead, unless that is impossible/impractical. (It wouldn't make sense to me to have a special syntax for big integers that only worked some of the time) Disable any PHP functions that do not have equivalents in LibTomMath. (or make BigInt be a distinct class that always uses LibTomMath) 3. Consider a class_alias to make BigInt an alias of GMP TODO: Support hexadecimal literals and binary literals, if there is interest in this. ------- Drawbacks: + Objects cannot be used as array keys("Illegal offset type") even if they define `__toString()` + Many developers/users may want arbitrary precision for all integers in a future major version and for that to continue working with scalar type hints, and that would be incompatible with this proposal. However, there may also be objections to changing the default. It seems likely that keeping integers as finite precision would be useful for opcache and the JIT to continue to efficiently optimize code. --- Zend/tests/bigint/add.phpt | 23 +++++++++++++++ Zend/tests/bigint/ast.phpt | 16 ++++++++++ Zend/tests/bigint/parse.phpt | 25 ++++++++++++++++ Zend/zend_ast.c | 9 ++++++ Zend/zend_ast.h | 1 + Zend/zend_compile.c | 54 ++++++++++++++++++++++++++++++++++ Zend/zend_language_parser.y | 2 ++ Zend/zend_language_scanner.l | 27 +++++++++++++++++ ext/tokenizer/tokenizer_data.c | 2 ++ 9 files changed, 159 insertions(+) create mode 100644 Zend/tests/bigint/add.phpt create mode 100644 Zend/tests/bigint/ast.phpt create mode 100644 Zend/tests/bigint/parse.phpt diff --git a/Zend/tests/bigint/add.phpt b/Zend/tests/bigint/add.phpt new file mode 100644 index 0000000000000..5181a72ffe17b --- /dev/null +++ b/Zend/tests/bigint/add.phpt @@ -0,0 +1,23 @@ +--TEST-- +Bigint addition tests +--SKIPIF-- + +--FILE-- + +--EXPECTF-- +124 +object(GMP)#1 (1) { + ["num"]=> + string(3) "452" +} +1267650600228229401496703205376 +729000000000000000000000000000000000027 diff --git a/Zend/tests/bigint/ast.phpt b/Zend/tests/bigint/ast.phpt new file mode 100644 index 0000000000000..7974931b8c2ec --- /dev/null +++ b/Zend/tests/bigint/ast.phpt @@ -0,0 +1,16 @@ +--TEST-- +Bigint AST representation in assert() +--SKIPIF-- + +--FILE-- +getMessage() . "\n"; +} + +?> +--EXPECTF-- +Caught assert(is_int(1 + 0n + 01_23n)) diff --git a/Zend/tests/bigint/parse.phpt b/Zend/tests/bigint/parse.phpt new file mode 100644 index 0000000000000..e5bd04f11cbaf --- /dev/null +++ b/Zend/tests/bigint/parse.phpt @@ -0,0 +1,25 @@ +--TEST-- +Bigint parsing tests +--SKIPIF-- + +--FILE-- +') as $token) { + if (is_string($token)) { + printf("%s\n", $token); + continue; + } + printf("%s: '%s'\n", token_name($token[0]), $token[1]); +} +?> +--EXPECT-- +T_OPEN_TAG: '' \ No newline at end of file diff --git a/Zend/zend_ast.c b/Zend/zend_ast.c index e2a2aca698d48..7ead6912122de 100644 --- a/Zend/zend_ast.c +++ b/Zend/zend_ast.c @@ -1714,6 +1714,15 @@ static ZEND_COLD void zend_ast_export_ex(smart_str *str, zend_ast *ast, int prio } smart_str_appendc(str, '`'); break; + case ZEND_AST_BIGINT: + { + zval *zv; + ZEND_ASSERT(ast->child[0]->kind == ZEND_AST_ZVAL); + zv = zend_ast_get_zval(ast->child[0]); + ZEND_ASSERT(Z_TYPE_P(zv) == IS_STRING); + smart_str_append(str, Z_STR_P(zv)); + } + break; case ZEND_AST_CLONE: PREFIX_OP("clone ", 270, 271); case ZEND_AST_EXIT: diff --git a/Zend/zend_ast.h b/Zend/zend_ast.h index 97236c5560c07..ec640135598ff 100644 --- a/Zend/zend_ast.h +++ b/Zend/zend_ast.h @@ -92,6 +92,7 @@ enum _zend_ast_kind { ZEND_AST_POST_DEC, ZEND_AST_YIELD_FROM, ZEND_AST_CLASS_NAME, + ZEND_AST_BIGINT, ZEND_AST_GLOBAL, ZEND_AST_UNSET, diff --git a/Zend/zend_compile.c b/Zend/zend_compile.c index 5e61760e9029e..74a4933d13178 100644 --- a/Zend/zend_compile.c +++ b/Zend/zend_compile.c @@ -8770,6 +8770,57 @@ void zend_compile_shell_exec(znode *result, zend_ast *ast) /* {{{ */ } /* }}} */ +static void strip_underscores(char *str, size_t *len) +{ + char *src = str, *dest = str; + while (*src != '\0') { + if (*src != '_') { + *dest = *src; + dest++; + } else { + --(*len); + } + src++; + } + *dest = '\0'; +} + + +void zend_compile_bigint(znode *result, zend_ast *ast) /* {{{ */ +{ + zend_ast *expr_ast = ast->child[0]; + zend_ast *new_expr_ast; + const zval *expr_zv = zend_ast_get_zval(expr_ast); + char* new_string = Z_STRVAL_P(expr_zv); + zval new_expr_zv; + size_t len = Z_STRLEN_P(expr_zv) - 1; /* Remove 'n' suffix */ + zend_bool contains_underscores = (memchr(new_string, '_', len) != NULL); + + zval fn_name; + zend_ast *name_ast, *args_ast, *call_ast; + ZEND_ASSERT(Z_TYPE_P(expr_zv) == IS_STRING); + if (contains_underscores) { + new_string = estrndup(new_string, len); + strip_underscores(new_string, &len); + } + ZVAL_STRINGL(&new_expr_zv, new_string, len); + if (contains_underscores) { + efree(new_string); + } + + ZVAL_STRING(&fn_name, "gmp_init"); + new_expr_ast = zend_ast_create_zval(&new_expr_zv); + name_ast = zend_ast_create_zval(&fn_name); + args_ast = zend_ast_create_list(1, ZEND_AST_ARG_LIST, new_expr_ast); + call_ast = zend_ast_create(ZEND_AST_CALL, name_ast, args_ast); + + zend_compile_expr(result, call_ast); + + zval_ptr_dtor(&fn_name); + zval_ptr_dtor(&new_expr_zv); +} +/* }}} */ + void zend_compile_array(znode *result, zend_ast *ast) /* {{{ */ { zend_ast_list *list = zend_ast_get_list(ast); @@ -9558,6 +9609,9 @@ static void zend_compile_expr_inner(znode *result, zend_ast *ast) /* {{{ */ case ZEND_AST_SHELL_EXEC: zend_compile_shell_exec(result, ast); return; + case ZEND_AST_BIGINT: + zend_compile_bigint(result, ast); + return; case ZEND_AST_ARRAY: zend_compile_array(result, ast); return; diff --git a/Zend/zend_language_parser.y b/Zend/zend_language_parser.y index 1a566e352d2d9..62778f2ccac38 100644 --- a/Zend/zend_language_parser.y +++ b/Zend/zend_language_parser.y @@ -88,6 +88,7 @@ static YYSIZE_T zend_yytnamerr(char*, const char*); %token T_LNUMBER "integer" %token T_DNUMBER "floating-point number" +%token T_BIGINT "arbitrary precision integer" %token T_STRING "identifier" %token T_NAME_FULLY_QUALIFIED "fully qualified name" %token T_NAME_RELATIVE "namespace-relative name" @@ -1116,6 +1117,7 @@ expr: | T_EXIT exit_expr { $$ = zend_ast_create(ZEND_AST_EXIT, $2); } | '@' expr { $$ = zend_ast_create(ZEND_AST_SILENCE, $2); } | scalar { $$ = $1; } + | T_BIGINT { $$ = zend_ast_create(ZEND_AST_BIGINT, $1); } | '`' backticks_expr '`' { $$ = zend_ast_create(ZEND_AST_SHELL_EXEC, $2); } | T_PRINT expr { $$ = zend_ast_create(ZEND_AST_PRINT, $2); } | T_YIELD { $$ = zend_ast_create(ZEND_AST_YIELD, NULL, NULL); CG(extra_fn_flags) |= ZEND_ACC_GENERATOR; } diff --git a/Zend/zend_language_scanner.l b/Zend/zend_language_scanner.l index 7f42159b46698..61222d62ba041 100644 --- a/Zend/zend_language_scanner.l +++ b/Zend/zend_language_scanner.l @@ -1971,6 +1971,33 @@ NEWLINE ("\r"|"\n"|"\r\n") } } +/* Bigint support */ +{LNUM}"n" { + /* Length without the "n" suffix */ + size_t len = yyleng - 1; + char *lnum = yytext; + zend_bool is_octal = lnum[0] == '0'; + + /* Digits 8 and 9 are illegal in octal literals. */ + if (is_octal) { + size_t i; + for (i = 0; i < len; i++) { + if (lnum[i] == '8' || lnum[i] == '9') { + zend_throw_exception(zend_ce_parse_error, "Invalid numeric literal", 0); + if (PARSER_MODE()) { + ZVAL_UNDEF(zendlval); + RETURN_TOKEN(T_ERROR); + } + + /* Continue in order to determine if this is T_LNUMBER or T_DNUMBER. */ + len = i; + break; + } + } + } + RETURN_TOKEN_WITH_STR(T_BIGINT, 0); +} + {LNUM} { size_t len = yyleng; char *end, *lnum = yytext; diff --git a/ext/tokenizer/tokenizer_data.c b/ext/tokenizer/tokenizer_data.c index 5699c57566b7d..3473dd94eb336 100644 --- a/ext/tokenizer/tokenizer_data.c +++ b/ext/tokenizer/tokenizer_data.c @@ -76,6 +76,7 @@ void tokenizer_register_constants(INIT_FUNC_ARGS) { REGISTER_LONG_CONSTANT("T_ELSE", T_ELSE, CONST_CS | CONST_PERSISTENT); REGISTER_LONG_CONSTANT("T_LNUMBER", T_LNUMBER, CONST_CS | CONST_PERSISTENT); REGISTER_LONG_CONSTANT("T_DNUMBER", T_DNUMBER, CONST_CS | CONST_PERSISTENT); + REGISTER_LONG_CONSTANT("T_BIGINT", T_BIGINT, CONST_CS | CONST_PERSISTENT); REGISTER_LONG_CONSTANT("T_STRING", T_STRING, CONST_CS | CONST_PERSISTENT); REGISTER_LONG_CONSTANT("T_NAME_FULLY_QUALIFIED", T_NAME_FULLY_QUALIFIED, CONST_CS | CONST_PERSISTENT); REGISTER_LONG_CONSTANT("T_NAME_RELATIVE", T_NAME_RELATIVE, CONST_CS | CONST_PERSISTENT); @@ -225,6 +226,7 @@ char *get_token_type_name(int token_type) case T_ELSE: return "T_ELSE"; case T_LNUMBER: return "T_LNUMBER"; case T_DNUMBER: return "T_DNUMBER"; + case T_BIGINT: return "T_BIGINT"; case T_STRING: return "T_STRING"; case T_NAME_FULLY_QUALIFIED: return "T_NAME_FULLY_QUALIFIED"; case T_NAME_RELATIVE: return "T_NAME_RELATIVE";