From 6f9d25bd3234eb167046fbd44b809a7c9dec8d10 Mon Sep 17 00:00:00 2001 From: Tyson Andre Date: Mon, 2 Mar 2020 20:12:53 -0500 Subject: [PATCH] Add a way to override the lexer in a Parser instance Currently, it's possible to do, but would require a lot of code duplication and pinning to an exact version of tolerant-php-parser, which discourages that. I can think of the following use cases for this. My main reason is to reuse token_get_all() elsewhere, but being able to parse `T_FN` in php < 7.4 is also convenient. - Multiple applications needing to use the result of token_get_all for the same file. If none of them modify the array, it's much faster to reuse the same array than to create this. For example, Phan's language server mode will potentially use tolerant-php-parser. In addition to that, it also uses token_get_all in InlineHTMLPlugin (to check for misuse of inline HTML) and sometimes in BuiltinSuppressionPlugin (to list T_COMMENT/T_DOC_COMMENT containing `@phan-suppress-*`) Aside: https://wiki.php.net/rfc/token_as_object would be faster and more memory efficient than token_get_all() in php 8.0, if it gets approved - Needing to call tolerant-php-parser on the same token stream, multiple times. (e.g. an application that modifies the Microsoft\PhpParser\Node instances (but not tokens), or which creates data structures from the original Node but usually discards them to save memory) --- src/Parser.php | 17 ++++++++++++++++- src/PhpTokenizer.php | 18 +++++++++++++++++- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/src/Parser.php b/src/Parser.php index 722c447e..5687d751 100644 --- a/src/Parser.php +++ b/src/Parser.php @@ -135,6 +135,21 @@ public function __construct() { $this->returnTypeDeclarationTokens = \array_merge([TokenKind::VoidReservedWord, TokenKind::NullReservedWord, TokenKind::FalseReservedWord, TokenKind::StaticKeyword], $this->parameterTypeDeclarationTokens); } + /** + * This method exists so that it can be overridden in subclasses. + * Any subclass must return a token stream that is equivalent to the contents in $fileContents for this to work properly. + * + * Possible reasons for applications to override the lexer: + * + * - Imitate token stream of a newer/older PHP version (e.g. T_FN is only available in php 7.4) + * - Reuse the result of token_get_all to create a Node again. + * - Reuse the result of token_get_all in a different library. + */ + protected function makeLexer(string $fileContents): TokenStreamProviderInterface + { + return TokenStreamProviderFactory::GetTokenStreamProvider($fileContents); + } + /** * Generates AST from source file contents. Returns an instance of SourceFileNode, which is always the top-most * Node-type of the tree. @@ -143,7 +158,7 @@ public function __construct() { * @return SourceFileNode */ public function parseSourceFile(string $fileContents, string $uri = null) : SourceFileNode { - $this->lexer = TokenStreamProviderFactory::GetTokenStreamProvider($fileContents); + $this->lexer = $this->makeLexer($fileContents); $this->reset(); diff --git a/src/PhpTokenizer.php b/src/PhpTokenizer.php index 756cd159..1cc96754 100644 --- a/src/PhpTokenizer.php +++ b/src/PhpTokenizer.php @@ -74,7 +74,7 @@ public static function getTokensArrayFromContent( $content = $prefix . $content; } - $tokens = @\token_get_all($content); + $tokens = static::tokenGetAll($content, $parseContext); $arr = array(); $fullStart = $start = $pos = $initialPos; @@ -147,6 +147,22 @@ public static function getTokensArrayFromContent( return $arr; } + /** + * @param string $content the raw php code + * @param ?int $parseContext can be SourceElements when extracting doc comments. + * Having this available may be useful for subclasses to decide whether or not to post-process results, cache results, etc. + * @return array[]|string[] an array of tokens. When concatenated, these tokens must equal $content. + * + * This exists so that it can be overridden in subclasses, e.g. to cache the result of tokenizing entire files. + * Applications using tolerant-php-parser may often end up needing to use the token stream for other reasons that are hard to do in the resulting AST, + * such as iterating over T_COMMENTS, checking for inline html, + * looking up all tokens (including skipped tokens) on a given line, etc. + */ + protected static function tokenGetAll(string $content, $parseContext): array + { + return @\token_get_all($content); + } + const TOKEN_MAP = [ T_CLASS_C => TokenKind::Name, T_DIR => TokenKind::Name,