From 54d9a2d23d8a793a30b383853c248e29ae24f015 Mon Sep 17 00:00:00 2001 From: Markus Staab Date: Thu, 1 Aug 2024 18:21:09 +0200 Subject: [PATCH] Extracted RegexGroupParser from RegexArrayShapeMatcher --- conf/config.neon | 3 + src/Type/Php/RegexArrayShapeMatcher.php | 384 +--------------------- src/Type/Php/RegexGroupParser.php | 404 ++++++++++++++++++++++++ 3 files changed, 410 insertions(+), 381 deletions(-) create mode 100644 src/Type/Php/RegexGroupParser.php diff --git a/conf/config.neon b/conf/config.neon index a6a75e7b59..f649636e98 100644 --- a/conf/config.neon +++ b/conf/config.neon @@ -1500,6 +1500,9 @@ services: - class: PHPStan\Type\Php\RegexArrayShapeMatcher + - + class: PHPStan\Type\Php\RegexGroupParser + - class: PHPStan\Type\Php\RegexExpressionHelper diff --git a/src/Type/Php/RegexArrayShapeMatcher.php b/src/Type/Php/RegexArrayShapeMatcher.php index af24eea428..d14f1ce765 100644 --- a/src/Type/Php/RegexArrayShapeMatcher.php +++ b/src/Type/Php/RegexArrayShapeMatcher.php @@ -2,21 +2,11 @@ namespace PHPStan\Type\Php; -use Hoa\Compiler\Llk\Llk; -use Hoa\Compiler\Llk\Parser; -use Hoa\Compiler\Llk\TreeNode; -use Hoa\Exception\Exception; -use Hoa\File\Read; -use Nette\Utils\RegexpException; -use Nette\Utils\Strings; use PhpParser\Node\Expr; use PHPStan\Analyser\Scope; use PHPStan\Php\PhpVersion; -use PHPStan\ShouldNotHappenException; use PHPStan\TrinaryLogic; use PHPStan\Type\Accessory\AccessoryArrayListType; -use PHPStan\Type\Accessory\AccessoryNonEmptyStringType; -use PHPStan\Type\Accessory\AccessoryNumericStringType; use PHPStan\Type\ArrayType; use PHPStan\Type\Constant\ConstantArrayType; use PHPStan\Type\Constant\ConstantArrayTypeBuilder; @@ -24,7 +14,6 @@ use PHPStan\Type\Constant\ConstantStringType; use PHPStan\Type\IntegerRangeType; use PHPStan\Type\IntegerType; -use PHPStan\Type\IntersectionType; use PHPStan\Type\StringType; use PHPStan\Type\Type; use PHPStan\Type\TypeCombinator; @@ -32,14 +21,7 @@ use function array_reverse; use function count; use function in_array; -use function is_int; use function is_string; -use function rtrim; -use function sscanf; -use function str_contains; -use function str_replace; -use function strlen; -use function substr; use const PREG_OFFSET_CAPTURE; use const PREG_PATTERN_ORDER; use const PREG_SET_ORDER; @@ -56,11 +38,10 @@ final class RegexArrayShapeMatcher */ public const PREG_UNMATCHED_AS_NULL_ON_72_73 = 2048; - private static ?Parser $parser = null; - public function __construct( - private PhpVersion $phpVersion, + private RegexGroupParser $regexGroupParser, private RegexExpressionHelper $regexExpressionHelper, + private PhpVersion $phpVersion, ) { } @@ -131,7 +112,7 @@ private function matchPatternType(Type $patternType, ?Type $flagsType, TrinaryLo */ private function matchRegex(string $regex, ?int $flags, TrinaryLogic $wasMatched, bool $matchesAll): ?Type { - $parseResult = $this->parseGroups($regex); + $parseResult = $this->regexGroupParser->parseGroups($regex); if ($parseResult === null) { // regex could not be parsed by Hoa/Regex return null; @@ -500,365 +481,6 @@ private function getValueType(Type $baseType, int $flags, bool $matchesAll): Typ return $valueType; } - /** - * @return array{array, array>, list}|null - */ - private function parseGroups(string $regex): ?array - { - if (self::$parser === null) { - /** @throws void */ - self::$parser = Llk::load(new Read(__DIR__ . '/../../../resources/RegexGrammar.pp')); - } - - try { - Strings::match('', $regex); - } catch (RegexpException) { - // pattern is invalid, so let the RegularExpressionPatternRule report it - return null; - } - - $rawRegex = $this->regexExpressionHelper->removeDelimitersAndModifiers($regex); - try { - $ast = self::$parser->parse($rawRegex); - } catch (Exception) { - return null; - } - - $captureOnlyNamed = false; - if ($this->phpVersion->supportsPregCaptureOnlyNamedGroups()) { - $modifiers = $this->regexExpressionHelper->getPatternModifiers($regex); - $captureOnlyNamed = str_contains($modifiers ?? '', 'n'); - } - - $capturingGroups = []; - $groupCombinations = []; - $alternationId = -1; - $captureGroupId = 100; - $markVerbs = []; - $this->walkRegexAst( - $ast, - false, - $alternationId, - 0, - false, - null, - $captureGroupId, - $capturingGroups, - $groupCombinations, - $markVerbs, - $captureOnlyNamed, - ); - - return [$capturingGroups, $groupCombinations, $markVerbs]; - } - - /** - * @param array $capturingGroups - * @param array> $groupCombinations - * @param list $markVerbs - */ - private function walkRegexAst( - TreeNode $ast, - bool $inAlternation, - int &$alternationId, - int $combinationIndex, - bool $inOptionalQuantification, - RegexCapturingGroup|RegexNonCapturingGroup|null $parentGroup, - int &$captureGroupId, - array &$capturingGroups, - array &$groupCombinations, - array &$markVerbs, - bool $captureOnlyNamed, - ): void - { - $group = null; - if ($ast->getId() === '#capturing') { - $group = new RegexCapturingGroup( - $captureGroupId++, - null, - $inAlternation ? $alternationId : null, - $inOptionalQuantification, - $parentGroup, - $this->createGroupType($ast), - ); - $parentGroup = $group; - } elseif ($ast->getId() === '#namedcapturing') { - $name = $ast->getChild(0)->getValueValue(); - $group = new RegexCapturingGroup( - $captureGroupId++, - $name, - $inAlternation ? $alternationId : null, - $inOptionalQuantification, - $parentGroup, - $this->createGroupType($ast), - ); - $parentGroup = $group; - } elseif ($ast->getId() === '#noncapturing') { - $group = new RegexNonCapturingGroup( - $inAlternation ? $alternationId : null, - $inOptionalQuantification, - $parentGroup, - false, - ); - $parentGroup = $group; - } elseif ($ast->getId() === '#noncapturingreset') { - $group = new RegexNonCapturingGroup( - $inAlternation ? $alternationId : null, - $inOptionalQuantification, - $parentGroup, - true, - ); - $parentGroup = $group; - } - - $inOptionalQuantification = false; - if ($ast->getId() === '#quantification') { - [$min] = $this->getQuantificationRange($ast); - - if ($min === 0) { - $inOptionalQuantification = true; - } - } - - if ($ast->getId() === '#alternation') { - $alternationId++; - $inAlternation = true; - } - - if ($ast->getId() === '#mark') { - $markVerbs[] = $ast->getChild(0)->getValueValue(); - return; - } - - if ( - $group instanceof RegexCapturingGroup && - (!$captureOnlyNamed || $group->isNamed()) - ) { - $capturingGroups[$group->getId()] = $group; - - if (!array_key_exists($alternationId, $groupCombinations)) { - $groupCombinations[$alternationId] = []; - } - if (!array_key_exists($combinationIndex, $groupCombinations[$alternationId])) { - $groupCombinations[$alternationId][$combinationIndex] = []; - } - $groupCombinations[$alternationId][$combinationIndex][] = $group->getId(); - } - - foreach ($ast->getChildren() as $child) { - $this->walkRegexAst( - $child, - $inAlternation, - $alternationId, - $combinationIndex, - $inOptionalQuantification, - $parentGroup, - $captureGroupId, - $capturingGroups, - $groupCombinations, - $markVerbs, - $captureOnlyNamed, - ); - - if ($ast->getId() !== '#alternation') { - continue; - } - - $combinationIndex++; - } - } - - /** @return array{?int, ?int} */ - private function getQuantificationRange(TreeNode $node): array - { - if ($node->getId() !== '#quantification') { - throw new ShouldNotHappenException(); - } - - $min = null; - $max = null; - - $lastChild = $node->getChild($node->getChildrenNumber() - 1); - $value = $lastChild->getValue(); - - // normalize away possessive and lazy quantifier-modifiers - $token = str_replace(['_possessive', '_lazy'], '', $value['token']); - $value = rtrim($value['value'], '+?'); - - if ($token === 'n_to_m') { - if (sscanf($value, '{%d,%d}', $n, $m) !== 2 || !is_int($n) || !is_int($m)) { - throw new ShouldNotHappenException(); - } - - $min = $n; - $max = $m; - } elseif ($token === 'n_or_more') { - if (sscanf($value, '{%d,}', $n) !== 1 || !is_int($n)) { - throw new ShouldNotHappenException(); - } - - $min = $n; - } elseif ($token === 'exactly_n') { - if (sscanf($value, '{%d}', $n) !== 1 || !is_int($n)) { - throw new ShouldNotHappenException(); - } - - $min = $n; - $max = $n; - } elseif ($token === 'zero_or_one') { - $min = 0; - $max = 1; - } elseif ($token === 'zero_or_more') { - $min = 0; - } elseif ($token === 'one_or_more') { - $min = 1; - } - - return [$min, $max]; - } - - private function createGroupType(TreeNode $group): Type - { - $isNonEmpty = TrinaryLogic::createMaybe(); - $isNumeric = TrinaryLogic::createMaybe(); - $inOptionalQuantification = false; - - $this->walkGroupAst($group, $isNonEmpty, $isNumeric, $inOptionalQuantification); - - if ($isNumeric->yes()) { - $result = new IntersectionType([new StringType(), new AccessoryNumericStringType()]); - if (!$isNonEmpty->yes()) { - return TypeCombinator::union(new ConstantStringType(''), $result); - } - return $result; - } elseif ($isNonEmpty->yes()) { - return new IntersectionType([new StringType(), new AccessoryNonEmptyStringType()]); - } - - return new StringType(); - } - - private function walkGroupAst(TreeNode $ast, TrinaryLogic &$isNonEmpty, TrinaryLogic &$isNumeric, bool &$inOptionalQuantification): void - { - $children = $ast->getChildren(); - - if ( - $ast->getId() === '#concatenation' - && count($children) > 0 - ) { - $isNonEmpty = TrinaryLogic::createYes(); - } - - if ($ast->getId() === '#quantification') { - [$min] = $this->getQuantificationRange($ast); - - if ($min === 0) { - $inOptionalQuantification = true; - } - if ($min >= 1) { - $isNonEmpty = TrinaryLogic::createYes(); - $inOptionalQuantification = false; - } - } - - if ($ast->getId() === 'token') { - $literalValue = $this->getLiteralValue($ast); - if ($literalValue !== null) { - if (Strings::match($literalValue, '/^\d+$/') === null) { - $isNumeric = TrinaryLogic::createNo(); - } elseif ($isNumeric->maybe()) { - $isNumeric = TrinaryLogic::createYes(); - } - - if (!$inOptionalQuantification) { - $isNonEmpty = TrinaryLogic::createYes(); - } - } - } - - // [^0-9] should not parse as numeric-string, and [^list-everything-but-numbers] is technically - // doable but really silly compared to just \d so we can safely assume the string is not numeric - // for negative classes - if ($ast->getId() === '#negativeclass') { - $isNumeric = TrinaryLogic::createNo(); - } - - foreach ($children as $child) { - $this->walkGroupAst( - $child, - $isNonEmpty, - $isNumeric, - $inOptionalQuantification, - ); - } - } - - private function getLiteralValue(TreeNode $node): ?string - { - if ($node->getId() !== 'token') { - return null; - } - - // token is the token name from grammar without the namespace so literal and class:literal are both called literal here - $token = $node->getValueToken(); - $value = $node->getValueValue(); - - if (in_array($token, ['literal', 'escaped_end_class'], true)) { - if (strlen($node->getValueValue()) > 1 && $value[0] === '\\') { - return substr($value, 1); - } - - return $value; - } - - // literal "-" in front/back of a character class like '[-a-z]' or '[abc-]', not forming a range - if ($token === 'range') { - return $value; - } - - // literal "[" or "]" inside character classes '[[]' or '[]]' - if (in_array($token, ['class_', '_class_literal'], true)) { - return $value; - } - - // character escape sequences, just return a fixed string - if (in_array($token, ['character', 'dynamic_character', 'character_type'], true)) { - if ($token === 'character_type' && $value === '\d') { - return '0'; - } - - return $value; - } - - // [:digit:] and the like, more support coming later - if ($token === 'posix_class') { - if ($value === '[:digit:]') { - return '0'; - } - if (in_array($value, ['[:alpha:]', '[:alnum:]', '[:upper:]', '[:lower:]', '[:word:]', '[:ascii:]', '[:print:]', '[:xdigit:]', '[:graph:]'], true)) { - return 'a'; - } - if ($value === '[:blank:]') { - return " \t"; - } - if ($value === '[:cntrl:]') { - return "\x00\x1F"; - } - if ($value === '[:space:]') { - return " \t\r\n\v\f"; - } - if ($value === '[:punct:]') { - return '!"#$%&\'()*+,\-./:;<=>?@[\]^_`{|}~'; - } - } - - if ($token === 'anchor' || $token === 'match_point_reset') { - return ''; - } - - return null; - } - private function getPatternType(Expr $patternExpr, Scope $scope): Type { if ($patternExpr instanceof Expr\BinaryOp\Concat) { diff --git a/src/Type/Php/RegexGroupParser.php b/src/Type/Php/RegexGroupParser.php new file mode 100644 index 0000000000..c6782c16be --- /dev/null +++ b/src/Type/Php/RegexGroupParser.php @@ -0,0 +1,404 @@ +, array>, list}|null + */ + public function parseGroups(string $regex): ?array + { + if (self::$parser === null) { + /** @throws void */ + self::$parser = Llk::load(new Read(__DIR__ . '/../../../resources/RegexGrammar.pp')); + } + + try { + Strings::match('', $regex); + } catch (RegexpException) { + // pattern is invalid, so let the RegularExpressionPatternRule report it + return null; + } + + $rawRegex = $this->regexExpressionHelper->removeDelimitersAndModifiers($regex); + try { + $ast = self::$parser->parse($rawRegex); + } catch (Exception) { + return null; + } + + $captureOnlyNamed = false; + if ($this->phpVersion->supportsPregCaptureOnlyNamedGroups()) { + $modifiers = $this->regexExpressionHelper->getPatternModifiers($regex); + $captureOnlyNamed = str_contains($modifiers ?? '', 'n'); + } + + $capturingGroups = []; + $groupCombinations = []; + $alternationId = -1; + $captureGroupId = 100; + $markVerbs = []; + $this->walkRegexAst( + $ast, + false, + $alternationId, + 0, + false, + null, + $captureGroupId, + $capturingGroups, + $groupCombinations, + $markVerbs, + $captureOnlyNamed, + ); + + return [$capturingGroups, $groupCombinations, $markVerbs]; + } + + /** + * @param array $capturingGroups + * @param array> $groupCombinations + * @param list $markVerbs + */ + private function walkRegexAst( + TreeNode $ast, + bool $inAlternation, + int &$alternationId, + int $combinationIndex, + bool $inOptionalQuantification, + RegexCapturingGroup|RegexNonCapturingGroup|null $parentGroup, + int &$captureGroupId, + array &$capturingGroups, + array &$groupCombinations, + array &$markVerbs, + bool $captureOnlyNamed, + ): void + { + $group = null; + if ($ast->getId() === '#capturing') { + $group = new RegexCapturingGroup( + $captureGroupId++, + null, + $inAlternation ? $alternationId : null, + $inOptionalQuantification, + $parentGroup, + $this->createGroupType($ast), + ); + $parentGroup = $group; + } elseif ($ast->getId() === '#namedcapturing') { + $name = $ast->getChild(0)->getValueValue(); + $group = new RegexCapturingGroup( + $captureGroupId++, + $name, + $inAlternation ? $alternationId : null, + $inOptionalQuantification, + $parentGroup, + $this->createGroupType($ast), + ); + $parentGroup = $group; + } elseif ($ast->getId() === '#noncapturing') { + $group = new RegexNonCapturingGroup( + $inAlternation ? $alternationId : null, + $inOptionalQuantification, + $parentGroup, + false, + ); + $parentGroup = $group; + } elseif ($ast->getId() === '#noncapturingreset') { + $group = new RegexNonCapturingGroup( + $inAlternation ? $alternationId : null, + $inOptionalQuantification, + $parentGroup, + true, + ); + $parentGroup = $group; + } + + $inOptionalQuantification = false; + if ($ast->getId() === '#quantification') { + [$min] = $this->getQuantificationRange($ast); + + if ($min === 0) { + $inOptionalQuantification = true; + } + } + + if ($ast->getId() === '#alternation') { + $alternationId++; + $inAlternation = true; + } + + if ($ast->getId() === '#mark') { + $markVerbs[] = $ast->getChild(0)->getValueValue(); + return; + } + + if ( + $group instanceof RegexCapturingGroup && + (!$captureOnlyNamed || $group->isNamed()) + ) { + $capturingGroups[$group->getId()] = $group; + + if (!array_key_exists($alternationId, $groupCombinations)) { + $groupCombinations[$alternationId] = []; + } + if (!array_key_exists($combinationIndex, $groupCombinations[$alternationId])) { + $groupCombinations[$alternationId][$combinationIndex] = []; + } + $groupCombinations[$alternationId][$combinationIndex][] = $group->getId(); + } + + foreach ($ast->getChildren() as $child) { + $this->walkRegexAst( + $child, + $inAlternation, + $alternationId, + $combinationIndex, + $inOptionalQuantification, + $parentGroup, + $captureGroupId, + $capturingGroups, + $groupCombinations, + $markVerbs, + $captureOnlyNamed, + ); + + if ($ast->getId() !== '#alternation') { + continue; + } + + $combinationIndex++; + } + } + + /** @return array{?int, ?int} */ + private function getQuantificationRange(TreeNode $node): array + { + if ($node->getId() !== '#quantification') { + throw new ShouldNotHappenException(); + } + + $min = null; + $max = null; + + $lastChild = $node->getChild($node->getChildrenNumber() - 1); + $value = $lastChild->getValue(); + + // normalize away possessive and lazy quantifier-modifiers + $token = str_replace(['_possessive', '_lazy'], '', $value['token']); + $value = rtrim($value['value'], '+?'); + + if ($token === 'n_to_m') { + if (sscanf($value, '{%d,%d}', $n, $m) !== 2 || !is_int($n) || !is_int($m)) { + throw new ShouldNotHappenException(); + } + + $min = $n; + $max = $m; + } elseif ($token === 'n_or_more') { + if (sscanf($value, '{%d,}', $n) !== 1 || !is_int($n)) { + throw new ShouldNotHappenException(); + } + + $min = $n; + } elseif ($token === 'exactly_n') { + if (sscanf($value, '{%d}', $n) !== 1 || !is_int($n)) { + throw new ShouldNotHappenException(); + } + + $min = $n; + $max = $n; + } elseif ($token === 'zero_or_one') { + $min = 0; + $max = 1; + } elseif ($token === 'zero_or_more') { + $min = 0; + } elseif ($token === 'one_or_more') { + $min = 1; + } + + return [$min, $max]; + } + + private function createGroupType(TreeNode $group): Type + { + $isNonEmpty = TrinaryLogic::createMaybe(); + $isNumeric = TrinaryLogic::createMaybe(); + $inOptionalQuantification = false; + + $this->walkGroupAst($group, $isNonEmpty, $isNumeric, $inOptionalQuantification); + + if ($isNumeric->yes()) { + $result = new IntersectionType([new StringType(), new AccessoryNumericStringType()]); + if (!$isNonEmpty->yes()) { + return TypeCombinator::union(new ConstantStringType(''), $result); + } + return $result; + } elseif ($isNonEmpty->yes()) { + return new IntersectionType([new StringType(), new AccessoryNonEmptyStringType()]); + } + + return new StringType(); + } + + private function walkGroupAst(TreeNode $ast, TrinaryLogic &$isNonEmpty, TrinaryLogic &$isNumeric, bool &$inOptionalQuantification): void + { + $children = $ast->getChildren(); + + if ( + $ast->getId() === '#concatenation' + && count($children) > 0 + ) { + $isNonEmpty = TrinaryLogic::createYes(); + } + + if ($ast->getId() === '#quantification') { + [$min] = $this->getQuantificationRange($ast); + + if ($min === 0) { + $inOptionalQuantification = true; + } + if ($min >= 1) { + $isNonEmpty = TrinaryLogic::createYes(); + $inOptionalQuantification = false; + } + } + + if ($ast->getId() === 'token') { + $literalValue = $this->getLiteralValue($ast); + if ($literalValue !== null) { + if (Strings::match($literalValue, '/^\d+$/') === null) { + $isNumeric = TrinaryLogic::createNo(); + } elseif ($isNumeric->maybe()) { + $isNumeric = TrinaryLogic::createYes(); + } + + if (!$inOptionalQuantification) { + $isNonEmpty = TrinaryLogic::createYes(); + } + } + } + + // [^0-9] should not parse as numeric-string, and [^list-everything-but-numbers] is technically + // doable but really silly compared to just \d so we can safely assume the string is not numeric + // for negative classes + if ($ast->getId() === '#negativeclass') { + $isNumeric = TrinaryLogic::createNo(); + } + + foreach ($children as $child) { + $this->walkGroupAst( + $child, + $isNonEmpty, + $isNumeric, + $inOptionalQuantification, + ); + } + } + + private function getLiteralValue(TreeNode $node): ?string + { + if ($node->getId() !== 'token') { + return null; + } + + // token is the token name from grammar without the namespace so literal and class:literal are both called literal here + $token = $node->getValueToken(); + $value = $node->getValueValue(); + + if (in_array($token, ['literal', 'escaped_end_class'], true)) { + if (strlen($node->getValueValue()) > 1 && $value[0] === '\\') { + return substr($value, 1); + } + + return $value; + } + + // literal "-" in front/back of a character class like '[-a-z]' or '[abc-]', not forming a range + if ($token === 'range') { + return $value; + } + + // literal "[" or "]" inside character classes '[[]' or '[]]' + if (in_array($token, ['class_', '_class_literal'], true)) { + return $value; + } + + // character escape sequences, just return a fixed string + if (in_array($token, ['character', 'dynamic_character', 'character_type'], true)) { + if ($token === 'character_type' && $value === '\d') { + return '0'; + } + + return $value; + } + + // [:digit:] and the like, more support coming later + if ($token === 'posix_class') { + if ($value === '[:digit:]') { + return '0'; + } + if (in_array($value, ['[:alpha:]', '[:alnum:]', '[:upper:]', '[:lower:]', '[:word:]', '[:ascii:]', '[:print:]', '[:xdigit:]', '[:graph:]'], true)) { + return 'a'; + } + if ($value === '[:blank:]') { + return " \t"; + } + if ($value === '[:cntrl:]') { + return "\x00\x1F"; + } + if ($value === '[:space:]') { + return " \t\r\n\v\f"; + } + if ($value === '[:punct:]') { + return '!"#$%&\'()*+,\-./:;<=>?@[\]^_`{|}~'; + } + } + + if ($token === 'anchor' || $token === 'match_point_reset') { + return ''; + } + + return null; + } + +}