Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Parse forest is the output of the GLR parser, it is a tree-like DAG which presents all possible parse trees without duplicating subparse structures. This is a patch split from https://reviews.llvm.org/D121150. Differential Revision: https://reviews.llvm.org/D122139
- Loading branch information
Showing
5 changed files
with
436 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,178 @@ | ||
//===--- Forest.h - Parse forest, the output of the GLR parser ---*- C++-*-===// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// A parse forest represents a set of possible parse trees efficiently, it is | ||
// produced by the GLR parser. | ||
// | ||
// Despite the name, its data structure is a tree-like DAG with a single root. | ||
// Multiple ways to parse the same tokens are presented as an ambiguous node | ||
// with all possible interpretations as children. | ||
// Common sub-parses are shared: if two interpretations both parse "1 + 1" as | ||
// "expr := expr + expr", they will share a Sequence node representing the expr. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#include "clang-pseudo/Grammar.h" | ||
#include "clang-pseudo/Token.h" | ||
#include "llvm/ADT/ArrayRef.h" | ||
#include "llvm/ADT/STLExtras.h" | ||
#include "llvm/Support/Allocator.h" | ||
#include <cstdint> | ||
|
||
namespace clang { | ||
namespace pseudo { | ||
|
||
// A node represents ways to parse a sequence of tokens, it interprets a fixed | ||
// range of tokens as a fixed grammar symbol. | ||
// | ||
// There are different kinds of nodes, some nodes have "children" (stored in a | ||
// trailing array) and have pointers to them. "Children" has different semantics | ||
// depending on the node kinds. For an Ambiguous node, it means all | ||
// possible interpretations; for a Sequence node, it means each symbol on the | ||
// right hand side of the production rule. | ||
// | ||
// Since this is a node in a DAG, a node may have multiple parents. And a node | ||
// doesn't have parent pointers. | ||
class alignas(class ForestNode *) ForestNode { | ||
public: | ||
enum Kind : uint8_t { | ||
// A Terminal node is a single terminal symbol bound to a token. | ||
Terminal, | ||
// A Sequence node is a nonterminal symbol parsed from a grammar rule, | ||
// elements() are the parses of each symbol on the RHS of the rule. | ||
// If the rule is A := X Y Z, the node is for nonterminal A, and elements() | ||
// are [X, Y, Z]. | ||
Sequence, | ||
// An Ambiguous node exposes multiple ways to interpret the code as the | ||
// same symbol, alternatives() are all possible parses. | ||
Ambiguous, | ||
// An Opaque node is a placeholder. It asserts that tokens match a symbol, | ||
// without saying how. | ||
// It is used for lazy-parsing (not parsed yet), or error-recovery (invalid | ||
// code). | ||
Opaque, | ||
}; | ||
Kind kind() const { return K; } | ||
|
||
SymbolID symbol() const { return Symbol; } | ||
|
||
// The start of the token range, it is a poistion within a token stream. | ||
Token::Index startTokenIndex() const { return StartIndex; } | ||
|
||
// Returns the corresponding grammar rule. | ||
// REQUIRES: this is a Sequence node. | ||
RuleID rule() const { | ||
assert(kind() == Sequence); | ||
return Data & ((1 << RuleBits) - 1); | ||
} | ||
// Returns the parses of each element on the RHS of the rule. | ||
// REQUIRES: this is a Sequence node; | ||
llvm::ArrayRef<const ForestNode *> elements() const { | ||
assert(kind() == Sequence); | ||
return children(Data >> RuleBits); | ||
}; | ||
|
||
// Returns all possible interpretations of the code. | ||
// REQUIRES: this is an Ambiguous node. | ||
llvm::ArrayRef<const ForestNode *> alternatives() const { | ||
assert(kind() == Ambiguous); | ||
return children(Data); | ||
} | ||
|
||
std::string dump(const Grammar &) const; | ||
std::string dumpRecursive(const Grammar &, bool Abbreviated = false) const; | ||
|
||
private: | ||
friend class ForestArena; | ||
|
||
ForestNode(Kind K, SymbolID Symbol, Token::Index StartIndex, uint16_t Data) | ||
: StartIndex(StartIndex), K(K), Symbol(Symbol), Data(Data) {} | ||
|
||
ForestNode(const ForestNode &) = delete; | ||
ForestNode &operator=(const ForestNode &) = delete; | ||
ForestNode(ForestNode &&) = delete; | ||
ForestNode &operator=(ForestNode &&) = delete; | ||
|
||
static uint16_t sequenceData(RuleID Rule, | ||
llvm::ArrayRef<const ForestNode *> Elements) { | ||
assert(Rule < (1 << RuleBits)); | ||
assert(Elements.size() < (1 << (16 - RuleBits))); | ||
return Rule | Elements.size() << RuleBits; | ||
} | ||
static uint16_t | ||
ambiguousData(llvm::ArrayRef<const ForestNode *> Alternatives) { | ||
return Alternatives.size(); | ||
} | ||
|
||
// Retrieves the trailing array. | ||
llvm::ArrayRef<const ForestNode *> children(uint16_t Num) const { | ||
return llvm::makeArrayRef(reinterpret_cast<ForestNode *const *>(this + 1), | ||
Num); | ||
} | ||
|
||
Token::Index StartIndex; | ||
Kind K : 4; | ||
SymbolID Symbol : SymbolBits; | ||
// Sequence - child count : 4 | RuleID : RuleBits (12) | ||
// Ambiguous - child count : 16 | ||
// Terminal, Opaque - unused | ||
uint16_t Data; | ||
// An array of ForestNode* following the object. | ||
}; | ||
// ForestNode may not be destroyed (for BumpPtrAllocator). | ||
static_assert(std::is_trivially_destructible<ForestNode>(), ""); | ||
|
||
// A memory arena for the parse forest. | ||
class ForestArena { | ||
public: | ||
llvm::ArrayRef<ForestNode> createTerminals(const TokenStream &Code); | ||
ForestNode &createSequence(SymbolID SID, RuleID RID, | ||
llvm::ArrayRef<const ForestNode *> Elements) { | ||
assert(!Elements.empty()); | ||
return create(ForestNode::Sequence, SID, | ||
Elements.front()->startTokenIndex(), | ||
ForestNode::sequenceData(RID, Elements), Elements); | ||
} | ||
ForestNode &createAmbiguous(SymbolID SID, | ||
llvm::ArrayRef<const ForestNode *> Alternatives) { | ||
assert(!Alternatives.empty()); | ||
assert(llvm::all_of(Alternatives, | ||
[SID](const ForestNode *Alternative) { | ||
return SID == Alternative->symbol(); | ||
}) && | ||
"Ambiguous alternatives must represent the same symbol!"); | ||
return create(ForestNode::Ambiguous, SID, | ||
Alternatives.front()->startTokenIndex(), | ||
ForestNode::ambiguousData(Alternatives), Alternatives); | ||
} | ||
ForestNode &createOpaque(SymbolID SID, Token::Index Start) { | ||
return create(ForestNode::Opaque, SID, Start, 0, {}); | ||
} | ||
|
||
size_t nodeCount() const { return NodeCount; } | ||
size_t bytes() const { return Arena.getBytesAllocated() + sizeof(this); } | ||
|
||
private: | ||
ForestNode &create(ForestNode::Kind K, SymbolID SID, Token::Index Start, | ||
uint16_t Data, | ||
llvm::ArrayRef<const ForestNode *> Elements) { | ||
++NodeCount; | ||
ForestNode *New = new (Arena.Allocate( | ||
sizeof(ForestNode) + Elements.size() * sizeof(ForestNode *), | ||
alignof(ForestNode))) ForestNode(K, SID, Start, Data); | ||
if (!Elements.empty()) | ||
llvm::copy(Elements, reinterpret_cast<const ForestNode **>(New + 1)); | ||
return *New; | ||
} | ||
|
||
llvm::BumpPtrAllocator Arena; | ||
uint32_t NodeCount = 0; | ||
}; | ||
|
||
} // namespace pseudo | ||
} // namespace clang |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
//===--- Forest.cpp - Parse forest ------------------------------*- C++-*-===// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#include "clang-pseudo/Forest.h" | ||
#include "clang-pseudo/Token.h" | ||
#include "llvm/ADT/ArrayRef.h" | ||
#include "llvm/ADT/None.h" | ||
#include "llvm/ADT/STLExtras.h" | ||
#include "llvm/Support/FormatVariadic.h" | ||
|
||
namespace clang { | ||
namespace pseudo { | ||
|
||
std::string ForestNode::dump(const Grammar &G) const { | ||
switch (kind()) { | ||
case Ambiguous: | ||
return llvm::formatv("{0} := <ambiguous>", G.symbolName(symbol())); | ||
case Terminal: | ||
return llvm::formatv("{0} := tok[{1}]", G.symbolName(symbol()), | ||
startTokenIndex()); | ||
case Sequence: | ||
return G.dumpRule(rule()); | ||
case Opaque: | ||
return llvm::formatv("{0} := <opaque>", G.symbolName(symbol())); | ||
} | ||
llvm_unreachable("Unhandled node kind!"); | ||
} | ||
|
||
std::string ForestNode::dumpRecursive(const Grammar &G, | ||
bool Abbreviated) const { | ||
// Count visits of nodes so we can mark those seen multiple times. | ||
llvm::DenseMap<const ForestNode *, /*VisitCount*/ unsigned> VisitCounts; | ||
std::function<void(const ForestNode *)> CountVisits = | ||
[&](const ForestNode *P) { | ||
if (VisitCounts[P]++ > 0) | ||
return; // Don't count children as multiply visited. | ||
if (P->kind() == Ambiguous) | ||
llvm::for_each(P->alternatives(), CountVisits); | ||
else if (P->kind() == Sequence) | ||
llvm::for_each(P->elements(), CountVisits); | ||
}; | ||
CountVisits(this); | ||
|
||
// We print a "#<id>" for nonterminal forest nodes that are being dumped | ||
// multiple times. | ||
llvm::DenseMap<const ForestNode *, size_t> ReferenceIds; | ||
std::string Result; | ||
constexpr Token::Index KEnd = std::numeric_limits<Token::Index>::max(); | ||
std::function<void(const ForestNode *, unsigned, Token::Index, | ||
llvm::Optional<SymbolID>)> | ||
Dump = [&](const ForestNode *P, unsigned Level, Token::Index End, | ||
llvm::Optional<SymbolID> ElidedParent) { | ||
llvm::ArrayRef<const ForestNode *> Children; | ||
auto EndOfElement = [&](size_t ChildIndex) { | ||
return ChildIndex + 1 == Children.size() | ||
? End | ||
: Children[ChildIndex + 1]->startTokenIndex(); | ||
}; | ||
if (P->kind() == Ambiguous) { | ||
Children = P->alternatives(); | ||
} else if (P->kind() == Sequence) { | ||
Children = P->elements(); | ||
if (Abbreviated) { | ||
if (P->startTokenIndex() == End) | ||
return; | ||
for (size_t I = 0; I < Children.size(); ++I) | ||
if (Children[I]->startTokenIndex() == P->startTokenIndex() && | ||
EndOfElement(I) == End) { | ||
return Dump( | ||
Children[I], Level, End, | ||
/*ElidedParent=*/ElidedParent.getValueOr(P->symbol())); | ||
} | ||
} | ||
} | ||
|
||
// FIXME: pretty ascii trees | ||
if (End == KEnd) | ||
Result += llvm::formatv("[{0,3}, end) ", P->startTokenIndex()); | ||
else | ||
Result += llvm::formatv("[{0,3}, {1,3}) ", P->startTokenIndex(), End); | ||
Result.append(2 * Level, ' '); | ||
if (ElidedParent.hasValue()) { | ||
Result += G.symbolName(*ElidedParent); | ||
Result += "~"; | ||
} | ||
Result.append(P->dump(G)); | ||
|
||
if (VisitCounts.find(P)->getSecond() > 1 && | ||
P->kind() != ForestNode::Terminal) { | ||
// The first time, print as #1. Later, =#1. | ||
auto It = ReferenceIds.try_emplace(P, ReferenceIds.size() + 1); | ||
Result += | ||
llvm::formatv(" {0}#{1}", It.second ? "" : "=", It.first->second); | ||
} | ||
Result.push_back('\n'); | ||
|
||
++Level; | ||
for (size_t I = 0; I < Children.size(); ++I) | ||
Dump(Children[I], Level, | ||
P->kind() == Sequence ? EndOfElement(I) : End, llvm::None); | ||
}; | ||
Dump(this, 0, KEnd, llvm::None); | ||
return Result; | ||
} | ||
|
||
llvm::ArrayRef<ForestNode> | ||
ForestArena::createTerminals(const TokenStream &Code) { | ||
ForestNode *Terminals = Arena.Allocate<ForestNode>(Code.tokens().size()); | ||
size_t Index = 0; | ||
for (const auto &T : Code.tokens()) { | ||
new (&Terminals[Index]) | ||
ForestNode(ForestNode::Terminal, tokenSymbol(T.Kind), | ||
/*Start=*/Index, /*TerminalData*/ 0); | ||
++Index; | ||
} | ||
NodeCount = Index; | ||
return llvm::makeArrayRef(Terminals, Index); | ||
} | ||
|
||
} // namespace pseudo | ||
} // namespace clang |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.