Skip to content

Commit

Permalink
[pseudo] Introduce parse forest.
Browse files Browse the repository at this point in the history
Parse forest is the output of the GLR parser, it is a tree-like DAG
which presents all possible parse trees without duplicating subparse structures.

This is a patch split from https://reviews.llvm.org/D121150.

Differential Revision: https://reviews.llvm.org/D122139
  • Loading branch information
hokein committed Mar 24, 2022
1 parent 9dbc687 commit 62d5f25
Show file tree
Hide file tree
Showing 5 changed files with 436 additions and 0 deletions.
178 changes: 178 additions & 0 deletions clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
@@ -0,0 +1,178 @@
//===--- Forest.h - Parse forest, the output of the GLR parser ---*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// A parse forest represents a set of possible parse trees efficiently, it is
// produced by the GLR parser.
//
// Despite the name, its data structure is a tree-like DAG with a single root.
// Multiple ways to parse the same tokens are presented as an ambiguous node
// with all possible interpretations as children.
// Common sub-parses are shared: if two interpretations both parse "1 + 1" as
// "expr := expr + expr", they will share a Sequence node representing the expr.
//
//===----------------------------------------------------------------------===//

#include "clang-pseudo/Grammar.h"
#include "clang-pseudo/Token.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/Allocator.h"
#include <cstdint>

namespace clang {
namespace pseudo {

// A node represents ways to parse a sequence of tokens, it interprets a fixed
// range of tokens as a fixed grammar symbol.
//
// There are different kinds of nodes, some nodes have "children" (stored in a
// trailing array) and have pointers to them. "Children" has different semantics
// depending on the node kinds. For an Ambiguous node, it means all
// possible interpretations; for a Sequence node, it means each symbol on the
// right hand side of the production rule.
//
// Since this is a node in a DAG, a node may have multiple parents. And a node
// doesn't have parent pointers.
class alignas(class ForestNode *) ForestNode {
public:
enum Kind : uint8_t {
// A Terminal node is a single terminal symbol bound to a token.
Terminal,
// A Sequence node is a nonterminal symbol parsed from a grammar rule,
// elements() are the parses of each symbol on the RHS of the rule.
// If the rule is A := X Y Z, the node is for nonterminal A, and elements()
// are [X, Y, Z].
Sequence,
// An Ambiguous node exposes multiple ways to interpret the code as the
// same symbol, alternatives() are all possible parses.
Ambiguous,
// An Opaque node is a placeholder. It asserts that tokens match a symbol,
// without saying how.
// It is used for lazy-parsing (not parsed yet), or error-recovery (invalid
// code).
Opaque,
};
Kind kind() const { return K; }

SymbolID symbol() const { return Symbol; }

// The start of the token range, it is a poistion within a token stream.
Token::Index startTokenIndex() const { return StartIndex; }

// Returns the corresponding grammar rule.
// REQUIRES: this is a Sequence node.
RuleID rule() const {
assert(kind() == Sequence);
return Data & ((1 << RuleBits) - 1);
}
// Returns the parses of each element on the RHS of the rule.
// REQUIRES: this is a Sequence node;
llvm::ArrayRef<const ForestNode *> elements() const {
assert(kind() == Sequence);
return children(Data >> RuleBits);
};

// Returns all possible interpretations of the code.
// REQUIRES: this is an Ambiguous node.
llvm::ArrayRef<const ForestNode *> alternatives() const {
assert(kind() == Ambiguous);
return children(Data);
}

std::string dump(const Grammar &) const;
std::string dumpRecursive(const Grammar &, bool Abbreviated = false) const;

private:
friend class ForestArena;

ForestNode(Kind K, SymbolID Symbol, Token::Index StartIndex, uint16_t Data)
: StartIndex(StartIndex), K(K), Symbol(Symbol), Data(Data) {}

ForestNode(const ForestNode &) = delete;
ForestNode &operator=(const ForestNode &) = delete;
ForestNode(ForestNode &&) = delete;
ForestNode &operator=(ForestNode &&) = delete;

static uint16_t sequenceData(RuleID Rule,
llvm::ArrayRef<const ForestNode *> Elements) {
assert(Rule < (1 << RuleBits));
assert(Elements.size() < (1 << (16 - RuleBits)));
return Rule | Elements.size() << RuleBits;
}
static uint16_t
ambiguousData(llvm::ArrayRef<const ForestNode *> Alternatives) {
return Alternatives.size();
}

// Retrieves the trailing array.
llvm::ArrayRef<const ForestNode *> children(uint16_t Num) const {
return llvm::makeArrayRef(reinterpret_cast<ForestNode *const *>(this + 1),
Num);
}

Token::Index StartIndex;
Kind K : 4;
SymbolID Symbol : SymbolBits;
// Sequence - child count : 4 | RuleID : RuleBits (12)
// Ambiguous - child count : 16
// Terminal, Opaque - unused
uint16_t Data;
// An array of ForestNode* following the object.
};
// ForestNode may not be destroyed (for BumpPtrAllocator).
static_assert(std::is_trivially_destructible<ForestNode>(), "");

// A memory arena for the parse forest.
class ForestArena {
public:
llvm::ArrayRef<ForestNode> createTerminals(const TokenStream &Code);
ForestNode &createSequence(SymbolID SID, RuleID RID,
llvm::ArrayRef<const ForestNode *> Elements) {
assert(!Elements.empty());
return create(ForestNode::Sequence, SID,
Elements.front()->startTokenIndex(),
ForestNode::sequenceData(RID, Elements), Elements);
}
ForestNode &createAmbiguous(SymbolID SID,
llvm::ArrayRef<const ForestNode *> Alternatives) {
assert(!Alternatives.empty());
assert(llvm::all_of(Alternatives,
[SID](const ForestNode *Alternative) {
return SID == Alternative->symbol();
}) &&
"Ambiguous alternatives must represent the same symbol!");
return create(ForestNode::Ambiguous, SID,
Alternatives.front()->startTokenIndex(),
ForestNode::ambiguousData(Alternatives), Alternatives);
}
ForestNode &createOpaque(SymbolID SID, Token::Index Start) {
return create(ForestNode::Opaque, SID, Start, 0, {});
}

size_t nodeCount() const { return NodeCount; }
size_t bytes() const { return Arena.getBytesAllocated() + sizeof(this); }

private:
ForestNode &create(ForestNode::Kind K, SymbolID SID, Token::Index Start,
uint16_t Data,
llvm::ArrayRef<const ForestNode *> Elements) {
++NodeCount;
ForestNode *New = new (Arena.Allocate(
sizeof(ForestNode) + Elements.size() * sizeof(ForestNode *),
alignof(ForestNode))) ForestNode(K, SID, Start, Data);
if (!Elements.empty())
llvm::copy(Elements, reinterpret_cast<const ForestNode **>(New + 1));
return *New;
}

llvm::BumpPtrAllocator Arena;
uint32_t NodeCount = 0;
};

} // namespace pseudo
} // namespace clang
1 change: 1 addition & 0 deletions clang-tools-extra/pseudo/lib/CMakeLists.txt
Expand Up @@ -2,6 +2,7 @@ set(LLVM_LINK_COMPONENTS Support)

add_clang_library(clangPseudo
DirectiveMap.cpp
Forest.cpp
Grammar.cpp
GrammarBNF.cpp
Lex.cpp
Expand Down
126 changes: 126 additions & 0 deletions clang-tools-extra/pseudo/lib/Forest.cpp
@@ -0,0 +1,126 @@
//===--- Forest.cpp - Parse forest ------------------------------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "clang-pseudo/Forest.h"
#include "clang-pseudo/Token.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/FormatVariadic.h"

namespace clang {
namespace pseudo {

std::string ForestNode::dump(const Grammar &G) const {
switch (kind()) {
case Ambiguous:
return llvm::formatv("{0} := <ambiguous>", G.symbolName(symbol()));
case Terminal:
return llvm::formatv("{0} := tok[{1}]", G.symbolName(symbol()),
startTokenIndex());
case Sequence:
return G.dumpRule(rule());
case Opaque:
return llvm::formatv("{0} := <opaque>", G.symbolName(symbol()));
}
llvm_unreachable("Unhandled node kind!");
}

std::string ForestNode::dumpRecursive(const Grammar &G,
bool Abbreviated) const {
// Count visits of nodes so we can mark those seen multiple times.
llvm::DenseMap<const ForestNode *, /*VisitCount*/ unsigned> VisitCounts;
std::function<void(const ForestNode *)> CountVisits =
[&](const ForestNode *P) {
if (VisitCounts[P]++ > 0)
return; // Don't count children as multiply visited.
if (P->kind() == Ambiguous)
llvm::for_each(P->alternatives(), CountVisits);
else if (P->kind() == Sequence)
llvm::for_each(P->elements(), CountVisits);
};
CountVisits(this);

// We print a "#<id>" for nonterminal forest nodes that are being dumped
// multiple times.
llvm::DenseMap<const ForestNode *, size_t> ReferenceIds;
std::string Result;
constexpr Token::Index KEnd = std::numeric_limits<Token::Index>::max();
std::function<void(const ForestNode *, unsigned, Token::Index,
llvm::Optional<SymbolID>)>
Dump = [&](const ForestNode *P, unsigned Level, Token::Index End,
llvm::Optional<SymbolID> ElidedParent) {
llvm::ArrayRef<const ForestNode *> Children;
auto EndOfElement = [&](size_t ChildIndex) {
return ChildIndex + 1 == Children.size()
? End
: Children[ChildIndex + 1]->startTokenIndex();
};
if (P->kind() == Ambiguous) {
Children = P->alternatives();
} else if (P->kind() == Sequence) {
Children = P->elements();
if (Abbreviated) {
if (P->startTokenIndex() == End)
return;
for (size_t I = 0; I < Children.size(); ++I)
if (Children[I]->startTokenIndex() == P->startTokenIndex() &&
EndOfElement(I) == End) {
return Dump(
Children[I], Level, End,
/*ElidedParent=*/ElidedParent.getValueOr(P->symbol()));
}
}
}

// FIXME: pretty ascii trees
if (End == KEnd)
Result += llvm::formatv("[{0,3}, end) ", P->startTokenIndex());
else
Result += llvm::formatv("[{0,3}, {1,3}) ", P->startTokenIndex(), End);
Result.append(2 * Level, ' ');
if (ElidedParent.hasValue()) {
Result += G.symbolName(*ElidedParent);
Result += "~";
}
Result.append(P->dump(G));

if (VisitCounts.find(P)->getSecond() > 1 &&
P->kind() != ForestNode::Terminal) {
// The first time, print as #1. Later, =#1.
auto It = ReferenceIds.try_emplace(P, ReferenceIds.size() + 1);
Result +=
llvm::formatv(" {0}#{1}", It.second ? "" : "=", It.first->second);
}
Result.push_back('\n');

++Level;
for (size_t I = 0; I < Children.size(); ++I)
Dump(Children[I], Level,
P->kind() == Sequence ? EndOfElement(I) : End, llvm::None);
};
Dump(this, 0, KEnd, llvm::None);
return Result;
}

llvm::ArrayRef<ForestNode>
ForestArena::createTerminals(const TokenStream &Code) {
ForestNode *Terminals = Arena.Allocate<ForestNode>(Code.tokens().size());
size_t Index = 0;
for (const auto &T : Code.tokens()) {
new (&Terminals[Index])
ForestNode(ForestNode::Terminal, tokenSymbol(T.Kind),
/*Start=*/Index, /*TerminalData*/ 0);
++Index;
}
NodeCount = Index;
return llvm::makeArrayRef(Terminals, Index);
}

} // namespace pseudo
} // namespace clang
1 change: 1 addition & 0 deletions clang-tools-extra/pseudo/unittests/CMakeLists.txt
Expand Up @@ -5,6 +5,7 @@ set(LLVM_LINK_COMPONENTS
add_custom_target(ClangPseudoUnitTests)
add_unittest(ClangPseudoUnitTests ClangPseudoTests
DirectiveMapTest.cpp
ForestTest.cpp
GrammarTest.cpp
LRTableTest.cpp
TokenTest.cpp
Expand Down

0 comments on commit 62d5f25

Please sign in to comment.