Skip to content

Commit

Permalink
[pseudo] Placeholder disambiguation strategy: always choose second
Browse files Browse the repository at this point in the history
Mostly mechanics here. Interesting decisions:
 - apply disambiguation in-place instead of copying the forest
   debatable, but even the final tree size is significant
 - split decide/apply into different functions - this allows the hard part
   (decide) to be tested non-destructively and combined with HTML forest easily
 - add non-const accessors to forest to enable apply
 - unit tests but no lit tests: my plan is to test actual C++ disambiguation
   heuristics with lit, generic disambiguation mechanics without the C++ grammar

Differential Revision: https://reviews.llvm.org/D132487
  • Loading branch information
sam-mccall committed Aug 26, 2022
1 parent 4a56470 commit 56c54cf
Show file tree
Hide file tree
Showing 10 changed files with 269 additions and 11 deletions.
64 changes: 64 additions & 0 deletions clang-tools-extra/pseudo/include/clang-pseudo/Disambiguate.h
@@ -0,0 +1,64 @@
//===--- Disambiguate.h - Find the best tree in the forest -------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// A GLR parse forest represents every possible parse tree for the source code.
//
// Before we can do useful analysis/editing of the code, we need to pick a
// single tree which we think is accurate. We use three main types of clues:
//
// A) Semantic language rules may restrict which parses are allowed.
// For example, `string string string X` is *grammatical* C++, but only a
// single type-name is allowed in a decl-specifier-sequence.
// Where possible, these interpretations are forbidden by guards.
// Sometimes this isn't possible, or we want our parser to be lenient.
//
// B) Some constructs are rarer, while others are common.
// For example `a<b>::c` is often a template specialization, and rarely a
// double comparison between a, b, and c.
//
// C) Identifier text hints whether they name types/values/templates etc.
// "std" is usually a namespace, a project index may also guide us.
// Hints may be within the document: if one occurrence of 'foo' is a variable
// then the others probably are too.
// (Text need not match: similar CaseStyle can be a weak hint, too).
//
//----------------------------------------------------------------------------//
//
// Mechanically, we replace each ambiguous node with its best alternative.
//
// "Best" is determined by assigning bonuses/penalties to nodes, to express
// the clues of type A and B above. A forest node representing an unlikely
// parse would apply a penalty to every subtree is is present in.
// Disambiguation proceeds bottom-up, so that the score of each alternative
// is known when a decision is made.
//
// Identifier-based hints within the document mean some nodes should be
// *correlated*. Rather than resolve these simultaneously, we make the most
// certain decisions first and use these results to update bonuses elsewhere.
//
//===----------------------------------------------------------------------===//

#include "clang-pseudo/Forest.h"

namespace clang::pseudo {

struct DisambiguateParams {};

// Maps ambiguous nodes onto the index of their preferred alternative.
using Disambiguation = llvm::DenseMap<const ForestNode *, unsigned>;

// Resolve each ambiguous node in the forest.
// Maps each ambiguous node to the index of the chosen alternative.
// FIXME: current implementation is a placeholder and chooses arbitrarily.
Disambiguation disambiguate(const ForestNode *Root,
const DisambiguateParams &Params);

// Remove all ambiguities from the forest, resolving them according to Disambig.
void removeAmbiguities(ForestNode *&Root, const Disambiguation &Disambig);

} // namespace clang::pseudo
12 changes: 12 additions & 0 deletions clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
Expand Up @@ -80,13 +80,21 @@ class alignas(class ForestNode *) ForestNode {
assert(kind() == Sequence);
return children(Data >> RuleBits);
}
llvm::MutableArrayRef<ForestNode *> elements() {
assert(kind() == Sequence);
return children(Data >> RuleBits);
}

// Returns all possible interpretations of the code.
// REQUIRES: this is an Ambiguous node.
llvm::ArrayRef<const ForestNode *> alternatives() const {
assert(kind() == Ambiguous);
return children(Data);
}
llvm::MutableArrayRef<ForestNode *> alternatives() {
assert(kind() == Ambiguous);
return children(Data);
}

llvm::ArrayRef<const ForestNode *> children() const {
switch (kind()) {
Expand Down Expand Up @@ -134,6 +142,10 @@ class alignas(class ForestNode *) ForestNode {
return llvm::makeArrayRef(reinterpret_cast<ForestNode *const *>(this + 1),
Num);
}
llvm::MutableArrayRef<ForestNode *> children(uint16_t Num) {
return llvm::makeMutableArrayRef(reinterpret_cast<ForestNode **>(this + 1),
Num);
}

Token::Index StartIndex;
Kind K : 4;
Expand Down
4 changes: 2 additions & 2 deletions clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
Expand Up @@ -130,8 +130,8 @@ struct ParseParams {
// A rule `_ := StartSymbol` must exit for the chosen start symbol.
//
// If the parsing fails, we model it as an opaque node in the forest.
const ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
const Language &Lang);
ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
const Language &Lang);

// Shift a token onto all OldHeads, placing the results into NewHeads.
//
Expand Down
1 change: 1 addition & 0 deletions clang-tools-extra/pseudo/lib/CMakeLists.txt
Expand Up @@ -7,6 +7,7 @@ set(LLVM_LINK_COMPONENTS Support)
add_clang_library(clangPseudo
Bracket.cpp
DirectiveTree.cpp
Disambiguate.cpp
Forest.cpp
GLR.cpp
Lex.cpp
Expand Down
48 changes: 48 additions & 0 deletions clang-tools-extra/pseudo/lib/Disambiguate.cpp
@@ -0,0 +1,48 @@
//===--- Disambiguate.cpp - Find the best tree in the forest --------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "clang-pseudo/Disambiguate.h"

namespace clang::pseudo {

Disambiguation disambiguate(const ForestNode *Root,
const DisambiguateParams &Params) {
// FIXME: this is a dummy placeholder strategy, implement a real one!
Disambiguation Result;
for (const ForestNode &N : Root->descendants()) {
if (N.kind() == ForestNode::Ambiguous)
Result.try_emplace(&N, 1);
}
return Result;
}

void removeAmbiguities(ForestNode *&Root, const Disambiguation &D) {
std::vector<ForestNode **> Queue = {&Root};
while (!Queue.empty()) {
ForestNode **Next = Queue.back();
Queue.pop_back();
switch ((*Next)->kind()) {
case ForestNode::Sequence:
for (ForestNode *&Child : (*Next)->elements())
Queue.push_back(&Child);
break;
case ForestNode::Ambiguous: {
assert(D.count(*Next) != 0 && "disambiguation is incomplete!");
ForestNode *ChosenChild = (*Next)->alternatives()[D.lookup(*Next)];
*Next = ChosenChild;
Queue.push_back(Next);
break;
}
case ForestNode::Terminal:
case ForestNode::Opaque:
break;
}
}
}

} // namespace clang::pseudo
6 changes: 3 additions & 3 deletions clang-tools-extra/pseudo/lib/GLR.cpp
Expand Up @@ -598,8 +598,8 @@ class GLRReduce {

} // namespace

const ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
const Language &Lang) {
ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
const Language &Lang) {
GLRReduce Reduce(Params, Lang);
assert(isNonterminal(StartSymbol) && "Start symbol must be a nonterminal");
llvm::ArrayRef<ForestNode> Terminals = Params.Forest.createTerminals(Params.Code);
Expand Down Expand Up @@ -686,7 +686,7 @@ const ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
return Result;
};
if (auto *Result = SearchForAccept(Heads))
return *Result;
return *const_cast<ForestNode *>(Result); // Safe: we created all nodes.
// We failed to parse the input, returning an opaque forest node for recovery.
// FIXME: as above, we can add fallback error handling so this is impossible.
return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
Expand Down
23 changes: 20 additions & 3 deletions clang-tools-extra/pseudo/tool/ClangPseudo.cpp
Expand Up @@ -8,6 +8,7 @@

#include "clang-pseudo/Bracket.h"
#include "clang-pseudo/DirectiveTree.h"
#include "clang-pseudo/Disambiguate.h"
#include "clang-pseudo/Forest.h"
#include "clang-pseudo/GLR.h"
#include "clang-pseudo/Language.h"
Expand Down Expand Up @@ -45,6 +46,8 @@ static opt<bool>
static opt<bool>
StripDirectives("strip-directives",
desc("Strip directives and select conditional sections"));
static opt<bool> Disambiguate("disambiguate",
desc("Choose best tree from parse forest"));
static opt<bool> PrintStatistics("print-statistics", desc("Print GLR parser statistics"));
static opt<bool> PrintForest("print-forest", desc("Print parse forest"));
static opt<bool> ForestAbbrev("forest-abbrev", desc("Abbreviate parse forest"),
Expand All @@ -70,7 +73,8 @@ namespace clang {
namespace pseudo {
// Defined in HTMLForest.cpp
void writeHTMLForest(llvm::raw_ostream &OS, const Grammar &,
const ForestNode &Root, const TokenStream &);
const ForestNode &Root, const Disambiguation &,
const TokenStream &);
namespace {

struct NodeStats {
Expand Down Expand Up @@ -156,8 +160,12 @@ int main(int argc, char *argv[]) {
auto &Root =
glrParse(clang::pseudo::ParseParams{*ParseableStream, Arena, GSS},
*StartSymID, Lang);
if (PrintForest)
// If we're disambiguating, we'll print at the end instead.
if (PrintForest && !Disambiguate)
llvm::outs() << Root.dumpRecursive(Lang.G, /*Abbreviated=*/ForestAbbrev);
clang::pseudo::Disambiguation Disambig;
if (Disambiguate)
Disambig = clang::pseudo::disambiguate(&Root, {});

if (HTMLForest.getNumOccurrences()) {
std::error_code EC;
Expand All @@ -167,7 +175,8 @@ int main(int argc, char *argv[]) {
<< "\n";
return 2;
}
clang::pseudo::writeHTMLForest(HTMLOut, Lang.G, Root, *ParseableStream);
clang::pseudo::writeHTMLForest(HTMLOut, Lang.G, Root, Disambig,
*ParseableStream);
}

if (PrintStatistics) {
Expand Down Expand Up @@ -219,6 +228,14 @@ int main(int argc, char *argv[]) {
llvm::outs() << llvm::formatv("Unparsed: {0}%\n",
100.0 * UnparsedTokens / Len);
}

if (Disambiguate && PrintForest) {
ForestNode *DisambigRoot = &Root;
removeAmbiguities(DisambigRoot, Disambig);
llvm::outs() << "Disambiguated tree:\n";
llvm::outs() << DisambigRoot->dumpRecursive(Lang.G,
/*Abbreviated=*/ForestAbbrev);
}
}

return 0;
Expand Down
10 changes: 7 additions & 3 deletions clang-tools-extra/pseudo/tool/HTMLForest.cpp
Expand Up @@ -43,6 +43,7 @@
//
//===----------------------------------------------------------------------===//

#include "clang-pseudo/Disambiguate.h"
#include "clang-pseudo/Forest.h"
#include "clang-pseudo/grammar/Grammar.h"
#include "llvm/ADT/StringExtras.h"
Expand All @@ -60,6 +61,7 @@ struct Writer {
const Grammar &G;
const ForestNode &Root;
const TokenStream &Stream;
const Disambiguation &Disambig;

void write() {
Out << "<!doctype html>\n";
Expand Down Expand Up @@ -150,7 +152,8 @@ void Writer::writeForestJSON() {
break;
case ForestNode::Ambiguous:
Out.attribute("kind", "ambiguous");
Out.attribute("selected", AssignID(N->children().front(), End));
Out.attribute("selected",
AssignID(N->children()[Disambig.lookup(N)], End));
break;
case ForestNode::Opaque:
Out.attribute("kind", "opaque");
Expand Down Expand Up @@ -180,8 +183,9 @@ void Writer::writeForestJSON() {
// We only accept the derived stream here.
// FIXME: allow the original stream instead?
void writeHTMLForest(llvm::raw_ostream &OS, const Grammar &G,
const ForestNode &Root, const TokenStream &Stream) {
Writer{OS, G, Root, Stream}.write();
const ForestNode &Root, const Disambiguation &Disambig,
const TokenStream &Stream) {
Writer{OS, G, Root, Stream, Disambig}.write();
}

} // namespace pseudo
Expand Down
1 change: 1 addition & 0 deletions clang-tools-extra/pseudo/unittests/CMakeLists.txt
Expand Up @@ -7,6 +7,7 @@ add_unittest(ClangPseudoUnitTests ClangPseudoTests
BracketTest.cpp
CXXTest.cpp
DirectiveTreeTest.cpp
DisambiguateTest.cpp
ForestTest.cpp
GLRTest.cpp
GrammarTest.cpp
Expand Down

0 comments on commit 56c54cf

Please sign in to comment.