213 changes: 195 additions & 18 deletions clang-tools-extra/pseudo/lib/GLR.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,166 @@

namespace clang {
namespace pseudo {
namespace {

llvm::Optional<unsigned>
findRecoveryEndpoint(RecoveryStrategy Strategy,
const GSS::Node *RecoveryNode,
const TokenStream &Tokens) {
assert(Strategy == RecoveryStrategy::Braces);
const ForestNode *LBrace = RecoveryNode->Payload;
assert(LBrace->kind() == ForestNode::Terminal &&
LBrace->symbol() == tokenSymbol(tok::l_brace));
if (const Token *RBrace = Tokens.tokens()[LBrace->startTokenIndex()].pair())
return Tokens.index(*RBrace);
return llvm::None;
}

} // namespace

void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
unsigned &TokenIndex, const TokenStream &Tokens,
const ParseParams &Params,
std::vector<const GSS::Node *> &NewHeads) {
LLVM_DEBUG(llvm::dbgs() << "Recovery at token " << TokenIndex << "...\n");
// Describes a possibility to recover by forcibly interpreting a range of
// tokens around the cursor as a nonterminal that we expected to see.
struct PlaceholderRecovery {
// The token prior to the nonterminal which is being recovered.
// This starts of the region we're skipping, so higher Position is better.
Token::Index Position;
// The nonterminal which will be created in order to recover.
SymbolID Symbol;
// The heuristic used to choose the bounds of the nonterminal to recover.
RecoveryStrategy Strategy;

// The GSS head where we are expecting the recovered nonterminal.
const GSS::Node *RecoveryNode;
// Payload of nodes on the way back from the OldHead to the recovery node.
// These represent the partial parse that is being discarded.
// They should become the children of the opaque recovery node.
//
// There may be multiple paths leading to the same recovery node, we choose
// one arbitrarily.
std::vector<const ForestNode *> Path;
};
std::vector<PlaceholderRecovery> Options;

// Find recovery options by walking up the stack.
//
// This is similar to exception handling: we walk up the "frames" of nested
// rules being parsed until we find one that has a "handler" which allows us
// to determine the node bounds without parsing it.
//
// Unfortunately there's a significant difference: the stack contains both
// "upward" nodes (ancestor parses) and "leftward" ones.
// e.g. when parsing `int(2 + ?)`, the stack contains:
// expr := expr + . expr - which we're currently parsing
// expr := type ( . expr ) - (up) we should recover this outer expr
// expr := . type ( expr ) - (up+left) we should not recover this type!
//
// It's not obvious how to avoid collecting the latter as a recovery option.
// I think the distinction is ill-defined after merging items into states.
// For now, we have to take this into account when defining recovery rules.
// FIXME: find a more satisfying way to avoid such false recovery.
std::vector<const ForestNode *> Path;
llvm::DenseSet<const GSS::Node *> Seen;
auto DFS = [&](const GSS::Node *N, Token::Index NextTok, auto &DFS) {
if (!Seen.insert(N).second)
return;
for (auto Strategy : Params.Table.getRecovery(N->State)) {
Options.push_back(PlaceholderRecovery{
NextTok,
Strategy.Result,
Strategy.Strategy,
N,
Path,
});
LLVM_DEBUG(llvm::dbgs()
<< "Option: recover " << Params.G.symbolName(Strategy.Result)
<< " at token " << NextTok << "\n");
}
Path.push_back(N->Payload);
for (const GSS::Node *Parent : N->parents())
DFS(Parent, N->Payload->startTokenIndex(), DFS);
Path.pop_back();
};
for (auto *N : llvm::reverse(OldHeads))
DFS(N, TokenIndex, DFS);

// Now we select the option(s) we will use to recover.
//
// We prefer options starting further right, as these discard less code
// (e.g. we prefer to recover inner scopes rather than outer ones).
// The options also need to agree on an endpoint, so the parser has a
// consistent position afterwards.
//
// So conceptually we're sorting by the tuple (start, end), though we avoid
// computing `end` for options that can't be winners.

// Consider options starting further right first.
// Don't drop the others yet though, we may still use them if preferred fails.
llvm::stable_sort(Options, [&](const auto &L, const auto &R) {
return L.Position > R.Position;
});

assert(NewHeads.empty()); // We may repeatedly populate and clear it.
llvm::Optional<Token::Range> RecoveryRange;
for (const PlaceholderRecovery &Option : Options) {
// If this starts further right than options we've already found, then
// we'll never find anything better. Skip computing End for the rest.
if (RecoveryRange && Option.Position < RecoveryRange->Begin)
break;

auto End =
findRecoveryEndpoint(Option.Strategy, Option.RecoveryNode, Tokens);
// Only consider recovery that advances the parse.
if (!End || *End <= TokenIndex)
continue;
if (RecoveryRange) {
// If this is worse than our previous options, ignore it.
if (RecoveryRange->End < *End)
continue;
// If this is an improvement over our previous options, then drop them.
if (RecoveryRange->End > *End)
NewHeads.clear();
}
// Create recovery nodes and heads for them in the GSS. These may be
// discarded if a better recovery is later found, but this path isn't hot.
RecoveryRange = {Option.Position, *End};
const ForestNode &Placeholder =
Params.Forest.createOpaque(Option.Symbol, Option.Position);
const GSS::Node *NewHead = Params.GSStack.addNode(
Params.Table.getGoToState(Option.RecoveryNode->State, Option.Symbol),
&Placeholder, {Option.RecoveryNode});
NewHeads.push_back(NewHead);
}

// Advance the cursor, whether recovery succeeded or not.
if (RecoveryRange) {
LLVM_DEBUG({
llvm::dbgs() << "Recovered range=" << *RecoveryRange << ":";
for (const auto *Head : NewHeads)
llvm::dbgs() << " " << Params.G.symbolName(Head->Payload->symbol());
llvm::dbgs() << "\n";
});
TokenIndex = RecoveryRange->End;
} else {
LLVM_DEBUG(llvm::dbgs() << "Recovery failed after trying " << Options.size()
<< " strategies\n");
++TokenIndex;
}
}

using StateID = LRTable::StateID;

llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const GSS::Node &N) {
std::vector<std::string> ParentStates;
for (const auto *Parent : N.parents())
ParentStates.push_back(llvm::formatv("{0}", Parent->State));
OS << llvm::formatv("state {0}, parsed symbol {1}, parents {2}", N.State,
N.Payload->symbol(), llvm::join(ParentStates, ", "));
OS << llvm::formatv("state {0}, parsed symbol {1}, parents {3}", N.State,
N.Payload ? N.Payload->symbol() : 0,
llvm::join(ParentStates, ", "));
return OS;
}

Expand Down Expand Up @@ -427,15 +578,27 @@ const ForestNode &glrParse(const TokenStream &Tokens, const ParseParams &Params,
GSS.gc(std::move(Roots));
};
// Each iteration fully processes a single token.
for (unsigned I = 0; I < Terminals.size(); ++I) {
for (unsigned I = 0; I < Terminals.size();) {
LLVM_DEBUG(llvm::dbgs() << llvm::formatv(
"Next token {0} (id={1})\n",
G.symbolName(Terminals[I].symbol()), Terminals[I].symbol()));
// Consume the token.
glrShift(Heads, Terminals[I], Params, NextHeads);

// If we weren't able to consume the token, try to skip over some tokens
// so we can keep parsing.
if (NextHeads.empty()) {
glrRecover(Heads, I, Tokens, Params, NextHeads);
if (NextHeads.empty())
// FIXME: Ensure the `_ := start-symbol` rules have a fallback
// error-recovery strategy attached. Then this condition can't happen.
return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
} else
++I;

// Form nonterminals containing the token we just consumed.
SymbolID Lookahead = I + 1 == Terminals.size() ? tokenSymbol(tok::eof)
: Terminals[I + 1].symbol();
SymbolID Lookahead =
I == Terminals.size() ? tokenSymbol(tok::eof) : Terminals[I].symbol();
Reduce(NextHeads, Lookahead);
// Prepare for the next token.
std::swap(Heads, NextHeads);
Expand All @@ -444,22 +607,35 @@ const ForestNode &glrParse(const TokenStream &Tokens, const ParseParams &Params,
}
LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n"));

// The parse was successful if we're in state `_ := start-symbol .`
StateID AcceptState = Params.Table.getGoToState(StartState, StartSymbol);
const ForestNode *Result = nullptr;
for (const auto *Head : Heads) {
if (Head->State == AcceptState) {
assert(Head->Payload->symbol() == StartSymbol);
assert(Result == nullptr && "multiple results!");
Result = Head->Payload;
auto SearchForAccept = [&](llvm::ArrayRef<const GSS::Node *> Heads) {
const ForestNode *Result = nullptr;
for (const auto *Head : Heads) {
if (Head->State == AcceptState) {
assert(Head->Payload->symbol() == StartSymbol);
assert(Result == nullptr && "multiple results!");
Result = Head->Payload;
}
}
}
if (Result)
return Result;
};
if (auto *Result = SearchForAccept(Heads))
return *Result;
// Failed to parse the input, attempt to run recovery.
// FIXME: this awkwardly repeats the recovery in the loop, when shift fails.
// More elegant is to include EOF in the token stream, and make the
// augmented rule: `_ := translation-unit EOF`. In this way recovery at EOF
// would not be a special case: it show up as a failure to shift the EOF
// token.
unsigned I = Terminals.size();
glrRecover(Heads, I, Tokens, Params, NextHeads);
Reduce(NextHeads, tokenSymbol(tok::eof));
if (auto *Result = SearchForAccept(NextHeads))
return *Result;

// We failed to parse the input, returning an opaque forest node for recovery.
//
// FIXME: We will need to invoke our generic error-recovery handlers when we
// reach EOF without reaching accept state, and involving the eof
// token in the above main for-loopmay be the best way to reuse the code).
// FIXME: as above, we can add fallback error handling so this is impossible.
return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
}

Expand All @@ -470,9 +646,10 @@ void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
}

const GSS::Node *GSS::addNode(LRTable::StateID State, const ForestNode *Symbol,

llvm::ArrayRef<const Node *> Parents) {
Node *Result = new (allocate(Parents.size()))
Node({State, GCParity, static_cast<unsigned>(Parents.size())});
Node({State, GCParity, static_cast<uint16_t>(Parents.size())});
Alive.push_back(Result);
++NodesCreated;
Result->Payload = Symbol;
Expand Down
8 changes: 4 additions & 4 deletions clang-tools-extra/pseudo/lib/cxx/cxx.bnf
Original file line number Diff line number Diff line change
Expand Up @@ -460,10 +460,10 @@ initializer-clause := braced-init-list
# This is standard C, and accepted by clang and others as an extension.
braced-init-list := { initializer-list ,_opt }
braced-init-list := { }
initializer-list := initializer-list-item ..._opt
initializer-list := initializer-list , initializer-list-item ..._opt
initializer-list-item := initializer-clause
initializer-list-item := designator brace-or-equal-initializer
initializer-list := initializer-list-item
initializer-list := initializer-list , initializer-list-item
initializer-list-item := initializer-clause ..._opt
initializer-list-item := designator brace-or-equal-initializer ..._opt
designator := . IDENTIFIER
#! Array designators are legal in C, and a common extension in C++.
designator := [ expression ]
Expand Down
7 changes: 5 additions & 2 deletions clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,11 @@ std::string Grammar::dumpRule(RuleID RID) const {
llvm::raw_string_ostream OS(Result);
const Rule &R = T->Rules[RID];
OS << symbolName(R.Target) << " :=";
for (SymbolID SID : R.seq())
OS << " " << symbolName(SID);
for (unsigned I = 0; I < R.Size; ++I) {
OS << " " << symbolName(R.Sequence[I]);
if (R.RecoveryIndex == I)
OS << " [recover=" << static_cast<unsigned>(R.Recovery) << "]";
}
if (R.Guard)
OS << " [guard=" << T->AttributeValues[R.Guard] << "]";
return Result;
Expand Down
11 changes: 11 additions & 0 deletions clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,17 @@ class GrammarBuilder {

assert(T->Rules.size() < (1 << RuleBits) &&
"Too many rules to fit in RuleID bits!");
// Wherever RHS contains { foo }, mark foo for brace-recovery.
// FIXME: this should be grammar annotations instead.
for (auto &Rule : T->Rules) {
for (unsigned I = 2; I < Rule.Size; ++I)
if (Rule.Sequence[I] == tokenSymbol(tok::r_brace) &&
Rule.Sequence[I - 2] == tokenSymbol(tok::l_brace) &&
!isToken(Rule.Sequence[I - 1])) {
Rule.Recovery = RecoveryStrategy::Braces;
Rule.RecoveryIndex = I - 1;
}
}
const auto &SymbolOrder = getTopologicalOrder(T.get());
llvm::stable_sort(
T->Rules, [&SymbolOrder](const Rule &Left, const Rule &Right) {
Expand Down
37 changes: 30 additions & 7 deletions clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,20 @@ nextAvailableKernelItems(const State &S, const Grammar &G) {
return Results;
}

std::vector<std::pair<RecoveryStrategy, SymbolID>>
availableRecovery(const State &S, const Grammar &G) {
std::vector<std::pair<RecoveryStrategy, SymbolID>> Result;
for (const Item &I : S.Items) {
const auto &Rule = G.lookupRule(I.rule());
if (I.dot() != Rule.RecoveryIndex)
continue;
Result.push_back({Rule.Recovery, Rule.seq()[Rule.RecoveryIndex]});
}
llvm::sort(Result);
Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
return Result;
}

} // namespace

std::string Item::dump(const Grammar &G) const {
Expand All @@ -130,9 +144,10 @@ std::string Item::dump(const Grammar &G) const {
Results.push_back(G.symbolName(SID));
return Results;
};
return llvm::formatv("{0} := {1} • {2}", G.symbolName(Rule.Target),
return llvm::formatv("{0} := {1} • {2}{3}", G.symbolName(Rule.Target),
llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "),
llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "))
llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "),
Rule.RecoveryIndex == DotPos ? " [recovery]" : "")
.str();
}

Expand Down Expand Up @@ -181,6 +196,11 @@ LRGraph LRGraph::buildLR0(const Grammar &G) {
Edges.push_back({Src, Dst, Label});
}

void insertRecovery(StateID Src, RecoveryStrategy Strategy,
SymbolID Result) {
Recoveries.push_back({Src, Strategy, Result});
}

// Returns a state with the given id.
const State &find(StateID ID) const {
assert(ID < States.size());
Expand All @@ -194,9 +214,10 @@ LRGraph LRGraph::buildLR0(const Grammar &G) {
LRGraph build() && {
States.shrink_to_fit();
Edges.shrink_to_fit();
Recoveries.shrink_to_fit();
llvm::sort(StartStates);
StartStates.shrink_to_fit();
return LRGraph(std::move(States), std::move(Edges),
return LRGraph(std::move(States), std::move(Edges), std::move(Recoveries),
std::move(StartStates));
}

Expand All @@ -205,6 +226,7 @@ LRGraph LRGraph::buildLR0(const Grammar &G) {
llvm::DenseMap<ItemSet, /*index of States*/ size_t> StatesIndex;
std::vector<State> States;
std::vector<Edge> Edges;
std::vector<Recovery> Recoveries;
const Grammar &G;
std::vector<std::pair<SymbolID, StateID>> StartStates;
} Builder(G);
Expand All @@ -225,15 +247,16 @@ LRGraph LRGraph::buildLR0(const Grammar &G) {
}

while (!PendingStates.empty()) {
auto CurrentStateID = PendingStates.back();
auto StateID = PendingStates.back();
PendingStates.pop_back();
for (auto Next :
nextAvailableKernelItems(Builder.find(CurrentStateID), G)) {
for (auto Next : nextAvailableKernelItems(Builder.find(StateID), G)) {
auto Insert = Builder.insert(Next.second);
if (Insert.second) // new state, insert to the pending queue.
PendingStates.push_back(Insert.first);
Builder.insertEdge(CurrentStateID, Insert.first, Next.first);
Builder.insertEdge(StateID, Insert.first, Next.first);
}
for (auto Recovery : availableRecovery(Builder.find(StateID), G))
Builder.insertRecovery(StateID, Recovery.first, Recovery.second);
}
return std::move(Builder).build();
}
Expand Down
28 changes: 27 additions & 1 deletion clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "clang-pseudo/grammar/LRTable.h"
#include "clang/Basic/TokenKinds.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/Support/raw_ostream.h"
#include <cstdint>

namespace llvm {
Expand Down Expand Up @@ -44,6 +45,7 @@ struct LRTable::Builder {
llvm::DenseSet<Entry> Entries;
llvm::DenseMap<StateID, llvm::SmallSet<RuleID, 4>> Reduces;
std::vector<llvm::DenseSet<SymbolID>> FollowSets;
std::vector<LRGraph::Recovery> Recoveries;

LRTable build(unsigned NumStates) && {
// E.g. given the following parsing table with 3 states and 3 terminals:
Expand Down Expand Up @@ -88,6 +90,26 @@ struct LRTable::Builder {
}
Table.StartStates = std::move(StartStates);

// Error recovery entries: sort (no dups already), and build offset lookup.
llvm::sort(Recoveries,
[&](const LRGraph::Recovery &L, const LRGraph::Recovery &R) {
return std::tie(L.Src, L.Result, L.Strategy) <
std::tie(R.Src, R.Result, R.Strategy);
});
Table.Recoveries.reserve(Recoveries.size());
for (const auto &R : Recoveries)
Table.Recoveries.push_back({R.Strategy, R.Result});
Table.RecoveryOffset = std::vector<uint32_t>(NumStates + 1, 0);
SortedIndex = 0;
for (StateID State = 0; State < NumStates; ++State) {
Table.RecoveryOffset[State] = SortedIndex;
while (SortedIndex < Recoveries.size() &&
Recoveries[SortedIndex].Src == State)
SortedIndex++;
}
Table.RecoveryOffset[NumStates] = SortedIndex;
assert(SortedIndex == Recoveries.size());

// Compile the follow sets into a bitmap.
Table.FollowSets.resize(tok::NUM_TOKENS * FollowSets.size());
for (SymbolID NT = 0; NT < FollowSets.size(); ++NT)
Expand All @@ -114,7 +136,8 @@ struct LRTable::Builder {
};

LRTable LRTable::buildForTests(const Grammar &G, llvm::ArrayRef<Entry> Entries,
llvm::ArrayRef<ReduceEntry> Reduces) {
llvm::ArrayRef<ReduceEntry> Reduces,
llvm::ArrayRef<RecoveryEntry> Recoveries) {
StateID MaxState = 0;
for (const auto &Entry : Entries) {
MaxState = std::max(MaxState, Entry.State);
Expand All @@ -128,13 +151,16 @@ LRTable LRTable::buildForTests(const Grammar &G, llvm::ArrayRef<Entry> Entries,
for (const ReduceEntry &E : Reduces)
Build.Reduces[E.State].insert(E.Rule);
Build.FollowSets = followSets(G);
for (const auto &R : Recoveries)
Build.Recoveries.push_back({R.State, R.Strategy, R.Result});
return std::move(Build).build(/*NumStates=*/MaxState + 1);
}

LRTable LRTable::buildSLR(const Grammar &G) {
auto Graph = LRGraph::buildLR0(G);
Builder Build;
Build.StartStates = Graph.startStates();
Build.Recoveries = Graph.recoveries();
for (const auto &T : Graph.edges()) {
Action Act = isToken(T.Label) ? Action::shift(T.Dst) : Action::goTo(T.Dst);
Build.Entries.insert({T.Src, T.Label, Act});
Expand Down
2 changes: 1 addition & 1 deletion clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
class Foo {
public:
};
// CHECK: decl-specifier-seq~class-specifier := class-head { member-specification }
// CHECK: decl-specifier-seq~class-specifier := class-head { member-specification [recover=1] }
// CHECK-NEXT: ├─class-head := class-key class-head-name
// CHECK-NEXT: │ ├─class-key~CLASS := tok[0]
// CHECK-NEXT: │ └─class-head-name~IDENTIFIER := tok[1]
Expand Down
13 changes: 13 additions & 0 deletions clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// RUN: clang-pseudo -grammar=%cxx-bnf-file -source=%s --print-forest | FileCheck %s
auto x = { complete garbage };
// CHECK: translation-unit~simple-declaration
// CHECK-NEXT: ├─decl-specifier-seq~AUTO := tok[0]
// CHECK-NEXT: ├─init-declarator-list~init-declarator
// CHECK-NEXT: │ ├─declarator~IDENTIFIER := tok[1]
// CHECK-NEXT: │ └─initializer~brace-or-equal-initializer
// CHECK-NEXT: │ ├─= := tok[2]
// CHECK-NEXT: │ └─initializer-clause~braced-init-list
// CHECK-NEXT: │ ├─{ := tok[3]
// CHECK-NEXT: │ ├─initializer-list := <opaque>
// CHECK-NEXT: │ └─} := tok[6]
// CHECK-NEXT: └─; := tok[7]
161 changes: 158 additions & 3 deletions clang-tools-extra/pseudo/unittests/GLRTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//

#include "clang-pseudo/GLR.h"
#include "clang-pseudo/Bracket.h"
#include "clang-pseudo/Token.h"
#include "clang-pseudo/grammar/Grammar.h"
#include "clang/Basic/LangOptions.h"
Expand All @@ -32,11 +33,13 @@ namespace {
using Action = LRTable::Action;
using testing::AllOf;
using testing::ElementsAre;
using testing::IsEmpty;
using testing::UnorderedElementsAre;

MATCHER_P(state, StateID, "") { return arg->State == StateID; }
MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; }
MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; }
MATCHER_P(start, Start, "") { return arg->Payload->startTokenIndex() == Start; }

testing::Matcher<const GSS::Node *>
parents(llvm::ArrayRef<const GSS::Node *> Parents) {
Expand Down Expand Up @@ -238,9 +241,9 @@ TEST_F(GLRTest, ReduceJoiningWithMultipleBases) {
/*State=*/1, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
const auto *GSSNode2 = GSStack.addNode(
/*State=*/2, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
const auto *GSSNode3 =
GSStack.addNode(/*State=*/3, /*ForestNode=*/ClassNameNode,
/*Parents=*/{GSSNode1});
const auto *GSSNode3 = GSStack.addNode(
/*State=*/3, /*ForestNode=*/ClassNameNode,
/*Parents=*/{GSSNode1});
const auto *GSSNode4 =
GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode,
/*Parents=*/{GSSNode2});
Expand Down Expand Up @@ -363,6 +366,124 @@ TEST_F(GLRTest, ReduceLookahead) {
EXPECT_THAT(Heads, ElementsAre(GSSNode1));
}

TEST_F(GLRTest, Recover) {
// Recovery while parsing "word" inside braces.
// Before:
// 0--1({)--2(?)
// After recovering a `word` at state 1:
// 0--3(word) // 3 is goto(1, word)
buildGrammar({"word"}, {});
LRTable Table = LRTable::buildForTests(
G, {{/*State=*/1, id("word"), Action::goTo(3)}}, /*Reduce=*/{},
/*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("word")}});

auto *LBrace = &Arena.createTerminal(tok::l_brace, 0);
auto *Question1 = &Arena.createTerminal(tok::question, 1);
const auto *Root = GSStack.addNode(0, nullptr, {});
const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
const auto *AfterQuestion1 = GSStack.addNode(2, Question1, {OpenedBraces});

// Need a token stream with paired braces so the strategy works.
clang::LangOptions LOptions;
TokenStream Tokens = cook(lex("{ ? ? ? }", LOptions), LOptions);
pairBrackets(Tokens);
std::vector<const GSS::Node *> NewHeads;

unsigned TokenIndex = 2;
glrRecover({AfterQuestion1}, TokenIndex, Tokens, {G, Table, Arena, GSStack},
NewHeads);
EXPECT_EQ(TokenIndex, 4u) << "should skip ahead to matching brace";
EXPECT_THAT(NewHeads, ElementsAre(
AllOf(state(3), parsedSymbolID(id("word")),
parents({OpenedBraces}), start(1u))));
EXPECT_EQ(NewHeads.front()->Payload->kind(), ForestNode::Opaque);

// Test recovery failure: omit closing brace so strategy fails
TokenStream NoRBrace = cook(lex("{ ? ? ? ?", LOptions), LOptions);
pairBrackets(NoRBrace);
NewHeads.clear();
TokenIndex = 2;
glrRecover({AfterQuestion1}, TokenIndex, NoRBrace,
{G, Table, Arena, GSStack}, NewHeads);
EXPECT_EQ(TokenIndex, 3u) << "should advance by 1 by default";
EXPECT_THAT(NewHeads, IsEmpty());
}

TEST_F(GLRTest, RecoverRightmost) {
// In a nested block structure, we recover at the innermost possible block.
// Before:
// 0--1({)--1({)--1({)
// After recovering a `block` at inside the second braces:
// 0--1({)--2(body) // 2 is goto(1, body)
buildGrammar({"body"}, {});
LRTable Table = LRTable::buildForTests(
G, {{/*State=*/1, id("body"), Action::goTo(2)}}, /*Reduce=*/{},
/*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("body")}});

clang::LangOptions LOptions;
// Innermost brace is unmatched, to test fallback to next brace.
TokenStream Tokens = cook(lex("{ { { ? ? } }", LOptions), LOptions);
Tokens.tokens()[0].Pair = 5;
Tokens.tokens()[1].Pair = 4;
Tokens.tokens()[4].Pair = 1;
Tokens.tokens()[5].Pair = 0;

auto *Brace1 = &Arena.createTerminal(tok::l_brace, 0);
auto *Brace2 = &Arena.createTerminal(tok::l_brace, 1);
auto *Brace3 = &Arena.createTerminal(tok::l_brace, 2);
const auto *Root = GSStack.addNode(0, nullptr, {});
const auto *In1 = GSStack.addNode(1, Brace1, {Root});
const auto *In2 = GSStack.addNode(1, Brace2, {In1});
const auto *In3 = GSStack.addNode(1, Brace3, {In2});

unsigned TokenIndex = 3;
std::vector<const GSS::Node *> NewHeads;
glrRecover({In3}, TokenIndex, Tokens, {G, Table, Arena, GSStack}, NewHeads);
EXPECT_EQ(TokenIndex, 5u);
EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(2), parsedSymbolID(id("body")),
parents({In2}), start(2u))));
}

TEST_F(GLRTest, RecoverAlternatives) {
// Recovery inside braces with multiple equally good options
// Before:
// 0--1({)
// After recovering either `word` or `number` inside the braces:
// 0--1({)--2(word) // 2 is goto(1, word)
// └--3(number) // 3 is goto(1, number)
buildGrammar({"number", "word"}, {});
LRTable Table = LRTable::buildForTests(
G,
{
{/*State=*/1, id("number"), Action::goTo(2)},
{/*State=*/1, id("word"), Action::goTo(3)},
},
/*Reduce=*/{},
/*Recovery=*/
{
{/*State=*/1, RecoveryStrategy::Braces, id("number")},
{/*State=*/1, RecoveryStrategy::Braces, id("word")},
});
auto *LBrace = &Arena.createTerminal(tok::l_brace, 0);
const auto *Root = GSStack.addNode(0, nullptr, {});
const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});

clang::LangOptions LOptions;
TokenStream Tokens = cook(lex("{ ? }", LOptions), LOptions);
pairBrackets(Tokens);
std::vector<const GSS::Node *> NewHeads;
unsigned TokenIndex = 1;

glrRecover({OpenedBraces}, TokenIndex, Tokens, {G, Table, Arena, GSStack},
NewHeads);
EXPECT_EQ(TokenIndex, 2u);
EXPECT_THAT(NewHeads,
UnorderedElementsAre(AllOf(state(2), parsedSymbolID(id("number")),
parents({OpenedBraces}), start(1u)),
AllOf(state(3), parsedSymbolID(id("word")),
parents({OpenedBraces}), start(1u))));
}

TEST_F(GLRTest, PerfectForestNodeSharing) {
// Run the GLR on a simple grammar and test that we build exactly one forest
// node per (SymbolID, token range).
Expand Down Expand Up @@ -431,6 +552,40 @@ TEST_F(GLRTest, GLRReduceOrder) {
"[ 0, end) └─IDENTIFIER := tok[0]\n");
}

TEST_F(GLRTest, RecoveryEndToEnd) {
// Simple example of brace-based recovery showing:
// - recovered region includes tokens both ahead of and behind the cursor
// - multiple possible recovery rules
// - recovery from outer scopes is rejected
build(R"bnf(
_ := block
block := { block }
block := { numbers }
numbers := NUMERIC_CONSTANT NUMERIC_CONSTANT
)bnf");
auto LRTable = LRTable::buildSLR(G);
clang::LangOptions LOptions;
TokenStream Tokens = cook(lex("{ { 42 ? } }", LOptions), LOptions);
pairBrackets(Tokens);

const ForestNode &Parsed =
glrParse(Tokens, {G, LRTable, Arena, GSStack}, id("block"));
EXPECT_EQ(Parsed.dumpRecursive(G),
"[ 0, end) block := { block [recover=1] }\n"
"[ 0, 1) ├─{ := tok[0]\n"
"[ 1, 5) ├─block := <ambiguous>\n"
"[ 1, 5) │ ├─block := { block [recover=1] }\n"
"[ 1, 2) │ │ ├─{ := tok[1]\n"
"[ 2, 4) │ │ ├─block := <opaque>\n"
"[ 4, 5) │ │ └─} := tok[4]\n"
"[ 1, 5) │ └─block := { numbers [recover=1] }\n"
"[ 1, 2) │ ├─{ := tok[1]\n"
"[ 2, 4) │ ├─numbers := <opaque>\n"
"[ 4, 5) │ └─} := tok[4]\n"
"[ 5, end) └─} := tok[5]\n");
}

TEST_F(GLRTest, NoExplicitAccept) {
build(R"bnf(
_ := test
Expand Down