Skip to content

Commit

Permalink
[pseudo] (trivial) bracket-matching
Browse files Browse the repository at this point in the history
Error-tolerant bracket matching enables our error-tolerant parsing strategies.
The implementation here is *not* yet error tolerant: this patch sets up the APIs
and plumbing, and describes the planned approach.

Differential Revision: https://reviews.llvm.org/D125911
  • Loading branch information
sam-mccall committed May 24, 2022
1 parent f371019 commit 0360b9f
Show file tree
Hide file tree
Showing 8 changed files with 342 additions and 2 deletions.
15 changes: 14 additions & 1 deletion clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
//===----------------------------------------------------------------------===//

#include "benchmark/benchmark.h"
#include "clang-pseudo/Bracket.h"
#include "clang-pseudo/DirectiveTree.h"
#include "clang-pseudo/Forest.h"
#include "clang-pseudo/GLR.h"
Expand Down Expand Up @@ -89,7 +90,9 @@ TokenStream lexAndPreprocess() {
chooseConditionalBranches(DirectiveStructure, RawStream);
TokenStream Cook =
cook(DirectiveStructure.stripDirectives(RawStream), LangOpts);
return stripComments(Cook);
auto Stream = stripComments(Cook);
pairBrackets(Stream);
return Stream;
}

static void lex(benchmark::State &State) {
Expand All @@ -101,6 +104,16 @@ static void lex(benchmark::State &State) {
}
BENCHMARK(lex);

static void pairBrackets(benchmark::State &State) {
clang::LangOptions LangOpts = genericLangOpts();
auto Stream = clang::pseudo::lex(*SourceText, LangOpts);
for (auto _ : State)
pairBrackets(Stream);
State.SetBytesProcessed(static_cast<uint64_t>(State.iterations()) *
SourceText->size());
}
BENCHMARK(pairBrackets);

static void preprocess(benchmark::State &State) {
clang::LangOptions LangOpts = genericLangOpts();
TokenStream RawStream = clang::pseudo::lex(*SourceText, LangOpts);
Expand Down
41 changes: 41 additions & 0 deletions clang-tools-extra/pseudo/include/clang-pseudo/Bracket.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
//===--- Bracket.h - Analyze bracket structure --------------------*-C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Bracket structure (particularly braces) is key to isolating broken regions
// of code and preventing parsing from going "off the rails".
//
// For correct C++ code, brackets are well-nested and identifying pairs and
// therefore blocks is simple. In broken code, brackets are not properly nested.
// We cannot match them all and must choose which pairs to form.
//
// Rather than have the grammar-based parser make these choices, we pair
// brackets up-front based on textual features like indentation.
// This mirrors the way humans read code, and so is likely to produce the
// "correct" interpretation of broken code.
//
// This interpretation then guides the parse: a rule containing a bracket pair
// must match against paired bracket tokens.
//
//===----------------------------------------------------------------------===//

#ifndef CLANG_PSEUDO_BRACKET_H
#define CLANG_PSEUDO_BRACKET_H

#include "clang-pseudo/Token.h"

namespace clang {
namespace pseudo {

/// Identifies bracket token in the stream which should be paired.
/// Sets Token::Pair accordingly.
void pairBrackets(TokenStream &);

} // namespace pseudo
} // namespace clang

#endif
11 changes: 10 additions & 1 deletion clang-tools-extra/pseudo/include/clang-pseudo/Token.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,15 @@ struct Token {
while (T->Kind == tok::comment);
return *T;
}
/// Returns the bracket paired with this one, if any.
const Token *pair() const { return Pair == 0 ? nullptr : this + Pair; }

/// The type of token as determined by clang's lexer.
clang::tok::TokenKind Kind = clang::tok::unknown;
/// If this token is a paired bracket, the offset of the pair in the stream.
int32_t Pair = 0;
};
static_assert(sizeof(Token) <= sizeof(char *) + 16, "Careful with layout!");
static_assert(sizeof(Token) <= sizeof(char *) + 20, "Careful with layout!");
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &);

/// A half-open range of tokens within a stream.
Expand Down Expand Up @@ -155,6 +159,11 @@ class TokenStream {
return tokens().slice(R.Begin, R.End - R.Begin);
}

MutableArrayRef<Token> tokens() {
assert(isFinalized());
return Tokens;
}

/// May return the end sentinel if the stream is empty.
const Token &front() const {
assert(isFinalized());
Expand Down
155 changes: 155 additions & 0 deletions clang-tools-extra/pseudo/lib/Bracket.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
//===--- Bracket.cpp - Analyze bracket structure --------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// The basic phases of our bracket matching are:
//
// 1) A simple "greedy" match looks for well-nested subsequences.
//
// We can't fully trust the results of this, consider:
// while (1) { // A
// if (true) { // B
// break;
// } // C
// Greedy matching will match B=C, when we should at least consider A=C.
// However for the correct parts of the file, the greedy match gives the
// right answer. It produces useful candidates for phase 2.
//
// simplePairBrackets handles this step.
//
// 2) Try to identify places where formatting indicates that the greedy match
// was correct. This is similar to how a human would scan a large file.
//
// For example:
// int foo() { // X
// // indented
// while (1) {
// // valid code
// }
// return bar(42);
// } // Y
// We can "verify" that X..Y looks like a braced block, and the greedy match
// tells us that substring is perfectly nested.
// We trust the pairings of those brackets and don't examine them further.
// However in the first example above, we do not trust B=C because the brace
// indentation is suspect.
//
// FIXME: implement this step.
//
// 3) Run full best-match optimization on remaining brackets.
//
// Conceptually, this considers all possible matchings and optimizes cost:
// - there is a cost for failing to match a bracket
// - there is a variable cost for matching two brackets.
// (For example if brace indentation doesn't match).
//
// In the first example we have three alternatives, and they are ranked:
// 1) A=C, skip B
// 2) B=C, skip A
// 3) skip A, skip B, skip C
// The cost for skipping a bracket is high, so option 3 is worst.
// B=C costs more than A=C, because the indentation doesn't match.
//
// It would be correct to run this step alone, but it would be too slow.
// The implementation is dynamic programming in N^3 space and N^2 time.
// Having earlier steps filter out most brackets is key to performance.
//
// FIXME: implement this step.
//
//===----------------------------------------------------------------------===//

#include "clang-pseudo/Bracket.h"

namespace clang {
namespace pseudo {
namespace {

struct Bracket {
using Index = unsigned;
constexpr static Index None = -1;

enum BracketKind : char { Paren, Brace, Square } Kind;
enum Direction : bool { Open, Close } Dir;
unsigned Line;
unsigned Indent;
Token::Index Tok;
Bracket::Index Pair = None;
};

// Find brackets in the stream and convert to Bracket struct.
std::vector<Bracket> findBrackets(const TokenStream &Stream) {
std::vector<Bracket> Brackets;
auto Add = [&](const pseudo::Token &Tok, Bracket::BracketKind K,
Bracket::Direction D) {
Brackets.push_back(
{K, D, Tok.Line, Tok.Indent, Stream.index(Tok), Bracket::None});
};
for (const auto &Tok : Stream.tokens()) {
switch (Tok.Kind) {
case clang::tok::l_paren:
Add(Tok, Bracket::Paren, Bracket::Open);
break;
case clang::tok::r_paren:
Add(Tok, Bracket::Paren, Bracket::Close);
break;
case clang::tok::l_brace:
Add(Tok, Bracket::Brace, Bracket::Open);
break;
case clang::tok::r_brace:
Add(Tok, Bracket::Brace, Bracket::Close);
break;
case clang::tok::l_square:
Add(Tok, Bracket::Square, Bracket::Open);
break;
case clang::tok::r_square:
Add(Tok, Bracket::Square, Bracket::Close);
break;
default:
break;
}
}
return Brackets;
}

// Write the bracket pairings from Brackets back to Tokens.
void applyPairings(ArrayRef<Bracket> Brackets, TokenStream &Tokens) {
for (const auto &B : Brackets)
Tokens.tokens()[B.Tok].Pair =
(B.Pair == Bracket::None) ? 0 : (int32_t)Brackets[B.Pair].Tok - B.Tok;
}

// Find perfect pairings (ignoring whitespace) via greedy algorithm.
// This means two brackets are paired if they match and the brackets between
// them nest perfectly, with no skipped or crossed brackets.
void simplePairBrackets(MutableArrayRef<Bracket> Brackets) {
std::vector<unsigned> Stack;
for (unsigned I = 0; I < Brackets.size(); ++I) {
if (Brackets[I].Dir == Bracket::Open) {
Stack.push_back(I);
} else if (!Stack.empty() &&
Brackets[Stack.back()].Kind == Brackets[I].Kind) {
Brackets[Stack.back()].Pair = I;
Brackets[I].Pair = Stack.back();
Stack.pop_back();
} else {
// Unpaired closer, no brackets on stack are part of a perfect sequence.
Stack.clear();
}
}
// Any remaining brackets on the stack stay unpaired.
}

} // namespace

void pairBrackets(TokenStream &Stream) {
auto Brackets = findBrackets(Stream);
simplePairBrackets(Brackets);
applyPairings(Brackets, Stream);
}

} // namespace pseudo
} // namespace clang
1 change: 1 addition & 0 deletions clang-tools-extra/pseudo/lib/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
set(LLVM_LINK_COMPONENTS Support)

add_clang_library(clangPseudo
Bracket.cpp
DirectiveTree.cpp
Forest.cpp
GLR.cpp
Expand Down
2 changes: 2 additions & 0 deletions clang-tools-extra/pseudo/tool/ClangPseudo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
//
//===----------------------------------------------------------------------===//

#include "clang-pseudo/Bracket.h"
#include "clang-pseudo/DirectiveTree.h"
#include "clang-pseudo/GLR.h"
#include "clang-pseudo/Grammar.h"
Expand Down Expand Up @@ -89,6 +90,7 @@ int main(int argc, char *argv[]) {
llvm::outs() << DirectiveStructure;

ParseableStream = clang::pseudo::stripComments(cook(*Stream, LangOpts));
pairBrackets(*ParseableStream);
}

if (Grammar.getNumOccurrences()) {
Expand Down

0 comments on commit 0360b9f

Please sign in to comment.