Skip to content

Commit

Permalink
[analyzer][HTMLRewriter] Cache partial rewrite results. (#80220)
Browse files Browse the repository at this point in the history
This is a follow-up for 721dd3b [analyzer] NFC: Don't regenerate
duplicate HTML reports.

Because HTMLRewriter re-runs the Lexer for syntax highlighting and macro
expansion purposes, it may get fairly expensive when the rewriter is
invoked multiple times on the same file. In the static analyzer (which
uses HTMLRewriter for HTML output mode) we only get away with this
because there are usually very few reports emitted per file. But if loud
checkers are enabled, such as `webkit.*`, this may explode in complexity
and even cause the compiler to run over the 32-bit SourceLocation
addressing limit.

This patch caches intermediate results so that re-lexing only needed to
happen once.

As the clever __COUNTER__ test demonstrates, "once" is still too many.
Ideally we shouldn't re-lex anything at all, which remains a TODO.
  • Loading branch information
haoNoQ committed Feb 1, 2024
1 parent fe408eb commit 243bfed
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 27 deletions.
15 changes: 13 additions & 2 deletions clang/include/clang/Rewrite/Core/HTMLRewrite.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,15 @@ class RewriteBuffer;
class Preprocessor;

namespace html {
struct RelexRewriteCache;
using RelexRewriteCacheRef = std::shared_ptr<RelexRewriteCache>;

/// If you need to rewrite the same file multiple times, you can instantiate
/// a RelexRewriteCache and refer functions such as SyntaxHighlight()
/// and HighlightMacros() to it so that to avoid re-lexing the file each time.
/// The cache may outlive the rewriter as long as cached FileIDs and source
/// locations continue to make sense for the translation unit as a whole.
RelexRewriteCacheRef instantiateRelexRewriteCache();

/// HighlightRange - Highlight a range in the source code with the specified
/// start/end tags. B/E must be in the same file. This ensures that
Expand Down Expand Up @@ -67,13 +76,15 @@ namespace html {

/// SyntaxHighlight - Relex the specified FileID and annotate the HTML with
/// information about keywords, comments, etc.
void SyntaxHighlight(Rewriter &R, FileID FID, const Preprocessor &PP);
void SyntaxHighlight(Rewriter &R, FileID FID, const Preprocessor &PP,
RelexRewriteCacheRef Cache = nullptr);

/// HighlightMacros - This uses the macro table state from the end of the
/// file, to reexpand macros and insert (into the HTML) information about the
/// macro expansions. This won't be perfectly perfect, but it will be
/// reasonably close.
void HighlightMacros(Rewriter &R, FileID FID, const Preprocessor &PP);
void HighlightMacros(Rewriter &R, FileID FID, const Preprocessor &PP,
RelexRewriteCacheRef Cache = nullptr);

} // end html namespace
} // end clang namespace
Expand Down
135 changes: 116 additions & 19 deletions clang/lib/Rewrite/HTMLRewrite.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/raw_ostream.h"
#include <memory>
using namespace clang;

using namespace clang;
using namespace llvm;
using namespace html;

/// HighlightRange - Highlight a range in the source code with the specified
/// start/end tags. B/E must be in the same file. This ensures that
Expand Down Expand Up @@ -104,6 +106,32 @@ void html::HighlightRange(RewriteBuffer &RB, unsigned B, unsigned E,
}
}

namespace clang::html {
struct RelexRewriteCache {
// These structs mimic input arguments of HighlightRange().
struct Highlight {
SourceLocation B, E;
std::string StartTag, EndTag;
bool IsTokenRange;
};
struct RawHighlight {
unsigned B, E;
std::string StartTag, EndTag;
};

// SmallVector isn't appropriate because these vectors are almost never small.
using HighlightList = std::vector<Highlight>;
using RawHighlightList = std::vector<RawHighlight>;

DenseMap<FileID, RawHighlightList> SyntaxHighlights;
DenseMap<FileID, HighlightList> MacroHighlights;
};
} // namespace clang::html

html::RelexRewriteCacheRef html::instantiateRelexRewriteCache() {
return std::make_shared<RelexRewriteCache>();
}

void html::EscapeText(Rewriter &R, FileID FID,
bool EscapeSpaces, bool ReplaceTabs) {

Expand Down Expand Up @@ -442,13 +470,18 @@ input.spoilerhider:checked + label + .spoiler{
/// information about keywords, macro expansions etc. This uses the macro
/// table state from the end of the file, so it won't be perfectly perfect,
/// but it will be reasonably close.
void html::SyntaxHighlight(Rewriter &R, FileID FID, const Preprocessor &PP) {
RewriteBuffer &RB = R.getEditBuffer(FID);
static void SyntaxHighlightImpl(
Rewriter &R, FileID FID, const Preprocessor &PP,
llvm::function_ref<void(RewriteBuffer &, unsigned, unsigned, const char *,
const char *, const char *)>
HighlightRangeCallback) {

RewriteBuffer &RB = R.getEditBuffer(FID);
const SourceManager &SM = PP.getSourceManager();
llvm::MemoryBufferRef FromFile = SM.getBufferOrFake(FID);
const char *BufferStart = FromFile.getBuffer().data();

Lexer L(FID, FromFile, SM, PP.getLangOpts());
const char *BufferStart = L.getBuffer().data();

// Inform the preprocessor that we want to retain comments as tokens, so we
// can highlight them.
Expand All @@ -475,13 +508,13 @@ void html::SyntaxHighlight(Rewriter &R, FileID FID, const Preprocessor &PP) {

// If this is a pp-identifier, for a keyword, highlight it as such.
if (Tok.isNot(tok::identifier))
HighlightRange(RB, TokOffs, TokOffs+TokLen, BufferStart,
"<span class='keyword'>", "</span>");
HighlightRangeCallback(RB, TokOffs, TokOffs + TokLen, BufferStart,
"<span class='keyword'>", "</span>");
break;
}
case tok::comment:
HighlightRange(RB, TokOffs, TokOffs+TokLen, BufferStart,
"<span class='comment'>", "</span>");
HighlightRangeCallback(RB, TokOffs, TokOffs + TokLen, BufferStart,
"<span class='comment'>", "</span>");
break;
case tok::utf8_string_literal:
// Chop off the u part of u8 prefix
Expand All @@ -498,8 +531,8 @@ void html::SyntaxHighlight(Rewriter &R, FileID FID, const Preprocessor &PP) {
[[fallthrough]];
case tok::string_literal:
// FIXME: Exclude the optional ud-suffix from the highlighted range.
HighlightRange(RB, TokOffs, TokOffs+TokLen, BufferStart,
"<span class='string_literal'>", "</span>");
HighlightRangeCallback(RB, TokOffs, TokOffs + TokLen, BufferStart,
"<span class='string_literal'>", "</span>");
break;
case tok::hash: {
// If this is a preprocessor directive, all tokens to end of line are too.
Expand All @@ -516,8 +549,8 @@ void html::SyntaxHighlight(Rewriter &R, FileID FID, const Preprocessor &PP) {
}

// Find end of line. This is a hack.
HighlightRange(RB, TokOffs, TokEnd, BufferStart,
"<span class='directive'>", "</span>");
HighlightRangeCallback(RB, TokOffs, TokEnd, BufferStart,
"<span class='directive'>", "</span>");

// Don't skip the next token.
continue;
Expand All @@ -527,12 +560,43 @@ void html::SyntaxHighlight(Rewriter &R, FileID FID, const Preprocessor &PP) {
L.LexFromRawLexer(Tok);
}
}
void html::SyntaxHighlight(Rewriter &R, FileID FID, const Preprocessor &PP,
RelexRewriteCacheRef Cache) {
RewriteBuffer &RB = R.getEditBuffer(FID);
const SourceManager &SM = PP.getSourceManager();
llvm::MemoryBufferRef FromFile = SM.getBufferOrFake(FID);
const char *BufferStart = FromFile.getBuffer().data();

if (Cache) {
auto CacheIt = Cache->SyntaxHighlights.find(FID);
if (CacheIt != Cache->SyntaxHighlights.end()) {
for (const RelexRewriteCache::RawHighlight &H : CacheIt->second) {
HighlightRange(RB, H.B, H.E, BufferStart, H.StartTag.data(),
H.EndTag.data());
}
return;
}
}

// "Every time you would call HighlightRange, cache the inputs as well."
auto HighlightRangeCallback = [&](RewriteBuffer &RB, unsigned B, unsigned E,
const char *BufferStart,
const char *StartTag, const char *EndTag) {
HighlightRange(RB, B, E, BufferStart, StartTag, EndTag);

if (Cache)
Cache->SyntaxHighlights[FID].push_back({B, E, StartTag, EndTag});
};

SyntaxHighlightImpl(R, FID, PP, HighlightRangeCallback);
}

static void HighlightMacrosImpl(
Rewriter &R, FileID FID, const Preprocessor &PP,
llvm::function_ref<void(Rewriter &, SourceLocation, SourceLocation,
const char *, const char *, bool)>
HighlightRangeCallback) {

/// HighlightMacros - This uses the macro table state from the end of the
/// file, to re-expand macros and insert (into the HTML) information about the
/// macro expansions. This won't be perfectly perfect, but it will be
/// reasonably close.
void html::HighlightMacros(Rewriter &R, FileID FID, const Preprocessor& PP) {
// Re-lex the raw token stream into a token buffer.
const SourceManager &SM = PP.getSourceManager();
std::vector<Token> TokenStream;
Expand Down Expand Up @@ -659,11 +723,44 @@ void html::HighlightMacros(Rewriter &R, FileID FID, const Preprocessor& PP) {
// get highlighted.
Expansion = "<span class='macro_popup'>" + Expansion + "</span></span>";

HighlightRange(R, LLoc.getBegin(), LLoc.getEnd(), "<span class='macro'>",
Expansion.c_str(), LLoc.isTokenRange());
HighlightRangeCallback(R, LLoc.getBegin(), LLoc.getEnd(),
"<span class='macro'>", Expansion.c_str(),
LLoc.isTokenRange());
}

// Restore the preprocessor's old state.
TmpPP.setDiagnostics(*OldDiags);
TmpPP.setPragmasEnabled(PragmasPreviouslyEnabled);
}

/// HighlightMacros - This uses the macro table state from the end of the
/// file, to re-expand macros and insert (into the HTML) information about the
/// macro expansions. This won't be perfectly perfect, but it will be
/// reasonably close.
void html::HighlightMacros(Rewriter &R, FileID FID, const Preprocessor &PP,
RelexRewriteCacheRef Cache) {
if (Cache) {
auto CacheIt = Cache->MacroHighlights.find(FID);
if (CacheIt != Cache->MacroHighlights.end()) {
for (const RelexRewriteCache::Highlight &H : CacheIt->second) {
HighlightRange(R, H.B, H.E, H.StartTag.data(), H.EndTag.data(),
H.IsTokenRange);
}
return;
}
}

// "Every time you would call HighlightRange, cache the inputs as well."
auto HighlightRangeCallback = [&](Rewriter &R, SourceLocation B,
SourceLocation E, const char *StartTag,
const char *EndTag, bool isTokenRange) {
HighlightRange(R, B, E, StartTag, EndTag, isTokenRange);

if (Cache) {
Cache->MacroHighlights[FID].push_back(
{B, E, StartTag, EndTag, isTokenRange});
}
};

HighlightMacrosImpl(R, FID, PP, HighlightRangeCallback);
}
10 changes: 4 additions & 6 deletions clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ class HTMLDiagnostics : public PathDiagnosticConsumer {
const Preprocessor &PP;
const bool SupportsCrossFileDiagnostics;
llvm::StringSet<> EmittedHashes;
html::RelexRewriteCacheRef RewriterCache =
html::instantiateRelexRewriteCache();

public:
HTMLDiagnostics(PathDiagnosticConsumerOptions DiagOpts,
Expand Down Expand Up @@ -309,10 +311,6 @@ void HTMLDiagnostics::ReportDiag(const PathDiagnostic& D,
return;
}

// FIXME: This causes each file to be re-parsed and syntax-highlighted
// and macro-expanded separately for each report. We could cache such rewrites
// across all reports and only re-do the part that's actually different:
// the warning/note bubbles.
std::string report = GenerateHTML(D, R, SMgr, path, declName.c_str());
if (report.empty()) {
llvm::errs() << "warning: no diagnostics generated for main file.\n";
Expand Down Expand Up @@ -882,8 +880,8 @@ void HTMLDiagnostics::RewriteFile(Rewriter &R, const PathPieces &path,
// If we have a preprocessor, relex the file and syntax highlight.
// We might not have a preprocessor if we come from a deserialized AST file,
// for example.
html::SyntaxHighlight(R, FID, PP);
html::HighlightMacros(R, FID, PP);
html::SyntaxHighlight(R, FID, PP, RewriterCache);
html::HighlightMacros(R, FID, PP, RewriterCache);
}

void HTMLDiagnostics::HandlePiece(Rewriter &R, FileID BugFileID,
Expand Down
29 changes: 29 additions & 0 deletions clang/test/Analysis/html_diagnostics/counter.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// RUN: rm -fR %t
// RUN: mkdir %t
// RUN: %clang_analyze_cc1 -analyzer-checker=core \
// RUN: -analyzer-output=html -o %t -verify %s
// RUN: grep -v CHECK %t/report-*.html | FileCheck %s


void foo() {
int *x = 0;
*x = __COUNTER__; // expected-warning{{Dereference of null pointer (loaded from variable 'x')}}
}

void bar() {
int *y;
*y = __COUNTER__; // expected-warning{{Dereference of undefined pointer value (loaded from variable 'y')}}
}

// The checks below confirm that both reports have the same values for __COUNTER__.
//
// FIXME: The correct values are (0, 1, 0, 1). Because we re-lex the file in order
// to detect macro expansions for HTML report purposes, they turn into (2, 3, 2, 3)
// by the time we emit HTML. But at least it's better than (2, 3, 4, 5)
// which would have been the case if we re-lexed the file *each* time we
// emitted an HTML report.

// CHECK: <span class='macro'>__COUNTER__<span class='macro_popup'>2</span></span>
// CHECK: <span class='macro'>__COUNTER__<span class='macro_popup'>3</span></span>
// CHECK: <span class='macro'>__COUNTER__<span class='macro_popup'>2</span></span>
// CHECK: <span class='macro'>__COUNTER__<span class='macro_popup'>3</span></span>

0 comments on commit 243bfed

Please sign in to comment.