diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h index 4f9867262a275..557f0e547ab4a 100644 --- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h +++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h @@ -60,7 +60,10 @@ enum class ScanningOptimizations { /// Remove unused -ivfsoverlay arguments. VFS = 4, - DSS_LAST_BITMASK_ENUM(VFS), + /// Canonicalize -D and -U options. + Macros = 8, + + DSS_LAST_BITMASK_ENUM(Macros), Default = All }; diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp index 3cf3ad8a4e490..7477b930188b4 100644 --- a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp +++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp @@ -179,6 +179,78 @@ static void sanitizeDiagOpts(DiagnosticOptions &DiagOpts) { DiagOpts.IgnoreWarnings = true; } +// Clang implements -D and -U by splatting text into a predefines buffer. This +// allows constructs such as `-DFඞ=3 "-D F\u{0D9E} 4 3 2”` to be accepted and +// define the same macro, or adding C++ style comments before the macro name. +// +// This function checks that the first non-space characters in the macro +// obviously form an identifier that can be uniqued on without lexing. Failing +// to do this could lead to changing the final definition of a macro. +// +// We could set up a preprocessor and actually lex the name, but that's very +// heavyweight for a situation that will almost never happen in practice. +static std::optional getSimpleMacroName(StringRef Macro) { + StringRef Name = Macro.split("=").first.ltrim(" \t"); + std::size_t I = 0; + + auto FinishName = [&]() -> std::optional { + StringRef SimpleName = Name.slice(0, I); + if (SimpleName.empty()) + return std::nullopt; + return SimpleName; + }; + + for (; I != Name.size(); ++I) { + switch (Name[I]) { + case '(': // Start of macro parameter list + case ' ': // End of macro name + case '\t': + return FinishName(); + case '_': + continue; + default: + if (llvm::isAlnum(Name[I])) + continue; + return std::nullopt; + } + } + return FinishName(); +} + +static void canonicalizeDefines(PreprocessorOptions &PPOpts) { + using MacroOpt = std::pair; + std::vector SimpleNames; + SimpleNames.reserve(PPOpts.Macros.size()); + std::size_t Index = 0; + for (const auto &M : PPOpts.Macros) { + auto SName = getSimpleMacroName(M.first); + // Skip optimizing if we can't guarantee we can preserve relative order. + if (!SName) + return; + SimpleNames.emplace_back(*SName, Index); + ++Index; + } + + llvm::stable_sort(SimpleNames, [](const MacroOpt &A, const MacroOpt &B) { + return A.first < B.first; + }); + // Keep the last instance of each macro name by going in reverse + auto NewEnd = std::unique( + SimpleNames.rbegin(), SimpleNames.rend(), + [](const MacroOpt &A, const MacroOpt &B) { return A.first == B.first; }); + SimpleNames.erase(SimpleNames.begin(), NewEnd.base()); + + // Apply permutation. + decltype(PPOpts.Macros) NewMacros; + NewMacros.reserve(SimpleNames.size()); + for (std::size_t I = 0, E = SimpleNames.size(); I != E; ++I) { + std::size_t OriginalIndex = SimpleNames[I].second; + // We still emit undefines here as they may be undefining a predefined macro + NewMacros.push_back(std::move(PPOpts.Macros[OriginalIndex])); + } + std::swap(PPOpts.Macros, NewMacros); +} + /// A clang tool that runs the preprocessor in a mode that's optimized for /// dependency scanning for the given compiler invocation. class DependencyScanningAction : public tooling::ToolAction { @@ -203,6 +275,8 @@ class DependencyScanningAction : public tooling::ToolAction { CompilerInvocation OriginalInvocation(*Invocation); // Restore the value of DisableFree, which may be modified by Tooling. OriginalInvocation.getFrontendOpts().DisableFree = DisableFree; + if (any(OptimizeArgs & ScanningOptimizations::Macros)) + canonicalizeDefines(OriginalInvocation.getPreprocessorOpts()); if (Scanned) { // Scanning runs once for the first -cc1 invocation in a chain of driver diff --git a/clang/test/ClangScanDeps/optimize-canonicalize-macros.m b/clang/test/ClangScanDeps/optimize-canonicalize-macros.m new file mode 100644 index 0000000000000..2c9b06be39210 --- /dev/null +++ b/clang/test/ClangScanDeps/optimize-canonicalize-macros.m @@ -0,0 +1,87 @@ +// This test verifies that command lines with equivalent -D and -U arguments +// are canonicalized to the same module variant. + +// RUN: rm -rf %t +// RUN: split-file %s %t +// RUN: sed -e "s|DIR|%/t|g" %t/build/compile-commands.json.in > %t/build/compile-commands.json +// RUN: clang-scan-deps -compilation-database %t/build/compile-commands.json \ +// RUN: -j 1 -format experimental-full -optimize-args=canonicalize-macros > %t/deps.db +// RUN: cat %t/deps.db | sed 's:\\\\\?:/:g' | FileCheck %s -DPREFIX=%/t + +// Verify that there are only two variants and that the expected merges have +// happened. + +// CHECK: { +// CHECK-NEXT: "modules": [ +// CHECK-NEXT: { +// CHECK-NEXT: "clang-module-deps": [], +// CHECK-NEXT: "clang-modulemap-file": +// CHECK-NEXT: "command-line": [ +// CHECK-NOT: "J=1" +// CHECK-NOT: "J" +// CHECK-NOT: "K" +// CHECK: ], +// CHECK-NEXT: "context-hash": "{{.*}}", +// CHECK-NEXT: "file-deps": [ +// CHECK: ], +// CHECK-NEXT: "name": "A" +// CHECK-NEXT: }, +// CHECK-NEXT: { +// CHECK-NEXT: "clang-module-deps": [], +// CHECK-NEXT: "clang-modulemap-file": +// CHECK-NEXT: "command-line": [ +// CHECK: "Fඞ" +// CHECK: "F\\u{0D9E}" +// CHECK: "K" +// CHECK: "K" +// CHECK: ], +// CHECK-NEXT: "context-hash": "{{.*}}", +// CHECK-NEXT: "file-deps": [ +// CHECK: ], +// CHECK-NEXT: "name": "A" +// CHECK-NEXT: } +// CHECK-NEXT: ], +// CHECK-NEXT: "translation-units": [ +// CHECK: ] +// CHECK: } + + +//--- build/compile-commands.json.in + +[ +{ + "directory": "DIR", + "command": "clang -c DIR/tu0.m -DJ=1 -UJ -DJ=2 -DI -DK(x)=x -I modules/A -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-module-maps", + "file": "DIR/tu0.m" +}, +{ + "directory": "DIR", + "command": "clang -c DIR/tu1.m -DK -DK(x)=x -DI -D \"J=2\" -I modules/A -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-module-maps", + "file": "DIR/tu1.m" +}, +{ + "directory": "DIR", + "command": "clang -c DIR/tu2.m -I modules/A -DFඞ '-DF\\u{0D9E}' -DK -DK -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-module-maps", + "file": "DIR/tu2.m" +} +] + +//--- modules/A/module.modulemap + +module A { + umbrella header "A.h" +} + +//--- modules/A/A.h + +//--- tu0.m + +#include + +//--- tu1.m + +#include + +//--- tu2.m + +#include diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp index 0458a4b3ecec3..9811d2a875335 100644 --- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp +++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp @@ -157,6 +157,7 @@ static void ParseArgs(int argc, char **argv) { .Case("header-search", ScanningOptimizations::HeaderSearch) .Case("system-warnings", ScanningOptimizations::SystemWarnings) .Case("vfs", ScanningOptimizations::VFS) + .Case("canonicalize-macros", ScanningOptimizations::Macros) .Case("all", ScanningOptimizations::All) .Default(std::nullopt); if (!Optimization) {