141 changes: 141 additions & 0 deletions clang/lib/AST/StmtDataCollectors.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
// The functions below collect the class specific data of each Stmt subclass.

DEF_ADD_DATA(Stmt, {
addData(S->getStmtClass());
// This ensures that non-macro-generated code isn't identical to
// macro-generated code.
addData(data_collection::getMacroStack(S->getLocStart(), Context));
addData(data_collection::getMacroStack(S->getLocEnd(), Context));
})
DEF_ADD_DATA(Expr, { addData(S->getType()); })

//--- Builtin functionality ----------------------------------------------//
DEF_ADD_DATA(ArrayTypeTraitExpr, { addData(S->getTrait()); })
DEF_ADD_DATA(ExpressionTraitExpr, { addData(S->getTrait()); })
DEF_ADD_DATA(PredefinedExpr, { addData(S->getIdentType()); })
DEF_ADD_DATA(TypeTraitExpr, {
addData(S->getTrait());
for (unsigned i = 0; i < S->getNumArgs(); ++i)
addData(S->getArg(i)->getType());
})

//--- Calls --------------------------------------------------------------//
DEF_ADD_DATA(CallExpr, {
// Function pointers don't have a callee and we just skip hashing it.
if (const FunctionDecl *D = S->getDirectCallee()) {
// If the function is a template specialization, we also need to handle
// the template arguments as they are not included in the qualified name.
if (auto Args = D->getTemplateSpecializationArgs()) {
std::string ArgString;

// Print all template arguments into ArgString
llvm::raw_string_ostream OS(ArgString);
for (unsigned i = 0; i < Args->size(); ++i) {
Args->get(i).print(Context.getLangOpts(), OS);
// Add a padding character so that 'foo<X, XX>()' != 'foo<XX, X>()'.
OS << '\n';
}
OS.flush();

addData(ArgString);
}
addData(D->getQualifiedNameAsString());
}
})

//--- Value references ---------------------------------------------------//
DEF_ADD_DATA(DeclRefExpr,
{ addData(S->getDecl()->getQualifiedNameAsString()); })
DEF_ADD_DATA(MemberExpr,
{ addData(S->getMemberDecl()->getName()); })

//--- Literals -----------------------------------------------------------//
DEF_ADD_DATA(IntegerLiteral, { addData(llvm::hash_value(S->getValue())); })
DEF_ADD_DATA(FloatingLiteral, { addData(llvm::hash_value(S->getValue())); })
DEF_ADD_DATA(StringLiteral, { addData(S->getString()); })
DEF_ADD_DATA(CXXBoolLiteralExpr, { addData(S->getValue()); })
DEF_ADD_DATA(CharacterLiteral, { addData(S->getValue()); })

//--- Exceptions ---------------------------------------------------------//
DEF_ADD_DATA(CXXCatchStmt, { addData(S->getCaughtType()); })

//--- C++ OOP Stmts ------------------------------------------------------//
DEF_ADD_DATA(CXXDeleteExpr, {
addData(S->isArrayFormAsWritten());
addData(S->isGlobalDelete());
})

//--- Casts --------------------------------------------------------------//
DEF_ADD_DATA(ObjCBridgedCastExpr, { addData(S->getBridgeKind()); })

//--- Miscellaneous Exprs ------------------------------------------------//
DEF_ADD_DATA(BinaryOperator, { addData(S->getOpcode()); })
DEF_ADD_DATA(UnaryOperator, { addData(S->getOpcode()); })

//--- Control flow -------------------------------------------------------//
DEF_ADD_DATA(GotoStmt, { addData(S->getLabel()->getName()); })
DEF_ADD_DATA(IndirectGotoStmt, {
if (S->getConstantTarget())
addData(S->getConstantTarget()->getName());
})
DEF_ADD_DATA(LabelStmt, { addData(S->getDecl()->getName()); })
DEF_ADD_DATA(MSDependentExistsStmt, { addData(S->isIfExists()); })
DEF_ADD_DATA(AddrLabelExpr, { addData(S->getLabel()->getName()); })

//--- Objective-C --------------------------------------------------------//
DEF_ADD_DATA(ObjCIndirectCopyRestoreExpr, { addData(S->shouldCopy()); })
DEF_ADD_DATA(ObjCPropertyRefExpr, {
addData(S->isSuperReceiver());
addData(S->isImplicitProperty());
})
DEF_ADD_DATA(ObjCAtCatchStmt, { addData(S->hasEllipsis()); })

//--- Miscellaneous Stmts ------------------------------------------------//
DEF_ADD_DATA(CXXFoldExpr, {
addData(S->isRightFold());
addData(S->getOperator());
})
DEF_ADD_DATA(GenericSelectionExpr, {
for (unsigned i = 0; i < S->getNumAssocs(); ++i) {
addData(S->getAssocType(i));
}
})
DEF_ADD_DATA(LambdaExpr, {
for (const LambdaCapture &C : S->captures()) {
addData(C.isPackExpansion());
addData(C.getCaptureKind());
if (C.capturesVariable())
addData(C.getCapturedVar()->getType());
}
addData(S->isGenericLambda());
addData(S->isMutable());
})
DEF_ADD_DATA(DeclStmt, {
auto numDecls = std::distance(S->decl_begin(), S->decl_end());
addData(static_cast<unsigned>(numDecls));
for (const Decl *D : S->decls()) {
if (const VarDecl *VD = dyn_cast<VarDecl>(D)) {
addData(VD->getType());
}
}
})
DEF_ADD_DATA(AsmStmt, {
addData(S->isSimple());
addData(S->isVolatile());
addData(S->generateAsmString(Context));
for (unsigned i = 0; i < S->getNumInputs(); ++i) {
addData(S->getInputConstraint(i));
}
for (unsigned i = 0; i < S->getNumOutputs(); ++i) {
addData(S->getOutputConstraint(i));
}
for (unsigned i = 0; i < S->getNumClobbers(); ++i) {
addData(S->getClobber(i));
}
})
DEF_ADD_DATA(AttributedStmt, {
for (const Attr *A : S->getAttrs()) {
addData(std::string(A->getSpelling()));
}
})
#undef DEF_ADD_DATA
112 changes: 68 additions & 44 deletions clang/lib/Analysis/CloneDetection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,12 @@

#include "clang/Analysis/CloneDetection.h"

#include "clang/AST/ASTContext.h"
#include "clang/AST/RecursiveASTVisitor.h"
#include "clang/AST/Stmt.h"
#include "clang/Lex/Lexer.h"
#include "clang/AST/DataCollection.h"
#include "clang/AST/DeclTemplate.h"
#include "llvm/Support/MD5.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Support/Path.h"

using namespace clang;
using namespace clang::clone_detection;

StmtSequence::StmtSequence(const CompoundStmt *Stmt, const Decl *D,
unsigned StartIndex, unsigned EndIndex)
Expand Down Expand Up @@ -91,34 +87,6 @@ SourceRange StmtSequence::getSourceRange() const {
return SourceRange(getStartLoc(), getEndLoc());
}

/// Prints the macro name that contains the given SourceLocation into the given
/// raw_string_ostream.
static void printMacroName(llvm::raw_string_ostream &MacroStack,
ASTContext &Context, SourceLocation Loc) {
MacroStack << Lexer::getImmediateMacroName(Loc, Context.getSourceManager(),
Context.getLangOpts());

// Add an empty space at the end as a padding to prevent
// that macro names concatenate to the names of other macros.
MacroStack << " ";
}

std::string clone_detection::getMacroStack(SourceLocation Loc,
ASTContext &Context) {
std::string MacroStack;
llvm::raw_string_ostream MacroStackStream(MacroStack);
SourceManager &SM = Context.getSourceManager();

// Iterate over all macros that expanded into the given SourceLocation.
while (Loc.isMacroID()) {
// Add the macro name to the stream.
printMacroName(MacroStackStream, Context, Loc);
Loc = SM.getImmediateMacroCallerLoc(Loc);
}
MacroStackStream.flush();
return MacroStack;
}

void CloneDetector::analyzeCodeBody(const Decl *D) {
assert(D);
assert(D->hasBody());
Expand Down Expand Up @@ -184,23 +152,77 @@ void OnlyLargestCloneConstraint::constrain(
}
}

bool FilenamePatternConstraint::isAutoGenerated(const CloneDetector::CloneGroup &Group) {
bool FilenamePatternConstraint::isAutoGenerated(
const CloneDetector::CloneGroup &Group) {
std::string Error;
if (IgnoredFilesPattern.empty() || Group.empty() ||
if (IgnoredFilesPattern.empty() || Group.empty() ||
!IgnoredFilesRegex->isValid(Error))
return false;

for (const StmtSequence &S : Group) {
const SourceManager &SM = S.getASTContext().getSourceManager();
StringRef Filename = llvm::sys::path::filename(SM.getFilename(
S.getContainingDecl()->getLocation()));
StringRef Filename = llvm::sys::path::filename(
SM.getFilename(S.getContainingDecl()->getLocation()));
if (IgnoredFilesRegex->match(Filename))
return true;
}

return false;
}

/// This class defines what a type II code clone is: If it collects for two
/// statements the same data, then those two statements are considered to be
/// clones of each other.
///
/// All collected data is forwarded to the given data consumer of the type T.
/// The data consumer class needs to provide a member method with the signature:
/// update(StringRef Str)
namespace {
template <class T>
class CloneTypeIIStmtDataCollector
: public ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>> {
ASTContext &Context;
/// The data sink to which all data is forwarded.
T &DataConsumer;

template <class Ty> void addData(const Ty &Data) {
data_collection::addDataToConsumer(DataConsumer, Data);
}

public:
CloneTypeIIStmtDataCollector(const Stmt *S, ASTContext &Context,
T &DataConsumer)
: Context(Context), DataConsumer(DataConsumer) {
this->Visit(S);
}

// Define a visit method for each class to collect data and subsequently visit
// all parent classes. This uses a template so that custom visit methods by us
// take precedence.
#define DEF_ADD_DATA(CLASS, CODE) \
template <class = void> void Visit##CLASS(const CLASS *S) { \
CODE; \
ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>>::Visit##CLASS(S); \
}

#include "../AST/StmtDataCollectors.inc"

// Type II clones ignore variable names and literals, so let's skip them.
#define SKIP(CLASS) \
void Visit##CLASS(const CLASS *S) { \
ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>>::Visit##CLASS(S); \
}
SKIP(DeclRefExpr)
SKIP(MemberExpr)
SKIP(IntegerLiteral)
SKIP(FloatingLiteral)
SKIP(StringLiteral)
SKIP(CXXBoolLiteralExpr)
SKIP(CharacterLiteral)
#undef SKIP
};
} // end anonymous namespace

static size_t createHash(llvm::MD5 &Hash) {
size_t HashCode;

Expand All @@ -222,7 +244,7 @@ size_t RecursiveCloneTypeIIConstraint::saveHash(
llvm::MD5 Hash;
ASTContext &Context = D->getASTContext();

StmtDataCollector<llvm::MD5>(S, Context, Hash);
CloneTypeIIStmtDataCollector<llvm::MD5>(S, Context, Hash);

auto CS = dyn_cast<CompoundStmt>(S);
SmallVector<size_t, 8> ChildHashes;
Expand Down Expand Up @@ -288,8 +310,8 @@ class FoldingSetNodeIDWrapper {
static void CollectStmtSequenceData(const StmtSequence &Sequence,
FoldingSetNodeIDWrapper &OutputData) {
for (const Stmt *S : Sequence) {
StmtDataCollector<FoldingSetNodeIDWrapper>(S, Sequence.getASTContext(),
OutputData);
CloneTypeIIStmtDataCollector<FoldingSetNodeIDWrapper>(
S, Sequence.getASTContext(), OutputData);

for (const Stmt *Child : S->children()) {
if (!Child)
Expand Down Expand Up @@ -339,7 +361,7 @@ void RecursiveCloneTypeIIConstraint::constrain(
// Sort hash_codes in StmtsByHash.
std::stable_sort(StmtsByHash.begin(), StmtsByHash.end(),
[](std::pair<size_t, StmtSequence> LHS,
std::pair<size_t, StmtSequence> RHS) {
std::pair<size_t, StmtSequence> RHS) {
return LHS.first < RHS.first;
});

Expand Down Expand Up @@ -393,8 +415,10 @@ size_t MinComplexityConstraint::calculateStmtComplexity(
ASTContext &Context = Seq.getASTContext();

// Look up what macros expanded into the current statement.
std::string StartMacroStack = getMacroStack(Seq.getStartLoc(), Context);
std::string EndMacroStack = getMacroStack(Seq.getEndLoc(), Context);
std::string StartMacroStack =
data_collection::getMacroStack(Seq.getStartLoc(), Context);
std::string EndMacroStack =
data_collection::getMacroStack(Seq.getEndLoc(), Context);

// First, check if ParentMacroStack is not empty which means we are currently
// dealing with a parent statement which was expanded from a macro.
Expand Down
1 change: 1 addition & 0 deletions clang/unittests/AST/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ add_clang_unittest(ASTTests
ASTVectorTest.cpp
CommentLexer.cpp
CommentParser.cpp
DataCollectionTest.cpp
DeclPrinterTest.cpp
DeclTest.cpp
EvaluateAsRValueTest.cpp
Expand Down
173 changes: 173 additions & 0 deletions clang/unittests/AST/DataCollectionTest.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
//===- unittests/AST/DataCollectionTest.cpp -------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file contains tests for the DataCollection module.
//
// They work by hashing the collected data of two nodes and asserting that the
// hash values are equal iff the nodes are considered equal.
//
//===----------------------------------------------------------------------===//

#include "clang/AST/DataCollection.h"
#include "clang/AST/DeclTemplate.h"
#include "clang/AST/StmtVisitor.h"
#include "clang/ASTMatchers/ASTMatchFinder.h"
#include "clang/Tooling/Tooling.h"
#include "gtest/gtest.h"

using namespace clang;
using namespace tooling;
using namespace ast_matchers;

namespace {
class StmtDataCollector : public ConstStmtVisitor<StmtDataCollector> {
ASTContext &Context;
llvm::MD5 &DataConsumer;

template <class T> void addData(const T &Data) {
data_collection::addDataToConsumer(DataConsumer, Data);
}

public:
StmtDataCollector(const Stmt *S, ASTContext &Context, llvm::MD5 &DataConsumer)
: Context(Context), DataConsumer(DataConsumer) {
this->Visit(S);
}

#define DEF_ADD_DATA(CLASS, CODE) \
template <class Dummy = void> Dummy Visit##CLASS(const CLASS *S) { \
CODE; \
ConstStmtVisitor<StmtDataCollector>::Visit##CLASS(S); \
}

#include "../../lib/AST/StmtDataCollectors.inc"
};
} // end anonymous namespace

namespace {
struct StmtHashMatch : public MatchFinder::MatchCallback {
unsigned NumFound;
llvm::MD5::MD5Result &Hash;
StmtHashMatch(llvm::MD5::MD5Result &Hash) : NumFound(0), Hash(Hash) {}

void run(const MatchFinder::MatchResult &Result) override {
const Stmt *S = Result.Nodes.getNodeAs<Stmt>("id");
if (!S)
return;
++NumFound;
if (NumFound > 1)
return;
llvm::MD5 MD5;
StmtDataCollector(S, *Result.Context, MD5);
MD5.final(Hash);
}
};
} // end anonymous namespace

static testing::AssertionResult hashStmt(llvm::MD5::MD5Result &Hash,
const StatementMatcher &StmtMatch,
StringRef Code) {
StmtHashMatch Hasher(Hash);
MatchFinder Finder;
Finder.addMatcher(StmtMatch, &Hasher);
std::unique_ptr<FrontendActionFactory> Factory(
newFrontendActionFactory(&Finder));
if (!runToolOnCode(Factory->create(), Code))
return testing::AssertionFailure()
<< "Parsing error in \"" << Code.str() << "\"";
if (Hasher.NumFound == 0)
return testing::AssertionFailure() << "Matcher didn't find any statements";
if (Hasher.NumFound > 1)
return testing::AssertionFailure()
<< "Matcher should match only one statement "
"(found "
<< Hasher.NumFound << ")";
return testing::AssertionSuccess();
}

static testing::AssertionResult
isStmtHashEqual(const StatementMatcher &StmtMatch, StringRef Code1,
StringRef Code2) {
llvm::MD5::MD5Result Hash1, Hash2;
testing::AssertionResult Result = hashStmt(Hash1, StmtMatch, Code1);
if (!Result)
return Result;
if (!(Result = hashStmt(Hash2, StmtMatch, Code2)))
return Result;

return testing::AssertionResult(Hash1 == Hash2);
}

TEST(StmtDataCollector, TestDeclRefExpr) {
ASSERT_TRUE(isStmtHashEqual(declRefExpr().bind("id"), "int x, r = x;",
"int x, r = x;"));
ASSERT_FALSE(isStmtHashEqual(declRefExpr().bind("id"), "int x, r = x;",
"int y, r = y;"));
ASSERT_FALSE(isStmtHashEqual(declRefExpr().bind("id"), "int x, r = x;",
"namespace n { int x, r = x; };"));
}

TEST(StmtDataCollector, TestMemberExpr) {
ASSERT_TRUE(isStmtHashEqual(memberExpr().bind("id"),
"struct { int x; } X; int r = X.x;",
"struct { int x; } X; int r = (&X)->x;"));
ASSERT_TRUE(isStmtHashEqual(memberExpr().bind("id"),
"struct { int x; } X; int r = X.x;",
"struct { int x; } Y; int r = Y.x;"));
ASSERT_TRUE(isStmtHashEqual(memberExpr().bind("id"),
"struct { int x; } X; int r = X.x;",
"struct C { int x; } X; int r = X.C::x;"));
ASSERT_FALSE(isStmtHashEqual(memberExpr().bind("id"),
"struct { int x; } X; int r = X.x;",
"struct { int y; } X; int r = X.y;"));
}

TEST(StmtDataCollector, TestIntegerLiteral) {
ASSERT_TRUE(
isStmtHashEqual(integerLiteral().bind("id"), "int x = 0;", "int x = 0;"));
ASSERT_TRUE(
isStmtHashEqual(integerLiteral().bind("id"), "int x = 0;", "int x =00;"));
ASSERT_FALSE(
isStmtHashEqual(integerLiteral().bind("id"), "int x = 0;", "int x = 1;"));
}

TEST(StmtDataCollector, TestFloatingLiteral) {
ASSERT_TRUE(isStmtHashEqual(floatLiteral().bind("id"), "double x = .0;",
"double x = .0;"));
ASSERT_TRUE(isStmtHashEqual(floatLiteral().bind("id"), "double x = .10;",
"double x = .1;"));
ASSERT_TRUE(isStmtHashEqual(floatLiteral().bind("id"), "double x = .1;",
"double x = 1e-1;"));
ASSERT_FALSE(isStmtHashEqual(floatLiteral().bind("id"), "double x = .0;",
"double x = .1;"));
}

TEST(StmtDataCollector, TestStringLiteral) {
ASSERT_TRUE(isStmtHashEqual(stringLiteral().bind("id"), R"(char x[] = "0";)",
R"(char x[] = "0";)"));
ASSERT_FALSE(isStmtHashEqual(stringLiteral().bind("id"), R"(char x[] = "0";)",
R"(char x[] = "1";)"));
}

TEST(StmtDataCollector, TestCXXBoolLiteral) {
ASSERT_TRUE(isStmtHashEqual(cxxBoolLiteral().bind("id"), "bool x = false;",
"bool x = false;"));
ASSERT_FALSE(isStmtHashEqual(cxxBoolLiteral().bind("id"), "bool x = false;",
"bool x = true;"));
}

TEST(StmtDataCollector, TestCharacterLiteral) {
ASSERT_TRUE(isStmtHashEqual(characterLiteral().bind("id"), "char x = '0';",
"char x = '0';"));
ASSERT_TRUE(isStmtHashEqual(characterLiteral().bind("id"),
R"(char x = '\0';)",
R"(char x = '\x00';)"));
ASSERT_FALSE(isStmtHashEqual(characterLiteral().bind("id"), "char x = '0';",
"char x = '1';"));
}