277 changes: 277 additions & 0 deletions clang/lib/Analysis/CloneDetection.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
//===--- CloneDetection.cpp - Finds code clones in an AST -------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
///
/// This file implements classes for searching and anlyzing source code clones.
///
//===----------------------------------------------------------------------===//

#include "clang/Analysis/CloneDetection.h"

#include "clang/AST/ASTContext.h"
#include "clang/AST/RecursiveASTVisitor.h"
#include "clang/AST/Stmt.h"
#include "llvm/ADT/StringRef.h"

using namespace clang;

StmtSequence::StmtSequence(const CompoundStmt *Stmt, ASTContext &Context,
unsigned StartIndex, unsigned EndIndex)
: S(Stmt), Context(&Context), StartIndex(StartIndex), EndIndex(EndIndex) {
assert(Stmt && "Stmt must not be a nullptr");
assert(StartIndex < EndIndex && "Given array should not be empty");
assert(EndIndex <= Stmt->size() && "Given array too big for this Stmt");
}

StmtSequence::StmtSequence(const Stmt *Stmt, ASTContext &Context)
: S(Stmt), Context(&Context), StartIndex(0), EndIndex(0) {}

StmtSequence::StmtSequence()
: S(nullptr), Context(nullptr), StartIndex(0), EndIndex(0) {}

bool StmtSequence::contains(const StmtSequence &Other) const {
// If both sequences reside in different translation units, they can never
// contain each other.
if (Context != Other.Context)
return false;

const SourceManager &SM = Context->getSourceManager();

// Otherwise check if the start and end locations of the current sequence
// surround the other sequence.
bool StartIsInBounds =
SM.isBeforeInTranslationUnit(getStartLoc(), Other.getStartLoc()) ||
getStartLoc() == Other.getStartLoc();
if (!StartIsInBounds)
return false;

bool EndIsInBounds =
SM.isBeforeInTranslationUnit(Other.getEndLoc(), getEndLoc()) ||
Other.getEndLoc() == getEndLoc();
return EndIsInBounds;
}

StmtSequence::iterator StmtSequence::begin() const {
if (!holdsSequence()) {
return &S;
}
auto CS = cast<CompoundStmt>(S);
return CS->body_begin() + StartIndex;
}

StmtSequence::iterator StmtSequence::end() const {
if (!holdsSequence()) {
return &S + 1;
}
auto CS = cast<CompoundStmt>(S);
return CS->body_begin() + EndIndex;
}

SourceLocation StmtSequence::getStartLoc() const {
return front()->getLocStart();
}

SourceLocation StmtSequence::getEndLoc() const { return back()->getLocEnd(); }

namespace {
/// Generates CloneSignatures for a set of statements and stores the results in
/// a CloneDetector object.
class CloneSignatureGenerator {

CloneDetector &CD;
ASTContext &Context;

/// \brief Generates CloneSignatures for all statements in the given statement
/// tree and stores them in the CloneDetector.
///
/// \param S The root of the given statement tree.
/// \return The CloneSignature of the root statement.
CloneDetector::CloneSignature generateSignatures(const Stmt *S) {
// Create an empty signature that will be filled in this method.
CloneDetector::CloneSignature Signature;

// The only relevant data for now is the class of the statement.
// TODO: Collect statement class specific data.
Signature.Data.push_back(S->getStmtClass());

// Storage for the signatures of the direct child statements. This is only
// needed if the current statement is a CompoundStmt.
std::vector<CloneDetector::CloneSignature> ChildSignatures;
const CompoundStmt *CS = dyn_cast<const CompoundStmt>(S);

// The signature of a statement includes the signatures of its children.
// Therefore we create the signatures for every child and add them to the
// current signature.
for (const Stmt *Child : S->children()) {
// Some statements like 'if' can have nullptr children that we will skip.
if (!Child)
continue;

// Recursive call to create the signature of the child statement. This
// will also create and store all clone groups in this child statement.
auto ChildSignature = generateSignatures(Child);

// Add the collected data to the signature of the current statement.
Signature.add(ChildSignature);

// If the current statement is a CompoundStatement, we need to store the
// signature for the generation of the sub-sequences.
if (CS)
ChildSignatures.push_back(ChildSignature);
}

// If the current statement is a CompoundStmt, we also need to create the
// clone groups from the sub-sequences inside the children.
if (CS)
handleSubSequences(CS, ChildSignatures);

// Save the signature for the current statement in the CloneDetector object.
CD.add(StmtSequence(S, Context), Signature);

return Signature;
}

/// \brief Adds all possible sub-sequences in the child array of the given
/// CompoundStmt to the CloneDetector.
/// \param CS The given CompoundStmt.
/// \param ChildSignatures A list of calculated signatures for each child in
/// the given CompoundStmt.
void handleSubSequences(
const CompoundStmt *CS,
const std::vector<CloneDetector::CloneSignature> &ChildSignatures) {

// FIXME: This function has quadratic runtime right now. Check if skipping
// this function for too long CompoundStmts is an option.

// The length of the sub-sequence. We don't need to handle sequences with
// the length 1 as they are already handled in CollectData().
for (unsigned Length = 2; Length <= CS->size(); ++Length) {
// The start index in the body of the CompoundStmt. We increase the
// position until the end of the sub-sequence reaches the end of the
// CompoundStmt body.
for (unsigned Pos = 0; Pos <= CS->size() - Length; ++Pos) {
// Create an empty signature and add the signatures of all selected
// child statements to it.
CloneDetector::CloneSignature SubSignature;

for (unsigned i = Pos; i < Pos + Length; ++i) {
SubSignature.add(ChildSignatures[i]);
}

// Save the signature together with the information about what children
// sequence we selected.
CD.add(StmtSequence(CS, Context, Pos, Pos + Length), SubSignature);
}
}
}

public:
explicit CloneSignatureGenerator(CloneDetector &CD, ASTContext &Context)
: CD(CD), Context(Context) {}

/// \brief Generates signatures for all statements in the given function body.
void consumeCodeBody(const Stmt *S) { generateSignatures(S); }
};
} // end anonymous namespace

void CloneDetector::analyzeCodeBody(const Decl *D) {
assert(D);
assert(D->hasBody());
CloneSignatureGenerator Generator(*this, D->getASTContext());
Generator.consumeCodeBody(D->getBody());
}

void CloneDetector::add(const StmtSequence &S,
const CloneSignature &Signature) {
// StringMap only works with StringRefs, so we create one for our data vector.
auto &Data = Signature.Data;
StringRef DataRef = StringRef(reinterpret_cast<const char *>(Data.data()),
Data.size() * sizeof(unsigned));

// Search with the help of the signature if we already have encountered a
// clone of the given StmtSequence.
auto I = CloneGroupIndexes.find(DataRef);
if (I == CloneGroupIndexes.end()) {
// We haven't found an existing clone group, so we create a new clone group
// for this StmtSequence and store the index of it in our search map.
CloneGroupIndexes[DataRef] = CloneGroups.size();
CloneGroups.emplace_back(S, Signature.Complexity);
return;
}

// We have found an existing clone group and can expand it with the given
// StmtSequence.
CloneGroups[I->getValue()].Sequences.push_back(S);
}

namespace {
/// \brief Returns true if and only if \p Stmt contains at least one other
/// sequence in the \p Group.
bool containsAnyInGroup(StmtSequence &Stmt,
CloneDetector::CloneGroup &Group) {
for (StmtSequence &GroupStmt : Group.Sequences) {
if (Stmt.contains(GroupStmt))
return true;
}
return false;
}

/// \brief Returns true if and only if all sequences in \p OtherGroup are
/// contained by a sequence in \p Group.
bool containsGroup(CloneDetector::CloneGroup &Group,
CloneDetector::CloneGroup &OtherGroup) {
// We have less sequences in the current group than we have in the other,
// so we will never fulfill the requirement for returning true. This is only
// possible because we know that a sequence in Group can contain at most
// one sequence in OtherGroup.
if (Group.Sequences.size() < OtherGroup.Sequences.size())
return false;

for (StmtSequence &Stmt : Group.Sequences) {
if (!containsAnyInGroup(Stmt, OtherGroup))
return false;
}
return true;
}
} // end anonymous namespace

void CloneDetector::findClones(std::vector<CloneGroup> &Result,
unsigned MinGroupComplexity) {
// Add every valid clone group that fulfills the complexity requirement.
for (const CloneGroup &Group : CloneGroups) {
if (Group.isValid() && Group.Complexity >= MinGroupComplexity) {
Result.push_back(Group);
}
}

std::vector<unsigned> IndexesToRemove;

// Compare every group in the result with the rest. If one groups contains
// another group, we only need to return the bigger group.
// Note: This doesn't scale well, so if possible avoid calling any heavy
// function from this loop to minimize the performance impact.
for (unsigned i = 0; i < Result.size(); ++i) {
for (unsigned j = 0; j < Result.size(); ++j) {
// Don't compare a group with itself.
if (i == j)
continue;

if (containsGroup(Result[j], Result[i])) {
IndexesToRemove.push_back(i);
break;
}
}
}

// Erasing a list of indexes from the vector should be done with decreasing
// indexes. As IndexesToRemove is constructed with increasing values, we just
// reverse iterate over it to get the desired order.
for (auto I = IndexesToRemove.rbegin(); I != IndexesToRemove.rend(); ++I) {
Result.erase(Result.begin() + *I);
}
}
1 change: 1 addition & 0 deletions clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ add_clang_library(clangStaticAnalyzerCheckers
CheckerDocumentation.cpp
ChrootChecker.cpp
ClangCheckers.cpp
CloneChecker.cpp
CXXSelfAssignmentChecker.cpp
DeadStoresChecker.cpp
DebugCheckers.cpp
Expand Down
96 changes: 96 additions & 0 deletions clang/lib/StaticAnalyzer/Checkers/CloneChecker.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
//===--- CloneChecker.cpp - Clone detection checker -------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
///
/// \file
/// CloneChecker is a checker that reports clones in the current translation
/// unit.
///
//===----------------------------------------------------------------------===//

#include "ClangSACheckers.h"
#include "clang/Analysis/CloneDetection.h"
#include "clang/Basic/Diagnostic.h"
#include "clang/StaticAnalyzer/Core/Checker.h"
#include "clang/StaticAnalyzer/Core/CheckerManager.h"
#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"

using namespace clang;
using namespace ento;

namespace {
class CloneChecker
: public Checker<check::ASTCodeBody, check::EndOfTranslationUnit> {
mutable CloneDetector CloneDetector;

public:
void checkASTCodeBody(const Decl *D, AnalysisManager &Mgr,
BugReporter &BR) const;

void checkEndOfTranslationUnit(const TranslationUnitDecl *TU,
AnalysisManager &Mgr, BugReporter &BR) const;
};
} // end anonymous namespace

void CloneChecker::checkASTCodeBody(const Decl *D, AnalysisManager &Mgr,
BugReporter &BR) const {
// Every statement that should be included in the search for clones needs to
// be passed to the CloneDetector.
CloneDetector.analyzeCodeBody(D);
}

void CloneChecker::checkEndOfTranslationUnit(const TranslationUnitDecl *TU,
AnalysisManager &Mgr,
BugReporter &BR) const {
// At this point, every statement in the translation unit has been analyzed by
// the CloneDetector. The only thing left to do is to report the found clones.

int MinComplexity = Mgr.getAnalyzerOptions().getOptionAsInteger(
"MinimumCloneComplexity", 10, this);

assert(MinComplexity >= 0);

SourceManager &SM = BR.getSourceManager();

std::vector<CloneDetector::CloneGroup> CloneGroups;
CloneDetector.findClones(CloneGroups, MinComplexity);

DiagnosticsEngine &DiagEngine = Mgr.getDiagnostic();

unsigned WarnID = DiagEngine.getCustomDiagID(DiagnosticsEngine::Warning,
"Detected code clone.");

unsigned NoteID = DiagEngine.getCustomDiagID(DiagnosticsEngine::Note,
"Related code clone is here.");

for (CloneDetector::CloneGroup &Group : CloneGroups) {
// For readability reasons we sort the clones by line numbers.
std::sort(Group.Sequences.begin(), Group.Sequences.end(),
[&SM](const StmtSequence &LHS, const StmtSequence &RHS) {
return SM.isBeforeInTranslationUnit(LHS.getStartLoc(),
RHS.getStartLoc()) &&
SM.isBeforeInTranslationUnit(LHS.getEndLoc(),
RHS.getEndLoc());
});

// We group the clones by printing the first as a warning and all others
// as a note.
DiagEngine.Report(Group.Sequences.front().getStartLoc(), WarnID);
for (unsigned i = 1; i < Group.Sequences.size(); ++i) {
DiagEngine.Report(Group.Sequences[i].getStartLoc(), NoteID);
}
}
}

//===----------------------------------------------------------------------===//
// Register CloneChecker
//===----------------------------------------------------------------------===//

void ento::registerCloneChecker(CheckerManager &Mgr) {
Mgr.registerChecker<CloneChecker>();
}
19 changes: 19 additions & 0 deletions clang/test/Analysis/copypaste/blocks.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// RUN: %clang_cc1 -analyze -fblocks -std=c++11 -analyzer-checker=alpha.clone.CloneChecker -verify %s

// This tests if we search for clones in blocks.

void log();

auto BlockA = ^(int a, int b){ // expected-warning{{Detected code clone.}}
log();
if (a > b)
return a;
return b;
};

auto BlockB = ^(int a, int b){ // expected-note{{Related code clone is here.}}
log();
if (a > b)
return a;
return b;
};
29 changes: 29 additions & 0 deletions clang/test/Analysis/copypaste/false-positives.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// RUN: %clang_cc1 -analyze -std=c++11 -analyzer-checker=alpha.clone.CloneChecker -verify %s

// This test contains false-positive reports from the CloneChecker that need to
// be fixed.

void log();

int max(int a, int b) { // expected-warning{{Detected code clone.}}
log();
if (a > b)
return a;
return b;
}

// FIXME: Detect different binary operator kinds.
int min1(int a, int b) { // expected-note{{Related code clone is here.}}
log();
if (a < b)
return a;
return b;
}

// FIXME: Detect different variable patterns.
int min2(int a, int b) { // expected-note{{Related code clone is here.}}
log();
if (b > a)
return a;
return b;
}
25 changes: 25 additions & 0 deletions clang/test/Analysis/copypaste/functions.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// RUN: %clang_cc1 -analyze -std=c++11 -analyzer-checker=alpha.clone.CloneChecker -verify %s

// This tests if we search for clones in functions.

void log();

int max(int a, int b) { // expected-warning{{Detected code clone.}}
log();
if (a > b)
return a;
return b;
}

int maxClone(int x, int y) { // expected-note{{Related code clone is here.}}
log();
if (x > y)
return x;
return y;
}

// Functions below are not clones and should not be reported.

int foo(int a, int b) { // no-warning
return a + b;
}
27 changes: 27 additions & 0 deletions clang/test/Analysis/copypaste/objc-methods.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// RUN: %clang_cc1 -analyze -Wno-objc-root-class -analyzer-checker=alpha.clone.CloneChecker -verify %s

// This tests if we search for clones in Objective-C methods.

@interface A
- (int) setOk : (int) a : (int) b;
@end

@implementation A
- (int) setOk : (int) a : (int) b { // expected-warning{{Detected code clone.}}
if (a > b)
return a;
return b;
}
@end

@interface B
- (int) setOk : (int) a : (int) b;
@end

@implementation B
- (int) setOk : (int) a : (int) b { // expected-note{{Related code clone is here.}}
if (a > b)
return a;
return b;
}
@end
27 changes: 27 additions & 0 deletions clang/test/Analysis/copypaste/sub-sequences.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// RUN: %clang_cc1 -analyze -std=c++11 -analyzer-checker=alpha.clone.CloneChecker -verify %s

// This tests if sub-sequences can match with normal sequences.

void log2(int a);
void log();

int max(int a, int b) {
log2(a);
log(); // expected-warning{{Detected code clone.}}
if (a > b)
return a;
return b;
}

int maxClone(int a, int b) {
log(); // expected-note{{Related code clone is here.}}
if (a > b)
return a;
return b;
}

// Functions below are not clones and should not be reported.

int foo(int a, int b) { // no-warning
return a + b;
}