/
IRSimilarityIdentifier.h
367 lines (336 loc) · 15 KB
/
IRSimilarityIdentifier.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
//===- IRSimilarityIdentifier.h - Find similarity in a module --------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// \file
// Interface file for the IRSimilarityIdentifier for identifying similarities in
// IR including the IRInstructionMapper, which maps an Instruction to unsigned
// integers.
//
// Two sequences of instructions are called "similar" if they perform the same
// series of operations for all inputs.
//
// \code
// %1 = add i32 %a, 10
// %2 = add i32 %a, %1
// %3 = icmp slt icmp %1, %2
// \endcode
//
// and
//
// \code
// %1 = add i32 11, %a
// %2 = sub i32 %a, %1
// %3 = icmp sgt icmp %2, %1
// \endcode
//
// ultimately have the same result, even if the inputs, and structure are
// slightly different.
//
// For instructions, we do not worry about operands that do not have fixed
// semantic meaning to the program. We consider the opcode that the instruction
// has, the types, parameters, and extra information such as the function name,
// or comparison predicate. These are used to create a hash to map instructions
// to integers to be used in similarity matching in sequences of instructions
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
#define LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/Allocator.h"
namespace llvm {
namespace IRSimilarity {
/// This represents what is and is not supported when finding similarity in
/// Instructions.
///
/// Legal Instructions are considered when looking at similarity between
/// Instructions.
///
/// Illegal Instructions cannot be considered when looking for similarity
/// between Instructions. They act as boundaries between similarity regions.
///
/// Invisible Instructions are skipped over during analysis.
// TODO: Shared with MachineOutliner
enum InstrType { Legal, Illegal, Invisible };
/// This provides the utilities for hashing an Instruction to an unsigned
/// integer. Two IRInstructionDatas produce the same hash value when their
/// underlying Instructions perform the same operation (even if they don't have
/// the same input operands.)
/// As a more concrete example, consider the following:
///
/// \code
/// %add1 = add i32 %a, %b
/// %add2 = add i32 %c, %d
/// %add3 = add i64 %e, %f
/// \endcode
///
// Then the IRInstructionData wrappers for these Instructions may be hashed like
/// so:
///
/// \code
/// ; These two adds have the same types and operand types, so they hash to the
/// ; same number.
/// %add1 = add i32 %a, %b ; Hash: 1
/// %add2 = add i32 %c, %d ; Hash: 1
/// ; This add produces an i64. This differentiates it from %add1 and %add2. So,
/// ; it hashes to a different number.
/// %add3 = add i64 %e, %f; Hash: 2
/// \endcode
///
///
/// This hashing scheme will be used to represent the program as a very long
/// string. This string can then be placed in a data structure which can be used
/// for similarity queries.
///
/// TODO: Handle types of Instructions which can be equal even with different
/// operands. (E.g. comparisons with swapped predicates.)
/// TODO: Handle CallInsts, which are only checked for function type
/// by \ref isSameOperationAs.
/// TODO: Handle GetElementPtrInsts, as some of the operands have to be the
/// exact same, and some do not.
struct IRInstructionData : ilist_node<IRInstructionData> {
/// The source Instruction that is being wrapped.
Instruction *Inst = nullptr;
/// The values of the operands in the Instruction.
SmallVector<Value *, 4> OperVals;
/// The legality of the wrapped instruction. This is informed by InstrType,
/// and is used when checking when two instructions are considered similar.
/// If either instruction is not legal, the instructions are automatically not
/// considered similar.
bool Legal;
/// Gather the information that is difficult to gather for an Instruction, or
/// is changed. i.e. the operands of an Instruction and the Types of those
/// operands. This extra information allows for similarity matching to make
/// assertions that allow for more flexibility when checking for whether an
/// Instruction performs the same operation.
IRInstructionData(Instruction &I, bool Legality);
/// Hashes \p Value based on its opcode, types, and operand types.
/// Two IRInstructionData instances produce the same hash when they perform
/// the same operation.
///
/// As a simple example, consider the following instructions.
///
/// \code
/// %add1 = add i32 %x1, %y1
/// %add2 = add i32 %x2, %y2
///
/// %sub = sub i32 %x1, %y1
///
/// %add_i64 = add i64 %x2, %y2
/// \endcode
///
/// Because the first two adds operate the same types, and are performing the
/// same action, they will be hashed to the same value.
///
/// However, the subtraction instruction is not the same as an addition, and
/// will be hashed to a different value.
///
/// Finally, the last add has a different type compared to the first two add
/// instructions, so it will also be hashed to a different value that any of
/// the previous instructions.
///
/// \param [in] Value - The IRInstructionData instance to be hashed.
/// \returns A hash_value of the IRInstructionData.
friend hash_code hash_value(const IRInstructionData &ID) {
SmallVector<Type *, 4> OperTypes;
for (Value *V : ID.OperVals)
OperTypes.push_back(V->getType());
return llvm::hash_combine(
llvm::hash_value(ID.Inst->getOpcode()),
llvm::hash_value(ID.Inst->getType()),
llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
}
};
/// Compare one IRInstructionData class to another IRInstructionData class for
/// whether they are performing a the same operation, and can mapped to the
/// same value. For regular instructions if the hash value is the same, then
/// they will also be close.
///
/// \param A - The first IRInstructionData class to compare
/// \param B - The second IRInstructionData class to compare
/// \returns true if \p A and \p B are similar enough to be mapped to the same
/// value.
bool isClose(const IRInstructionData &A, const IRInstructionData &B);
struct IRInstructionDataTraits : DenseMapInfo<IRInstructionData *> {
static inline IRInstructionData *getEmptyKey() { return nullptr; }
static inline IRInstructionData *getTombstoneKey() {
return reinterpret_cast<IRInstructionData *>(-1);
}
static unsigned getHashValue(const IRInstructionData *E) {
using llvm::hash_value;
assert(E && "IRInstructionData is a nullptr?");
return hash_value(*E);
}
static bool isEqual(const IRInstructionData *LHS,
const IRInstructionData *RHS) {
if (RHS == getEmptyKey() || RHS == getTombstoneKey() ||
LHS == getEmptyKey() || LHS == getTombstoneKey())
return LHS == RHS;
assert(LHS && RHS && "nullptr should have been caught by getEmptyKey?");
return isClose(*LHS, *RHS);
}
};
/// Helper struct for converting the Instructions in a Module into a vector of
/// unsigned integers. This vector of unsigned integers can be thought of as a
/// "numeric string". This numeric string can then be queried by, for example,
/// data structures that find repeated substrings.
///
/// This hashing is done per BasicBlock in the module. To hash Instructions
/// based off of their operations, each Instruction is wrapped in an
/// IRInstructionData struct. The unsigned integer for an IRInstructionData
/// depends on:
/// - The hash provided by the IRInstructionData.
/// - Which member of InstrType the IRInstructionData is classified as.
// See InstrType for more details on the possible classifications, and how they
// manifest in the numeric string.
///
/// The numeric string for an individual BasicBlock is terminated by an unique
/// unsigned integer. This prevents data structures which rely on repetition
/// from matching across BasicBlocks. (For example, the SuffixTree.)
/// As a concrete example, if we have the following two BasicBlocks:
/// \code
/// bb0:
/// %add1 = add i32 %a, %b
/// %add2 = add i32 %c, %d
/// %add3 = add i64 %e, %f
/// bb1:
/// %sub = sub i32 %c, %d
/// \endcode
/// We may hash the Instructions like this (via IRInstructionData):
/// \code
/// bb0:
/// %add1 = add i32 %a, %b ; Hash: 1
/// %add2 = add i32 %c, %d; Hash: 1
/// %add3 = add i64 %e, %f; Hash: 2
/// bb1:
/// %sub = sub i32 %c, %d; Hash: 3
/// %add4 = add i32 %c, %d ; Hash: 1
/// \endcode
/// And produce a "numeric string representation" like so:
/// 1, 1, 2, unique_integer_1, 3, 1, unique_integer_2
///
/// TODO: This is very similar to the MachineOutliner, and should be
/// consolidated into the same interface.
struct IRInstructionMapper {
/// The starting illegal instruction number to map to.
///
/// Set to -3 for compatibility with DenseMapInfo<unsigned>.
unsigned IllegalInstrNumber = static_cast<unsigned>(-3);
/// The next available integer to assign to a legal Instruction to.
unsigned LegalInstrNumber = 0;
/// Correspondence from IRInstructionData to unsigned integers.
DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>
InstructionIntegerMap;
/// Set if we added an illegal number in the previous step.
/// Since each illegal number is unique, we only need one of them between
/// each range of legal numbers. This lets us make sure we don't add more
/// than one illegal number per range.
bool AddedIllegalLastTime = false;
/// Marks whether we found a illegal instruction in the previous step.
bool CanCombineWithPrevInstr = false;
/// Marks whether we have found a set of instructions that is long enough
/// to be considered for similarity.
bool HaveLegalRange = false;
/// This allocator pointer is in charge of holding on to the IRInstructionData
/// so it is not deallocated until whatever external tool is using it is done
/// with the information.
SpecificBumpPtrAllocator<IRInstructionData> *InstDataAllocator = nullptr;
/// Get an allocated IRInstructionData struct using the InstDataAllocator.
///
/// \param I - The Instruction to wrap with IRInstructionData.
/// \param Legality - A boolean value that is true if the instruction is to
/// be considered for similarity, and false if not.
/// \returns An allocated IRInstructionData struct.
IRInstructionData *allocateIRInstructionData(Instruction &I, bool Legality);
/// Maps the Instructions in a BasicBlock \p BB to legal or illegal integers
/// determined by \p InstrType. Two Instructions are mapped to the same value
/// if they are close as defined by the InstructionData class above.
///
/// \param [in] BB - The BasicBlock to be mapped to integers.
/// \param [in,out] InstrList - Vector of IRInstructionData to append to.
/// \param [in,out] IntegerMapping - Vector of unsigned integers to append to.
void convertToUnsignedVec(BasicBlock &BB,
std::vector<IRInstructionData *> &InstrList,
std::vector<unsigned> &IntegerMapping);
/// Maps an Instruction to a legal integer.
///
/// \param [in] It - The Instruction to be mapped to an integer.
/// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
/// append to.
/// \param [in,out] InstrList - Vector of InstructionData to append
/// to. \returns The integer \p It was mapped to.
unsigned mapToLegalUnsigned(BasicBlock::iterator &It,
std::vector<unsigned> &IntegerMappingForBB,
std::vector<IRInstructionData *> &InstrListForBB);
/// Maps an Instruction to an illegal integer.
///
/// \param [in] It - The \p Instruction to be mapped to an integer.
/// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
/// append to.
/// \param [in,out] InstrList - Vector of IRInstructionData to append to.
/// \param End - true if creating a dummy IRInstructionData at the end of a
/// basic block.
/// \returns The integer \p It was mapped to.
unsigned mapToIllegalUnsigned(
BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
std::vector<IRInstructionData *> &InstrListForBB, bool End = false);
IRInstructionMapper(SpecificBumpPtrAllocator<IRInstructionData> *IDA)
: InstDataAllocator(IDA) {
// Make sure that the implementation of DenseMapInfo<unsigned> hasn't
// changed.
assert(DenseMapInfo<unsigned>::getEmptyKey() == static_cast<unsigned>(-1) &&
"DenseMapInfo<unsigned>'s empty key isn't -1!");
assert(DenseMapInfo<unsigned>::getTombstoneKey() ==
static_cast<unsigned>(-2) &&
"DenseMapInfo<unsigned>'s tombstone key isn't -2!");
}
/// Custom InstVisitor to classify different instructions for whether it can
/// be analyzed for similarity.
struct InstructionClassification
: public InstVisitor<InstructionClassification, InstrType> {
InstructionClassification() {}
// TODO: Determine a scheme to resolve when the label is similar enough.
InstrType visitBranchInst(BranchInst &BI) { return Illegal; }
// TODO: Determine a scheme to resolve when the labels are similar enough.
InstrType visitPHINode(PHINode &PN) { return Illegal; }
// TODO: Handle allocas.
InstrType visitAllocaInst(AllocaInst &AI) { return Illegal; }
// We exclude variable argument instructions since variable arguments
// requires extra checking of the argument list.
InstrType visitVAArgInst(VAArgInst &VI) { return Illegal; }
// We exclude all exception handling cases since they are so context
// dependent.
InstrType visitLandingPadInst(LandingPadInst &LPI) { return Illegal; }
InstrType visitFuncletPadInst(FuncletPadInst &FPI) { return Illegal; }
// DebugInfo should be included in the regions, but should not be
// analyzed for similarity as it has no bearing on the outcome of the
// program.
InstrType visitDbgInfoIntrinsic(DbgInfoIntrinsic &DII) { return Invisible; }
// TODO: Handle GetElementPtrInsts
InstrType visitGetElementPtrInst(GetElementPtrInst &GEPI) {
return Illegal;
}
// TODO: Handle specific intrinsics.
InstrType visitIntrinsicInst(IntrinsicInst &II) { return Illegal; }
// TODO: Handle CallInsts.
InstrType visitCallInst(CallInst &CI) { return Illegal; }
// TODO: We do not current handle similarity that changes the control flow.
InstrType visitInvokeInst(InvokeInst &II) { return Illegal; }
// TODO: We do not current handle similarity that changes the control flow.
InstrType visitCallBrInst(CallBrInst &CBI) { return Illegal; }
// TODO: Handle interblock similarity.
InstrType visitTerminator(Instruction &I) { return Illegal; }
InstrType visitInstruction(Instruction &I) { return Legal; }
};
/// Maps an Instruction to a member of InstrType.
InstructionClassification InstClassifier;
};
} // end namespace IRSimilarity
} // end namespace llvm
#endif // LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H