Skip to content

Commit

Permalink
[LV] Don't emit unused scalars for uniform instructions
Browse files Browse the repository at this point in the history
If we identify an instruction as uniform after vectorization, we know that we
should only use the value corresponding to the first vector lane of each unroll
iteration. However, when scalarizing such instructions, we still produce values
for the other vector lanes. This patch prevents us from generating the unused
scalars.

Differential Revision: https://reviews.llvm.org/D24275

llvm-svn: 282087
  • Loading branch information
mssimpso committed Sep 21, 2016
1 parent 2e217b8 commit 15869f8
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 52 deletions.
72 changes: 58 additions & 14 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Expand Up @@ -2281,11 +2281,28 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
assert(ScalarIVTy->isIntegerTy() && ScalarIVTy == Step->getType() &&
"Val and Step should have the same integer type");

auto scalarUserIsUniform = [&](User *U) -> bool {
auto *I = cast<Instruction>(U);
return !OrigLoop->contains(I) || !Legal->isScalarAfterVectorization(I) ||
Legal->isUniformAfterVectorization(I);
};

// Determine the number of scalars we need to generate for each unroll
// iteration. If EntryVal is uniform or all it's scalar users are uniform, we
// only need to generate the first lane. Otherwise, we generate all VF
// values. We are essentially determining if the induction variable has no
// "multi-scalar" (non-uniform scalar) users.
unsigned Lanes =
Legal->isUniformAfterVectorization(cast<Instruction>(EntryVal)) ||
all_of(EntryVal->users(), scalarUserIsUniform)
? 1
: VF;

// Compute the scalar steps and save the results in VectorLoopValueMap.
ScalarParts Entry(UF);
for (unsigned Part = 0; Part < UF; ++Part) {
Entry[Part].resize(VF);
for (unsigned Lane = 0; Lane < VF; ++Lane) {
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + Lane);
auto *Mul = Builder.CreateMul(StartIdx, Step);
auto *Add = Builder.CreateAdd(ScalarIV, Mul);
Expand Down Expand Up @@ -2332,6 +2349,9 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
// Initialize a new vector map entry.
VectorParts Entry(UF);

// If we've scalarized a value, that value should be an instruction.
auto *I = cast<Instruction>(V);

// If we aren't vectorizing, we can just copy the scalar map values over to
// the vector map.
if (VF == 1) {
Expand All @@ -2340,25 +2360,37 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
return VectorLoopValueMap.initVector(V, Entry);
}

// Get the last scalarized instruction. This corresponds to the instruction
// we created for the last vector lane on the last unroll iteration.
auto *LastInst = cast<Instruction>(getScalarValue(V, UF - 1, VF - 1));
// Get the last scalar instruction we generated for V. If the value is
// known to be uniform after vectorization, this corresponds to lane zero
// of the last unroll iteration. Otherwise, the last instruction is the one
// we created for the last vector lane of the last unroll iteration.
unsigned LastLane = Legal->isUniformAfterVectorization(I) ? 0 : VF - 1;
auto *LastInst = cast<Instruction>(getScalarValue(V, UF - 1, LastLane));

// Set the insert point after the last scalarized instruction. This ensures
// the insertelement sequence will directly follow the scalar definitions.
auto OldIP = Builder.saveIP();
auto NewIP = std::next(BasicBlock::iterator(LastInst));
Builder.SetInsertPoint(&*NewIP);

// However, if we are vectorizing, we need to construct the vector values
// using insertelement instructions. Since the resulting vectors are stored
// in VectorLoopValueMap, we will only generate the insertelements once.
// However, if we are vectorizing, we need to construct the vector values.
// If the value is known to be uniform after vectorization, we can just
// broadcast the scalar value corresponding to lane zero for each unroll
// iteration. Otherwise, we construct the vector values using insertelement
// instructions. Since the resulting vectors are stored in
// VectorLoopValueMap, we will only generate the insertelements once.
for (unsigned Part = 0; Part < UF; ++Part) {
Value *Insert = UndefValue::get(VectorType::get(V->getType(), VF));
for (unsigned Lane = 0; Lane < VF; ++Lane)
Insert = Builder.CreateInsertElement(
Insert, getScalarValue(V, Part, Lane), Builder.getInt32(Lane));
Entry[Part] = Insert;
Value *VectorValue = nullptr;
if (Legal->isUniformAfterVectorization(I)) {
VectorValue = getBroadcastInstrs(getScalarValue(V, Part, 0));
} else {
VectorValue = UndefValue::get(VectorType::get(V->getType(), VF));
for (unsigned Lane = 0; Lane < VF; ++Lane)
VectorValue = Builder.CreateInsertElement(
VectorValue, getScalarValue(V, Part, Lane),
Builder.getInt32(Lane));
}
Entry[Part] = VectorValue;
}
Builder.restoreIP(OldIP);
return VectorLoopValueMap.initVector(V, Entry);
Expand All @@ -2378,6 +2410,9 @@ Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part,
if (OrigLoop->isLoopInvariant(V))
return V;

assert(Lane > 0 ? !Legal->isUniformAfterVectorization(cast<Instruction>(V))
: true && "Uniform values only have lane zero");

// If the value from the original loop has not been vectorized, it is
// represented by UF x VF scalar values in the new loop. Return the requested
// scalar value.
Expand Down Expand Up @@ -2884,11 +2919,16 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
if (IfPredicateInstr)
Cond = createBlockInMask(Instr->getParent());

// Determine the number of scalars we need to generate for each unroll
// iteration. If the instruction is uniform, we only need to generate the
// first lane. Otherwise, we generate all VF values.
unsigned Lanes = Legal->isUniformAfterVectorization(Instr) ? 1 : VF;

// For each vector unroll 'part':
for (unsigned Part = 0; Part < UF; ++Part) {
Entry[Part].resize(VF);
// For each scalar that we create:
for (unsigned Lane = 0; Lane < VF; ++Lane) {
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {

// Start if-block.
Value *Cmp = nullptr;
Expand Down Expand Up @@ -4398,12 +4438,16 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
// This is the normalized GEP that starts counting at zero.
Value *PtrInd = Induction;
PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
// Determine the number of scalars we need to generate for each unroll
// iteration. If the instruction is uniform, we only need to generate the
// first lane. Otherwise, we generate all VF values.
unsigned Lanes = Legal->isUniformAfterVectorization(P) ? 1 : VF;
// These are the scalar results. Notice that we don't generate vector GEPs
// because scalar GEPs result in better code.
ScalarParts Entry(UF);
for (unsigned Part = 0; Part < UF; ++Part) {
Entry[Part].resize(VF);
for (unsigned Lane = 0; Lane < VF; ++Lane) {
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);
Expand Down
8 changes: 0 additions & 8 deletions llvm/test/Transforms/LoopVectorize/induction.ll
Expand Up @@ -78,21 +78,15 @@ loopexit:
; CHECK: vector.body:
; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: %[[i0:.+]] = add i64 %index, 0
; CHECK: %[[i1:.+]] = add i64 %index, 1
; CHECK: getelementptr inbounds i64, i64* %a, i64 %[[i0]]
; CHECK: getelementptr inbounds i64, i64* %a, i64 %[[i1]]
;
; UNROLL-NO-IC-LABEL: @scalarize_induction_variable_01(
; UNROLL-NO-IC: vector.body:
; UNROLL-NO-IC: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; UNROLL-NO-IC: %[[i0:.+]] = add i64 %index, 0
; UNROLL-NO-IC: %[[i1:.+]] = add i64 %index, 1
; UNROLL-NO-IC: %[[i2:.+]] = add i64 %index, 2
; UNROLL-NO-IC: %[[i3:.+]] = add i64 %index, 3
; UNROLL-NO-IC: getelementptr inbounds i64, i64* %a, i64 %[[i0]]
; UNROLL-NO-IC: getelementptr inbounds i64, i64* %a, i64 %[[i1]]
; UNROLL-NO-IC: getelementptr inbounds i64, i64* %a, i64 %[[i2]]
; UNROLL-NO-IC: getelementptr inbounds i64, i64* %a, i64 %[[i3]]
;
; IND-LABEL: @scalarize_induction_variable_01(
; IND: vector.body:
Expand Down Expand Up @@ -611,9 +605,7 @@ exit:
; CHECK: %vec.ind = phi <2 x i32> [ %[[START]], %vector.ph ], [ %vec.ind.next, %vector.body ]
; CHECK: %offset.idx = add i32 %i, %index
; CHECK: %[[A1:.*]] = add i32 %offset.idx, 0
; CHECK: %[[A2:.*]] = add i32 %offset.idx, 1
; CHECK: %[[G1:.*]] = getelementptr inbounds i32, i32* %a, i32 %[[A1]]
; CHECK: %[[G2:.*]] = getelementptr inbounds i32, i32* %a, i32 %[[A2]]
; CHECK: %[[G3:.*]] = getelementptr i32, i32* %[[G1]], i32 0
; CHECK: %[[B1:.*]] = bitcast i32* %[[G3]] to <2 x i32>*
; CHECK: store <2 x i32> %vec.ind, <2 x i32>* %[[B1]]
Expand Down
30 changes: 0 additions & 30 deletions llvm/test/Transforms/LoopVectorize/reverse_induction.ll
Expand Up @@ -8,13 +8,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: %offset.idx = sub i64 %startval, %index
; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0
; CHECK: %[[a1:.+]] = add i64 %offset.idx, -1
; CHECK: %[[a2:.+]] = add i64 %offset.idx, -2
; CHECK: %[[a3:.+]] = add i64 %offset.idx, -3
; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4
; CHECK: %[[a5:.+]] = add i64 %offset.idx, -5
; CHECK: %[[a6:.+]] = add i64 %offset.idx, -6
; CHECK: %[[a7:.+]] = add i64 %offset.idx, -7

define i32 @reverse_induction_i64(i64 %startval, i32 * %ptr) {
entry:
Expand All @@ -40,13 +34,7 @@ loopend:
; CHECK: %index = phi i128 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: %offset.idx = sub i128 %startval, %index
; CHECK: %[[a0:.+]] = add i128 %offset.idx, 0
; CHECK: %[[a1:.+]] = add i128 %offset.idx, -1
; CHECK: %[[a2:.+]] = add i128 %offset.idx, -2
; CHECK: %[[a3:.+]] = add i128 %offset.idx, -3
; CHECK: %[[a4:.+]] = add i128 %offset.idx, -4
; CHECK: %[[a5:.+]] = add i128 %offset.idx, -5
; CHECK: %[[a6:.+]] = add i128 %offset.idx, -6
; CHECK: %[[a7:.+]] = add i128 %offset.idx, -7

define i32 @reverse_induction_i128(i128 %startval, i32 * %ptr) {
entry:
Expand All @@ -72,13 +60,7 @@ loopend:
; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: %offset.idx = sub i16 %startval, {{.*}}
; CHECK: %[[a0:.+]] = add i16 %offset.idx, 0
; CHECK: %[[a1:.+]] = add i16 %offset.idx, -1
; CHECK: %[[a2:.+]] = add i16 %offset.idx, -2
; CHECK: %[[a3:.+]] = add i16 %offset.idx, -3
; CHECK: %[[a4:.+]] = add i16 %offset.idx, -4
; CHECK: %[[a5:.+]] = add i16 %offset.idx, -5
; CHECK: %[[a6:.+]] = add i16 %offset.idx, -6
; CHECK: %[[a7:.+]] = add i16 %offset.idx, -7

define i32 @reverse_induction_i16(i16 %startval, i32 * %ptr) {
entry:
Expand Down Expand Up @@ -121,13 +103,7 @@ loopend:
; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: %offset.idx = sub i64 1023, %index
; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0
; CHECK: %[[a1:.+]] = add i64 %offset.idx, -1
; CHECK: %[[a2:.+]] = add i64 %offset.idx, -2
; CHECK: %[[a3:.+]] = add i64 %offset.idx, -3
; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4
; CHECK: %[[a5:.+]] = add i64 %offset.idx, -5
; CHECK: %[[a6:.+]] = add i64 %offset.idx, -6
; CHECK: %[[a7:.+]] = add i64 %offset.idx, -7

define void @reverse_forward_induction_i64_i8() {
entry:
Expand All @@ -153,13 +129,7 @@ while.end:
; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: %offset.idx = sub i64 1023, %index
; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0
; CHECK: %[[a1:.+]] = add i64 %offset.idx, -1
; CHECK: %[[a2:.+]] = add i64 %offset.idx, -2
; CHECK: %[[a3:.+]] = add i64 %offset.idx, -3
; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4
; CHECK: %[[a5:.+]] = add i64 %offset.idx, -5
; CHECK: %[[a6:.+]] = add i64 %offset.idx, -6
; CHECK: %[[a7:.+]] = add i64 %offset.idx, -7

define void @reverse_forward_induction_i64_i8_signed() {
entry:
Expand Down

0 comments on commit 15869f8

Please sign in to comment.