Skip to content

Commit

Permalink
[LV] For some IVs, use vector phis instead of widening in the loop body
Browse files Browse the repository at this point in the history
Previously, whenever we needed a vector IV, we would create it on the fly,
by splatting the scalar IV and adding a step vector. Instead, we can create a
real vector IV. This tends to save a couple of instructions per iteration.

This only changes the behavior for the most basic case - integer primary
IVs with a constant step.

Differential Revision: http://reviews.llvm.org/D20315

llvm-svn: 271410
  • Loading branch information
mkuperst committed Jun 1, 2016
1 parent 2c3933f commit 3a3c64d
Show file tree
Hide file tree
Showing 9 changed files with 161 additions and 40 deletions.
96 changes: 76 additions & 20 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,14 @@ class InnerLoopVectorizer {
/// from SCEV or creates a new using SCEVExpander.
virtual Value *getStepVector(Value *Val, int StartIdx, const SCEV *Step);

/// Create a vector induction variable based on an existing scalar one.
/// Currently only works for integer primary induction variables with
/// a constant step.
/// If TruncType is provided, instead of widening the original IV, we
/// widen a version of the IV truncated to TruncType.
void widenInductionVariable(const InductionDescriptor &II, VectorParts &Entry,
IntegerType *TruncType = nullptr);

/// When we go over instructions in the basic block we rely on previous
/// values within the current basic block or on loop invariant values.
/// When we widen (vectorize) values we place them in the map. If the values
Expand Down Expand Up @@ -2099,6 +2107,40 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
return getStepVector(Val, StartIdx, StepValue);
}

void InnerLoopVectorizer::widenInductionVariable(const InductionDescriptor &II,
VectorParts &Entry,
IntegerType *TruncType) {
Value *Start = II.getStartValue();
ConstantInt *Step = II.getConstIntStepValue();
assert(Step && "Can not widen an IV with a non-constant step");

// Construct the initial value of the vector IV in the vector loop preheader
auto CurrIP = Builder.saveIP();
Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
if (TruncType) {
Step = ConstantInt::getSigned(TruncType, Step->getSExtValue());
Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
}
Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
Value *SteppedStart = getStepVector(SplatStart, 0, Step);
Builder.restoreIP(CurrIP);

Value *SplatVF =
ConstantVector::getSplat(VF, ConstantInt::get(Start->getType(), VF));
// We may need to add the step a number of times, depending on the unroll
// factor. The last of those goes into the PHI.
PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
&*LoopVectorBody->getFirstInsertionPt());
Value *LastInduction = VecInd;
for (unsigned Part = 0; Part < UF; ++Part) {
Entry[Part] = LastInduction;
LastInduction = Builder.CreateAdd(LastInduction, SplatVF, "step.add");
}

VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
VecInd->addIncoming(LastInduction, LoopVectorBody);
}

Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
Value *Step) {
assert(Val->getType()->isVectorTy() && "Must be a vector");
Expand Down Expand Up @@ -4056,19 +4098,25 @@ void InnerLoopVectorizer::widenPHIInstruction(
llvm_unreachable("Unknown induction");
case InductionDescriptor::IK_IntInduction: {
assert(P->getType() == II.getStartValue()->getType() && "Types must match");
// Handle other induction variables that are now based on the
// canonical one.
Value *V = Induction;
if (P != OldInduction) {
V = Builder.CreateSExtOrTrunc(Induction, P->getType());
V = II.transform(Builder, V, PSE.getSE(), DL);
V->setName("offset.idx");
if (P != OldInduction || VF == 1) {
Value *V = Induction;
// Handle other induction variables that are now based on the
// canonical one.
if (P != OldInduction) {
V = Builder.CreateSExtOrTrunc(Induction, P->getType());
V = II.transform(Builder, V, PSE.getSE(), DL);
V->setName("offset.idx");
}
Value *Broadcasted = getBroadcastInstrs(V);
// After broadcasting the induction variable we need to make the vector
// consecutive by adding 0, 1, 2, etc.
for (unsigned part = 0; part < UF; ++part)
Entry[part] = getStepVector(Broadcasted, VF * part, II.getStep());
} else {
// Instead of re-creating the vector IV by splatting the scalar IV
// in each iteration, we can make a new independent vector IV.
widenInductionVariable(II, Entry);
}
Value *Broadcasted = getBroadcastInstrs(V);
// After broadcasting the induction variable we need to make the vector
// consecutive by adding 0, 1, 2, etc.
for (unsigned part = 0; part < UF; ++part)
Entry[part] = getStepVector(Broadcasted, VF * part, II.getStep());
return;
}
case InductionDescriptor::IK_PtrInduction:
Expand Down Expand Up @@ -4239,15 +4287,23 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
if (CI->getOperand(0) == OldInduction &&
it->getOpcode() == Instruction::Trunc) {
InductionDescriptor II =
Legal->getInductionVars()->lookup(OldInduction);
Legal->getInductionVars()->lookup(OldInduction);
if (auto StepValue = II.getConstIntStepValue()) {
StepValue = ConstantInt::getSigned(cast<IntegerType>(CI->getType()),
StepValue->getSExtValue());
Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
CI->getType());
Value *Broadcasted = getBroadcastInstrs(ScalarCast);
for (unsigned Part = 0; Part < UF; ++Part)
Entry[Part] = getStepVector(Broadcasted, VF * Part, StepValue);
IntegerType *TruncType = cast<IntegerType>(CI->getType());
if (VF == 1) {
StepValue =
ConstantInt::getSigned(TruncType, StepValue->getSExtValue());
Value *ScalarCast =
Builder.CreateCast(CI->getOpcode(), Induction, CI->getType());
Value *Broadcasted = getBroadcastInstrs(ScalarCast);
for (unsigned Part = 0; Part < UF; ++Part)
Entry[Part] = getStepVector(Broadcasted, VF * Part, StepValue);
} else {
// Truncating a vector induction variable on each iteration
// may be expensive. Instead, truncate the initial value, and create
// a new, truncated, vector IV based on that.
widenInductionVariable(II, Entry, TruncType);
}
addMetadata(Entry, &*it);
break;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ for.end12: ; preds = %for.end, %entry

; CHECK-LABEL: @s173
; CHECK: load <4 x float>, <4 x float>*
; CHECK: add i64 %index, 16000
; CHECK: add nsw i64 %.lhs, 16000
; CHECK: ret i32 0
}

Expand Down
6 changes: 3 additions & 3 deletions llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ for.end: ; preds = %for.cond
%struct.In = type { float, float }

;AVX512-LABEL: @foo2
;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %induction, i32 1
;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1
;AVX512: llvm.masked.gather.v16f32
;AVX512: llvm.masked.store.v16f32
;AVX512: ret void
Expand Down Expand Up @@ -170,10 +170,10 @@ for.end: ; preds = %for.cond
;}

;AVX512-LABEL: @foo3
;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %induction, i32 1
;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1
;AVX512: llvm.masked.gather.v16f32
;AVX512: fadd <16 x float>
;AVX512: getelementptr %struct.Out, %struct.Out* %out, <16 x i64> %induction, i32 1
;AVX512: getelementptr %struct.Out, %struct.Out* %out, <16 x i64> %{{.*}}, i32 1
;AVX512: llvm.masked.scatter.v16f32
;AVX512: ret void

Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Transforms/LoopVectorize/cast-induction.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ target triple = "x86_64-apple-macosx10.8.0"
@a = common global [2048 x i32] zeroinitializer, align 16

;CHECK-LABEL: @example12(
;CHECK: trunc i64
;CHECK: %vec.ind1 = phi <4 x i32>
;CHECK: store <4 x i32>
;CHECK: ret void
define void @example12() nounwind uwtable ssp {
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Transforms/LoopVectorize/gcc-examples.ll
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,7 @@ define void @example11() nounwind uwtable ssp {
}

;CHECK-LABEL: @example12(
;CHECK: trunc i64
;CHECK: %vec.ind1 = phi <4 x i32>
;CHECK: store <4 x i32>
;CHECK: ret void
define void @example12() nounwind uwtable ssp {
Expand Down
11 changes: 6 additions & 5 deletions llvm/test/Transforms/LoopVectorize/gep_with_bitcast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"

; CHECK-LABEL: @foo
; CHECK: vector.body
; CHECK: %0 = getelementptr inbounds double*, double** %in, i64 %index
; CHECK: %1 = bitcast double** %0 to <4 x i64>*
; CHECK: %wide.load = load <4 x i64>, <4 x i64>* %1, align 8
; CHECK: %2 = icmp eq <4 x i64> %wide.load, zeroinitializer
; CHECK: %0 = phi
; CHECK: %2 = getelementptr inbounds double*, double** %in, i64 %0
; CHECK: %3 = bitcast double** %2 to <4 x i64>*
; CHECK: %wide.load = load <4 x i64>, <4 x i64>* %3, align 8
; CHECK: %4 = icmp eq <4 x i64> %wide.load, zeroinitializer
; CHECK: br i1

define void @foo(double** noalias nocapture readonly %in, double** noalias nocapture readnone %out, i8* noalias nocapture %res) #0 {
Expand All @@ -37,4 +38,4 @@ for.body:

for.end:
ret void
}
}
8 changes: 4 additions & 4 deletions llvm/test/Transforms/LoopVectorize/global_alias.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
@PA = external global i32*


;; === First, the tests that should always vectorize, wither statically or by adding run-time checks ===
;; === First, the tests that should always vectorize, whether statically or by adding run-time checks ===


; /// Different objects, positive induction, constant distance
Expand Down Expand Up @@ -387,7 +387,7 @@ for.end: ; preds = %for.cond
; return Foo.A[a];
; }
; CHECK-LABEL: define i32 @noAlias08(
; CHECK: sub <4 x i32>
; CHECK: sub nuw nsw <4 x i32>
; CHECK: ret

define i32 @noAlias08(i32 %a) #0 {
Expand Down Expand Up @@ -439,7 +439,7 @@ for.end: ; preds = %for.cond
; return Foo.A[a];
; }
; CHECK-LABEL: define i32 @noAlias09(
; CHECK: sub <4 x i32>
; CHECK: sub nuw nsw <4 x i32>
; CHECK: ret

define i32 @noAlias09(i32 %a) #0 {
Expand Down Expand Up @@ -721,7 +721,7 @@ for.end: ; preds = %for.cond
; return Foo.A[a];
; }
; CHECK-LABEL: define i32 @noAlias14(
; CHECK: sub <4 x i32>
; CHECK: sub nuw nsw <4 x i32>
; CHECK: ret

define i32 @noAlias14(i32 %a) #0 {
Expand Down
65 changes: 63 additions & 2 deletions llvm/test/Transforms/LoopVectorize/induction.ll
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S | FileCheck %s
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=IND
; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=UNROLL

target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"

Expand Down Expand Up @@ -27,8 +29,6 @@ for.end:
ret void
}

; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=IND

; Make sure we remove unneeded vectorization of induction variables.
; In order for instcombine to cleanup the vectorized induction variables that we
; create in the loop vectorizer we need to perform some form of redundancy
Expand Down Expand Up @@ -241,3 +241,64 @@ entry:
exit:
ret void
}

; Check that we generate vectorized IVs in the pre-header
; instead of widening the scalar IV inside the loop, when
; we know how to do that.
; IND-LABEL: veciv
; IND: vector.body:
; IND: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; IND: %vec.ind = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %step.add, %vector.body ]
; IND: %step.add = add <2 x i32> %vec.ind, <i32 2, i32 2>
; IND: %index.next = add i32 %index, 2
; IND: %[[CMP:.*]] = icmp eq i32 %index.next
; IND: br i1 %[[CMP]]
; UNROLL-LABEL: veciv
; UNROLL: vector.body:
; UNROLL: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; UNROLL: %vec.ind = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %step.add1, %vector.body ]
; UNROLL: %step.add = add <2 x i32> %vec.ind, <i32 2, i32 2>
; UNROLL: %step.add1 = add <2 x i32> %vec.ind, <i32 4, i32 4>
; UNROLL: %index.next = add i32 %index, 4
; UNROLL: %[[CMP:.*]] = icmp eq i32 %index.next
; UNROLL: br i1 %[[CMP]]
define void @veciv(i32* nocapture %a, i32 %start, i32 %k) {
for.body.preheader:
br label %for.body

for.body:
%indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
%arrayidx = getelementptr inbounds i32, i32* %a, i32 %indvars.iv
store i32 %indvars.iv, i32* %arrayidx, align 4
%indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
%exitcond = icmp eq i32 %indvars.iv.next, %k
br i1 %exitcond, label %exit, label %for.body

exit:
ret void
}

; IND-LABEL: trunciv
; IND: vector.body:
; IND: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; IND: %[[VECIND:.*]] = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %[[STEPADD:.*]], %vector.body ]
; IND: %[[STEPADD]] = add <2 x i32> %[[VECIND]], <i32 2, i32 2>
; IND: %index.next = add i64 %index, 2
; IND: %[[CMP:.*]] = icmp eq i64 %index.next
; IND: br i1 %[[CMP]]
define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) {
for.body.preheader:
br label %for.body

for.body:
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
%trunc.iv = trunc i64 %indvars.iv to i32
%arrayidx = getelementptr inbounds i32, i32* %a, i32 %trunc.iv
store i32 %trunc.iv, i32* %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, %k
br i1 %exitcond, label %exit, label %for.body

exit:
ret void
}
9 changes: 6 additions & 3 deletions llvm/test/Transforms/LoopVectorize/induction_plus.ll
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -instcombine -S | FileCheck %s
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s

target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"

@array = common global [1024 x i32] zeroinitializer, align 16

;CHECK-LABEL: @array_at_plus_one(
;CHECK: add i64 %index, 12
;CHECK: trunc i64
;CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
;CHECK: %vec.ind = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %step.add, %vector.body ]
;CHECK: %vec.ind1 = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %step.add2, %vector.body ]
;CHECK: add <4 x i64> %vec.ind, <i64 4, i64 4, i64 4, i64 4>
;CHECK: add nsw <4 x i64> %vec.ind, <i64 12, i64 12, i64 12, i64 12>
;CHECK: ret i32
define i32 @array_at_plus_one(i32 %n) nounwind uwtable ssp {
%1 = icmp sgt i32 %n, 0
Expand Down

0 comments on commit 3a3c64d

Please sign in to comment.