[LV] For some IVs, use vector phis instead of widening in the loop body

Previously, whenever we needed a vector IV, we would create it on the fly, by splatting the scalar IV and adding a step vector. Instead, we can create a real vector IV. This tends to save a couple of instructions per iteration. This only changes the behavior for the most basic case - integer primary IVs with a constant step. Differential Revision: http://reviews.llvm.org/D20315 llvm-svn: 271410
llvm · Jun 1, 2016 · 3a3c64d · 3a3c64d
1 parent 2c3933f
commit 3a3c64d
Show file tree

Hide file tree

Showing 9 changed files with 161 additions and 40 deletions.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -422,6 +422,14 @@ class InnerLoopVectorizer {
   /// from SCEV or creates a new using SCEVExpander.
   virtual Value *getStepVector(Value *Val, int StartIdx, const SCEV *Step);
 
+  /// Create a vector induction variable based on an existing scalar one.
+  /// Currently only works for integer primary induction variables with
+  /// a constant step.
+  /// If TruncType is provided, instead of widening the original IV, we
+  /// widen a version of the IV truncated to TruncType.
+  void widenInductionVariable(const InductionDescriptor &II, VectorParts &Entry,
+                              IntegerType *TruncType = nullptr);
+
   /// When we go over instructions in the basic block we rely on previous
   /// values within the current basic block or on loop invariant values.
   /// When we widen (vectorize) values we place them in the map. If the values
@@ -2099,6 +2107,40 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
   return getStepVector(Val, StartIdx, StepValue);
 }
 
+void InnerLoopVectorizer::widenInductionVariable(const InductionDescriptor &II,
+                                                 VectorParts &Entry,
+                                                 IntegerType *TruncType) {
+  Value *Start = II.getStartValue();
+  ConstantInt *Step = II.getConstIntStepValue();
+  assert(Step && "Can not widen an IV with a non-constant step");
+
+  // Construct the initial value of the vector IV in the vector loop preheader
+  auto CurrIP = Builder.saveIP();
+  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+  if (TruncType) {
+    Step = ConstantInt::getSigned(TruncType, Step->getSExtValue());
+    Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
+  }
+  Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
+  Value *SteppedStart = getStepVector(SplatStart, 0, Step);
+  Builder.restoreIP(CurrIP);
+
+  Value *SplatVF =
+      ConstantVector::getSplat(VF, ConstantInt::get(Start->getType(), VF));
+  // We may need to add the step a number of times, depending on the unroll
+  // factor. The last of those goes into the PHI.
+  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
+                                    &*LoopVectorBody->getFirstInsertionPt());
+  Value *LastInduction = VecInd;
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    Entry[Part] = LastInduction;
+    LastInduction = Builder.CreateAdd(LastInduction, SplatVF, "step.add");
+  }
+
+  VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
+  VecInd->addIncoming(LastInduction, LoopVectorBody);
+}
+
 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
                                           Value *Step) {
   assert(Val->getType()->isVectorTy() && "Must be a vector");
@@ -4056,19 +4098,25 @@ void InnerLoopVectorizer::widenPHIInstruction(
     llvm_unreachable("Unknown induction");
   case InductionDescriptor::IK_IntInduction: {
     assert(P->getType() == II.getStartValue()->getType() && "Types must match");
-    // Handle other induction variables that are now based on the
-    // canonical one.
-    Value *V = Induction;
-    if (P != OldInduction) {
-      V = Builder.CreateSExtOrTrunc(Induction, P->getType());
-      V = II.transform(Builder, V, PSE.getSE(), DL);
-      V->setName("offset.idx");
+    if (P != OldInduction || VF == 1) {
+      Value *V = Induction;
+      // Handle other induction variables that are now based on the
+      // canonical one.
+      if (P != OldInduction) {
+        V = Builder.CreateSExtOrTrunc(Induction, P->getType());
+        V = II.transform(Builder, V, PSE.getSE(), DL);
+        V->setName("offset.idx");
+      }
+      Value *Broadcasted = getBroadcastInstrs(V);
+      // After broadcasting the induction variable we need to make the vector
+      // consecutive by adding 0, 1, 2, etc.
+      for (unsigned part = 0; part < UF; ++part)
+        Entry[part] = getStepVector(Broadcasted, VF * part, II.getStep());
+    } else {
+      // Instead of re-creating the vector IV by splatting the scalar IV
+      // in each iteration, we can make a new independent vector IV.
+      widenInductionVariable(II, Entry);
     }
-    Value *Broadcasted = getBroadcastInstrs(V);
-    // After broadcasting the induction variable we need to make the vector
-    // consecutive by adding 0, 1, 2, etc.
-    for (unsigned part = 0; part < UF; ++part)
-      Entry[part] = getStepVector(Broadcasted, VF * part, II.getStep());
     return;
   }
   case InductionDescriptor::IK_PtrInduction:
@@ -4239,15 +4287,23 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
       if (CI->getOperand(0) == OldInduction &&
           it->getOpcode() == Instruction::Trunc) {
         InductionDescriptor II =
-          Legal->getInductionVars()->lookup(OldInduction);
+            Legal->getInductionVars()->lookup(OldInduction);
         if (auto StepValue = II.getConstIntStepValue()) {
-          StepValue = ConstantInt::getSigned(cast<IntegerType>(CI->getType()),
-                                             StepValue->getSExtValue());
-          Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
-                                                 CI->getType());
-          Value *Broadcasted = getBroadcastInstrs(ScalarCast);
-          for (unsigned Part = 0; Part < UF; ++Part)
-            Entry[Part] = getStepVector(Broadcasted, VF * Part, StepValue);
+          IntegerType *TruncType = cast<IntegerType>(CI->getType());
+          if (VF == 1) {
+            StepValue =
+                ConstantInt::getSigned(TruncType, StepValue->getSExtValue());
+            Value *ScalarCast =
+                Builder.CreateCast(CI->getOpcode(), Induction, CI->getType());
+            Value *Broadcasted = getBroadcastInstrs(ScalarCast);
+            for (unsigned Part = 0; Part < UF; ++Part)
+              Entry[Part] = getStepVector(Broadcasted, VF * Part, StepValue);
+          } else {
+            // Truncating a vector induction variable on each iteration
+            // may be expensive. Instead, truncate the initial value, and create
+            // a new, truncated, vector IV based on that.
+            widenInductionVariable(II, Entry, TruncType);
+          }
           addMetadata(Entry, &*it);
           break;
         }

diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll
@@ -43,7 +43,7 @@ for.end12:                                        ; preds = %for.end, %entry
 
 ; CHECK-LABEL: @s173
 ; CHECK: load <4 x float>, <4 x float>*
-; CHECK: add i64 %index, 16000
+; CHECK: add nsw i64 %.lhs, 16000
 ; CHECK: ret i32 0
 }
 

diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -95,7 +95,7 @@ for.end:                                          ; preds = %for.cond
 %struct.In = type { float, float }
 
 ;AVX512-LABEL: @foo2
-;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %induction, i32 1
+;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1
 ;AVX512: llvm.masked.gather.v16f32
 ;AVX512: llvm.masked.store.v16f32
 ;AVX512: ret void
@@ -170,10 +170,10 @@ for.end:                                          ; preds = %for.cond
 ;}
 
 ;AVX512-LABEL: @foo3
-;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %induction, i32 1
+;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1
 ;AVX512: llvm.masked.gather.v16f32
 ;AVX512: fadd <16 x float>
-;AVX512: getelementptr %struct.Out, %struct.Out* %out, <16 x i64> %induction, i32 1
+;AVX512: getelementptr %struct.Out, %struct.Out* %out, <16 x i64> %{{.*}}, i32 1
 ;AVX512: llvm.masked.scatter.v16f32
 ;AVX512: ret void
 

diff --git a/llvm/test/Transforms/LoopVectorize/cast-induction.ll b/llvm/test/Transforms/LoopVectorize/cast-induction.ll
@@ -8,7 +8,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 @a = common global [2048 x i32] zeroinitializer, align 16
 
 ;CHECK-LABEL: @example12(
-;CHECK: trunc i64
+;CHECK: %vec.ind1 = phi <4 x i32>
 ;CHECK: store <4 x i32>
 ;CHECK: ret void
 define void @example12() nounwind uwtable ssp {

diff --git a/llvm/test/Transforms/LoopVectorize/gcc-examples.ll b/llvm/test/Transforms/LoopVectorize/gcc-examples.ll
@@ -368,7 +368,7 @@ define void @example11() nounwind uwtable ssp {
 }
 
 ;CHECK-LABEL: @example12(
-;CHECK: trunc i64
+;CHECK: %vec.ind1 = phi <4 x i32>
 ;CHECK: store <4 x i32>
 ;CHECK: ret void
 define void @example12() nounwind uwtable ssp {

diff --git a/llvm/test/Transforms/LoopVectorize/gep_with_bitcast.ll b/llvm/test/Transforms/LoopVectorize/gep_with_bitcast.ll
@@ -12,10 +12,11 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
 ; CHECK-LABEL: @foo
 ; CHECK: vector.body
-; CHECK:  %0 = getelementptr inbounds double*, double** %in, i64 %index
-; CHECK:  %1 = bitcast double** %0 to <4 x i64>*
-; CHECK:  %wide.load = load <4 x i64>, <4 x i64>* %1, align 8
-; CHECK:  %2 = icmp eq <4 x i64> %wide.load, zeroinitializer
+; CHECK:  %0 = phi
+; CHECK:  %2 = getelementptr inbounds double*, double** %in, i64 %0
+; CHECK:  %3 = bitcast double** %2 to <4 x i64>*
+; CHECK:  %wide.load = load <4 x i64>, <4 x i64>* %3, align 8
+; CHECK:  %4 = icmp eq <4 x i64> %wide.load, zeroinitializer
 ; CHECK:  br i1
 
 define void @foo(double** noalias nocapture readonly %in, double** noalias nocapture readnone %out, i8* noalias nocapture %res) #0 {
@@ -37,4 +38,4 @@ for.body:
 
 for.end:
   ret void
-}
+}
diff --git a/llvm/test/Transforms/LoopVectorize/global_alias.ll b/llvm/test/Transforms/LoopVectorize/global_alias.ll
@@ -12,7 +12,7 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 @PA = external global i32*
 
 
-;; === First, the tests that should always vectorize, wither statically or by adding run-time checks ===
+;; === First, the tests that should always vectorize, whether statically or by adding run-time checks ===
 
 
 ; /// Different objects, positive induction, constant distance
@@ -387,7 +387,7 @@ for.end:                                          ; preds = %for.cond
 ;   return Foo.A[a];
 ; }
 ; CHECK-LABEL: define i32 @noAlias08(
-; CHECK: sub <4 x i32>
+; CHECK: sub nuw nsw <4 x i32>
 ; CHECK: ret
 
 define i32 @noAlias08(i32 %a) #0 {
@@ -439,7 +439,7 @@ for.end:                                          ; preds = %for.cond
 ;   return Foo.A[a];
 ; }
 ; CHECK-LABEL: define i32 @noAlias09(
-; CHECK: sub <4 x i32>
+; CHECK: sub nuw nsw <4 x i32>
 ; CHECK: ret
 
 define i32 @noAlias09(i32 %a) #0 {
@@ -721,7 +721,7 @@ for.end:                                          ; preds = %for.cond
 ;   return Foo.A[a];
 ; }
 ; CHECK-LABEL: define i32 @noAlias14(
-; CHECK: sub <4 x i32>
+; CHECK: sub nuw nsw <4 x i32>
 ; CHECK: ret
 
 define i32 @noAlias14(i32 %a) #0 {

diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -1,4 +1,6 @@
 ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=IND
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=UNROLL
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
@@ -27,8 +29,6 @@ for.end:
   ret void
 }
 
-; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=IND
-
 ; Make sure we remove unneeded vectorization of induction variables.
 ; In order for instcombine to cleanup the vectorized induction variables that we
 ; create in the loop vectorizer we need to perform some form of redundancy
@@ -241,3 +241,64 @@ entry:
  exit:
   ret void
 }
+
+; Check that we generate vectorized IVs in the pre-header
+; instead of widening the scalar IV inside the loop, when
+; we know how to do that.
+; IND-LABEL: veciv
+; IND: vector.body:
+; IND: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND: %vec.ind = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %step.add, %vector.body ]
+; IND: %step.add = add <2 x i32> %vec.ind, <i32 2, i32 2>
+; IND: %index.next = add i32 %index, 2
+; IND: %[[CMP:.*]] = icmp eq i32 %index.next
+; IND: br i1 %[[CMP]]
+; UNROLL-LABEL: veciv
+; UNROLL: vector.body:
+; UNROLL: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL: %vec.ind = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %step.add1, %vector.body ]
+; UNROLL: %step.add = add <2 x i32> %vec.ind, <i32 2, i32 2>
+; UNROLL: %step.add1 = add <2 x i32> %vec.ind, <i32 4, i32 4>
+; UNROLL: %index.next = add i32 %index, 4
+; UNROLL: %[[CMP:.*]] = icmp eq i32 %index.next
+; UNROLL: br i1 %[[CMP]]
+define void @veciv(i32* nocapture %a, i32 %start, i32 %k) {
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %indvars.iv
+  store i32 %indvars.iv, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, %k
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; IND-LABEL: trunciv
+; IND: vector.body:
+; IND: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND: %[[VECIND:.*]] = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %[[STEPADD:.*]], %vector.body ]
+; IND: %[[STEPADD]] = add <2 x i32> %[[VECIND]], <i32 2, i32 2>
+; IND: %index.next = add i64 %index, 2
+; IND: %[[CMP:.*]] = icmp eq i64 %index.next
+; IND: br i1 %[[CMP]]
+define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) {
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %trunc.iv = trunc i64 %indvars.iv to i32
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %trunc.iv
+  store i32 %trunc.iv, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %k
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/induction_plus.ll b/llvm/test/Transforms/LoopVectorize/induction_plus.ll
@@ -1,13 +1,16 @@
-; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -instcombine -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
 @array = common global [1024 x i32] zeroinitializer, align 16
 
 ;CHECK-LABEL: @array_at_plus_one(
-;CHECK: add i64 %index, 12
-;CHECK: trunc i64
+;CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+;CHECK: %vec.ind = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %step.add, %vector.body ]
+;CHECK: %vec.ind1 = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %step.add2, %vector.body ]
+;CHECK: add <4 x i64> %vec.ind, <i64 4, i64 4, i64 4, i64 4>
+;CHECK: add nsw <4 x i64> %vec.ind, <i64 12, i64 12, i64 12, i64 12>
 ;CHECK: ret i32
 define i32 @array_at_plus_one(i32 %n) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 0