LoopVectorize/reduction: add negative test for vectorizable example #68912

artagnon · 2023-10-12T17:04:20Z

The following snippet has been extracted from SPEC:

  int ix86_rtx_costs(unsigned long value)
  {
    int nbits;
    for (nbits = 0; value != 0; value &= value - 1)
      nbits++;
    return nbits;
  }

Although gcc-aarch64 vectorizes it just fine, llvm does not. The reason for this is that the LoopVectorizer gets confused when the phi corresponding to the 'and' instruction has another (unrelated) user: in this case, it is an 'add'. As a preliminary step for vectorizing the example, add a negative test.

The following snippet has been extracted from SPEC: int ix86_rtx_costs(unsigned long value) { int nbits; for (nbits = 0; value != 0; value &= value - 1) nbits++; return nbits; } Although gcc-aarch64 vectorizes it just fine, llvm does not. The reason for this is that the LoopVectorizer gets confused when the phi corresponding to the 'and' instruction has another (unrelated) user: in this case, it is an 'add'. As a preliminary step for vectorizing the example, add a negative test.

llvmbot · 2023-10-12T17:05:38Z

@llvm/pr-subscribers-llvm-transforms

Author: Ramkumar Ramachandra (artagnon)

Changes

The following snippet has been extracted from SPEC:

  int ix86_rtx_costs(unsigned long value)
  {
    int nbits;
    for (nbits = 0; value != 0; value &amp;= value - 1)
      nbits++;
    return nbits;
  }

Although gcc-aarch64 vectorizes it just fine, llvm does not. The reason for this is that the LoopVectorizer gets confused when the phi corresponding to the 'and' instruction has another (unrelated) user: in this case, it is an 'add'. As a preliminary step for vectorizing the example, add a negative test.

Patch is 34.20 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/68912.diff

1 Files Affected:

(modified) llvm/test/Transforms/LoopVectorize/reduction.ll (+336-291)

diff --git a/llvm/test/Transforms/LoopVectorize/reduction.ll b/llvm/test/Transforms/LoopVectorize/reduction.ll
index aba2532706de5c6..bf4447d7617e3b4 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction.ll
@@ -629,52 +629,6 @@ for.end:                                          ; preds = %for.body, %entry
   ret i32 %result.0.lcssa
 }
 
-; In this code the subtracted variable is on the RHS and this is not an induction variable.
-define i32 @reduction_sub_rhs(i32 %n, ptr %A) {
-; CHECK-LABEL: define i32 @reduction_sub_rhs(
-; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[N]], 0
-; CHECK-NEXT:    br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[X_05:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[SUB]] = sub nsw i32 [[TMP0]], [[X_05]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    br label [[FOR_END]]
-; CHECK:       for.end:
-; CHECK-NEXT:    [[X_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SUB]], [[FOR_END_LOOPEXIT]] ]
-; CHECK-NEXT:    ret i32 [[X_0_LCSSA]]
-;
-entry:
-  %cmp4 = icmp sgt i32 %n, 0
-  br i1 %cmp4, label %for.body, label %for.end
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
-  %x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
-  %0 = load i32, ptr %arrayidx, align 4
-  %sub = sub nsw i32 %0, %x.05
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %n
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  %x.0.lcssa = phi i32 [ 0, %entry ], [ %sub, %for.body ]
-  ret i32 %x.0.lcssa
-}
-
-
 ; In this test the reduction variable is on the LHS and we can vectorize it.
 define i32 @reduction_sub_lhs(i32 %n, ptr %A) {
 ; CHECK-LABEL: define i32 @reduction_sub_lhs(
@@ -831,9 +785,305 @@ for.inc:
   %exitcond = icmp ne i32 %lftr.wideiv, 128
   br i1 %exitcond, label %for.body, label %for.end
 
-for.end:
-  %sum.1.lcssa = phi float [ %sum.1, %for.inc ]
-  ret float %sum.1.lcssa
+for.end:
+  %sum.1.lcssa = phi float [ %sum.1, %for.inc ]
+  ret float %sum.1.lcssa
+}
+
+define i32 @reduction_sum_multiuse(i32 %n, ptr %A, ptr %B) {
+; CHECK-LABEL: define i32 @reduction_sum_multiuse(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[END:%.*]]
+; CHECK:       .lr.ph.preheader:
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 4294967292
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP5]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP7]] = add <4 x i32> [[TMP6]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
+; CHECK:       .lr.ph:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[TMP17:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[SUM_02]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], [[TMP11]]
+; CHECK-NEXT:    [[TMP17]] = add i32 [[TMP16]], [[TMP13]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    [[SUM_COPY:%.*]] = phi i32 [ [[TMP17]], [[DOTLR_PH]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = shl i32 [[SUM_COPY]], 1
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[F2:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP18]], [[DOT_CRIT_EDGE]] ]
+; CHECK-NEXT:    ret i32 [[F2]]
+;
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph.preheader, label %end
+.lr.ph.preheader:                                 ; preds = %0
+  br label %.lr.ph
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+  %2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  %3 = load i32, ptr %2, align 4
+  %4 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+  %5 = load i32, ptr %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = add i32 %sum.02, %6
+  %8 = add i32 %7, %3
+  %9 = add i32 %8, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.lcssa = phi i32 [ %9, %.lr.ph ]
+  %sum.copy = phi i32 [ %9, %.lr.ph ]
+  br label %end
+
+end:
+  %f1 = phi i32 [ 0, %0 ], [ %sum.lcssa, %._crit_edge ]
+  %f2 = phi i32 [ 0, %0 ], [ %sum.copy, %._crit_edge ]
+  %final = add i32 %f1, %f2
+  ret i32 %final
+}
+
+; Can vectorize reduction with redundant single-operand phi input.
+define i64 @reduction_with_phi_with_one_incoming_on_backedge(i16 %n, ptr %A) {
+; CHECK-LABEL: define i64 @reduction_with_phi_with_one_incoming_on_backedge(
+; CHECK-SAME: i16 [[N:%.*]], ptr [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i16 @llvm.smax.i16(i16 [[N]], i16 2)
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i16 [[SMAX]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[TMP0]] to i32
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i16 [[SMAX]], 5
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP1]], 65532
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i16
+; CHECK-NEXT:    [[IND_END:%.*]] = or i16 [[DOTCAST]], 1
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[DOTCAST1:%.*]] = trunc i32 [[INDEX]] to i16
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = or i16 [[DOTCAST1]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[OFFSET_IDX]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4]] = add <4 x i64> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP4]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i16 [[IV]] to i64
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i64, ptr [[A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[LV_A:%.*]] = load i64, ptr [[GEP_A]], align 4
+; CHECK-NEXT:    [[SUM_NEXT]] = add nsw i64 [[SUM]], [[LV_A]]
+; CHECK-NEXT:    br label [[LOOP_BB:%.*]]
+; CHECK:       loop.bb:
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[IV_NEXT]] = add nsw i16 [[IV]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i16 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[LCSSA_EXIT:%.*]] = phi i64 [ [[SUM_NEXT]], [[LOOP_LATCH]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[LCSSA_EXIT]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i16 [ 1, %entry ], [ %iv.next, %loop.latch ]
+  %sum = phi i64 [ 0, %entry ], [ %phi.sum.next, %loop.latch ]
+  %gep.A = getelementptr i64, ptr %A, i16 %iv
+  %lv.A = load i64, ptr %gep.A
+  %sum.next = add nsw i64 %sum, %lv.A
+  br label %loop.bb
+
+loop.bb:
+  %phi.sum.next = phi i64 [ %sum.next, %loop.header ]
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add nsw i16 %iv, 1
+  %cond = icmp slt i16 %iv.next, %n
+  br i1 %cond, label %loop.header, label %exit
+
+exit:
+  %lcssa.exit = phi i64 [ %phi.sum.next, %loop.latch ]
+  ret i64 %lcssa.exit
+}
+
+; Can vectorize reduction with redundant two-operand phi input.
+define i64 @reduction_with_phi_with_two_incoming_on_backedge(i16 %n, ptr %A) {
+; CHECK-LABEL: define i64 @reduction_with_phi_with_two_incoming_on_backedge(
+; CHECK-SAME: i16 [[N:%.*]], ptr [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i16 @llvm.smax.i16(i16 [[N]], i16 2)
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i16 [[SMAX]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[TMP0]] to i32
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i16 [[SMAX]], 5
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP1]], 65532
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i16
+; CHECK-NEXT:    [[IND_END:%.*]] = or i16 [[DOTCAST]], 1
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[DOTCAST1:%.*]] = trunc i32 [[INDEX]] to i16
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = or i16 [[DOTCAST1]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[OFFSET_IDX]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4]] = add <4 x i64> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP4]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i16 [[IV]] to i64
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i64, ptr [[A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[LV_A:%.*]] = load i64, ptr [[GEP_A]], align 4
+; CHECK-NEXT:    [[SUM_NEXT]] = add nsw i64 [[SUM]], [[LV_A]]
+; CHECK-NEXT:    [[CMP_0:%.*]] = icmp eq i64 [[LV_A]], 29
+; CHECK-NEXT:    br i1 [[CMP_0]], label [[LOOP_BB:%.*]], label [[LOOP_LATCH]]
+; CHECK:       loop.bb:
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[IV_NEXT]] = add nsw i16 [[IV]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i16 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[LCSSA_EXIT:%.*]] = phi i64 [ [[SUM_NEXT]], [[LOOP_LATCH]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[LCSSA_EXIT]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i16 [ 1, %entry ], [ %iv.next, %loop.latch ]
+  %sum = phi i64 [ 0, %entry ], [ %phi.sum.next, %loop.latch ]
+  %gep.A = getelementptr i64, ptr %A, i16 %iv
+  %lv.A = load i64, ptr %gep.A
+  %sum.next = add nsw i64 %sum, %lv.A
+  %cmp.0 = icmp eq i64 %lv.A, 29
+  br i1 %cmp.0, label %loop.bb, label %loop.latch
+
+loop.bb:
+  br label %loop.latch
+
+loop.latch:
+  %phi.sum.next = phi i64 [ %sum.next, %loop.bb ], [ %sum.next, %loop.header ]
+  %iv.next = add nsw i16 %iv, 1
+  %cond = icmp slt i16 %iv.next, %n
+  br i1 %cond, label %loop.header, label %exit
+
+exit:
+  %lcssa.exit = phi i64 [ %phi.sum.next, %loop.latch ]
+  ret i64 %lcssa.exit
+}
+
+; Negative tests
+
+; In this code the subtracted variable is on the RHS and this is not an induction variable.
+define i32 @reduction_sub_rhs(i32 %n, ptr %A) {
+; CHECK-LABEL: define i32 @reduction_sub_rhs(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[X_05:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[SUB]] = sub nsw i32 [[TMP0]], [[X_05]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[X_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SUB]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[X_0_LCSSA]]
+;
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %sub = sub nsw i32 %0, %x.05
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %x.0.lcssa = phi i32 [ 0, %entry ], [ %sub, %for.body ]
+  ret i32 %x.0.lcssa
 }
 
 ; We can't vectorize reductions with phi inputs from outside the reduction.
@@ -1035,97 +1285,6 @@ exit:
   ret i32 %inc.2
 }
 
-define i32 @reduction_sum_multiuse(i32 %n, ptr %A, ptr %B) {
-; CHECK-LABEL: define i32 @reduction_sum_multiuse(
-; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N]], 0
-; CHECK-NEXT:    br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[END:%.*]]
-; CHECK:       .lr.ph.preheader:
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 4294967292
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr...
[truncated]

artagnon · 2023-10-12T17:07:37Z

The diff might look a little suspicious, but all I've done is to move all the other negative tests to the end of the file.

Mel-Chen

Nice case, thanks! This is popcount. In my opinion, it is possible to use loop idiom pass to convert to ctpop.

nikic · 2023-10-12T19:19:28Z

LIR actually already has support for recognizing popcounts:

llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp

Line 1861 in 50ece4c

bool LoopIdiomRecognize::recognizePopcount() {

So you'd want to find out why that doesn't trigger and maybe add a PhaseOrdering test.

Mel-Chen · 2023-10-12T21:47:31Z

LIR actually already has support for recognizing popcounts:

llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp

Line 1861 in 50ece4c

bool LoopIdiomRecognize::recognizePopcount() {

So you'd want to find out why that doesn't trigger and maybe add a PhaseOrdering test.

I tried, it is worked well.
@artagnon I guess the target you tried doesn't support popcount instruction. Please enable it and try again.

artagnon · 2023-10-13T09:34:15Z

Thanks a lot @nikic and @Mel-Chen for the guidance! Indeed, turning on zbb enables LLVM to use ctpop in the example.

artagnon requested review from nikic, fhahn, RKSimon and Mel-Chen October 12, 2023 17:04

llvmbot added the llvm:transforms label Oct 12, 2023

Mel-Chen reviewed Oct 12, 2023

View reviewed changes

artagnon closed this Oct 13, 2023

artagnon deleted the lv-multiuser-confusion-test branch October 13, 2023 09:34

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

LoopVectorize/reduction: add negative test for vectorizable example #68912

LoopVectorize/reduction: add negative test for vectorizable example #68912

artagnon commented Oct 12, 2023

llvmbot commented Oct 12, 2023

artagnon commented Oct 12, 2023

Mel-Chen left a comment

nikic commented Oct 12, 2023

Mel-Chen commented Oct 12, 2023

artagnon commented Oct 13, 2023

LoopVectorize/reduction: add negative test for vectorizable example #68912

LoopVectorize/reduction: add negative test for vectorizable example #68912

Conversation

artagnon commented Oct 12, 2023

llvmbot commented Oct 12, 2023

artagnon commented Oct 12, 2023

Mel-Chen left a comment

Choose a reason for hiding this comment

nikic commented Oct 12, 2023

Mel-Chen commented Oct 12, 2023

artagnon commented Oct 13, 2023