116 changes: 52 additions & 64 deletions llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll

Large diffs are not rendered by default.

102 changes: 48 additions & 54 deletions llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
; RUN: opt < %s -passes=loop-vectorize -force-target-max-vector-interleave=1 -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-min=128 -scalable-vectorization=off -riscv-v-register-bit-width-lmul=2 -S | FileCheck %s --check-prefix=LMUL2

; Function Attrs: nounwind
define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* %c, i32 %size) {
define ptr @array_add(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr %c, i32 %size) {
; LMUL1-LABEL: @array_add(
; LMUL1-NEXT: entry:
; LMUL1-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[SIZE:%.*]], 0
Expand All @@ -23,22 +23,19 @@ define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocaptur
; LMUL1: vector.body:
; LMUL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; LMUL1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
; LMUL1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP3]]
; LMUL1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
; LMUL1-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
; LMUL1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
; LMUL1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP3]]
; LMUL1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
; LMUL1-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
; LMUL1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
; LMUL1-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
; LMUL1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP3]]
; LMUL1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
; LMUL1-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
; LMUL1-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP13]], align 4
; LMUL1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]]
; LMUL1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
; LMUL1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
; LMUL1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP3]]
; LMUL1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
; LMUL1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
; LMUL1-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
; LMUL1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP3]]
; LMUL1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
; LMUL1-NEXT: store <4 x i32> [[TMP8]], ptr [[TMP10]], align 4
; LMUL1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; LMUL1-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; LMUL1-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; LMUL1-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; LMUL1-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; LMUL1: middle.block:
; LMUL1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; LMUL1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
Expand All @@ -47,21 +44,21 @@ define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocaptur
; LMUL1-NEXT: br label [[FOR_BODY:%.*]]
; LMUL1: for.body:
; LMUL1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; LMUL1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
; LMUL1-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; LMUL1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
; LMUL1-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
; LMUL1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP15]]
; LMUL1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
; LMUL1-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
; LMUL1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
; LMUL1-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; LMUL1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
; LMUL1-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
; LMUL1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
; LMUL1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
; LMUL1-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
; LMUL1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; LMUL1-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
; LMUL1-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[SIZE]]
; LMUL1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; LMUL1: for.end.loopexit:
; LMUL1-NEXT: br label [[FOR_END]]
; LMUL1: for.end:
; LMUL1-NEXT: ret i32* [[C]]
; LMUL1-NEXT: ret ptr [[C]]
;
; LMUL2-LABEL: @array_add(
; LMUL2-NEXT: entry:
Expand All @@ -80,22 +77,19 @@ define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocaptur
; LMUL2: vector.body:
; LMUL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; LMUL2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
; LMUL2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP3]]
; LMUL2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
; LMUL2-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
; LMUL2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP6]], align 4
; LMUL2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP3]]
; LMUL2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
; LMUL2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
; LMUL2-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4
; LMUL2-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
; LMUL2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP3]]
; LMUL2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
; LMUL2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
; LMUL2-NEXT: store <8 x i32> [[TMP10]], <8 x i32>* [[TMP13]], align 4
; LMUL2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]]
; LMUL2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
; LMUL2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
; LMUL2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP3]]
; LMUL2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
; LMUL2-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4
; LMUL2-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
; LMUL2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP3]]
; LMUL2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
; LMUL2-NEXT: store <8 x i32> [[TMP8]], ptr [[TMP10]], align 4
; LMUL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; LMUL2-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; LMUL2-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; LMUL2-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; LMUL2-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; LMUL2: middle.block:
; LMUL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; LMUL2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
Expand All @@ -104,21 +98,21 @@ define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocaptur
; LMUL2-NEXT: br label [[FOR_BODY:%.*]]
; LMUL2: for.body:
; LMUL2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; LMUL2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
; LMUL2-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; LMUL2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
; LMUL2-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
; LMUL2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP15]]
; LMUL2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
; LMUL2-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
; LMUL2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
; LMUL2-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; LMUL2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
; LMUL2-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
; LMUL2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
; LMUL2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
; LMUL2-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
; LMUL2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; LMUL2-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
; LMUL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[SIZE]]
; LMUL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; LMUL2: for.end.loopexit:
; LMUL2-NEXT: br label [[FOR_END]]
; LMUL2: for.end:
; LMUL2-NEXT: ret i32* [[C]]
; LMUL2-NEXT: ret ptr [[C]]
;
entry:
%cmp10 = icmp sgt i32 %size, 0
Expand All @@ -129,13 +123,13 @@ for.body.preheader: ; preds = %entry

for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
%1 = load i32, i32* %arrayidx2, align 4
%arrayidx = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
%0 = load i32, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
%1 = load i32, ptr %arrayidx2, align 4
%add = add nsw i32 %1, %0
%arrayidx4 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
store i32 %add, i32* %arrayidx4, align 4
%arrayidx4 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
store i32 %add, ptr %arrayidx4, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %size
Expand All @@ -145,5 +139,5 @@ for.end.loopexit: ; preds = %for.body
br label %for.end

for.end: ; preds = %for.end.loopexit, %entry
ret i32* %c
ret ptr %c
}
62 changes: 29 additions & 33 deletions llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -mtriple=riscv64 -mattr=+v -passes=loop-vectorize < %s | FileCheck %s

define void @small_trip_count_min_vlen_128(i32* nocapture %a) nounwind vscale_range(4,1024) {
define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_range(4,1024) {
; CHECK-LABEL: @small_trip_count_min_vlen_128(
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
Expand All @@ -10,13 +10,11 @@ define void @small_trip_count_min_vlen_128(i32* nocapture %a) nounwind vscale_ra
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr [[TMP2]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
Expand All @@ -27,10 +25,10 @@ define void @small_trip_count_min_vlen_128(i32* nocapture %a) nounwind vscale_ra
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[IV]]
; CHECK-NEXT: [[V:%.*]] = load i32, i32* [[GEP]], align 4
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]]
; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[GEP]], align 4
; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[V]], 1
; CHECK-NEXT: store i32 [[ADD]], i32* [[GEP]], align 4
; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV]], 3
; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
Expand All @@ -42,10 +40,10 @@ entry:

loop:
%iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ]
%gep = getelementptr inbounds i32, i32* %a, i32 %iv
%v = load i32, i32* %gep, align 4
%gep = getelementptr inbounds i32, ptr %a, i32 %iv
%v = load i32, ptr %gep, align 4
%add = add nsw i32 %v, 1
store i32 %add, i32* %gep, align 4
store i32 %add, ptr %gep, align 4
%iv.next = add i32 %iv, 1
%cond = icmp eq i32 %iv, 3
br i1 %cond, label %exit, label %loop
Expand All @@ -56,7 +54,7 @@ exit:

; Note: This test uses a vscale_range starting at 1, which is technically incompatibile
; with +v. If we expose a target hook for minimum vlen, this example will need reworked
define void @small_trip_count_min_vlen_32(i32* nocapture %a) nounwind vscale_range(1,1024) {
define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_range(1,1024) {
; CHECK-LABEL: @small_trip_count_min_vlen_32(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
Expand All @@ -77,29 +75,27 @@ define void @small_trip_count_min_vlen_32(i32* nocapture %a) nounwind vscale_ran
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 0
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 [[TMP8]], i32 4)
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <vscale x 2 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0nxv2i32(<vscale x 2 x i32>* [[TMP11]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i32> poison)
; CHECK-NEXT: [[TMP12:%.*]] = add nsw <vscale x 2 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP10]] to <vscale x 2 x i32>*
; CHECK-NEXT: call void @llvm.masked.store.nxv2i32.p0nxv2i32(<vscale x 2 x i32> [[TMP12]], <vscale x 2 x i32>* [[TMP13]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 2
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP15]]
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP10]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i32> poison)
; CHECK-NEXT: [[TMP11:%.*]] = add nsw <vscale x 2 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: call void @llvm.masked.store.nxv2i32.p0(<vscale x 2 x i32> [[TMP11]], ptr [[TMP10]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], 2
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP13]]
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[IV]]
; CHECK-NEXT: [[V:%.*]] = load i32, i32* [[GEP]], align 4
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]]
; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[GEP]], align 4
; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[V]], 1
; CHECK-NEXT: store i32 [[ADD]], i32* [[GEP]], align 4
; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV]], 3
; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
Expand All @@ -111,10 +107,10 @@ entry:

loop:
%iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ]
%gep = getelementptr inbounds i32, i32* %a, i32 %iv
%v = load i32, i32* %gep, align 4
%gep = getelementptr inbounds i32, ptr %a, i32 %iv
%v = load i32, ptr %gep, align 4
%add = add nsw i32 %v, 1
store i32 %add, i32* %gep, align 4
store i32 %add, ptr %gep, align 4
%iv.next = add i32 %iv, 1
%cond = icmp eq i32 %iv, 3
br i1 %cond, label %exit, label %loop
Expand Down
27 changes: 12 additions & 15 deletions llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
%rec8 = type { i16 }

@a = global [1 x %rec8] zeroinitializer
@b = global [2 x i16*] zeroinitializer
@b = global [2 x ptr] zeroinitializer


define void @f1() {
Expand All @@ -21,12 +21,11 @@ define void @f1() {
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
; CHECK-NEXT: [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[TMP0]] to i64
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i16*, i16** [[TMP2]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16** [[TMP3]] to <2 x i16*>*
; CHECK-NEXT: store <2 x i16*> <i16* getelementptr inbounds ([1 x %rec8], [1 x %rec8]* @a, i32 0, i32 0, i32 0), i16* getelementptr inbounds ([1 x %rec8], [1 x %rec8]* @a, i32 0, i32 0, i32 0)>, <2 x i16*>* [[TMP4]], align 8
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr ptr, ptr [[TMP2]], i32 0
; CHECK-NEXT: store <2 x ptr> <ptr @a, ptr @a>, ptr [[TMP3]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 2, 2
; CHECK-NEXT: br i1 [[CMP_N]], label [[BB3:%.*]], label [[SCALAR_PH]]
Expand All @@ -36,14 +35,13 @@ define void @f1() {
; CHECK: bb2:
; CHECK-NEXT: [[C_1_0:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[_TMP9:%.*]], [[BB2]] ]
; CHECK-NEXT: [[_TMP1:%.*]] = zext i16 0 to i64
; CHECK-NEXT: [[_TMP2:%.*]] = getelementptr [1 x %rec8], [1 x %rec8]* @a, i16 0, i64 [[_TMP1]]
; CHECK-NEXT: [[_TMP4:%.*]] = bitcast %rec8* [[_TMP2]] to i16*
; CHECK-NEXT: [[_TMP2:%.*]] = getelementptr [1 x %rec8], ptr @a, i16 0, i64 [[_TMP1]]
; CHECK-NEXT: [[_TMP6:%.*]] = sext i16 [[C_1_0]] to i64
; CHECK-NEXT: [[_TMP7:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[_TMP6]]
; CHECK-NEXT: store i16* [[_TMP4]], i16** [[_TMP7]], align 8
; CHECK-NEXT: [[_TMP7:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[_TMP6]]
; CHECK-NEXT: store ptr [[_TMP2]], ptr [[_TMP7]], align 8
; CHECK-NEXT: [[_TMP9]] = add nsw i16 [[C_1_0]], 1
; CHECK-NEXT: [[_TMP11:%.*]] = icmp slt i16 [[_TMP9]], 2
; CHECK-NEXT: br i1 [[_TMP11]], label [[BB2]], label [[BB3]], [[LOOP2:!llvm.loop !.*]]
; CHECK-NEXT: br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: bb3:
; CHECK-NEXT: ret void
;
Expand All @@ -54,11 +52,10 @@ bb1:
bb2:
%c.1.0 = phi i16 [ 0, %bb1 ], [ %_tmp9, %bb2 ]
%_tmp1 = zext i16 0 to i64
%_tmp2 = getelementptr [1 x %rec8], [1 x %rec8]* @a, i16 0, i64 %_tmp1
%_tmp4 = bitcast %rec8* %_tmp2 to i16*
%_tmp2 = getelementptr [1 x %rec8], ptr @a, i16 0, i64 %_tmp1
%_tmp6 = sext i16 %c.1.0 to i64
%_tmp7 = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 %_tmp6
store i16* %_tmp4, i16** %_tmp7
%_tmp7 = getelementptr [2 x ptr], ptr @b, i16 0, i64 %_tmp6
store ptr %_tmp2, ptr %_tmp7
%_tmp9 = add nsw i16 %c.1.0, 1
%_tmp11 = icmp slt i16 %_tmp9, 2
br i1 %_tmp11, label %bb2, label %bb3
Expand Down
91 changes: 43 additions & 48 deletions llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"

define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp {
define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp {
; CHECK-LABEL: @conversion_cost1(
; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 3
; CHECK-NEXT: br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]]
Expand All @@ -21,17 +21,16 @@ define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) noun
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <32 x i8> [ <i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <32 x i8> [ <i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0
; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <32 x i8>*
; CHECK-NEXT: store <32 x i8> [[VEC_IND1]], <32 x i8>* [[TMP8]], align 1
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
; CHECK-NEXT: store <32 x i8> [[VEC_IND]], ptr [[TMP7]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <32 x i8> [[VEC_IND1]], <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32>
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <32 x i8> [[VEC_IND]], <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32>
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]]
Expand All @@ -40,9 +39,9 @@ define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) noun
; CHECK-NEXT: br label [[DOTLR_PH:%.*]]
; CHECK: .lr.ph:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[INDVARS_IV]] to i8
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store i8 [[TMP10]], i8* [[TMP11]], align 1
; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[INDVARS_IV]] to i8
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store i8 [[TMP9]], ptr [[TMP10]], align 1
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
Expand All @@ -58,8 +57,8 @@ define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) noun
.lr.ph: ; preds = %0, %.lr.ph
%indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 3, %0 ]
%2 = trunc i64 %indvars.iv to i8
%3 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
store i8 %2, i8* %3, align 1
%3 = getelementptr inbounds i8, ptr %A, i64 %indvars.iv
store i8 %2, ptr %3, align 1
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %n
Expand All @@ -69,7 +68,7 @@ define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) noun
ret i32 undef
}

define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp {
define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp {
; CHECK-LABEL: @conversion_cost2(
; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 9
; CHECK-NEXT: br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]]
Expand All @@ -92,37 +91,33 @@ define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) noun
; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <2 x i64> [[STEP_ADD1]], <i64 2, i64 2>
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 9, [[INDEX]]
; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 2
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 4
; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 6
; CHECK-NEXT: [[TMP13:%.*]] = add nsw <2 x i64> [[VEC_IND]], <i64 3, i64 3>
; CHECK-NEXT: [[TMP14:%.*]] = add nsw <2 x i64> [[STEP_ADD]], <i64 3, i64 3>
; CHECK-NEXT: [[TMP15:%.*]] = add nsw <2 x i64> [[STEP_ADD1]], <i64 3, i64 3>
; CHECK-NEXT: [[TMP16:%.*]] = add nsw <2 x i64> [[STEP_ADD2]], <i64 3, i64 3>
; CHECK-NEXT: [[TMP17:%.*]] = sitofp <2 x i64> [[TMP13]] to <2 x float>
; CHECK-NEXT: [[TMP18:%.*]] = sitofp <2 x i64> [[TMP14]] to <2 x float>
; CHECK-NEXT: [[TMP19:%.*]] = sitofp <2 x i64> [[TMP15]] to <2 x float>
; CHECK-NEXT: [[TMP20:%.*]] = sitofp <2 x i64> [[TMP16]] to <2 x float>
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP7]]
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]]
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP11]]
; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 0
; CHECK-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP25]] to <2 x float>*
; CHECK-NEXT: store <2 x float> [[TMP17]], <2 x float>* [[TMP26]], align 4
; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 2
; CHECK-NEXT: [[TMP28:%.*]] = bitcast float* [[TMP27]] to <2 x float>*
; CHECK-NEXT: store <2 x float> [[TMP18]], <2 x float>* [[TMP28]], align 4
; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 4
; CHECK-NEXT: [[TMP30:%.*]] = bitcast float* [[TMP29]] to <2 x float>*
; CHECK-NEXT: store <2 x float> [[TMP19]], <2 x float>* [[TMP30]], align 4
; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 6
; CHECK-NEXT: [[TMP32:%.*]] = bitcast float* [[TMP31]] to <2 x float>*
; CHECK-NEXT: store <2 x float> [[TMP20]], <2 x float>* [[TMP32]], align 4
; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 2
; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 4
; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 6
; CHECK-NEXT: [[TMP9:%.*]] = add nsw <2 x i64> [[VEC_IND]], <i64 3, i64 3>
; CHECK-NEXT: [[TMP10:%.*]] = add nsw <2 x i64> [[STEP_ADD]], <i64 3, i64 3>
; CHECK-NEXT: [[TMP11:%.*]] = add nsw <2 x i64> [[STEP_ADD1]], <i64 3, i64 3>
; CHECK-NEXT: [[TMP12:%.*]] = add nsw <2 x i64> [[STEP_ADD2]], <i64 3, i64 3>
; CHECK-NEXT: [[TMP13:%.*]] = sitofp <2 x i64> [[TMP9]] to <2 x float>
; CHECK-NEXT: [[TMP14:%.*]] = sitofp <2 x i64> [[TMP10]] to <2 x float>
; CHECK-NEXT: [[TMP15:%.*]] = sitofp <2 x i64> [[TMP11]] to <2 x float>
; CHECK-NEXT: [[TMP16:%.*]] = sitofp <2 x i64> [[TMP12]] to <2 x float>
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]]
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]]
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP8]]
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
; CHECK-NEXT: store <2 x float> [[TMP13]], ptr [[TMP21]], align 4
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 2
; CHECK-NEXT: store <2 x float> [[TMP14]], ptr [[TMP22]], align 4
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 4
; CHECK-NEXT: store <2 x float> [[TMP15]], ptr [[TMP23]], align 4
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 6
; CHECK-NEXT: store <2 x float> [[TMP16]], ptr [[TMP24]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD2]], <i64 2, i64 2>
; CHECK-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]]
Expand All @@ -133,8 +128,8 @@ define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) noun
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[INDVARS_IV]], 3
; CHECK-NEXT: [[TOFP:%.*]] = sitofp i64 [[ADD]] to float
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store float [[TOFP]], float* [[GEP]], align 4
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store float [[TOFP]], ptr [[GEP]], align 4
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
Expand All @@ -151,8 +146,8 @@ define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) noun
%indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 9, %0 ]
%add = add nsw i64 %indvars.iv, 3
%tofp = sitofp i64 %add to float
%gep = getelementptr inbounds float, float* %B, i64 %indvars.iv
store float %tofp, float* %gep, align 4
%gep = getelementptr inbounds float, ptr %B, i64 %indvars.iv
store float %tofp, ptr %gep, align 4
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %n
Expand Down
84 changes: 17 additions & 67 deletions llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,79 +11,29 @@
target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-w64-windows-gnu"

define void @cff_index_load_offsets(i1 %cond, i8 %x, i8* %p) #0 {
define void @cff_index_load_offsets(i1 %cond, i8 %x, ptr %p) #0 {
; CHECK-LABEL: @cff_index_load_offsets(
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[EXIT:%.*]]
; CHECK: if.then:
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 undef, i64 4)
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[UMAX]], -1
; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[N_VEC]], 4
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, i8* null, i64 [[TMP3]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[X:%.*]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[X]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i32>
; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT2]] to <4 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = shl nuw <4 x i32> [[TMP4]], <i32 24, i32 24, i32 24, i32 24>
; CHECK-NEXT: [[TMP7:%.*]] = shl nuw <4 x i32> [[TMP5]], <i32 24, i32 24, i32 24, i32 24>
; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[P:%.*]], align 1, !tbaa [[TBAA1:![0-9]+]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i8> poison, i8 [[TMP8]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT5]], <4 x i8> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT6]] to <4 x i32>
; CHECK-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT6]] to <4 x i32>
; CHECK-NEXT: [[TMP11:%.*]] = shl nuw nsw <4 x i32> [[TMP9]], <i32 16, i32 16, i32 16, i32 16>
; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw <4 x i32> [[TMP10]], <i32 16, i32 16, i32 16, i32 16>
; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i32> [[TMP11]], [[TMP6]]
; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i32> [[TMP12]], [[TMP7]]
; CHECK-NEXT: [[TMP15:%.*]] = load i8, i8* undef, align 1, !tbaa [[TBAA1]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i8> poison, i8 [[TMP15]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT9]], <4 x i8> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i32> [[TMP13]], zeroinitializer
; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP14]], zeroinitializer
; CHECK-NEXT: [[TMP18:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT10]] to <4 x i32>
; CHECK-NEXT: [[TMP19:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT10]] to <4 x i32>
; CHECK-NEXT: [[TMP20:%.*]] = or <4 x i32> [[TMP16]], [[TMP18]]
; CHECK-NEXT: [[TMP21:%.*]] = or <4 x i32> [[TMP17]], [[TMP19]]
; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP21]], i32 3
; CHECK-NEXT: store i32 [[TMP22]], i32* undef, align 4, !tbaa [[TBAA4:![0-9]+]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[SW_EPILOG:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ null, [[IF_THEN]] ]
; CHECK-NEXT: br label [[FOR_BODY68:%.*]]
; CHECK: for.body68:
; CHECK-NEXT: [[P_359:%.*]] = phi i8* [ [[ADD_PTR86:%.*]], [[FOR_BODY68]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[CONV70:%.*]] = zext i8 [[X]] to i32
; CHECK-NEXT: [[P_359:%.*]] = phi ptr [ [[ADD_PTR86:%.*]], [[FOR_BODY68]] ], [ null, [[IF_THEN]] ]
; CHECK-NEXT: [[CONV70:%.*]] = zext i8 [[X:%.*]] to i32
; CHECK-NEXT: [[SHL71:%.*]] = shl nuw i32 [[CONV70]], 24
; CHECK-NEXT: [[TMP24:%.*]] = load i8, i8* [[P]], align 1, !tbaa [[TBAA1]]
; CHECK-NEXT: [[CONV73:%.*]] = zext i8 [[TMP24]] to i32
; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[P:%.*]], align 1, !tbaa [[TBAA1:![0-9]+]]
; CHECK-NEXT: [[CONV73:%.*]] = zext i8 [[TMP0]] to i32
; CHECK-NEXT: [[SHL74:%.*]] = shl nuw nsw i32 [[CONV73]], 16
; CHECK-NEXT: [[OR75:%.*]] = or i32 [[SHL74]], [[SHL71]]
; CHECK-NEXT: [[TMP25:%.*]] = load i8, i8* undef, align 1, !tbaa [[TBAA1]]
; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr undef, align 1, !tbaa [[TBAA1]]
; CHECK-NEXT: [[SHL78:%.*]] = shl nuw nsw i32 undef, 8
; CHECK-NEXT: [[OR79:%.*]] = or i32 [[OR75]], [[SHL78]]
; CHECK-NEXT: [[CONV81:%.*]] = zext i8 [[TMP25]] to i32
; CHECK-NEXT: [[CONV81:%.*]] = zext i8 [[TMP1]] to i32
; CHECK-NEXT: [[OR83:%.*]] = or i32 [[OR79]], [[CONV81]]
; CHECK-NEXT: store i32 [[OR83]], i32* undef, align 4, !tbaa [[TBAA4]]
; CHECK-NEXT: [[ADD_PTR86]] = getelementptr inbounds i8, i8* [[P_359]], i64 4
; CHECK-NEXT: [[CMP66:%.*]] = icmp ult i8* [[ADD_PTR86]], undef
; CHECK-NEXT: br i1 [[CMP66]], label [[FOR_BODY68]], label [[SW_EPILOG]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK-NEXT: store i32 [[OR83]], ptr undef, align 4, !tbaa [[TBAA4:![0-9]+]]
; CHECK-NEXT: [[ADD_PTR86]] = getelementptr inbounds i8, ptr [[P_359]], i64 4
; CHECK-NEXT: [[CMP66:%.*]] = icmp ult ptr [[ADD_PTR86]], undef
; CHECK-NEXT: br i1 [[CMP66]], label [[FOR_BODY68]], label [[SW_EPILOG:%.*]]
; CHECK: sw.epilog:
; CHECK-NEXT: unreachable
; CHECK: Exit:
Expand All @@ -96,21 +46,21 @@ if.then: ; preds = %entry
br label %for.body68

for.body68: ; preds = %for.body68, %if.then
%p.359 = phi i8* [ %add.ptr86, %for.body68 ], [ null, %if.then ]
%p.359 = phi ptr [ %add.ptr86, %for.body68 ], [ null, %if.then ]
%conv70 = zext i8 %x to i32
%shl71 = shl nuw i32 %conv70, 24
%0 = load i8, i8* %p, align 1, !tbaa !1
%0 = load i8, ptr %p, align 1, !tbaa !1
%conv73 = zext i8 %0 to i32
%shl74 = shl nuw nsw i32 %conv73, 16
%or75 = or i32 %shl74, %shl71
%1 = load i8, i8* undef, align 1, !tbaa !1
%1 = load i8, ptr undef, align 1, !tbaa !1
%shl78 = shl nuw nsw i32 undef, 8
%or79 = or i32 %or75, %shl78
%conv81 = zext i8 %1 to i32
%or83 = or i32 %or79, %conv81
store i32 %or83, i32* undef, align 4, !tbaa !4
%add.ptr86 = getelementptr inbounds i8, i8* %p.359, i64 4
%cmp66 = icmp ult i8* %add.ptr86, undef
store i32 %or83, ptr undef, align 4, !tbaa !4
%add.ptr86 = getelementptr inbounds i8, ptr %p.359, i64 4
%cmp66 = icmp ult ptr %add.ptr86, undef
br i1 %cmp66, label %for.body68, label %sw.epilog

sw.epilog: ; preds = %for.body68
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ target triple = "x86_64-unknown-linux-gnu"
; }

; Function Attrs: nounwind uwtable
define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
define void @foo(ptr nocapture %a, ptr nocapture %b, i32 %k, i32 %m) #0 {
; CHECK-LABEL: @foo(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP27:%.*]] = icmp sgt i32 [[M:%.*]], 0
Expand All @@ -31,10 +31,10 @@ define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
; CHECK-NEXT: br label [[FOR_BODY3_LR_PH_US:%.*]]
; CHECK: for.end.us:
; CHECK-NEXT: [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV33:%.*]]
; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX9_US]], align 4, !llvm.mem.parallel_loop_access !0
; CHECK-NEXT: [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV33:%.*]]
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX9_US]], align 4, !llvm.mem.parallel_loop_access !0
; CHECK-NEXT: [[ADD10_US:%.*]] = add nsw i32 [[TMP4]], 3
; CHECK-NEXT: store i32 [[ADD10_US]], i32* [[ARRAYIDX9_US]], align 4, !llvm.mem.parallel_loop_access !0
; CHECK-NEXT: store i32 [[ADD10_US]], ptr [[ARRAYIDX9_US]], align 4, !llvm.mem.parallel_loop_access !0
; CHECK-NEXT: [[INDVARS_IV_NEXT34:%.*]] = add i64 [[INDVARS_IV33]], 1
; CHECK-NEXT: [[LFTR_WIDEIV35:%.*]] = trunc i64 [[INDVARS_IV_NEXT34]] to i32
; CHECK-NEXT: [[EXITCOND36:%.*]] = icmp eq i32 [[LFTR_WIDEIV35]], [[M]]
Expand All @@ -44,10 +44,10 @@ define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV29]] to i32
; CHECK-NEXT: [[ADD4_US:%.*]] = add i32 [[ADD_US:%.*]], [[TMP5]]
; CHECK-NEXT: [[IDXPROM_US:%.*]] = sext i32 [[ADD4_US]] to i64
; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDXPROM_US]]
; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4, !llvm.mem.parallel_loop_access !0
; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4, !llvm.mem.parallel_loop_access !0
; CHECK-NEXT: [[ADD5_US:%.*]] = add nsw i32 [[TMP6]], 1
; CHECK-NEXT: store i32 [[ADD5_US]], i32* [[ARRAYIDX7_US:%.*]], align 4, !llvm.mem.parallel_loop_access !0
; CHECK-NEXT: store i32 [[ADD5_US]], ptr [[ARRAYIDX7_US:%.*]], align 4, !llvm.mem.parallel_loop_access !0
; CHECK-NEXT: [[INDVARS_IV_NEXT30]] = add i64 [[INDVARS_IV29]], 1
; CHECK-NEXT: [[LFTR_WIDEIV31:%.*]] = trunc i64 [[INDVARS_IV_NEXT30]] to i32
; CHECK-NEXT: [[EXITCOND32:%.*]] = icmp eq i32 [[LFTR_WIDEIV31]], [[M]]
Expand All @@ -58,7 +58,7 @@ define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[INDVARS_IV33]] to i32
; CHECK-NEXT: [[ADD_US]] = add i32 [[TMP9]], [[K]]
; CHECK-NEXT: [[ARRAYIDX7_US]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV33]]
; CHECK-NEXT: [[ARRAYIDX7_US]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV33]]
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH]], label [[VECTOR_SCEVCHECK:%.*]]
; CHECK: vector.scevcheck:
Expand All @@ -70,21 +70,20 @@ define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[INDEX]] to i32
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[OFFSET_IDX]] to i32
; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], 0
; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[ADD_US]], [[TMP13]]
; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP15]]
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 0
; CHECK-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP18]], align 4
; CHECK-NEXT: [[TMP19:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP19]], i32 3
; CHECK-NEXT: store i32 [[TMP20]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP15]]
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP17]], align 4
; CHECK-NEXT: [[TMP18:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[TMP18]], i32 3
; CHECK-NEXT: store i32 [[TMP19]], ptr [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4
; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_US]], label [[SCALAR_PH]]
Expand All @@ -101,10 +100,10 @@ entry:
br i1 %cmp27, label %for.body3.lr.ph.us, label %for.end15

for.end.us: ; preds = %for.body3.us
%arrayidx9.us = getelementptr inbounds i32, i32* %b, i64 %indvars.iv33
%0 = load i32, i32* %arrayidx9.us, align 4, !llvm.mem.parallel_loop_access !3
%arrayidx9.us = getelementptr inbounds i32, ptr %b, i64 %indvars.iv33
%0 = load i32, ptr %arrayidx9.us, align 4, !llvm.mem.parallel_loop_access !3
%add10.us = add nsw i32 %0, 3
store i32 %add10.us, i32* %arrayidx9.us, align 4, !llvm.mem.parallel_loop_access !3
store i32 %add10.us, ptr %arrayidx9.us, align 4, !llvm.mem.parallel_loop_access !3
%indvars.iv.next34 = add i64 %indvars.iv33, 1
%lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32
%exitcond36 = icmp eq i32 %lftr.wideiv35, %m
Expand All @@ -115,10 +114,10 @@ for.body3.us: ; preds = %for.body3.us, %for.
%1 = trunc i64 %indvars.iv29 to i32
%add4.us = add i32 %add.us, %1
%idxprom.us = sext i32 %add4.us to i64
%arrayidx.us = getelementptr inbounds i32, i32* %a, i64 %idxprom.us
%2 = load i32, i32* %arrayidx.us, align 4, !llvm.mem.parallel_loop_access !3
%arrayidx.us = getelementptr inbounds i32, ptr %a, i64 %idxprom.us
%2 = load i32, ptr %arrayidx.us, align 4, !llvm.mem.parallel_loop_access !3
%add5.us = add nsw i32 %2, 1
store i32 %add5.us, i32* %arrayidx7.us, align 4, !llvm.mem.parallel_loop_access !3
store i32 %add5.us, ptr %arrayidx7.us, align 4, !llvm.mem.parallel_loop_access !3
%indvars.iv.next30 = add i64 %indvars.iv29, 1
%lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32
%exitcond32 = icmp eq i32 %lftr.wideiv31, %m
Expand All @@ -128,7 +127,7 @@ for.body3.lr.ph.us: ; preds = %for.end.us, %entry
%indvars.iv33 = phi i64 [ %indvars.iv.next34, %for.end.us ], [ 0, %entry ]
%3 = trunc i64 %indvars.iv33 to i32
%add.us = add i32 %3, %k
%arrayidx7.us = getelementptr inbounds i32, i32* %a, i64 %indvars.iv33
%arrayidx7.us = getelementptr inbounds i32, ptr %a, i64 %indvars.iv33
br label %for.body3.us

for.end15: ; preds = %for.end.us, %entry
Expand All @@ -139,18 +138,18 @@ for.end15: ; preds = %for.end.us, %entry

; Here we can see the vectorizer does the mem dep checks and decides it is
; unsafe to vectorize.
define void @no-par-mem-metadata(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
define void @no-par-mem-metadata(ptr nocapture %a, ptr nocapture %b, i32 %k, i32 %m) #0 {
; CHECK-LABEL: @no-par-mem-metadata(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP27:%.*]] = icmp sgt i32 [[M:%.*]], 0
; CHECK-NEXT: br i1 [[CMP27]], label [[FOR_BODY3_LR_PH_US_PREHEADER:%.*]], label [[FOR_END15:%.*]]
; CHECK: for.body3.lr.ph.us.preheader:
; CHECK-NEXT: br label [[FOR_BODY3_LR_PH_US:%.*]]
; CHECK: for.end.us:
; CHECK-NEXT: [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV33:%.*]]
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX9_US]], align 4
; CHECK-NEXT: [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV33:%.*]]
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX9_US]], align 4
; CHECK-NEXT: [[ADD10_US:%.*]] = add nsw i32 [[TMP0]], 3
; CHECK-NEXT: store i32 [[ADD10_US]], i32* [[ARRAYIDX9_US]], align 4
; CHECK-NEXT: store i32 [[ADD10_US]], ptr [[ARRAYIDX9_US]], align 4
; CHECK-NEXT: [[INDVARS_IV_NEXT34:%.*]] = add i64 [[INDVARS_IV33]], 1
; CHECK-NEXT: [[LFTR_WIDEIV35:%.*]] = trunc i64 [[INDVARS_IV_NEXT34]] to i32
; CHECK-NEXT: [[EXITCOND36:%.*]] = icmp eq i32 [[LFTR_WIDEIV35]], [[M]]
Expand All @@ -160,10 +159,10 @@ define void @no-par-mem-metadata(i32* nocapture %a, i32* nocapture %b, i32 %k, i
; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV29]] to i32
; CHECK-NEXT: [[ADD4_US:%.*]] = add i32 [[ADD_US:%.*]], [[TMP1]]
; CHECK-NEXT: [[IDXPROM_US:%.*]] = sext i32 [[ADD4_US]] to i64
; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDXPROM_US]]
; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4
; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
; CHECK-NEXT: [[ADD5_US:%.*]] = add nsw i32 [[TMP2]], 1
; CHECK-NEXT: store i32 [[ADD5_US]], i32* [[ARRAYIDX7_US:%.*]], align 4
; CHECK-NEXT: store i32 [[ADD5_US]], ptr [[ARRAYIDX7_US:%.*]], align 4
; CHECK-NEXT: [[INDVARS_IV_NEXT30]] = add i64 [[INDVARS_IV29]], 1
; CHECK-NEXT: [[LFTR_WIDEIV31:%.*]] = trunc i64 [[INDVARS_IV_NEXT30]] to i32
; CHECK-NEXT: [[EXITCOND32:%.*]] = icmp eq i32 [[LFTR_WIDEIV31]], [[M]]
Expand All @@ -172,7 +171,7 @@ define void @no-par-mem-metadata(i32* nocapture %a, i32* nocapture %b, i32 %k, i
; CHECK-NEXT: [[INDVARS_IV33]] = phi i64 [ [[INDVARS_IV_NEXT34]], [[FOR_END_US]] ], [ 0, [[FOR_BODY3_LR_PH_US_PREHEADER]] ]
; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[INDVARS_IV33]] to i32
; CHECK-NEXT: [[ADD_US]] = add i32 [[TMP3]], [[K:%.*]]
; CHECK-NEXT: [[ARRAYIDX7_US]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV33]]
; CHECK-NEXT: [[ARRAYIDX7_US]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV33]]
; CHECK-NEXT: br label [[FOR_BODY3_US]]
; CHECK: for.end15.loopexit:
; CHECK-NEXT: br label [[FOR_END15]]
Expand All @@ -184,10 +183,10 @@ entry:
br i1 %cmp27, label %for.body3.lr.ph.us, label %for.end15

for.end.us: ; preds = %for.body3.us
%arrayidx9.us = getelementptr inbounds i32, i32* %b, i64 %indvars.iv33
%0 = load i32, i32* %arrayidx9.us, align 4
%arrayidx9.us = getelementptr inbounds i32, ptr %b, i64 %indvars.iv33
%0 = load i32, ptr %arrayidx9.us, align 4
%add10.us = add nsw i32 %0, 3
store i32 %add10.us, i32* %arrayidx9.us, align 4
store i32 %add10.us, ptr %arrayidx9.us, align 4
%indvars.iv.next34 = add i64 %indvars.iv33, 1
%lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32
%exitcond36 = icmp eq i32 %lftr.wideiv35, %m
Expand All @@ -198,10 +197,10 @@ for.body3.us: ; preds = %for.body3.us, %for.
%1 = trunc i64 %indvars.iv29 to i32
%add4.us = add i32 %add.us, %1
%idxprom.us = sext i32 %add4.us to i64
%arrayidx.us = getelementptr inbounds i32, i32* %a, i64 %idxprom.us
%2 = load i32, i32* %arrayidx.us, align 4
%arrayidx.us = getelementptr inbounds i32, ptr %a, i64 %idxprom.us
%2 = load i32, ptr %arrayidx.us, align 4
%add5.us = add nsw i32 %2, 1
store i32 %add5.us, i32* %arrayidx7.us, align 4
store i32 %add5.us, ptr %arrayidx7.us, align 4
%indvars.iv.next30 = add i64 %indvars.iv29, 1
%lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32
%exitcond32 = icmp eq i32 %lftr.wideiv31, %m
Expand All @@ -211,7 +210,7 @@ for.body3.lr.ph.us: ; preds = %for.end.us, %entry
%indvars.iv33 = phi i64 [ %indvars.iv.next34, %for.end.us ], [ 0, %entry ]
%3 = trunc i64 %indvars.iv33 to i32
%add.us = add i32 %3, %k
%arrayidx7.us = getelementptr inbounds i32, i32* %a, i64 %indvars.iv33
%arrayidx7.us = getelementptr inbounds i32, ptr %a, i64 %indvars.iv33
br label %for.body3.us

for.end15: ; preds = %for.end.us, %entry
Expand Down
74 changes: 36 additions & 38 deletions llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@
; if (arr[i] != 42)
; tot += arr[i];

define double @sumIfScalar(double* nocapture readonly %arr) {
define double @sumIfScalar(ptr nocapture readonly %arr) {
; CHECK-LABEL: @sumIfScalar(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
; CHECK-NEXT: [[TOT:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
; CHECK-NEXT: [[ADDR:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[I]]
; CHECK-NEXT: [[NEXTVAL:%.*]] = load double, double* [[ADDR]], align 8
; CHECK-NEXT: [[ADDR:%.*]] = getelementptr double, ptr [[ARR:%.*]], i32 [[I]]
; CHECK-NEXT: [[NEXTVAL:%.*]] = load double, ptr [[ADDR]], align 8
; CHECK-NEXT: [[TST:%.*]] = fcmp une double [[NEXTVAL]], 4.200000e+01
; CHECK-NEXT: br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
; CHECK: do.add:
Expand All @@ -41,8 +41,8 @@ loop:
%i = phi i32 [0, %entry], [%i.next, %next.iter]
%tot = phi double [0.0, %entry], [%tot.next, %next.iter]

%addr = getelementptr double, double* %arr, i32 %i
%nextval = load double, double* %addr
%addr = getelementptr double, ptr %arr, i32 %i
%nextval = load double, ptr %addr

%tst = fcmp une double %nextval, 42.0
br i1 %tst, label %do.add, label %no.add
Expand All @@ -64,7 +64,7 @@ done:
ret double %tot.next
}

define double @sumIfVector(double* nocapture readonly %arr) {
define double @sumIfVector(ptr nocapture readonly %arr) {
; SSE-LABEL: @sumIfVector(
; SSE-NEXT: entry:
; SSE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
Expand All @@ -74,30 +74,29 @@ define double @sumIfVector(double* nocapture readonly %arr) {
; SSE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; SSE-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
; SSE-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
; SSE-NEXT: [[TMP1:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[TMP0]]
; SSE-NEXT: [[TMP2:%.*]] = getelementptr double, double* [[TMP1]], i32 0
; SSE-NEXT: [[TMP3:%.*]] = bitcast double* [[TMP2]] to <2 x double>*
; SSE-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
; SSE-NEXT: [[TMP4:%.*]] = fcmp fast une <2 x double> [[WIDE_LOAD]], <double 4.200000e+01, double 4.200000e+01>
; SSE-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[VEC_PHI]], [[WIDE_LOAD]]
; SSE-NEXT: [[TMP6:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
; SSE-NEXT: [[PREDPHI]] = select <2 x i1> [[TMP4]], <2 x double> [[TMP5]], <2 x double> [[VEC_PHI]]
; SSE-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[ARR:%.*]], i32 [[TMP0]]
; SSE-NEXT: [[TMP2:%.*]] = getelementptr double, ptr [[TMP1]], i32 0
; SSE-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
; SSE-NEXT: [[TMP3:%.*]] = fcmp fast une <2 x double> [[WIDE_LOAD]], <double 4.200000e+01, double 4.200000e+01>
; SSE-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[VEC_PHI]], [[WIDE_LOAD]]
; SSE-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP3]], <i1 true, i1 true>
; SSE-NEXT: [[PREDPHI]] = select <2 x i1> [[TMP3]], <2 x double> [[TMP4]], <2 x double> [[VEC_PHI]]
; SSE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; SSE-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
; SSE-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; SSE-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
; SSE-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; SSE: middle.block:
; SSE-NEXT: [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[PREDPHI]])
; SSE-NEXT: [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[PREDPHI]])
; SSE-NEXT: [[CMP_N:%.*]] = icmp eq i32 32, 32
; SSE-NEXT: br i1 [[CMP_N]], label [[DONE:%.*]], label [[SCALAR_PH]]
; SSE: scalar.ph:
; SSE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; SSE-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
; SSE-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
; SSE-NEXT: br label [[LOOP:%.*]]
; SSE: loop:
; SSE-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
; SSE-NEXT: [[TOT:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
; SSE-NEXT: [[ADDR:%.*]] = getelementptr double, double* [[ARR]], i32 [[I]]
; SSE-NEXT: [[NEXTVAL:%.*]] = load double, double* [[ADDR]], align 8
; SSE-NEXT: [[ADDR:%.*]] = getelementptr double, ptr [[ARR]], i32 [[I]]
; SSE-NEXT: [[NEXTVAL:%.*]] = load double, ptr [[ADDR]], align 8
; SSE-NEXT: [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01
; SSE-NEXT: br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
; SSE: do.add:
Expand All @@ -111,7 +110,7 @@ define double @sumIfVector(double* nocapture readonly %arr) {
; SSE-NEXT: [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
; SSE-NEXT: br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop [[LOOP2:![0-9]+]]
; SSE: done:
; SSE-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
; SSE-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
; SSE-NEXT: ret double [[TOT_NEXT_LCSSA]]
;
; AVX-LABEL: @sumIfVector(
Expand All @@ -123,30 +122,29 @@ define double @sumIfVector(double* nocapture readonly %arr) {
; AVX-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX-NEXT: [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
; AVX-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
; AVX-NEXT: [[TMP1:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[TMP0]]
; AVX-NEXT: [[TMP2:%.*]] = getelementptr double, double* [[TMP1]], i32 0
; AVX-NEXT: [[TMP3:%.*]] = bitcast double* [[TMP2]] to <4 x double>*
; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[TMP3]], align 8
; AVX-NEXT: [[TMP4:%.*]] = fcmp fast une <4 x double> [[WIDE_LOAD]], <double 4.200000e+01, double 4.200000e+01, double 4.200000e+01, double 4.200000e+01>
; AVX-NEXT: [[TMP5:%.*]] = fadd fast <4 x double> [[VEC_PHI]], [[WIDE_LOAD]]
; AVX-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
; AVX-NEXT: [[PREDPHI]] = select <4 x i1> [[TMP4]], <4 x double> [[TMP5]], <4 x double> [[VEC_PHI]]
; AVX-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[ARR:%.*]], i32 [[TMP0]]
; AVX-NEXT: [[TMP2:%.*]] = getelementptr double, ptr [[TMP1]], i32 0
; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 8
; AVX-NEXT: [[TMP3:%.*]] = fcmp fast une <4 x double> [[WIDE_LOAD]], <double 4.200000e+01, double 4.200000e+01, double 4.200000e+01, double 4.200000e+01>
; AVX-NEXT: [[TMP4:%.*]] = fadd fast <4 x double> [[VEC_PHI]], [[WIDE_LOAD]]
; AVX-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP3]], <i1 true, i1 true, i1 true, i1 true>
; AVX-NEXT: [[PREDPHI]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP4]], <4 x double> [[VEC_PHI]]
; AVX-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; AVX-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
; AVX-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; AVX-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
; AVX-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; AVX: middle.block:
; AVX-NEXT: [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[PREDPHI]])
; AVX-NEXT: [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[PREDPHI]])
; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i32 32, 32
; AVX-NEXT: br i1 [[CMP_N]], label [[DONE:%.*]], label [[SCALAR_PH]]
; AVX: scalar.ph:
; AVX-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; AVX-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
; AVX-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
; AVX-NEXT: br label [[LOOP:%.*]]
; AVX: loop:
; AVX-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
; AVX-NEXT: [[TOT:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
; AVX-NEXT: [[ADDR:%.*]] = getelementptr double, double* [[ARR]], i32 [[I]]
; AVX-NEXT: [[NEXTVAL:%.*]] = load double, double* [[ADDR]], align 8
; AVX-NEXT: [[ADDR:%.*]] = getelementptr double, ptr [[ARR]], i32 [[I]]
; AVX-NEXT: [[NEXTVAL:%.*]] = load double, ptr [[ADDR]], align 8
; AVX-NEXT: [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01
; AVX-NEXT: br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
; AVX: do.add:
Expand All @@ -160,7 +158,7 @@ define double @sumIfVector(double* nocapture readonly %arr) {
; AVX-NEXT: [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
; AVX-NEXT: br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop [[LOOP2:![0-9]+]]
; AVX: done:
; AVX-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
; AVX-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
; AVX-NEXT: ret double [[TOT_NEXT_LCSSA]]
;
entry:
Expand All @@ -170,8 +168,8 @@ loop:
%i = phi i32 [0, %entry], [%i.next, %next.iter]
%tot = phi double [0.0, %entry], [%tot.next, %next.iter]

%addr = getelementptr double, double* %arr, i32 %i
%nextval = load double, double* %addr
%addr = getelementptr double, ptr %arr, i32 %i
%nextval = load double, ptr %addr

%tst = fcmp fast une double %nextval, 42.0
br i1 %tst, label %do.add, label %no.add
Expand Down
99 changes: 49 additions & 50 deletions llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,92 +4,91 @@
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define i32 @inv_load_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
define i32 @inv_load_conditional(ptr %a, i64 %n, ptr %b, i32 %k) {
; CHECK-LABEL: @inv_load_conditional(
; CHECK-NEXT: iter.check:
; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
; CHECK-NEXT: [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 8
; CHECK-NEXT: [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX2]], 8
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; CHECK: vector.memcheck:
; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]]
; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1
; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[B]]
; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[SMAX]], 2
; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[B]]
; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[A]]
; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
; CHECK: vector.main.loop.iter.check:
; CHECK-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[SMAX6]], 16
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK7]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-NEXT: [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[SMAX2]], 16
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK3]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775792
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32*> poison, i32* [[A]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT]], <16 x i32*> poison, <16 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <16 x i32> poison, i32 [[NTRUNC]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT8]], <16 x i32> poison, <16 x i32> zeroinitializer
; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775792
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x ptr> poison, ptr [[A]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x ptr> [[BROADCAST_SPLATINSERT]], <16 x ptr> poison, <16 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <16 x i32> poison, i32 [[NTRUNC]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT4]], <16 x i32> poison, <16 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT9]], <16 x i32>* [[TMP1]], align 4, !alias.scope !0, !noalias !3
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP1]], align 4, !alias.scope !0, !noalias !3
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32*> [[BROADCAST_SPLAT]], zeroinitializer
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[BROADCAST_SPLAT]], i32 4, <16 x i1> [[TMP3]], <16 x i32> poison), !alias.scope !3
; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x ptr> [[BROADCAST_SPLAT]], zeroinitializer
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[BROADCAST_SPLAT]], i32 4, <16 x i1> [[TMP3]], <16 x i32> poison), !alias.scope !3
; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP3]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1>
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x i32> [[PREDPHI]], i64 15
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]]
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[SMAX6]], 8
; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[SMAX2]], 8
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK_NOT_NOT:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK_NOT_NOT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: [[N_VEC11:%.*]] = and i64 [[SMAX6]], 9223372036854775800
; CHECK-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT15]], <8 x i32*> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT17]], <8 x i32> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: [[N_VEC7:%.*]] = and i64 [[SMAX2]], 9223372036854775800
; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <8 x ptr> [[BROADCAST_SPLATINSERT11]], <8 x ptr> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT14:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT13]], <8 x i32> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT21:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT18]], <8 x i32>* [[TMP6]], align 4, !alias.scope !7, !noalias !10
; CHECK-NEXT: [[INDEX_NEXT21]] = add nuw i64 [[OFFSET_IDX]], 8
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT21]], [[N_VEC11]]
; CHECK-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX9]]
; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT14]], ptr [[TMP5]], align 4, !alias.scope !7, !noalias !10
; CHECK-NEXT: [[INDEX_NEXT17]] = add nuw i64 [[INDEX9]], 8
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT17]], [[N_VEC7]]
; CHECK-NEXT: br i1 [[TMP6]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <8 x i32*> [[BROADCAST_SPLAT16]], zeroinitializer
; CHECK-NEXT: [[WIDE_MASKED_GATHER19:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[BROADCAST_SPLAT16]], i32 4, <8 x i1> [[TMP8]], <8 x i32> poison), !alias.scope !10
; CHECK-NEXT: [[PREDPHI20:%.*]] = select <8 x i1> [[TMP8]], <8 x i32> [[WIDE_MASKED_GATHER19]], <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1>
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[PREDPHI20]], i64 7
; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC11]]
; CHECK-NEXT: br i1 [[CMP_N12]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <8 x ptr> [[BROADCAST_SPLAT12]], zeroinitializer
; CHECK-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[BROADCAST_SPLAT12]], i32 4, <8 x i1> [[TMP7]], <8 x i32> poison), !alias.scope !10
; CHECK-NEXT: [[PREDPHI16:%.*]] = select <8 x i1> [[TMP7]], <8 x i32> [[WIDE_MASKED_GATHER15]], <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1>
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[PREDPHI16]], i64 7
; CHECK-NEXT: [[CMP_N8:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC7]]
; CHECK-NEXT: br i1 [[CMP_N8]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK: vec.epilog.scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC7]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32* [[A]], null
; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[I1]], align 4
; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]]
; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq ptr [[A]], null
; CHECK-NEXT: store i32 [[NTRUNC]], ptr [[I1]], align 4
; CHECK-NEXT: br i1 [[CMP_NOT]], label [[LATCH]], label [[COND_LOAD:%.*]]
; CHECK: cond_load:
; CHECK-NEXT: [[ALOAD:%.*]] = load i32, i32* [[A]], align 4
; CHECK-NEXT: [[ALOAD:%.*]] = load i32, ptr [[A]], align 4
; CHECK-NEXT: br label [[LATCH]]
; CHECK: latch:
; CHECK-NEXT: [[A_LCSSA:%.*]] = phi i32 [ [[ALOAD]], [[COND_LOAD]] ], [ 1, [[FOR_BODY]] ]
; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1
; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: [[A_LCSSA_LCSSA:%.*]] = phi i32 [ [[A_LCSSA]], [[LATCH]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ [[TMP9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
; CHECK-NEXT: [[A_LCSSA_LCSSA:%.*]] = phi i32 [ [[A_LCSSA]], [[LATCH]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ [[TMP8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[A_LCSSA_LCSSA]]
;
entry:
Expand All @@ -98,14 +97,14 @@ entry:

for.body: ; preds = %for.body, %entry
%i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
%i1 = getelementptr inbounds i32, i32* %b, i64 %i
%i2 = load i32, i32* %i1, align 8
%cmp = icmp ne i32* %a, null
store i32 %ntrunc, i32* %i1
%i1 = getelementptr inbounds i32, ptr %b, i64 %i
%i2 = load i32, ptr %i1, align 8
%cmp = icmp ne ptr %a, null
store i32 %ntrunc, ptr %i1
br i1 %cmp, label %cond_load, label %latch

cond_load:
%aload = load i32, i32* %a, align 4
%aload = load i32, ptr %a, align 4
br label %latch

latch:
Expand Down
370 changes: 179 additions & 191 deletions llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll

Large diffs are not rendered by default.

60 changes: 28 additions & 32 deletions llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
target triple = "x86_64-unknown-linux-gnu"

; TODO: Make sure selected VF for the epilog loop doesn't exceed remaining TC.
define void @test1(i8 * noalias %src, i8 * noalias %dst) #0 {
define void @test1(ptr noalias %src, ptr noalias %dst) #0 {
; CHECK-LABEL: @test1(
; CHECK-NEXT: iter.check:
; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
Expand All @@ -16,17 +16,15 @@ define void @test1(i8 * noalias %src, i8 * noalias %dst) #0 {
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 64
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>*
; CHECK-NEXT: store <16 x i8> [[WIDE_LOAD]], <16 x i8>* [[TMP6]], align 64
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 64
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
; CHECK-NEXT: store <16 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 64
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 17, 16
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
Expand All @@ -36,19 +34,17 @@ define void @test1(i8 * noalias %src, i8 * noalias %dst) #0 {
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP9]], i32 0
; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>*
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 64
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[TMP8]]
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 0
; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <8 x i8>*
; CHECK-NEXT: store <8 x i8> [[WIDE_LOAD3]], <8 x i8>* [[TMP14]], align 64
; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[OFFSET_IDX]], 8
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 16
; CHECK-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX2]], 0
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP8]], align 64
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP6]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
; CHECK-NEXT: store <8 x i8> [[WIDE_LOAD3]], ptr [[TMP10]], align 64
; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX2]], 8
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 16
; CHECK-NEXT: br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i64 17, 16
; CHECK-NEXT: br i1 [[CMP_N1]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
Expand All @@ -57,10 +53,10 @@ define void @test1(i8 * noalias %src, i8 * noalias %dst) #0 {
; CHECK-NEXT: br label [[LOOP_MEMCPY_EXPANSION:%.*]]
; CHECK: loop-memcpy-expansion:
; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; CHECK-NEXT: [[LDADDR:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[I]]
; CHECK-NEXT: [[VAL:%.*]] = load i8, i8* [[LDADDR]], align 64
; CHECK-NEXT: [[STADDR:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[I]]
; CHECK-NEXT: store i8 [[VAL]], i8* [[STADDR]], align 64
; CHECK-NEXT: [[LDADDR:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I]]
; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[LDADDR]], align 64
; CHECK-NEXT: [[STADDR:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I]]
; CHECK-NEXT: store i8 [[VAL]], ptr [[STADDR]], align 64
; CHECK-NEXT: [[I_NEXT]] = add i64 [[I]], 1
; CHECK-NEXT: [[IS_NEXT:%.*]] = icmp ult i64 [[I_NEXT]], 17
; CHECK-NEXT: br i1 [[IS_NEXT]], label [[LOOP_MEMCPY_EXPANSION]], label [[EXIT]], !llvm.loop [[LOOP4:![0-9]+]]
Expand All @@ -72,10 +68,10 @@ entry:

loop-memcpy-expansion:
%i = phi i64 [ 0, %entry ], [ %i.next, %loop-memcpy-expansion ]
%ldaddr = getelementptr inbounds i8, i8 * %src, i64 %i
%val = load i8, i8 * %ldaddr, align 64
%staddr = getelementptr inbounds i8, i8 * %dst, i64 %i
store i8 %val, i8 * %staddr, align 64
%ldaddr = getelementptr inbounds i8, ptr %src, i64 %i
%val = load i8, ptr %ldaddr, align 64
%staddr = getelementptr inbounds i8, ptr %dst, i64 %i
store i8 %val, ptr %staddr, align 64
%i.next = add i64 %i, 1
%is.next = icmp ult i64 %i.next, 17
br i1 %is.next, label %loop-memcpy-expansion, label %exit
Expand Down
2,318 changes: 1,104 additions & 1,214 deletions llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll

Large diffs are not rendered by default.

2,870 changes: 1,315 additions & 1,555 deletions llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll

Large diffs are not rendered by default.

2,858 changes: 1,207 additions & 1,651 deletions llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll

Large diffs are not rendered by default.

172 changes: 81 additions & 91 deletions llvm/test/Transforms/LoopVectorize/X86/optsize.ll

Large diffs are not rendered by default.

47 changes: 22 additions & 25 deletions llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"

define void @small_tc(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
define void @small_tc(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
; CHECK-LABEL: @small_tc(
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
Expand All @@ -18,19 +18,16 @@ define void @small_tc(float* noalias nocapture %A, float* noalias nocapture read
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !0
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !0
; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]]
; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
; CHECK-NEXT: store <8 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP1:!llvm.loop !.*]]
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 8, 8
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
Expand All @@ -39,15 +36,15 @@ define void @small_tc(float* noalias nocapture %A, float* noalias nocapture read
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !0
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !0
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]]
; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !0
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP0]]
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]]
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP6]], [[TMP7]]
; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP3:!llvm.loop !.*]]
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
Expand All @@ -56,12 +53,12 @@ entry:

for.body:
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
%0 = load float, float* %arrayidx, align 4, !llvm.access.group !5
%arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
%1 = load float, float* %arrayidx2, align 4, !llvm.access.group !5
%arrayidx = getelementptr inbounds float, ptr %B, i64 %indvars.iv
%0 = load float, ptr %arrayidx, align 4, !llvm.access.group !5
%arrayidx2 = getelementptr inbounds float, ptr %A, i64 %indvars.iv
%1 = load float, ptr %arrayidx2, align 4, !llvm.access.group !5
%add = fadd fast float %0, %1
store float %add, float* %arrayidx2, align 4, !llvm.access.group !5
store float %add, ptr %arrayidx2, align 4, !llvm.access.group !5
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 8
br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
Expand Down
Loading