94 changes: 47 additions & 47 deletions llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
; ADD

; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
define i32 @add(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
define i32 @add(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) {
; CHECK-LABEL: @add
; CHECK: vector.body:
; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
Expand All @@ -23,8 +23,8 @@ entry:
for.body: ; preds = %entry, %for.body
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%sum.07 = phi i32 [ 2, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%add = add nsw i32 %0, %sum.07
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %n
Expand All @@ -37,7 +37,7 @@ for.end: ; preds = %for.body, %entry
; OR

; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
define i32 @or(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
define i32 @or(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) {
; CHECK-LABEL: @or
; CHECK: vector.body:
; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
Expand All @@ -53,8 +53,8 @@ entry:
for.body: ; preds = %entry, %for.body
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%sum.07 = phi i32 [ 2, %entry ], [ %or, %for.body ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%or = or i32 %0, %sum.07
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %n
Expand All @@ -67,7 +67,7 @@ for.end: ; preds = %for.body, %entry
; AND

; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
define i32 @and(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
define i32 @and(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) {
; CHECK-LABEL: @and
; CHECK: vector.body:
; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
Expand All @@ -83,8 +83,8 @@ entry:
for.body: ; preds = %entry, %for.body
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%sum.07 = phi i32 [ 2, %entry ], [ %and, %for.body ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%and = and i32 %0, %sum.07
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %n
Expand All @@ -97,7 +97,7 @@ for.end: ; preds = %for.body, %entry
; XOR

; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
define i32 @xor(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
define i32 @xor(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) {
; CHECK-LABEL: @xor
; CHECK: vector.body:
; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
Expand All @@ -113,8 +113,8 @@ entry:
for.body: ; preds = %entry, %for.body
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%sum.07 = phi i32 [ 2, %entry ], [ %xor, %for.body ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%xor = xor i32 %0, %sum.07
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %n
Expand All @@ -127,7 +127,7 @@ for.end: ; preds = %for.body, %entry
; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
; SMIN

define i32 @smin(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
define i32 @smin(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) {
; CHECK-LABEL: @smin
; CHECK: vector.body:
; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
Expand All @@ -146,8 +146,8 @@ entry:
for.body: ; preds = %entry, %for.body
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%sum.010 = phi i32 [ 2, %entry ], [ %.sroa.speculated, %for.body ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%cmp.i = icmp slt i32 %0, %sum.010
%.sroa.speculated = select i1 %cmp.i, i32 %0, i32 %sum.010
%iv.next = add nuw nsw i64 %iv, 1
Expand All @@ -161,7 +161,7 @@ for.end:
; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
; UMAX

define i32 @umax(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
define i32 @umax(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) {
; CHECK-LABEL: @umax
; CHECK: vector.body:
; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
Expand All @@ -180,8 +180,8 @@ entry:
for.body: ; preds = %entry, %for.body
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%sum.010 = phi i32 [ 2, %entry ], [ %.sroa.speculated, %for.body ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%cmp.i = icmp ugt i32 %0, %sum.010
%.sroa.speculated = select i1 %cmp.i, i32 %0, i32 %sum.010
%iv.next = add nuw nsw i64 %iv, 1
Expand All @@ -195,7 +195,7 @@ for.end:
; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
; FADD (FAST)

define float @fadd_fast(float* noalias nocapture readonly %a, i64 %n) {
define float @fadd_fast(ptr noalias nocapture readonly %a, i64 %n) {
; CHECK-LABEL: @fadd_fast
; CHECK: vector.body:
; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x float>
Expand All @@ -211,8 +211,8 @@ entry:
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds float, float* %a, i64 %iv
%0 = load float, float* %arrayidx, align 4
%arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
%0 = load float, ptr %arrayidx, align 4
%add = fadd fast float %0, %sum.07
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %n
Expand All @@ -224,7 +224,7 @@ for.end:

; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2)
define bfloat @fadd_fast_bfloat(bfloat* noalias nocapture readonly %a, i64 %n) {
define bfloat @fadd_fast_bfloat(ptr noalias nocapture readonly %a, i64 %n) {
; CHECK-LABEL: @fadd_fast_bfloat
; CHECK: vector.body:
; CHECK: %[[LOAD1:.*]] = load <8 x bfloat>
Expand All @@ -240,8 +240,8 @@ entry:
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%sum.07 = phi bfloat [ 0.000000e+00, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds bfloat, bfloat* %a, i64 %iv
%0 = load bfloat, bfloat* %arrayidx, align 4
%arrayidx = getelementptr inbounds bfloat, ptr %a, i64 %iv
%0 = load bfloat, ptr %arrayidx, align 4
%add = fadd fast bfloat %0, %sum.07
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %n
Expand All @@ -254,7 +254,7 @@ for.end:
; FMIN (FAST)

; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
define float @fmin_fast(float* noalias nocapture readonly %a, i64 %n) #0 {
define float @fmin_fast(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-LABEL: @fmin_fast
; CHECK: vector.body:
; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x float>
Expand All @@ -273,8 +273,8 @@ entry:
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%sum.07 = phi float [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ]
%arrayidx = getelementptr inbounds float, float* %a, i64 %iv
%0 = load float, float* %arrayidx, align 4
%arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
%0 = load float, ptr %arrayidx, align 4
%cmp.i = fcmp fast olt float %0, %sum.07
%.sroa.speculated = select i1 %cmp.i, float %0, float %sum.07
%iv.next = add nuw nsw i64 %iv, 1
Expand All @@ -288,7 +288,7 @@ for.end:
; FMAX (FAST)

; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
define float @fmax_fast(float* noalias nocapture readonly %a, i64 %n) #0 {
define float @fmax_fast(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-LABEL: @fmax_fast
; CHECK: vector.body:
; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x float>
Expand All @@ -307,8 +307,8 @@ entry:
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%sum.07 = phi float [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ]
%arrayidx = getelementptr inbounds float, float* %a, i64 %iv
%0 = load float, float* %arrayidx, align 4
%arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
%0 = load float, ptr %arrayidx, align 4
%cmp.i = fcmp fast ogt float %0, %sum.07
%.sroa.speculated = select i1 %cmp.i, float %0, float %sum.07
%iv.next = add nuw nsw i64 %iv, 1
Expand All @@ -322,7 +322,7 @@ for.end:
; ADD (with reduction stored in invariant address)

; CHECK-REMARK: vectorized loop (vectorization width: vscale x 4, interleaved count: 2)
define void @invariant_store(i32* %dst, i32* readonly %src) {
define void @invariant_store(ptr %dst, ptr readonly %src) {
; CHECK-LABEL: @invariant_store
; CHECK: vector.body:
; CHECK: %[[LOAD1:.*]] = load <vscale x 4 x i32>
Expand All @@ -332,18 +332,18 @@ define void @invariant_store(i32* %dst, i32* readonly %src) {
; CHECK: middle.block:
; CHECK: %[[ADD:.*]] = add <vscale x 4 x i32> %[[ADD2]], %[[ADD1]]
; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %[[ADD]])
; CHECK-NEXT: store i32 %[[SUM]], i32* %gep.dst, align 4
; CHECK-NEXT: store i32 %[[SUM]], ptr %gep.dst, align 4
entry:
%gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
store i32 0, i32* %gep.dst, align 4
%gep.dst = getelementptr inbounds i32, ptr %dst, i64 42
store i32 0, ptr %gep.dst, align 4
br label %for.body
for.body:
%sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%gep.src = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
%0 = load i32, i32* %gep.src, align 4
%gep.src = getelementptr inbounds i32, ptr %src, i64 %indvars.iv
%0 = load i32, ptr %gep.src, align 4
%add = add nsw i32 %sum, %0
store i32 %add, i32* %gep.dst, align 4
store i32 %add, ptr %gep.dst, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
br i1 %exitcond, label %for.cond.cleanup, label %for.body
Expand All @@ -358,7 +358,7 @@ for.cond.cleanup:

; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
define i32 @mul(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
define i32 @mul(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) {
; CHECK-LABEL: @mul
; CHECK: vector.body:
; CHECK: %[[LOAD1:.*]] = load <4 x i32>
Expand All @@ -374,8 +374,8 @@ entry:
for.body: ; preds = %entry, %for.body
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%sum.07 = phi i32 [ 2, %entry ], [ %mul, %for.body ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%mul = mul nsw i32 %0, %sum.07
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %n
Expand All @@ -388,7 +388,7 @@ for.end: ; preds = %for.body, %entry
; Note: This test was added to ensure we always check the legality of reductions (end emit a warning if necessary) before checking for memory dependencies
; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
define i32 @memory_dependence(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) {
define i32 @memory_dependence(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) {
; CHECK-LABEL: @memory_dependence
; CHECK: vector.body:
; CHECK: %[[LOAD1:.*]] = load <4 x i32>
Expand All @@ -408,14 +408,14 @@ entry:
for.body:
%i = phi i64 [ %inc, %for.body ], [ 0, %entry ]
%sum = phi i32 [ %mul, %for.body ], [ 2, %entry ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
%0 = load i32, i32* %arrayidx, align 4
%arrayidx1 = getelementptr inbounds i32, i32* %b, i64 %i
%1 = load i32, i32* %arrayidx1, align 4
%arrayidx = getelementptr inbounds i32, ptr %a, i64 %i
%0 = load i32, ptr %arrayidx, align 4
%arrayidx1 = getelementptr inbounds i32, ptr %b, i64 %i
%1 = load i32, ptr %arrayidx1, align 4
%add = add nsw i32 %1, %0
%add2 = add nuw nsw i64 %i, 32
%arrayidx3 = getelementptr inbounds i32, i32* %a, i64 %add2
store i32 %add, i32* %arrayidx3, align 4
%arrayidx3 = getelementptr inbounds i32, ptr %a, i64 %add2
store i32 %add, ptr %arrayidx3, align 4
%mul = mul nsw i32 %1, %sum
%inc = add nuw nsw i64 %i, 1
%exitcond.not = icmp eq i64 %inc, %n
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,20 +30,20 @@

; VF-4: <4 x i32>
; VF-VSCALE4: <16 x i32>
define void @test0(i32* %a, i8* %b, i32* %c) #0 {
define void @test0(ptr %a, ptr %b, ptr %c) #0 {
entry:
br label %loop

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%arrayidx = getelementptr inbounds i32, i32* %c, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
%1 = load i8, i8* %arrayidx2, align 4
%arrayidx = getelementptr inbounds i32, ptr %c, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %iv
%1 = load i8, ptr %arrayidx2, align 4
%zext = zext i8 %1 to i32
%add = add nsw i32 %zext, %0
%arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %iv
store i32 %add, i32* %arrayidx5, align 4
%arrayidx5 = getelementptr inbounds i32, ptr %a, i64 %iv
store i32 %add, ptr %arrayidx5, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %exit, label %loop
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
; Test that the MaxVF for the following loop, that has no dependence distances,
; is calculated as vscale x 4 (max legal SVE vector size) or vscale x 16
; (maximized bandwidth for i8 in the loop).
define void @test0(i32* %a, i8* %b, i32* %c) #0 {
define void @test0(ptr %a, ptr %b, ptr %c) #0 {
; CHECK: LV: Checking a loop in 'test0'
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
; CHECK_SCALABLE_ON: LV: Selecting VF: 16
Expand All @@ -19,14 +19,14 @@ entry:

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%arrayidx = getelementptr inbounds i32, i32* %c, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
%1 = load i8, i8* %arrayidx2, align 4
%arrayidx = getelementptr inbounds i32, ptr %c, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %iv
%1 = load i8, ptr %arrayidx2, align 4
%zext = zext i8 %1 to i32
%add = add nsw i32 %zext, %0
%arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %iv
store i32 %add, i32* %arrayidx5, align 4
%arrayidx5 = getelementptr inbounds i32, ptr %a, i64 %iv
store i32 %add, ptr %arrayidx5, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %exit, label %loop
Expand All @@ -37,7 +37,7 @@ exit:

; Test that the MaxVF for the following loop, with a dependence distance
; of 64 elements, is calculated as (maxvscale = 16) * 4.
define void @test1(i32* %a, i8* %b) #0 {
define void @test1(ptr %a, ptr %b) #0 {
; CHECK: LV: Checking a loop in 'test1'
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
; CHECK_SCALABLE_ON: LV: Selecting VF: 16
Expand All @@ -50,15 +50,15 @@ entry:

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
%1 = load i8, i8* %arrayidx2, align 4
%arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %iv
%1 = load i8, ptr %arrayidx2, align 4
%zext = zext i8 %1 to i32
%add = add nsw i32 %zext, %0
%2 = add nuw nsw i64 %iv, 64
%arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
store i32 %add, i32* %arrayidx5, align 4
%arrayidx5 = getelementptr inbounds i32, ptr %a, i64 %2
store i32 %add, ptr %arrayidx5, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %exit, label %loop
Expand All @@ -69,7 +69,7 @@ exit:

; Test that the MaxVF for the following loop, with a dependence distance
; of 32 elements, is calculated as (maxvscale = 16) * 2.
define void @test2(i32* %a, i8* %b) #0 {
define void @test2(ptr %a, ptr %b) #0 {
; CHECK: LV: Checking a loop in 'test2'
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2
; CHECK_SCALABLE_ON: LV: Selecting VF: 16
Expand All @@ -82,15 +82,15 @@ entry:

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
%1 = load i8, i8* %arrayidx2, align 4
%arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %iv
%1 = load i8, ptr %arrayidx2, align 4
%zext = zext i8 %1 to i32
%add = add nsw i32 %zext, %0
%2 = add nuw nsw i64 %iv, 32
%arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
store i32 %add, i32* %arrayidx5, align 4
%arrayidx5 = getelementptr inbounds i32, ptr %a, i64 %2
store i32 %add, ptr %arrayidx5, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %exit, label %loop
Expand All @@ -101,7 +101,7 @@ exit:

; Test that the MaxVF for the following loop, with a dependence distance
; of 16 elements, is calculated as (maxvscale = 16) * 1.
define void @test3(i32* %a, i8* %b) #0 {
define void @test3(ptr %a, ptr %b) #0 {
; CHECK: LV: Checking a loop in 'test3'
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 1
; CHECK_SCALABLE_ON: LV: Selecting VF: 16
Expand All @@ -114,15 +114,15 @@ entry:

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
%1 = load i8, i8* %arrayidx2, align 4
%arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %iv
%1 = load i8, ptr %arrayidx2, align 4
%zext = zext i8 %1 to i32
%add = add nsw i32 %zext, %0
%2 = add nuw nsw i64 %iv, 16
%arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
store i32 %add, i32* %arrayidx5, align 4
%arrayidx5 = getelementptr inbounds i32, ptr %a, i64 %2
store i32 %add, ptr %arrayidx5, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %exit, label %loop
Expand All @@ -133,7 +133,7 @@ exit:

; Test the fallback mechanism when scalable vectors are not feasible due
; to e.g. dependence distance.
define void @test4(i32* %a, i32* %b) #0 {
define void @test4(ptr %a, ptr %b) #0 {
; CHECK: LV: Checking a loop in 'test4'
; CHECK_SCALABLE_ON-NOT: LV: Found feasible scalable VF
; CHECK_SCALABLE_ON-NOT: LV: Found feasible scalable VF
Expand All @@ -147,14 +147,14 @@ entry:

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
%1 = load i32, i32* %arrayidx2, align 4
%arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, ptr %b, i64 %iv
%1 = load i32, ptr %arrayidx2, align 4
%add = add nsw i32 %1, %0
%2 = add nuw nsw i64 %iv, 8
%arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
store i32 %add, i32* %arrayidx5, align 4
%arrayidx5 = getelementptr inbounds i32, ptr %a, i64 %2
store i32 %add, ptr %arrayidx5, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %exit, label %loop
Expand Down
110 changes: 55 additions & 55 deletions llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
Original file line number Diff line number Diff line change
Expand Up @@ -45,20 +45,20 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
; CHECK-DBG: LV: Selecting VF: 4.
; CHECK-LABEL: @test1
; CHECK: <4 x i32>
define void @test1(i32* %a, i32* %b) #0 {
define void @test1(ptr %a, ptr %b) #0 {
entry:
br label %loop

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
%1 = load i32, i32* %arrayidx2, align 4
%arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, ptr %b, i64 %iv
%1 = load i32, ptr %arrayidx2, align 4
%add = add nsw i32 %1, %0
%2 = add nuw nsw i64 %iv, 8
%arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
store i32 %add, i32* %arrayidx5, align 4
%arrayidx5 = getelementptr inbounds i32, ptr %a, i64 %2
store i32 %add, ptr %arrayidx5, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
Expand Down Expand Up @@ -90,20 +90,20 @@ exit:
; CHECK-DBG: LV: Selecting VF: 4.
; CHECK-LABEL: @test2
; CHECK: <4 x i32>
define void @test2(i32* %a, i32* %b) #0 {
define void @test2(ptr %a, ptr %b) #0 {
entry:
br label %loop

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
%1 = load i32, i32* %arrayidx2, align 4
%arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, ptr %b, i64 %iv
%1 = load i32, ptr %arrayidx2, align 4
%add = add nsw i32 %1, %0
%2 = add nuw nsw i64 %iv, 4
%arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
store i32 %add, i32* %arrayidx5, align 4
%arrayidx5 = getelementptr inbounds i32, ptr %a, i64 %2
store i32 %add, ptr %arrayidx5, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !3
Expand Down Expand Up @@ -138,20 +138,20 @@ exit:
; CHECK-DBG: LV: Using user VF vscale x 2.
; CHECK-LABEL: @test3
; CHECK: <vscale x 2 x i32>
define void @test3(i32* %a, i32* %b) #0 {
define void @test3(ptr %a, ptr %b) #0 {
entry:
br label %loop

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
%1 = load i32, i32* %arrayidx2, align 4
%arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, ptr %b, i64 %iv
%1 = load i32, ptr %arrayidx2, align 4
%add = add nsw i32 %1, %0
%2 = add nuw nsw i64 %iv, 32
%arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
store i32 %add, i32* %arrayidx5, align 4
%arrayidx5 = getelementptr inbounds i32, ptr %a, i64 %2
store i32 %add, ptr %arrayidx5, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !6
Expand Down Expand Up @@ -190,20 +190,20 @@ exit:
; CHECK-DBG: LV: Selecting VF: vscale x 2.
; CHECK-LABEL: @test4
; CHECK: <vscale x 2 x i32>
define void @test4(i32* %a, i32* %b) #0 {
define void @test4(ptr %a, ptr %b) #0 {
entry:
br label %loop

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
%1 = load i32, i32* %arrayidx2, align 4
%arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, ptr %b, i64 %iv
%1 = load i32, ptr %arrayidx2, align 4
%add = add nsw i32 %1, %0
%2 = add nuw nsw i64 %iv, 32
%arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
store i32 %add, i32* %arrayidx5, align 4
%arrayidx5 = getelementptr inbounds i32, ptr %a, i64 %2
store i32 %add, ptr %arrayidx5, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !9
Expand Down Expand Up @@ -238,20 +238,20 @@ exit:
; CHECK-DBG: LV: Using user VF vscale x 4
; CHECK-LABEL: @test5
; CHECK: <vscale x 4 x i32>
define void @test5(i32* %a, i32* %b) #0 {
define void @test5(ptr %a, ptr %b) #0 {
entry:
br label %loop

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
%1 = load i32, i32* %arrayidx2, align 4
%arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, ptr %b, i64 %iv
%1 = load i32, ptr %arrayidx2, align 4
%add = add nsw i32 %1, %0
%2 = add nuw nsw i64 %iv, 128
%arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
store i32 %add, i32* %arrayidx5, align 4
%arrayidx5 = getelementptr inbounds i32, ptr %a, i64 %2
store i32 %add, ptr %arrayidx5, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !12
Expand Down Expand Up @@ -289,20 +289,20 @@ exit:
; CHECK-DBG: Selecting VF: vscale x 4.
; CHECK-LABEL: @test6
; CHECK: <vscale x 4 x i32>
define void @test6(i32* %a, i32* %b) #0 {
define void @test6(ptr %a, ptr %b) #0 {
entry:
br label %loop

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
%1 = load i32, i32* %arrayidx2, align 4
%arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, ptr %b, i64 %iv
%1 = load i32, ptr %arrayidx2, align 4
%add = add nsw i32 %1, %0
%2 = add nuw nsw i64 %iv, 128
%arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
store i32 %add, i32* %arrayidx5, align 4
%arrayidx5 = getelementptr inbounds i32, ptr %a, i64 %2
store i32 %add, ptr %arrayidx5, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !15
Expand All @@ -322,18 +322,18 @@ exit:
; CHECK-NO-SVE-LABEL: @test_no_sve
; CHECK-NO-SVE: <4 x i32>
; CHECK-NO-SVE-NOT: <vscale x 4 x i32>
define void @test_no_sve(i32* %a, i32* %b) #0 {
define void @test_no_sve(ptr %a, ptr %b) #0 {
entry:
br label %loop

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
%1 = load i32, i32* %arrayidx2, align 4
%arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, ptr %b, i64 %iv
%1 = load i32, ptr %arrayidx2, align 4
%add = add nsw i32 %1, %0
store i32 %add, i32* %arrayidx, align 4
store i32 %add, ptr %arrayidx, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !18
Expand All @@ -356,20 +356,20 @@ exit:
; CHECK-DBG: LV: Selecting VF: 4.
; CHECK-LABEL: @test_no_max_vscale
; CHECK: <4 x i32>
define void @test_no_max_vscale(i32* %a, i32* %b) #0 {
define void @test_no_max_vscale(ptr %a, ptr %b) #0 {
entry:
br label %loop

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
%1 = load i32, i32* %arrayidx2, align 4
%arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i32, ptr %b, i64 %iv
%1 = load i32, ptr %arrayidx2, align 4
%add = add nsw i32 %1, %0
%2 = add nuw nsw i64 %iv, 4
%arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
store i32 %add, i32* %arrayidx5, align 4
%arrayidx5 = getelementptr inbounds i32, ptr %a, i64 %2
store i32 %add, ptr %arrayidx5, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !21
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,17 @@ target triple = "aarch64-unknown-linux-gnu"
; architecture with masked loads/stores, but we use SVE for testing purposes
; here.

define void @foo(i32* %data1, i32* %data2) {
define void @foo(ptr %data1, ptr %data2) {
; CHECK-LABEL: @foo(
; CHECK: vector.body:
; CHECK: br i1 {{%.*}}, label %pred.store.if, label %pred.store.continue
; CHECK: pred.store.if:
; CHECK-NEXT: store i32 {{%.*}}, i32* {{%.*}}
; CHECK-NEXT: store i32 {{%.*}}, ptr {{%.*}}
; CHECK-NEXT: br label %pred.store.continue
; CHECK: pred.store.continue:
; CHECK-NEXT: br i1 {{%.*}}, label %pred.store.if1, label %pred.store.continue2
; CHECK: pred.store.if1:
; CHECK-NEXT: store i32 {{%.*}}, i32* {{%.*}}
; CHECK-NEXT: store i32 {{%.*}}, ptr {{%.*}}
; CHECK-NEXT: br label %pred.store.continue2
; CHECK: pred.store.continue2:

Expand All @@ -35,13 +35,13 @@ entry:

while.body:
%i = phi i64 [ 1023, %entry ], [ %i.next, %if.end ]
%arrayidx = getelementptr inbounds i32, i32* %data1, i64 %i
%ld = load i32, i32* %arrayidx, align 4
%arrayidx = getelementptr inbounds i32, ptr %data1, i64 %i
%ld = load i32, ptr %arrayidx, align 4
%cmp = icmp sgt i32 %ld, %ld
br i1 %cmp, label %if.then, label %if.end

if.then:
store i32 %ld, i32* %arrayidx, align 4
store i32 %ld, ptr %arrayidx, align 4
br label %if.end

if.end:
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/Transforms/LoopVectorize/AArch64/sdiv-pow2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ target triple = "aarch64--linux-gnu"
@Foo = common global %struct.anon zeroinitializer, align 4

; CHECK-LABEL: @foo(
; CHECK: load <4 x i32>, <4 x i32>*
; CHECK: load <4 x i32>, ptr
; CHECK: sdiv <4 x i32>
; CHECK: store <4 x i32>

Expand All @@ -17,11 +17,11 @@ entry:

for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds %struct.anon, %struct.anon* @Foo, i64 0, i32 2, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx = getelementptr inbounds %struct.anon, ptr @Foo, i64 0, i32 2, i64 %indvars.iv
%0 = load i32, ptr %arrayidx, align 4
%div = sdiv i32 %0, 2
%arrayidx2 = getelementptr inbounds %struct.anon, %struct.anon* @Foo, i64 0, i32 0, i64 %indvars.iv
store i32 %div, i32* %arrayidx2, align 4
%arrayidx2 = getelementptr inbounds %struct.anon, ptr @Foo, i64 0, i32 0, i64 %indvars.iv
store i32 %div, ptr %arrayidx2, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 100
br i1 %exitcond, label %for.end, label %for.body
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "arm64-apple-ios5.0.0"

define void @selects_1(i32* nocapture %dst, i32 %A, i32 %B, i32 %C, i32 %N) {
define void @selects_1(ptr nocapture %dst, i32 %A, i32 %B, i32 %C, i32 %N) {
; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %cond = select i1 %cmp1, i32 10, i32 %and
; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %cond6 = select i1 %cmp2, i32 30, i32 %and
; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %cond11 = select i1 %cmp7, i32 %cond, i32 %cond6
Expand All @@ -27,16 +27,16 @@ for.body.preheader: ; preds = %entry

for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds i32, i32* %dst, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx = getelementptr inbounds i32, ptr %dst, i64 %indvars.iv
%0 = load i32, ptr %arrayidx, align 4
%and = and i32 %0, 2047
%cmp1 = icmp eq i32 %and, %A
%cond = select i1 %cmp1, i32 10, i32 %and
%cmp2 = icmp eq i32 %and, %B
%cond6 = select i1 %cmp2, i32 30, i32 %and
%cmp7 = icmp ugt i32 %cond, %C
%cond11 = select i1 %cmp7, i32 %cond, i32 %cond6
store i32 %cond11, i32* %arrayidx, align 4
store i32 %cond11, ptr %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,23 @@ target triple = "aarch64--linux-gnu"
; CHECK-LABEL: Checking a loop in 'interleaved_access'
; CHECK: The Smallest and Widest types: 64 / 64 bits
;
define void @interleaved_access(i8** %A, i64 %N) {
define void @interleaved_access(ptr %A, i64 %N) {
for.ph:
br label %for.body

for.body:
%i = phi i64 [ %i.next.3, %for.body ], [ 0, %for.ph ]
%tmp0 = getelementptr inbounds i8*, i8** %A, i64 %i
store i8* null, i8** %tmp0, align 8
%tmp0 = getelementptr inbounds ptr, ptr %A, i64 %i
store ptr null, ptr %tmp0, align 8
%i.next.0 = add nuw nsw i64 %i, 1
%tmp1 = getelementptr inbounds i8*, i8** %A, i64 %i.next.0
store i8* null, i8** %tmp1, align 8
%tmp1 = getelementptr inbounds ptr, ptr %A, i64 %i.next.0
store ptr null, ptr %tmp1, align 8
%i.next.1 = add nsw i64 %i, 2
%tmp2 = getelementptr inbounds i8*, i8** %A, i64 %i.next.1
store i8* null, i8** %tmp2, align 8
%tmp2 = getelementptr inbounds ptr, ptr %A, i64 %i.next.1
store ptr null, ptr %tmp2, align 8
%i.next.2 = add nsw i64 %i, 3
%tmp3 = getelementptr inbounds i8*, i8** %A, i64 %i.next.2
store i8* null, i8** %tmp3, align 8
%tmp3 = getelementptr inbounds ptr, ptr %A, i64 %i.next.2
store ptr null, ptr %tmp3, align 8
%i.next.3 = add nsw i64 %i, 4
%cond = icmp slt i64 %i.next.3, %N
br i1 %cond, label %for.body, label %for.end
Expand Down
32 changes: 16 additions & 16 deletions llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@ target triple="aarch64-unknown-linux-gnu"
; CHECK-VF4: Found an estimated cost of 21 for VF 4 For instruction: %add = fadd float %0, %sum.07
; CHECK-VF8: Found an estimated cost of 42 for VF 8 For instruction: %add = fadd float %0, %sum.07

define float @fadd_strict32(float* noalias nocapture readonly %a, i64 %n) {
define float @fadd_strict32(ptr noalias nocapture readonly %a, i64 %n) {
entry:
br label %for.body

for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds float, float* %a, i64 %iv
%0 = load float, float* %arrayidx, align 4
%arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
%0 = load float, ptr %arrayidx, align 4
%add = fadd float %0, %sum.07
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %n
Expand All @@ -31,15 +31,15 @@ for.end:
; CHECK-VF4: Found an estimated cost of 18 for VF 4 For instruction: %add = fadd double %0, %sum.07
; CHECK-VF8: Found an estimated cost of 36 for VF 8 For instruction: %add = fadd double %0, %sum.07

define double @fadd_strict64(double* noalias nocapture readonly %a, i64 %n) {
define double @fadd_strict64(ptr noalias nocapture readonly %a, i64 %n) {
entry:
br label %for.body

for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%sum.07 = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds double, double* %a, i64 %iv
%0 = load double, double* %arrayidx, align 4
%arrayidx = getelementptr inbounds double, ptr %a, i64 %iv
%0 = load double, ptr %arrayidx, align 4
%add = fadd double %0, %sum.07
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %n
Expand All @@ -52,17 +52,17 @@ for.end:
; CHECK-VF4: Found an estimated cost of 23 for VF 4 For instruction: %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
; CHECK-VF8: Found an estimated cost of 46 for VF 8 For instruction: %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)

define float @fmuladd_strict32(float* %a, float* %b, i64 %n) {
define float @fmuladd_strict32(ptr %a, ptr %b, i64 %n) {
entry:
br label %for.body

for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ]
%arrayidx = getelementptr inbounds float, float* %a, i64 %iv
%0 = load float, float* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv
%1 = load float, float* %arrayidx2, align 4
%arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
%0 = load float, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds float, ptr %b, i64 %iv
%1 = load float, ptr %arrayidx2, align 4
%muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %n
Expand All @@ -77,17 +77,17 @@ declare float @llvm.fmuladd.f32(float, float, float)
; CHECK-VF4: Found an estimated cost of 22 for VF 4 For instruction: %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07)
; CHECK-VF8: Found an estimated cost of 44 for VF 8 For instruction: %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07)

define double @fmuladd_strict64(double* %a, double* %b, i64 %n) {
define double @fmuladd_strict64(ptr %a, ptr %b, i64 %n) {
entry:
br label %for.body

for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%sum.07 = phi double [ 0.000000e+00, %entry ], [ %muladd, %for.body ]
%arrayidx = getelementptr inbounds double, double* %a, i64 %iv
%0 = load double, double* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds double, double* %b, i64 %iv
%1 = load double, double* %arrayidx2, align 4
%arrayidx = getelementptr inbounds double, ptr %a, i64 %iv
%0 = load double, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds double, ptr %b, i64 %iv
%1 = load double, ptr %arrayidx2, align 4
%muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07)
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %n
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-vf1.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ target triple = "aarch64-unknown-linux-gnu"

; CHECK-DEBUG: LV: Not interleaving scalar ordered reductions.

define void @foo(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %M, i64 %N) {
define void @foo(ptr noalias nocapture %dst, ptr noalias nocapture readonly %src, i64 %M, i64 %N) {
; CHECK-LABEL: @foo(
; CHECK-NOT: vector.body

Expand All @@ -15,24 +15,24 @@ entry:

for.body.us: ; preds = %entry, %for.cond3
%i.023.us = phi i64 [ %inc8.us, %for.cond3 ], [ 0, %entry ]
%arrayidx.us = getelementptr inbounds float, float* %dst, i64 %i.023.us
%arrayidx.us = getelementptr inbounds float, ptr %dst, i64 %i.023.us
%mul.us = mul nsw i64 %i.023.us, %N
br label %for.body3.us

for.body3.us: ; preds = %for.body.us, %for.body3.us
%0 = phi float [ 0.000000e+00, %for.body.us ], [ %add6.us, %for.body3.us ]
%j.021.us = phi i64 [ 0, %for.body.us ], [ %inc.us, %for.body3.us ]
%add.us = add nsw i64 %j.021.us, %mul.us
%arrayidx4.us = getelementptr inbounds float, float* %src, i64 %add.us
%1 = load float, float* %arrayidx4.us, align 4
%arrayidx4.us = getelementptr inbounds float, ptr %src, i64 %add.us
%1 = load float, ptr %arrayidx4.us, align 4
%add6.us = fadd float %1, %0
%inc.us = add nuw nsw i64 %j.021.us, 1
%exitcond.not = icmp eq i64 %inc.us, %N
br i1 %exitcond.not, label %for.cond3, label %for.body3.us

for.cond3: ; preds = %for.body3.us
%add6.us.lcssa = phi float [ %add6.us, %for.body3.us ]
store float %add6.us.lcssa, float* %arrayidx.us, align 4
store float %add6.us.lcssa, ptr %arrayidx.us, align 4
%inc8.us = add nuw nsw i64 %i.023.us, 1
%exitcond26.not = icmp eq i64 %inc8.us, %M
br i1 %exitcond26.not, label %exit, label %for.body.us
Expand Down
42 changes: 21 additions & 21 deletions llvm/test/Transforms/LoopVectorize/AArch64/sve-basic-vec.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,26 @@
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-unknown-linux-gnu"

define void @cmpsel_i32(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) {
define void @cmpsel_i32(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) {
; CHECK-LABEL: @cmpsel_i32(
; CHECK-NEXT: entry:
; CHECK: vector.body:
; CHECK: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* {{.*}}, align 4
; CHECK: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr {{.*}}, align 4
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = select <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 10, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK: store <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32>* {{.*}}, align 4
; CHECK: store <vscale x 4 x i32> [[TMP2]], ptr {{.*}}, align 4
;
entry:
br label %for.body

for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4
%arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
%0 = load i32, ptr %arrayidx, align 4
%tobool.not = icmp eq i32 %0, 0
%cond = select i1 %tobool.not, i32 2, i32 10
%arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
store i32 %cond, i32* %arrayidx2, align 4
%arrayidx2 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
store i32 %cond, ptr %arrayidx2, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %n
br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !0
Expand All @@ -36,26 +36,26 @@ for.end: ; preds = %for.end.loopexit, %
ret void
}

define void @cmpsel_f32(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) {
define void @cmpsel_f32(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) {
; CHECK-LABEL: @cmpsel_f32(
; CHECK-NEXT: entry:
; CHECK: vector.body:
; CHECK: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* {{.*}}, align 4
; CHECK: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr {{.*}}, align 4
; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP2:%.*]] = select <vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+01, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK: store <vscale x 4 x float> [[TMP2]], <vscale x 4 x float>* {{.*}}, align 4
; CHECK: store <vscale x 4 x float> [[TMP2]], ptr {{.*}}, align 4

entry:
br label %for.body

for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
%0 = load float, float* %arrayidx, align 4
%arrayidx = getelementptr inbounds float, ptr %b, i64 %indvars.iv
%0 = load float, ptr %arrayidx, align 4
%cmp1 = fcmp ogt float %0, 3.000000e+00
%conv = select i1 %cmp1, float 1.000000e+01, float 2.000000e+00
%arrayidx3 = getelementptr inbounds float, float* %a, i64 %indvars.iv
store float %conv, float* %arrayidx3, align 4
%arrayidx3 = getelementptr inbounds float, ptr %a, i64 %indvars.iv
store float %conv, ptr %arrayidx3, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %n
br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
Expand All @@ -64,24 +64,24 @@ for.end: ; preds = %for.body, %entry
ret void
}

define void @fneg_f32(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) {
define void @fneg_f32(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) {
; CHECK-LABEL: @fneg_f32(
; CHECK-NEXT: entry:
; CHECK: vector.body:
; CHECK: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* {{.*}}, align 4
; CHECK: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr {{.*}}, align 4
; CHECK-NEXT: [[TMP1:%.*]] = fneg <vscale x 4 x float> [[WIDE_LOAD]]
; CHECK: store <vscale x 4 x float> [[TMP1]], <vscale x 4 x float>* {{.*}}, align 4
; CHECK: store <vscale x 4 x float> [[TMP1]], ptr {{.*}}, align 4

entry:
br label %for.body

for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
%0 = load float, float* %arrayidx, align 4
%arrayidx = getelementptr inbounds float, ptr %b, i64 %indvars.iv
%0 = load float, ptr %arrayidx, align 4
%fneg = fneg float %0
%arrayidx3 = getelementptr inbounds float, float* %a, i64 %indvars.iv
store float %fneg, float* %arrayidx3, align 4
%arrayidx3 = getelementptr inbounds float, ptr %a, i64 %indvars.iv
store float %fneg, ptr %arrayidx3, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %n
br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
Expand Down
133 changes: 63 additions & 70 deletions llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll

Large diffs are not rendered by default.

104 changes: 52 additions & 52 deletions llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,19 @@
target triple="aarch64--linux-gnu"

; CHECK: LV: Checking a loop in 'gather_nxv4i32_loaded_index'
; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: %1 = load float, float* %arrayidx3, align 4
define void @gather_nxv4i32_loaded_index(float* noalias nocapture readonly %a, i64* noalias nocapture readonly %b, float* noalias nocapture %c, i64 %n) #0 {
; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx3, align 4
define void @gather_nxv4i32_loaded_index(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i64 %n) #0 {
entry:
br label %for.body

for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i64, i64* %b, i64 %indvars.iv
%0 = load i64, i64* %arrayidx, align 8
%arrayidx3 = getelementptr inbounds float, float* %a, i64 %0
%1 = load float, float* %arrayidx3, align 4
%arrayidx5 = getelementptr inbounds float, float* %c, i64 %indvars.iv
store float %1, float* %arrayidx5, align 4
%arrayidx = getelementptr inbounds i64, ptr %b, i64 %indvars.iv
%0 = load i64, ptr %arrayidx, align 8
%arrayidx3 = getelementptr inbounds float, ptr %a, i64 %0
%1 = load float, ptr %arrayidx3, align 4
%arrayidx5 = getelementptr inbounds float, ptr %c, i64 %indvars.iv
store float %1, ptr %arrayidx5, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
Expand All @@ -27,19 +27,19 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo
}

; CHECK: LV: Checking a loop in 'scatter_nxv4i32_loaded_index'
; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: store float %1, float* %arrayidx5, align 4
define void @scatter_nxv4i32_loaded_index(float* noalias nocapture readonly %a, i64* noalias nocapture readonly %b, float* noalias nocapture %c, i64 %n) #0 {
; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: store float %1, ptr %arrayidx5, align 4
define void @scatter_nxv4i32_loaded_index(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i64 %n) #0 {
entry:
br label %for.body

for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i64, i64* %b, i64 %indvars.iv
%0 = load i64, i64* %arrayidx, align 8
%arrayidx3 = getelementptr inbounds float, float* %a, i64 %indvars.iv
%1 = load float, float* %arrayidx3, align 4
%arrayidx5 = getelementptr inbounds float, float* %c, i64 %0
store float %1, float* %arrayidx5, align 4
%arrayidx = getelementptr inbounds i64, ptr %b, i64 %indvars.iv
%0 = load i64, ptr %arrayidx, align 8
%arrayidx3 = getelementptr inbounds float, ptr %a, i64 %indvars.iv
%1 = load float, ptr %arrayidx3, align 4
%arrayidx5 = getelementptr inbounds float, ptr %c, i64 %0
store float %1, ptr %arrayidx5, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
Expand All @@ -51,18 +51,18 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo
; NOTE: For runtime-determined strides the vectoriser versions the loop and adds SCEV checks
; to ensure the stride value is always 1. Therefore, it can assume a contiguous load and a cost of 1.
; CHECK: LV: Checking a loop in 'gather_nxv4i32_unknown_stride'
; CHECK: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %0 = load float, float* %arrayidx, align 4
define void @gather_nxv4i32_unknown_stride(float* noalias nocapture readonly %a, float* noalias nocapture %b, i64 %stride, i64 %n) #0 {
; CHECK: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %0 = load float, ptr %arrayidx, align 4
define void @gather_nxv4i32_unknown_stride(ptr noalias nocapture readonly %a, ptr noalias nocapture %b, i64 %stride, i64 %n) #0 {
entry:
br label %for.body

for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%indvars.iv.stride2 = mul i64 %indvars.iv, %stride
%arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv.stride2
%0 = load float, float* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv
store float %0, float* %arrayidx2, align 4
%arrayidx = getelementptr inbounds float, ptr %b, i64 %indvars.iv.stride2
%0 = load float, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds float, ptr %a, i64 %indvars.iv
store float %0, ptr %arrayidx2, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
Expand All @@ -74,18 +74,18 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo
; NOTE: For runtime-determined strides the vectoriser versions the loop and adds SCEV checks
; to ensure the stride value is always 1. Therefore, it can assume a contiguous load and cost is 1.
; CHECK: LV: Checking a loop in 'scatter_nxv4i32_unknown_stride'
; CHECK: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: store float %0, float* %arrayidx2, align 4
define void @scatter_nxv4i32_unknown_stride(float* noalias nocapture readonly %a, float* noalias nocapture %b, i64 %stride, i64 %n) #0 {
; CHECK: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: store float %0, ptr %arrayidx2, align 4
define void @scatter_nxv4i32_unknown_stride(ptr noalias nocapture readonly %a, ptr noalias nocapture %b, i64 %stride, i64 %n) #0 {
entry:
br label %for.body

for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%indvars.iv.stride2 = mul i64 %indvars.iv, %stride
%arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
%0 = load float, float* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv.stride2
store float %0, float* %arrayidx2, align 4
%arrayidx = getelementptr inbounds float, ptr %b, i64 %indvars.iv
%0 = load float, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds float, ptr %a, i64 %indvars.iv.stride2
store float %0, ptr %arrayidx2, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
Expand All @@ -95,18 +95,18 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo
}

; CHECK: LV: Checking a loop in 'gather_nxv4i32_stride2'
; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: %0 = load float, float* %arrayidx, align 4
define void @gather_nxv4i32_stride2(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 {
; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: %0 = load float, ptr %arrayidx, align 4
define void @gather_nxv4i32_stride2(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i64 %n) #0 {
entry:
br label %for.body

for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%indvars.iv.stride2 = mul i64 %indvars.iv, 2
%arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv.stride2
%0 = load float, float* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv
store float %0, float* %arrayidx2, align 4
%arrayidx = getelementptr inbounds float, ptr %b, i64 %indvars.iv.stride2
%0 = load float, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds float, ptr %a, i64 %indvars.iv
store float %0, ptr %arrayidx2, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
Expand All @@ -116,18 +116,18 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo
}

; CHECK: LV: Checking a loop in 'scatter_nxv4i32_stride2'
; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: store float %0, float* %arrayidx2, align 4
define void @scatter_nxv4i32_stride2(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 {
; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: store float %0, ptr %arrayidx2, align 4
define void @scatter_nxv4i32_stride2(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i64 %n) #0 {
entry:
br label %for.body

for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%indvars.iv.stride2 = mul i64 %indvars.iv, 2
%arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
%0 = load float, float* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv.stride2
store float %0, float* %arrayidx2, align 4
%arrayidx = getelementptr inbounds float, ptr %b, i64 %indvars.iv
%0 = load float, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds float, ptr %a, i64 %indvars.iv.stride2
store float %0, ptr %arrayidx2, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
Expand All @@ -138,18 +138,18 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo


; CHECK: LV: Checking a loop in 'gather_nxv4i32_stride64'
; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: %0 = load float, float* %arrayidx, align 4
define void @gather_nxv4i32_stride64(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 {
; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: %0 = load float, ptr %arrayidx, align 4
define void @gather_nxv4i32_stride64(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i64 %n) #0 {
entry:
br label %for.body

for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%indvars.iv.stride2 = mul i64 %indvars.iv, 64
%arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv.stride2
%0 = load float, float* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv
store float %0, float* %arrayidx2, align 4
%arrayidx = getelementptr inbounds float, ptr %b, i64 %indvars.iv.stride2
%0 = load float, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds float, ptr %a, i64 %indvars.iv
store float %0, ptr %arrayidx2, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
Expand All @@ -159,18 +159,18 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo
}

; CHECK: LV: Checking a loop in 'scatter_nxv4i32_stride64'
; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: store float %0, float* %arrayidx2, align 4
define void @scatter_nxv4i32_stride64(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 {
; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: store float %0, ptr %arrayidx2, align 4
define void @scatter_nxv4i32_stride64(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i64 %n) #0 {
entry:
br label %for.body

for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%indvars.iv.stride2 = mul i64 %indvars.iv, 64
%arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
%0 = load float, float* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv.stride2
store float %0, float* %arrayidx2, align 4
%arrayidx = getelementptr inbounds float, ptr %b, i64 %indvars.iv
%0 = load float, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds float, ptr %a, i64 %indvars.iv.stride2
store float %0, ptr %arrayidx2, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
Expand Down
173 changes: 82 additions & 91 deletions llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll

Large diffs are not rendered by default.

28 changes: 13 additions & 15 deletions llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ target triple = "aarch64-linux-gnu"
; a[i] = b[i];
; }

define void @cond_ind64(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) #0 {
define void @cond_ind64(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) #0 {
; CHECK-LABEL: @cond_ind64(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
Expand All @@ -33,12 +33,10 @@ define void @cond_ind64(i32* noalias nocapture %a, i32* noalias nocapture readon
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP7:%.*]] = trunc <vscale x 4 x i64> [[VEC_IND]] to <vscale x 4 x i1>
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <vscale x 4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP9]], i32 4, <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i32> poison)
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <vscale x 4 x i32>*
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32>* [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP7]])
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i32> poison)
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP7]])
; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP13:%.*]] = shl i64 [[TMP12]], 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
Expand All @@ -57,10 +55,10 @@ define void @cond_ind64(i32* noalias nocapture %a, i32* noalias nocapture readon
; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i64 [[AND]], 0
; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
; CHECK: if.then:
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I_08]]
; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_08]]
; CHECK-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX1]], align 4
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I_08]]
; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_08]]
; CHECK-NEXT: store i32 [[TMP15]], ptr [[ARRAYIDX1]], align 4
; CHECK-NEXT: br label [[FOR_INC]]
; CHECK: for.inc:
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
Expand All @@ -79,10 +77,10 @@ for.body: ; preds = %entry, %for.inc
br i1 %tobool.not, label %for.inc, label %if.then

if.then: ; preds = %for.body
%arrayidx = getelementptr inbounds i32, i32* %b, i64 %i.08
%0 = load i32, i32* %arrayidx, align 4
%arrayidx1 = getelementptr inbounds i32, i32* %a, i64 %i.08
store i32 %0, i32* %arrayidx1, align 4
%arrayidx = getelementptr inbounds i32, ptr %b, i64 %i.08
%0 = load i32, ptr %arrayidx, align 4
%arrayidx1 = getelementptr inbounds i32, ptr %a, i64 %i.08
store i32 %0, ptr %arrayidx1, align 4
br label %for.inc

for.inc: ; preds = %for.body, %if.then
Expand Down
22 changes: 11 additions & 11 deletions llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-loads.ll
Original file line number Diff line number Diff line change
@@ -1,28 +1,28 @@
; RUN: opt -S -passes=loop-vectorize -mattr=+sve -mtriple aarch64-linux-gnu \
; RUN: -prefer-predicate-over-epilogue=scalar-epilogue < %s | FileCheck %s

define void @invariant_load(i64 %n, i32* noalias nocapture %a, i32* nocapture readonly %b) {
define void @invariant_load(i64 %n, ptr noalias nocapture %a, ptr nocapture readonly %b) {
; CHECK-LABEL: @invariant_load
; CHECK: vector.body:
; CHECK: %[[GEP:.*]] = getelementptr inbounds i32, i32* %b, i64 42
; CHECK-NEXT: %[[INVLOAD:.*]] = load i32, i32* %[[GEP]]
; CHECK: %[[GEP:.*]] = getelementptr inbounds i32, ptr %b, i64 42
; CHECK-NEXT: %[[INVLOAD:.*]] = load i32, ptr %[[GEP]]
; CHECK-NEXT: %[[SPLATINS:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[INVLOAD]], i32 0
; CHECK-NEXT: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[SPLATINS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; CHECK: %[[LOAD:.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>*
; CHECK: %[[LOAD:.*]] = load <vscale x 4 x i32>, ptr
; CHECK-NEXT: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[SPLAT]], %[[LOAD]]
; CHECK: store <vscale x 4 x i32> %[[ADD]], <vscale x 4 x i32>*
; CHECK: store <vscale x 4 x i32> %[[ADD]], ptr
entry:
br label %for.body

for.body: ; preds = %for.body.lr.ph, %for.body
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%arrayidx = getelementptr inbounds i32, i32* %b, i64 42
%0 = load i32, i32* %arrayidx, align 4
%arrayidx1 = getelementptr inbounds i32, i32* %b, i64 %iv
%1 = load i32, i32* %arrayidx1, align 4
%arrayidx = getelementptr inbounds i32, ptr %b, i64 42
%0 = load i32, ptr %arrayidx, align 4
%arrayidx1 = getelementptr inbounds i32, ptr %b, i64 %iv
%1 = load i32, ptr %arrayidx1, align 4
%add = add nsw i32 %0, %1
%arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %iv
store i32 %add, i32* %arrayidx2, align 4
%arrayidx2 = getelementptr inbounds i32, ptr %a, i64 %iv
store i32 %add, ptr %arrayidx2, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %n
br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
Expand Down
46 changes: 23 additions & 23 deletions llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -passes=loop-vectorize,dce,instcombine -S \
; RUN: -prefer-predicate-over-epilogue=scalar-epilogue <%s | FileCheck %s

define void @stride7_i32(i32* noalias nocapture %dst, i64 %n) #0 {
define void @stride7_i32(ptr noalias nocapture %dst, i64 %n) #0 {
; CHECK-LABEL: @stride7_i32(
; CHECK: vector.body
; CHECK: %[[VEC_IND:.*]] = phi <vscale x 4 x i64> [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ]
; CHECK-NEXT: %[[PTR_INDICES:.*]] = mul nuw nsw <vscale x 4 x i64> %[[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 7, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: %[[PTRS:.*]] = getelementptr inbounds i32, i32* %dst, <vscale x 4 x i64> %[[PTR_INDICES]]
; CHECK-NEXT: %[[GLOAD:.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> %[[PTRS]]
; CHECK-NEXT: %[[PTRS:.*]] = getelementptr inbounds i32, ptr %dst, <vscale x 4 x i64> %[[PTR_INDICES]]
; CHECK-NEXT: %[[GLOAD:.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> %[[PTRS]]
; CHECK-NEXT: %[[VALS:.*]] = add nsw <vscale x 4 x i32> %[[GLOAD]],
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> %[[VALS]], <vscale x 4 x i32*> %[[PTRS]]
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> %[[VALS]], <vscale x 4 x ptr> %[[PTRS]]
entry:
br label %for.body

for.body: ; preds = %entry, %for.body
%i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
%mul = mul nuw nsw i64 %i.05, 7
%arrayidx = getelementptr inbounds i32, i32* %dst, i64 %mul
%0 = load i32, i32* %arrayidx, align 4
%arrayidx = getelementptr inbounds i32, ptr %dst, i64 %mul
%0 = load i32, ptr %arrayidx, align 4
%add = add nsw i32 %0, 3
store i32 %add, i32* %arrayidx, align 4
store i32 %add, ptr %arrayidx, align 4
%inc = add nuw nsw i64 %i.05, 1
%exitcond.not = icmp eq i64 %inc, %n
br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
Expand All @@ -28,25 +28,25 @@ for.end: ; preds = %for.end.loopexit, %
ret void
}

define void @stride7_f64(double* noalias nocapture %dst, i64 %n) #0 {
define void @stride7_f64(ptr noalias nocapture %dst, i64 %n) #0 {
; CHECK-LABEL: @stride7_f64(
; CHECK: vector.body
; CHECK: %[[VEC_IND:.*]] = phi <vscale x 2 x i64> [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ]
; CHECK-NEXT: %[[PTR_INDICES:.*]] = mul nuw nsw <vscale x 2 x i64> %[[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 7, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: %[[PTRS:.*]] = getelementptr inbounds double, double* %dst, <vscale x 2 x i64> %[[PTR_INDICES]]
; CHECK-NEXT: %[[GLOAD:.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0f64(<vscale x 2 x double*> %[[PTRS]],
; CHECK-NEXT: %[[PTRS:.*]] = getelementptr inbounds double, ptr %dst, <vscale x 2 x i64> %[[PTR_INDICES]]
; CHECK-NEXT: %[[GLOAD:.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> %[[PTRS]],
; CHECK-NEXT: %[[VALS:.*]] = fadd <vscale x 2 x double> %[[GLOAD]],
; CHECK-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64(<vscale x 2 x double> %[[VALS]], <vscale x 2 x double*> %[[PTRS]],
; CHECK-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> %[[VALS]], <vscale x 2 x ptr> %[[PTRS]],
entry:
br label %for.body

for.body: ; preds = %entry, %for.body
%i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
%mul = mul nuw nsw i64 %i.05, 7
%arrayidx = getelementptr inbounds double, double* %dst, i64 %mul
%0 = load double, double* %arrayidx, align 8
%arrayidx = getelementptr inbounds double, ptr %dst, i64 %mul
%0 = load double, ptr %arrayidx, align 8
%add = fadd double %0, 1.000000e+00
store double %add, double* %arrayidx, align 8
store double %add, ptr %arrayidx, align 8
%inc = add nuw nsw i64 %i.05, 1
%exitcond.not = icmp eq i64 %inc, %n
br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !6
Expand All @@ -56,30 +56,30 @@ for.end: ; preds = %for.end.loopexit, %
}


define void @cond_stride7_f64(double* noalias nocapture %dst, i64* noalias nocapture readonly %cond, i64 %n) #0 {
define void @cond_stride7_f64(ptr noalias nocapture %dst, ptr noalias nocapture readonly %cond, i64 %n) #0 {
; CHECK-LABEL: @cond_stride7_f64(
; CHECK: vector.body
; CHECK: %[[MASK:.*]] = icmp ne <vscale x 2 x i64>
; CHECK: %[[PTRS:.*]] = getelementptr inbounds double, double* %dst, <vscale x 2 x i64> %{{.*}}
; CHECK-NEXT: %[[GLOAD:.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0f64(<vscale x 2 x double*> %[[PTRS]], i32 8, <vscale x 2 x i1> %[[MASK]]
; CHECK: %[[PTRS:.*]] = getelementptr inbounds double, ptr %dst, <vscale x 2 x i64> %{{.*}}
; CHECK-NEXT: %[[GLOAD:.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> %[[PTRS]], i32 8, <vscale x 2 x i1> %[[MASK]]
; CHECK-NEXT: %[[VALS:.*]] = fadd <vscale x 2 x double> %[[GLOAD]],
; CHECK-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64(<vscale x 2 x double> %[[VALS]], <vscale x 2 x double*> %[[PTRS]], i32 8, <vscale x 2 x i1> %[[MASK]])
; CHECK-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> %[[VALS]], <vscale x 2 x ptr> %[[PTRS]], i32 8, <vscale x 2 x i1> %[[MASK]])
entry:
br label %for.body

for.body: ; preds = %entry, %for.inc
%i.07 = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i64, i64* %cond, i64 %i.07
%0 = load i64, i64* %arrayidx, align 8
%arrayidx = getelementptr inbounds i64, ptr %cond, i64 %i.07
%0 = load i64, ptr %arrayidx, align 8
%tobool.not = icmp eq i64 %0, 0
br i1 %tobool.not, label %for.inc, label %if.then

if.then: ; preds = %for.body
%mul = mul nsw i64 %i.07, 7
%arrayidx1 = getelementptr inbounds double, double* %dst, i64 %mul
%1 = load double, double* %arrayidx1, align 8
%arrayidx1 = getelementptr inbounds double, ptr %dst, i64 %mul
%1 = load double, ptr %arrayidx1, align 8
%add = fadd double %1, 1.000000e+00
store double %add, double* %arrayidx1, align 8
store double %add, ptr %arrayidx1, align 8
br label %for.inc

for.inc: ; preds = %for.body, %if.then
Expand Down
40 changes: 20 additions & 20 deletions llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@

target triple = "aarch64-unknown-linux-gnu"

define void @trip7_i64(i64* noalias nocapture noundef %dst, i64* noalias nocapture noundef readonly %src) #0 {
define void @trip7_i64(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
; CHECK-LABEL: @trip7_i64(
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
; CHECK: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ {{%.*}}, %vector.ph ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %vector.body ]
; CHECK: {{%.*}} = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0nxv2i64(<vscale x 2 x i64>* {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
; CHECK: {{%.*}} = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0nxv2i64(<vscale x 2 x i64>* {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
; CHECK: call void @llvm.masked.store.nxv2i64.p0nxv2i64(<vscale x 2 x i64> {{%.*}}, <vscale x 2 x i64>* {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
; CHECK: {{%.*}} = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
; CHECK: {{%.*}} = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
; CHECK: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> {{%.*}}, ptr {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
; CHECK: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[VF:%.*]] = mul i64 [[VSCALE]], 2
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]]
Expand All @@ -23,13 +23,13 @@ entry:

for.body: ; preds = %entry, %for.body
%i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
%arrayidx = getelementptr inbounds i64, i64* %src, i64 %i.06
%0 = load i64, i64* %arrayidx, align 8
%arrayidx = getelementptr inbounds i64, ptr %src, i64 %i.06
%0 = load i64, ptr %arrayidx, align 8
%mul = shl nsw i64 %0, 1
%arrayidx1 = getelementptr inbounds i64, i64* %dst, i64 %i.06
%1 = load i64, i64* %arrayidx1, align 8
%arrayidx1 = getelementptr inbounds i64, ptr %dst, i64 %i.06
%1 = load i64, ptr %arrayidx1, align 8
%add = add nsw i64 %1, %mul
store i64 %add, i64* %arrayidx1, align 8
store i64 %add, ptr %arrayidx1, align 8
%inc = add nuw nsw i64 %i.06, 1
%exitcond.not = icmp eq i64 %inc, 7
br i1 %exitcond.not, label %for.end, label %for.body
Expand All @@ -38,19 +38,19 @@ for.end: ; preds = %for.body
ret void
}

define void @trip5_i8(i8* noalias nocapture noundef %dst, i8* noalias nocapture noundef readonly %src) #0 {
define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
; CHECK-LABEL: @trip5_i8(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[I_08]]
; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[I_08]]
; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP0]], 1
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[I_08]]
; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[I_08]]
; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP1]]
; CHECK-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX1]], align 1
; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
Expand All @@ -62,13 +62,13 @@ entry:

for.body: ; preds = %entry, %for.body
%i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
%arrayidx = getelementptr inbounds i8, i8* %src, i64 %i.08
%0 = load i8, i8* %arrayidx, align 1
%arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
%0 = load i8, ptr %arrayidx, align 1
%mul = shl i8 %0, 1
%arrayidx1 = getelementptr inbounds i8, i8* %dst, i64 %i.08
%1 = load i8, i8* %arrayidx1, align 1
%arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
%1 = load i8, ptr %arrayidx1, align 1
%add = add i8 %mul, %1
store i8 %add, i8* %arrayidx1, align 1
store i8 %add, ptr %arrayidx1, align 1
%inc = add nuw nsw i64 %i.08, 1
%exitcond.not = icmp eq i64 %inc, 5
br i1 %exitcond.not, label %for.end, label %for.body
Expand Down
44 changes: 20 additions & 24 deletions llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll
Original file line number Diff line number Diff line change
@@ -1,32 +1,30 @@
; RUN: opt -passes=loop-vectorize,dce,instcombine -mtriple aarch64-linux-gnu -mattr=+sve \
; RUN: -prefer-predicate-over-epilogue=scalar-epilogue -S %s -o - | FileCheck %s

define void @mloadstore_f32(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) {
define void @mloadstore_f32(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) {
; CHECK-LABEL: @mloadstore_f32
; CHECK: vector.body:
; CHECK: %[[LOAD1:.*]] = load <vscale x 4 x float>, <vscale x 4 x float>*
; CHECK: %[[LOAD1:.*]] = load <vscale x 4 x float>, ptr
; CHECK-NEXT: %[[MASK:.*]] = fcmp ogt <vscale x 4 x float> %[[LOAD1]],
; CHECK-NEXT: %[[GEPA:.*]] = getelementptr float, float* %a,
; CHECK-NEXT: %[[MLOAD_PTRS:.*]] = bitcast float* %[[GEPA]] to <vscale x 4 x float>*
; CHECK-NEXT: %[[LOAD2:.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* %[[MLOAD_PTRS]], i32 4, <vscale x 4 x i1> %[[MASK]]
; CHECK-NEXT: %[[GEPA:.*]] = getelementptr float, ptr %a,
; CHECK-NEXT: %[[LOAD2:.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %[[GEPA]], i32 4, <vscale x 4 x i1> %[[MASK]]
; CHECK-NEXT: %[[FADD:.*]] = fadd <vscale x 4 x float> %[[LOAD1]], %[[LOAD2]]
; CHECK-NEXT: %[[MSTORE_PTRS:.*]] = bitcast float* %[[GEPA]] to <vscale x 4 x float>*
; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0nxv4f32(<vscale x 4 x float> %[[FADD]], <vscale x 4 x float>* %[[MSTORE_PTRS]], i32 4, <vscale x 4 x i1> %[[MASK]])
; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %[[FADD]], ptr %[[GEPA]], i32 4, <vscale x 4 x i1> %[[MASK]])
entry:
br label %for.body

for.body: ; preds = %entry, %for.inc
%i.011 = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
%arrayidx = getelementptr inbounds float, float* %b, i64 %i.011
%0 = load float, float* %arrayidx, align 4
%arrayidx = getelementptr inbounds float, ptr %b, i64 %i.011
%0 = load float, ptr %arrayidx, align 4
%cmp1 = fcmp ogt float %0, 0.000000e+00
br i1 %cmp1, label %if.then, label %for.inc

if.then: ; preds = %for.body
%arrayidx3 = getelementptr inbounds float, float* %a, i64 %i.011
%1 = load float, float* %arrayidx3, align 4
%arrayidx3 = getelementptr inbounds float, ptr %a, i64 %i.011
%1 = load float, ptr %arrayidx3, align 4
%add = fadd float %0, %1
store float %add, float* %arrayidx3, align 4
store float %add, ptr %arrayidx3, align 4
br label %for.inc

for.inc: ; preds = %for.body, %if.then
Expand All @@ -38,32 +36,30 @@ exit: ; preds = %for.inc
ret void
}

define void @mloadstore_i32(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) {
define void @mloadstore_i32(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) {
; CHECK-LABEL: @mloadstore_i32
; CHECK: vector.body:
; CHECK: %[[LOAD1:.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>*
; CHECK: %[[LOAD1:.*]] = load <vscale x 4 x i32>, ptr
; CHECK-NEXT: %[[MASK:.*]] = icmp ne <vscale x 4 x i32> %[[LOAD1]],
; CHECK-NEXT: %[[GEPA:.*]] = getelementptr i32, i32* %a,
; CHECK-NEXT: %[[MLOAD_PTRS:.*]] = bitcast i32* %[[GEPA]] to <vscale x 4 x i32>*
; CHECK-NEXT: %[[LOAD2:.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* %[[MLOAD_PTRS]], i32 4, <vscale x 4 x i1> %[[MASK]]
; CHECK-NEXT: %[[GEPA:.*]] = getelementptr i32, ptr %a,
; CHECK-NEXT: %[[LOAD2:.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %[[GEPA]], i32 4, <vscale x 4 x i1> %[[MASK]]
; CHECK-NEXT: %[[FADD:.*]] = add <vscale x 4 x i32> %[[LOAD1]], %[[LOAD2]]
; CHECK-NEXT: %[[MSTORE_PTRS:.*]] = bitcast i32* %[[GEPA]] to <vscale x 4 x i32>*
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[FADD]], <vscale x 4 x i32>* %[[MSTORE_PTRS]], i32 4, <vscale x 4 x i1> %[[MASK]])
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %[[FADD]], ptr %[[GEPA]], i32 4, <vscale x 4 x i1> %[[MASK]])
entry:
br label %for.body

for.body: ; preds = %entry, %for.inc
%i.011 = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i32, i32* %b, i64 %i.011
%0 = load i32, i32* %arrayidx, align 4
%arrayidx = getelementptr inbounds i32, ptr %b, i64 %i.011
%0 = load i32, ptr %arrayidx, align 4
%cmp1 = icmp ne i32 %0, 0
br i1 %cmp1, label %if.then, label %for.inc

if.then: ; preds = %for.body
%arrayidx3 = getelementptr inbounds i32, i32* %a, i64 %i.011
%1 = load i32, i32* %arrayidx3, align 4
%arrayidx3 = getelementptr inbounds i32, ptr %a, i64 %i.011
%1 = load i32, ptr %arrayidx3, align 4
%add = add i32 %0, %1
store i32 %add, i32* %arrayidx3, align 4
store i32 %add, ptr %arrayidx3, align 4
br label %for.inc

for.inc: ; preds = %for.body, %if.then
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

; CHECK-LABEL: @scalable_load_in_loop
; CHECK-NOT: vector.body
define void @scalable_load_in_loop(i64 %n, <vscale x 4 x i32>* %x, <vscale x 4 x i32>* %y) {
define void @scalable_load_in_loop(i64 %n, ptr %x, ptr %y) {
entry:
br label %for.body

Expand All @@ -31,8 +31,8 @@ for.body:
br i1 %cmp, label %for.inc, label %if.end

if.end:
%0 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %y
store <vscale x 4 x i32> %0, <vscale x 4 x i32>* %x
%0 = load <vscale x 4 x i32>, ptr %y
store <vscale x 4 x i32> %0, ptr %x
br label %for.inc

for.inc:
Expand Down
36 changes: 18 additions & 18 deletions llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

target triple = "aarch64-linux-gnu"

define i32 @select_const_i32_from_icmp(i32* nocapture readonly %v, i64 %n) #0 {
define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 {
; CHECK-VF4IC1-LABEL: @select_const_i32_from_icmp
; CHECK-VF4IC1: vector.body:
; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
Expand Down Expand Up @@ -47,8 +47,8 @@ entry:
for.body: ; preds = %entry, %for.body
%0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
%1 = phi i32 [ 3, %entry ], [ %5, %for.body ]
%2 = getelementptr inbounds i32, i32* %v, i64 %0
%3 = load i32, i32* %2, align 4
%2 = getelementptr inbounds i32, ptr %v, i64 %0
%3 = load i32, ptr %2, align 4
%4 = icmp eq i32 %3, 3
%5 = select i1 %4, i32 %1, i32 7
%6 = add nuw nsw i64 %0, 1
Expand All @@ -59,7 +59,7 @@ exit: ; preds = %for.body
ret i32 %5
}

define i32 @select_i32_from_icmp(i32* nocapture readonly %v, i32 %a, i32 %b, i64 %n) #0 {
define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64 %n) #0 {
; CHECK-VF4IC1-LABEL: @select_i32_from_icmp
; CHECK-VF4IC1: vector.ph:
; CHECK-VF4IC1: [[TMP1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 %a, i32 0
Expand All @@ -86,8 +86,8 @@ entry:
for.body: ; preds = %entry, %for.body
%0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
%1 = phi i32 [ %a, %entry ], [ %5, %for.body ]
%2 = getelementptr inbounds i32, i32* %v, i64 %0
%3 = load i32, i32* %2, align 4
%2 = getelementptr inbounds i32, ptr %v, i64 %0
%3 = load i32, ptr %2, align 4
%4 = icmp eq i32 %3, 3
%5 = select i1 %4, i32 %1, i32 %b
%6 = add nuw nsw i64 %0, 1
Expand All @@ -98,7 +98,7 @@ exit: ; preds = %for.body
ret i32 %5
}

define i32 @select_const_i32_from_fcmp(float* nocapture readonly %v, i64 %n) #0 {
define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) #0 {
; CHECK-VF4IC1-LABEL: @select_const_i32_from_fcmp
; CHECK-VF4IC1: vector.body:
; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
Expand All @@ -118,8 +118,8 @@ entry:
for.body: ; preds = %entry, %for.body
%0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
%1 = phi i32 [ 2, %entry ], [ %5, %for.body ]
%2 = getelementptr inbounds float, float* %v, i64 %0
%3 = load float, float* %2, align 4
%2 = getelementptr inbounds float, ptr %v, i64 %0
%3 = load float, ptr %2, align 4
%4 = fcmp fast ueq float %3, 3.0
%5 = select i1 %4, i32 %1, i32 1
%6 = add nuw nsw i64 %0, 1
Expand All @@ -130,7 +130,7 @@ exit: ; preds = %for.body
ret i32 %5
}

define float @select_const_f32_from_icmp(i32* nocapture readonly %v, i64 %n) #0 {
define float @select_const_f32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 {
; CHECK-VF4IC1-LABEL: @select_const_f32_from_icmp
; CHECK-VF4IC1-NOT: vector.body
; CHECK-VF4IC4-LABEL: @select_const_f32_from_icmp
Expand All @@ -141,8 +141,8 @@ entry:
for.body: ; preds = %entry, %for.body
%0 = phi i64 [ 0, %entry ], [ %6, %for.body ]
%1 = phi fast float [ 3.0, %entry ], [ %5, %for.body ]
%2 = getelementptr inbounds i32, i32* %v, i64 %0
%3 = load i32, i32* %2, align 4
%2 = getelementptr inbounds i32, ptr %v, i64 %0
%3 = load i32, ptr %2, align 4
%4 = icmp eq i32 %3, 3
%5 = select fast i1 %4, float %1, float 7.0
%6 = add nuw nsw i64 %0, 1
Expand All @@ -153,13 +153,13 @@ exit: ; preds = %for.body
ret float %5
}

define i32 @pred_select_const_i32_from_icmp(i32* noalias nocapture readonly %src1, i32* noalias nocapture readonly %src2, i64 %n) #0 {
define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i64 %n) #0 {
; CHECK-VF4IC1-LABEL: @pred_select_const_i32_from_icmp
; CHECK-VF4IC1: vector.body:
; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <vscale x 4 x i32>
; CHECK-VF4IC1: [[MASK:%.*]] = icmp sgt <vscale x 4 x i32> [[VEC_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 35, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-VF4IC1: [[MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* {{%.*}}, i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x i32> poison)
; CHECK-VF4IC1: [[MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr {{%.*}}, i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x i32> poison)
; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq <vscale x 4 x i32> [[MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-VF4IC1-NEXT: [[VEC_SEL_TMP:%.*]] = select <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[VEC_PHI]]
; CHECK-VF4IC1: [[VEC_SEL:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x i32> [[VEC_SEL_TMP]], <vscale x 4 x i32> [[VEC_PHI]]
Expand All @@ -176,14 +176,14 @@ entry:
for.body: ; preds = %entry, %for.inc
%i.013 = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
%r.012 = phi i32 [ %r.1, %for.inc ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i32, i32* %src1, i64 %i.013
%0 = load i32, i32* %arrayidx, align 4
%arrayidx = getelementptr inbounds i32, ptr %src1, i64 %i.013
%0 = load i32, ptr %arrayidx, align 4
%cmp1 = icmp sgt i32 %0, 35
br i1 %cmp1, label %if.then, label %for.inc

if.then: ; preds = %for.body
%arrayidx2 = getelementptr inbounds i32, i32* %src2, i64 %i.013
%1 = load i32, i32* %arrayidx2, align 4
%arrayidx2 = getelementptr inbounds i32, ptr %src2, i64 %i.013
%1 = load i32, ptr %arrayidx2, align 4
%cmp3 = icmp eq i32 %1, 2
%spec.select = select i1 %cmp3, i32 1, i32 %r.012
br label %for.inc
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@ target triple="aarch64-unknown-linux-gnu"
; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 4 for VF vscale x 2 For instruction: %add = fadd float %0, %sum.07
; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 8 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07

define float @fadd_strict32(float* noalias nocapture readonly %a, i64 %n) #0 {
define float @fadd_strict32(ptr noalias nocapture readonly %a, i64 %n) #0 {
entry:
br label %for.body

for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds float, float* %a, i64 %iv
%0 = load float, float* %arrayidx, align 4
%arrayidx = getelementptr inbounds float, ptr %a, i64 %iv
%0 = load float, ptr %arrayidx, align 4
%add = fadd float %0, %sum.07
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %n
Expand All @@ -34,15 +34,15 @@ for.end:
; CHECK: Found an estimated cost of 8 for VF vscale x 2 For instruction: %add = fadd double %0, %sum.07
; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 4 for VF vscale x 2 For instruction: %add = fadd double %0, %sum.07

define double @fadd_strict64(double* noalias nocapture readonly %a, i64 %n) #0 {
define double @fadd_strict64(ptr noalias nocapture readonly %a, i64 %n) #0 {
entry:
br label %for.body

for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%sum.07 = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds double, double* %a, i64 %iv
%0 = load double, double* %arrayidx, align 4
%arrayidx = getelementptr inbounds double, ptr %a, i64 %iv
%0 = load double, ptr %arrayidx, align 4
%add = fadd double %0, %sum.07
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %n
Expand Down
Loading