diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 540c2e7476dc1..08d62f24466e4 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1998,6 +1998,10 @@ bool RISCVTargetLowering::canSplatOperand(Instruction *I, int Operand) const { case Intrinsic::vp_sdiv: case Intrinsic::vp_urem: case Intrinsic::vp_srem: + case Intrinsic::ssub_sat: + case Intrinsic::vp_ssub_sat: + case Intrinsic::usub_sat: + case Intrinsic::vp_usub_sat: return Operand == 1; // These intrinsics are commutative. case Intrinsic::vp_add: @@ -2009,6 +2013,18 @@ bool RISCVTargetLowering::canSplatOperand(Instruction *I, int Operand) const { case Intrinsic::vp_fmul: case Intrinsic::vp_icmp: case Intrinsic::vp_fcmp: + case Intrinsic::smin: + case Intrinsic::vp_smin: + case Intrinsic::umin: + case Intrinsic::vp_umin: + case Intrinsic::smax: + case Intrinsic::vp_smax: + case Intrinsic::umax: + case Intrinsic::vp_umax: + case Intrinsic::sadd_sat: + case Intrinsic::vp_sadd_sat: + case Intrinsic::uadd_sat: + case Intrinsic::vp_uadd_sat: // These intrinsics have 'vr' versions. case Intrinsic::vp_sub: case Intrinsic::vp_fsub: diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll index 191f047131fb1..5d09c39dfd6e6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll +++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll @@ -2849,23 +2849,21 @@ for.body: ; preds = %for.body.preheader, br i1 %cmp.not, label %for.cond.cleanup, label %for.body } -declare <4 x i32> @llvm.vp.mul.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) +declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>) -define void @sink_splat_vp_mul(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { -; CHECK-LABEL: sink_splat_vp_mul: +define void @sink_splat_min(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_min: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a3, 1 -; CHECK-NEXT: add a3, a0, a3 +; CHECK-NEXT: li a2, 1024 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: .LBB46_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma -; CHECK-NEXT: vmul.vx v8, v8, a1, v0.t -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmin.vx v8, v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: bne a0, a3, .LBB46_1 +; CHECK-NEXT: addi a2, a2, 4 +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: bnez a2, .LBB46_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -2877,9 +2875,9 @@ vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = getelementptr inbounds i32, ptr %a, i64 %index %wide.load = load <4 x i32>, ptr %0, align 4 - %1 = call <4 x i32> @llvm.vp.mul.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + %1 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat) store <4 x i32> %1, ptr %0, align 4 - %index.next = add nuw i64 %index, 4 + %index.next = sub nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -2887,23 +2885,19 @@ for.cond.cleanup: ; preds = %vector.body ret void } -declare <4 x i32> @llvm.vp.add.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) - -define void @sink_splat_vp_add(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { -; CHECK-LABEL: sink_splat_vp_add: +define void @sink_splat_min_commute(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_min_commute: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a3, 1 -; CHECK-NEXT: add a3, a0, a3 +; CHECK-NEXT: li a2, 1024 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: .LBB47_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma -; CHECK-NEXT: vadd.vx v8, v8, a1, v0.t -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmin.vx v8, v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: bne a0, a3, .LBB47_1 +; CHECK-NEXT: addi a2, a2, 4 +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: bnez a2, .LBB47_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -2915,9 +2909,9 @@ vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = getelementptr inbounds i32, ptr %a, i64 %index %wide.load = load <4 x i32>, ptr %0, align 4 - %1 = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + %1 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load) store <4 x i32> %1, ptr %0, align 4 - %index.next = add nuw i64 %index, 4 + %index.next = sub nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -2925,21 +2919,21 @@ for.cond.cleanup: ; preds = %vector.body ret void } -define void @sink_splat_vp_add_commute(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { -; CHECK-LABEL: sink_splat_vp_add_commute: +declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>) + +define void @sink_splat_max(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_max: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a3, 1 -; CHECK-NEXT: add a3, a0, a3 +; CHECK-NEXT: li a2, 1024 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: .LBB48_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma -; CHECK-NEXT: vadd.vx v8, v8, a1, v0.t -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmax.vx v8, v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: bne a0, a3, .LBB48_1 +; CHECK-NEXT: addi a2, a2, 4 +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: bnez a2, .LBB48_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -2951,9 +2945,9 @@ vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = getelementptr inbounds i32, ptr %a, i64 %index %wide.load = load <4 x i32>, ptr %0, align 4 - %1 = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl) + %1 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat) store <4 x i32> %1, ptr %0, align 4 - %index.next = add nuw i64 %index, 4 + %index.next = sub nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -2961,23 +2955,19 @@ for.cond.cleanup: ; preds = %vector.body ret void } -declare <4 x i32> @llvm.vp.sub.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) - -define void @sink_splat_vp_sub(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { -; CHECK-LABEL: sink_splat_vp_sub: +define void @sink_splat_max_commute(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_max_commute: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a3, 1 -; CHECK-NEXT: add a3, a0, a3 +; CHECK-NEXT: li a2, 1024 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: .LBB49_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma -; CHECK-NEXT: vsub.vx v8, v8, a1, v0.t -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmax.vx v8, v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: bne a0, a3, .LBB49_1 +; CHECK-NEXT: addi a2, a2, 4 +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: bnez a2, .LBB49_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -2989,9 +2979,9 @@ vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = getelementptr inbounds i32, ptr %a, i64 %index %wide.load = load <4 x i32>, ptr %0, align 4 - %1 = call <4 x i32> @llvm.vp.sub.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + %1 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load) store <4 x i32> %1, ptr %0, align 4 - %index.next = add nuw i64 %index, 4 + %index.next = sub nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -2999,21 +2989,21 @@ for.cond.cleanup: ; preds = %vector.body ret void } -define void @sink_splat_vp_rsub(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { -; CHECK-LABEL: sink_splat_vp_rsub: +declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>) + +define void @sink_splat_umin(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_umin: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a3, 1 -; CHECK-NEXT: add a3, a0, a3 +; CHECK-NEXT: li a2, 1024 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: .LBB50_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma -; CHECK-NEXT: vrsub.vx v8, v8, a1, v0.t -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vminu.vx v8, v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: bne a0, a3, .LBB50_1 +; CHECK-NEXT: addi a2, a2, 4 +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: bnez a2, .LBB50_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -3025,9 +3015,9 @@ vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = getelementptr inbounds i32, ptr %a, i64 %index %wide.load = load <4 x i32>, ptr %0, align 4 - %1 = call <4 x i32> @llvm.vp.sub.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl) + %1 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat) store <4 x i32> %1, ptr %0, align 4 - %index.next = add nuw i64 %index, 4 + %index.next = sub nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -3035,23 +3025,19 @@ for.cond.cleanup: ; preds = %vector.body ret void } -declare <4 x i32> @llvm.vp.shl.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) - -define void @sink_splat_vp_shl(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { -; CHECK-LABEL: sink_splat_vp_shl: +define void @sink_splat_umin_commute(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_umin_commute: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a3, 1 -; CHECK-NEXT: add a3, a0, a3 +; CHECK-NEXT: li a2, 1024 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: .LBB51_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma -; CHECK-NEXT: vsll.vx v8, v8, a1, v0.t -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vminu.vx v8, v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: bne a0, a3, .LBB51_1 +; CHECK-NEXT: addi a2, a2, 4 +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: bnez a2, .LBB51_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -3063,9 +3049,9 @@ vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = getelementptr inbounds i32, ptr %a, i64 %index %wide.load = load <4 x i32>, ptr %0, align 4 - %1 = call <4 x i32> @llvm.vp.shl.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + %1 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load) store <4 x i32> %1, ptr %0, align 4 - %index.next = add nuw i64 %index, 4 + %index.next = sub nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -3073,23 +3059,21 @@ for.cond.cleanup: ; preds = %vector.body ret void } -declare <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) +declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>) -define void @sink_splat_vp_lshr(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { -; CHECK-LABEL: sink_splat_vp_lshr: +define void @sink_splat_umax(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_umax: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a3, 1 -; CHECK-NEXT: add a3, a0, a3 +; CHECK-NEXT: li a2, 1024 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: .LBB52_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma -; CHECK-NEXT: vsrl.vx v8, v8, a1, v0.t -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmaxu.vx v8, v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: bne a0, a3, .LBB52_1 +; CHECK-NEXT: addi a2, a2, 4 +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: bnez a2, .LBB52_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -3101,9 +3085,9 @@ vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = getelementptr inbounds i32, ptr %a, i64 %index %wide.load = load <4 x i32>, ptr %0, align 4 - %1 = call <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + %1 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat) store <4 x i32> %1, ptr %0, align 4 - %index.next = add nuw i64 %index, 4 + %index.next = sub nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -3111,23 +3095,19 @@ for.cond.cleanup: ; preds = %vector.body ret void } -declare <4 x i32> @llvm.vp.ashr.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) - -define void @sink_splat_vp_ashr(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { -; CHECK-LABEL: sink_splat_vp_ashr: +define void @sink_splat_umax_commute(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_umax_commute: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a3, 1 -; CHECK-NEXT: add a3, a0, a3 +; CHECK-NEXT: li a2, 1024 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: .LBB53_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma -; CHECK-NEXT: vsra.vx v8, v8, a1, v0.t -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmaxu.vx v8, v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: bne a0, a3, .LBB53_1 +; CHECK-NEXT: addi a2, a2, 4 +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: bnez a2, .LBB53_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -3139,9 +3119,9 @@ vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = getelementptr inbounds i32, ptr %a, i64 %index %wide.load = load <4 x i32>, ptr %0, align 4 - %1 = call <4 x i32> @llvm.vp.ashr.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + %1 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load) store <4 x i32> %1, ptr %0, align 4 - %index.next = add nuw i64 %index, 4 + %index.next = sub nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -3149,10 +3129,10 @@ for.cond.cleanup: ; preds = %vector.body ret void } -declare <4 x float> @llvm.vp.fmul.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32) +declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>) -define void @sink_splat_vp_fmul(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) { -; CHECK-LABEL: sink_splat_vp_fmul: +define void @sink_splat_sadd_sat(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_sadd_sat: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a2, 1 ; CHECK-NEXT: add a2, a0, a2 @@ -3160,25 +3140,23 @@ define void @sink_splat_vp_fmul(ptr nocapture %a, float %x, <4 x i1> %m, i32 zer ; CHECK-NEXT: .LBB54_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vfmul.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: bne a0, a2, .LBB54_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0 - %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = getelementptr inbounds float, ptr %a, i64 %index - %wide.load = load <4 x float>, ptr %0, align 4 - %1 = call <4 x float> @llvm.vp.fmul.v4i32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x i1> %m, i32 %vl) - store <4 x float> %1, ptr %0, align 4 + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat) + store <4 x i32> %1, ptr %0, align 4 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -3187,10 +3165,8 @@ for.cond.cleanup: ; preds = %vector.body ret void } -declare <4 x float> @llvm.vp.fdiv.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32) - -define void @sink_splat_vp_fdiv(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) { -; CHECK-LABEL: sink_splat_vp_fdiv: +define void @sink_splat_sadd_sat_commute(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_sadd_sat_commute: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a2, 1 ; CHECK-NEXT: add a2, a0, a2 @@ -3198,25 +3174,23 @@ define void @sink_splat_vp_fdiv(ptr nocapture %a, float %x, <4 x i1> %m, i32 zer ; CHECK-NEXT: .LBB55_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vfdiv.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: bne a0, a2, .LBB55_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0 - %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = getelementptr inbounds float, ptr %a, i64 %index - %wide.load = load <4 x float>, ptr %0, align 4 - %1 = call <4 x float> @llvm.vp.fdiv.v4i32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x i1> %m, i32 %vl) - store <4 x float> %1, ptr %0, align 4 + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load) + store <4 x i32> %1, ptr %0, align 4 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -3225,35 +3199,35 @@ for.cond.cleanup: ; preds = %vector.body ret void } -define void @sink_splat_vp_frdiv(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) { -; CHECK-LABEL: sink_splat_vp_frdiv: +declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>) + +define void @sink_splat_ssub_sat(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_ssub_sat: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a2, 1 -; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: li a2, 1024 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: .LBB56_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vfrdiv.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: bne a0, a2, .LBB56_1 +; CHECK-NEXT: addi a2, a2, 4 +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: bnez a2, .LBB56_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0 - %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = getelementptr inbounds float, ptr %a, i64 %index - %wide.load = load <4 x float>, ptr %0, align 4 - %1 = call <4 x float> @llvm.vp.fdiv.v4i32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x i1> %m, i32 %vl) - store <4 x float> %1, ptr %0, align 4 - %index.next = add nuw i64 %index, 4 + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat) + store <4 x i32> %1, ptr %0, align 4 + %index.next = sub nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -3261,10 +3235,10 @@ for.cond.cleanup: ; preds = %vector.body ret void } -declare <4 x float> @llvm.vp.fadd.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32) +declare <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32>, <4 x i32>) -define void @sink_splat_vp_fadd(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) { -; CHECK-LABEL: sink_splat_vp_fadd: +define void @sink_splat_uadd_sat(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_uadd_sat: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a2, 1 ; CHECK-NEXT: add a2, a0, a2 @@ -3272,25 +3246,23 @@ define void @sink_splat_vp_fadd(ptr nocapture %a, float %x, <4 x i1> %m, i32 zer ; CHECK-NEXT: .LBB57_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vfadd.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: bne a0, a2, .LBB57_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0 - %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = getelementptr inbounds float, ptr %a, i64 %index - %wide.load = load <4 x float>, ptr %0, align 4 - %1 = call <4 x float> @llvm.vp.fadd.v4i32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x i1> %m, i32 %vl) - store <4 x float> %1, ptr %0, align 4 + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat) + store <4 x i32> %1, ptr %0, align 4 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -3299,10 +3271,8 @@ for.cond.cleanup: ; preds = %vector.body ret void } -declare <4 x float> @llvm.vp.fsub.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32) - -define void @sink_splat_vp_fsub(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) { -; CHECK-LABEL: sink_splat_vp_fsub: +define void @sink_splat_uadd_sat_commute(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_uadd_sat_commute: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a2, 1 ; CHECK-NEXT: add a2, a0, a2 @@ -3310,25 +3280,23 @@ define void @sink_splat_vp_fsub(ptr nocapture %a, float %x, <4 x i1> %m, i32 zer ; CHECK-NEXT: .LBB58_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vfsub.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: bne a0, a2, .LBB58_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0 - %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = getelementptr inbounds float, ptr %a, i64 %index - %wide.load = load <4 x float>, ptr %0, align 4 - %1 = call <4 x float> @llvm.vp.fsub.v4i32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x i1> %m, i32 %vl) - store <4 x float> %1, ptr %0, align 4 + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load) + store <4 x i32> %1, ptr %0, align 4 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -3337,37 +3305,35 @@ for.cond.cleanup: ; preds = %vector.body ret void } -declare <4 x float> @llvm.vp.frsub.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32) +declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) -define void @sink_splat_vp_frsub(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) { -; CHECK-LABEL: sink_splat_vp_frsub: +define void @sink_splat_usub_sat(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_usub_sat: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a2, 1 -; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: li a2, 1024 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: .LBB59_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vfrsub.vf v8, v8, fa0, v0.t -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: bne a0, a2, .LBB59_1 +; CHECK-NEXT: addi a2, a2, 4 +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: bnez a2, .LBB59_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0 - %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = getelementptr inbounds float, ptr %a, i64 %index - %wide.load = load <4 x float>, ptr %0, align 4 - %1 = call <4 x float> @llvm.vp.fsub.v4i32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x i1> %m, i32 %vl) - store <4 x float> %1, ptr %0, align 4 - %index.next = add nuw i64 %index, 4 + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat) + store <4 x i32> %1, ptr %0, align 4 + %index.next = sub nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -3375,10 +3341,10 @@ for.cond.cleanup: ; preds = %vector.body ret void } -declare <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) +declare <4 x i32> @llvm.vp.mul.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) -define void @sink_splat_vp_udiv(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { -; CHECK-LABEL: sink_splat_vp_udiv: +define void @sink_splat_vp_mul(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_mul: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a3, 1 ; CHECK-NEXT: add a3, a0, a3 @@ -3387,7 +3353,7 @@ define void @sink_splat_vp_udiv(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma -; CHECK-NEXT: vdivu.vx v8, v8, a1, v0.t +; CHECK-NEXT: vmul.vx v8, v8, a1, v0.t ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 16 @@ -3403,7 +3369,7 @@ vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = getelementptr inbounds i32, ptr %a, i64 %index %wide.load = load <4 x i32>, ptr %0, align 4 - %1 = call <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + %1 = call <4 x i32> @llvm.vp.mul.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) store <4 x i32> %1, ptr %0, align 4 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 @@ -3413,10 +3379,10 @@ for.cond.cleanup: ; preds = %vector.body ret void } -declare <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) +declare <4 x i32> @llvm.vp.add.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) -define void @sink_splat_vp_sdiv(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { -; CHECK-LABEL: sink_splat_vp_sdiv: +define void @sink_splat_vp_add(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_add: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a3, 1 ; CHECK-NEXT: add a3, a0, a3 @@ -3425,7 +3391,7 @@ define void @sink_splat_vp_sdiv(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma -; CHECK-NEXT: vdiv.vx v8, v8, a1, v0.t +; CHECK-NEXT: vadd.vx v8, v8, a1, v0.t ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 16 @@ -3441,7 +3407,7 @@ vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = getelementptr inbounds i32, ptr %a, i64 %index %wide.load = load <4 x i32>, ptr %0, align 4 - %1 = call <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + %1 = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) store <4 x i32> %1, ptr %0, align 4 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 @@ -3451,10 +3417,8 @@ for.cond.cleanup: ; preds = %vector.body ret void } -declare <4 x i32> @llvm.vp.urem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) - -define void @sink_splat_vp_urem(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { -; CHECK-LABEL: sink_splat_vp_urem: +define void @sink_splat_vp_add_commute(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_add_commute: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a3, 1 ; CHECK-NEXT: add a3, a0, a3 @@ -3463,7 +3427,7 @@ define void @sink_splat_vp_urem(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma -; CHECK-NEXT: vremu.vx v8, v8, a1, v0.t +; CHECK-NEXT: vadd.vx v8, v8, a1, v0.t ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 16 @@ -3479,7 +3443,7 @@ vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = getelementptr inbounds i32, ptr %a, i64 %index %wide.load = load <4 x i32>, ptr %0, align 4 - %1 = call <4 x i32> @llvm.vp.urem.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + %1 = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl) store <4 x i32> %1, ptr %0, align 4 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 @@ -3489,10 +3453,10 @@ for.cond.cleanup: ; preds = %vector.body ret void } -declare <4 x i32> @llvm.vp.srem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) +declare <4 x i32> @llvm.vp.sub.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) -define void @sink_splat_vp_srem(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { -; CHECK-LABEL: sink_splat_vp_srem: +define void @sink_splat_vp_sub(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_sub: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a3, 1 ; CHECK-NEXT: add a3, a0, a3 @@ -3501,7 +3465,7 @@ define void @sink_splat_vp_srem(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma -; CHECK-NEXT: vrem.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsub.vx v8, v8, a1, v0.t ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 16 @@ -3517,7 +3481,7 @@ vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = getelementptr inbounds i32, ptr %a, i64 %index %wide.load = load <4 x i32>, ptr %0, align 4 - %1 = call <4 x i32> @llvm.vp.srem.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + %1 = call <4 x i32> @llvm.vp.sub.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) store <4 x i32> %1, ptr %0, align 4 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 @@ -3527,24 +3491,21 @@ for.cond.cleanup: ; preds = %vector.body ret void } -; Check that we don't sink a splat operand that has no chance of being folded. - -define void @sink_splat_vp_srem_commute(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { -; CHECK-LABEL: sink_splat_vp_srem_commute: +define void @sink_splat_vp_rsub(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_rsub: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a3, a0, a3 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.v.x v8, a1 -; CHECK-NEXT: lui a1, 1 -; CHECK-NEXT: add a1, a0, a1 ; CHECK-NEXT: .LBB64_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma -; CHECK-NEXT: vrem.vv v9, v8, v9, v0.t +; CHECK-NEXT: vrsub.vx v8, v8, a1, v0.t ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vse32.v v9, (a0) +; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: bne a0, a1, .LBB64_1 +; CHECK-NEXT: bne a0, a3, .LBB64_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -3556,7 +3517,7 @@ vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = getelementptr inbounds i32, ptr %a, i64 %index %wide.load = load <4 x i32>, ptr %0, align 4 - %1 = call <4 x i32> @llvm.vp.srem.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl) + %1 = call <4 x i32> @llvm.vp.sub.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl) store <4 x i32> %1, ptr %0, align 4 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 @@ -3566,115 +3527,112 @@ for.cond.cleanup: ; preds = %vector.body ret void } -declare <4 x float> @llvm.vp.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, <4 x i1>, i32) +declare <4 x i32> @llvm.vp.shl.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) -define void @sink_splat_vp_fma(ptr noalias nocapture %a, ptr nocapture readonly %b, float %x, <4 x i1> %m, i32 zeroext %vl) { -; CHECK-LABEL: sink_splat_vp_fma: +define void @sink_splat_vp_shl(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_shl: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a3, 1 -; CHECK-NEXT: add a3, a1, a3 +; CHECK-NEXT: add a3, a0, a3 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: .LBB65_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vle32.v v9, (a1) ; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma -; CHECK-NEXT: vfmadd.vf v8, fa0, v9, v0.t +; CHECK-NEXT: vsll.vx v8, v8, a1, v0.t ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: bne a1, a3, .LBB65_1 +; CHECK-NEXT: bne a0, a3, .LBB65_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0 - %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = getelementptr inbounds float, ptr %a, i64 %index - %wide.load = load <4 x float>, ptr %0, align 4 - %1 = getelementptr inbounds float, ptr %b, i64 %index - %wide.load12 = load <4 x float>, ptr %1, align 4 - %2 = call <4 x float> @llvm.vp.fma.v4f32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x float> %wide.load12, <4 x i1> %m, i32 %vl) - store <4 x float> %2, ptr %0, align 4 + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.vp.shl.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + store <4 x i32> %1, ptr %0, align 4 %index.next = add nuw i64 %index, 4 - %3 = icmp eq i64 %index.next, 1024 - br i1 %3, label %for.cond.cleanup, label %vector.body + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void } -define void @sink_splat_vp_fma_commute(ptr noalias nocapture %a, ptr nocapture readonly %b, float %x, <4 x i1> %m, i32 zeroext %vl) { -; CHECK-LABEL: sink_splat_vp_fma_commute: +declare <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define void @sink_splat_vp_lshr(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_lshr: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a3, 1 -; CHECK-NEXT: add a3, a1, a3 +; CHECK-NEXT: add a3, a0, a3 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: .LBB66_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vle32.v v9, (a1) ; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma -; CHECK-NEXT: vfmadd.vf v8, fa0, v9, v0.t +; CHECK-NEXT: vsrl.vx v8, v8, a1, v0.t ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: bne a1, a3, .LBB66_1 +; CHECK-NEXT: bne a0, a3, .LBB66_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0 - %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = getelementptr inbounds float, ptr %a, i64 %index - %wide.load = load <4 x float>, ptr %0, align 4 - %1 = getelementptr inbounds float, ptr %b, i64 %index - %wide.load12 = load <4 x float>, ptr %1, align 4 - %2 = call <4 x float> @llvm.vp.fma.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12, <4 x i1> %m, i32 %vl) - store <4 x float> %2, ptr %0, align 4 + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + store <4 x i32> %1, ptr %0, align 4 %index.next = add nuw i64 %index, 4 - %3 = icmp eq i64 %index.next, 1024 - br i1 %3, label %for.cond.cleanup, label %vector.body + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body for.cond.cleanup: ; preds = %vector.body ret void } +declare <4 x i32> @llvm.vp.ashr.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) -define void @sink_splat_mul_lmul2(ptr nocapture %a, i64 signext %x) { -; CHECK-LABEL: sink_splat_mul_lmul2: +define void @sink_splat_vp_ashr(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_ashr: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a2, 2 -; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a3, a0, a3 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: .LBB67_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vmul.vx v8, v8, a1 -; CHECK-NEXT: vse64.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 32 -; CHECK-NEXT: bne a0, a2, .LBB67_1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vsra.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bne a0, a3, .LBB67_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0 - %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = getelementptr inbounds i64, ptr %a, i64 %index - %wide.load = load <4 x i64>, ptr %0, align 8 - %1 = mul <4 x i64> %wide.load, %broadcast.splat - store <4 x i64> %1, ptr %0, align 8 + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.vp.ashr.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + store <4 x i32> %1, ptr %0, align 4 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -3683,32 +3641,36 @@ for.cond.cleanup: ; preds = %vector.body ret void } -define void @sink_splat_add_lmul2(ptr nocapture %a, i64 signext %x) { -; CHECK-LABEL: sink_splat_add_lmul2: +declare <4 x float> @llvm.vp.fmul.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32) + +define void @sink_splat_vp_fmul(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_fmul: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a2, 2 +; CHECK-NEXT: lui a2, 1 ; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: .LBB68_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vadd.vx v8, v8, a1 -; CHECK-NEXT: vse64.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 32 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vfmul.vf v8, v8, fa0, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: bne a0, a2, .LBB68_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0 - %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0 + %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = getelementptr inbounds i64, ptr %a, i64 %index - %wide.load = load <4 x i64>, ptr %0, align 8 - %1 = add <4 x i64> %wide.load, %broadcast.splat - store <4 x i64> %1, ptr %0, align 8 + %0 = getelementptr inbounds float, ptr %a, i64 %index + %wide.load = load <4 x float>, ptr %0, align 4 + %1 = call <4 x float> @llvm.vp.fmul.v4i32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x i1> %m, i32 %vl) + store <4 x float> %1, ptr %0, align 4 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -3717,32 +3679,36 @@ for.cond.cleanup: ; preds = %vector.body ret void } -define void @sink_splat_sub_lmul2(ptr nocapture %a, i64 signext %x) { -; CHECK-LABEL: sink_splat_sub_lmul2: +declare <4 x float> @llvm.vp.fdiv.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32) + +define void @sink_splat_vp_fdiv(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_fdiv: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a2, 2 +; CHECK-NEXT: lui a2, 1 ; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: .LBB69_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsub.vx v8, v8, a1 -; CHECK-NEXT: vse64.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 32 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vfdiv.vf v8, v8, fa0, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: bne a0, a2, .LBB69_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0 - %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0 + %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = getelementptr inbounds i64, ptr %a, i64 %index - %wide.load = load <4 x i64>, ptr %0, align 8 - %1 = sub <4 x i64> %wide.load, %broadcast.splat - store <4 x i64> %1, ptr %0, align 8 + %0 = getelementptr inbounds float, ptr %a, i64 %index + %wide.load = load <4 x float>, ptr %0, align 4 + %1 = call <4 x float> @llvm.vp.fdiv.v4i32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x i1> %m, i32 %vl) + store <4 x float> %1, ptr %0, align 4 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -3751,32 +3717,1007 @@ for.cond.cleanup: ; preds = %vector.body ret void } -define void @sink_splat_rsub_lmul2(ptr nocapture %a, i64 signext %x) { -; CHECK-LABEL: sink_splat_rsub_lmul2: +define void @sink_splat_vp_frdiv(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_frdiv: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a2, 2 +; CHECK-NEXT: lui a2, 1 ; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: .LBB70_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vrsub.vx v8, v8, a1 -; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vfrdiv.vf v8, v8, fa0, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bne a0, a2, .LBB70_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0 + %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds float, ptr %a, i64 %index + %wide.load = load <4 x float>, ptr %0, align 4 + %1 = call <4 x float> @llvm.vp.fdiv.v4i32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x i1> %m, i32 %vl) + store <4 x float> %1, ptr %0, align 4 + %index.next = add nuw i64 %index, 4 + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +declare <4 x float> @llvm.vp.fadd.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32) + +define void @sink_splat_vp_fadd(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_fadd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a2, 1 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: .LBB71_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vfadd.vf v8, v8, fa0, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bne a0, a2, .LBB71_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0 + %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds float, ptr %a, i64 %index + %wide.load = load <4 x float>, ptr %0, align 4 + %1 = call <4 x float> @llvm.vp.fadd.v4i32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x i1> %m, i32 %vl) + store <4 x float> %1, ptr %0, align 4 + %index.next = add nuw i64 %index, 4 + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +declare <4 x float> @llvm.vp.fsub.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32) + +define void @sink_splat_vp_fsub(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_fsub: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a2, 1 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: .LBB72_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vfsub.vf v8, v8, fa0, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bne a0, a2, .LBB72_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0 + %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds float, ptr %a, i64 %index + %wide.load = load <4 x float>, ptr %0, align 4 + %1 = call <4 x float> @llvm.vp.fsub.v4i32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x i1> %m, i32 %vl) + store <4 x float> %1, ptr %0, align 4 + %index.next = add nuw i64 %index, 4 + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +declare <4 x float> @llvm.vp.frsub.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32) + +define void @sink_splat_vp_frsub(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_frsub: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a2, 1 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: .LBB73_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vfrsub.vf v8, v8, fa0, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bne a0, a2, .LBB73_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0 + %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds float, ptr %a, i64 %index + %wide.load = load <4 x float>, ptr %0, align 4 + %1 = call <4 x float> @llvm.vp.fsub.v4i32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x i1> %m, i32 %vl) + store <4 x float> %1, ptr %0, align 4 + %index.next = add nuw i64 %index, 4 + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +declare <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define void @sink_splat_vp_udiv(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_udiv: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a3, a0, a3 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: .LBB74_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vdivu.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bne a0, a3, .LBB74_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + store <4 x i32> %1, ptr %0, align 4 + %index.next = add nuw i64 %index, 4 + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +declare <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define void @sink_splat_vp_sdiv(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_sdiv: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a3, a0, a3 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: .LBB75_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vdiv.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bne a0, a3, .LBB75_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + store <4 x i32> %1, ptr %0, align 4 + %index.next = add nuw i64 %index, 4 + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +declare <4 x i32> @llvm.vp.urem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define void @sink_splat_vp_urem(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_urem: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a3, a0, a3 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: .LBB76_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vremu.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bne a0, a3, .LBB76_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.vp.urem.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + store <4 x i32> %1, ptr %0, align 4 + %index.next = add nuw i64 %index, 4 + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +declare <4 x i32> @llvm.vp.srem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define void @sink_splat_vp_srem(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_srem: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a3, a0, a3 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: .LBB77_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vrem.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bne a0, a3, .LBB77_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.vp.srem.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + store <4 x i32> %1, ptr %0, align 4 + %index.next = add nuw i64 %index, 4 + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +; Check that we don't sink a splat operand that has no chance of being folded. + +define void @sink_splat_vp_srem_commute(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_srem_commute: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: lui a1, 1 +; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: .LBB78_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vrem.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v9, (a0) +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bne a0, a1, .LBB78_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.vp.srem.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl) + store <4 x i32> %1, ptr %0, align 4 + %index.next = add nuw i64 %index, 4 + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +declare <4 x float> @llvm.vp.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, <4 x i1>, i32) + +define void @sink_splat_vp_fma(ptr noalias nocapture %a, ptr nocapture readonly %b, float %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_fma: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a3, a1, a3 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: .LBB79_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a1) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bne a1, a3, .LBB79_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0 + %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds float, ptr %a, i64 %index + %wide.load = load <4 x float>, ptr %0, align 4 + %1 = getelementptr inbounds float, ptr %b, i64 %index + %wide.load12 = load <4 x float>, ptr %1, align 4 + %2 = call <4 x float> @llvm.vp.fma.v4f32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x float> %wide.load12, <4 x i1> %m, i32 %vl) + store <4 x float> %2, ptr %0, align 4 + %index.next = add nuw i64 %index, 4 + %3 = icmp eq i64 %index.next, 1024 + br i1 %3, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @sink_splat_vp_fma_commute(ptr noalias nocapture %a, ptr nocapture readonly %b, float %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_fma_commute: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a3, a1, a3 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: .LBB80_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a1) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bne a1, a3, .LBB80_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0 + %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds float, ptr %a, i64 %index + %wide.load = load <4 x float>, ptr %0, align 4 + %1 = getelementptr inbounds float, ptr %b, i64 %index + %wide.load12 = load <4 x float>, ptr %1, align 4 + %2 = call <4 x float> @llvm.vp.fma.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12, <4 x i1> %m, i32 %vl) + store <4 x float> %2, ptr %0, align 4 + %index.next = add nuw i64 %index, 4 + %3 = icmp eq i64 %index.next, 1024 + br i1 %3, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + + +define void @sink_splat_mul_lmul2(ptr nocapture %a, i64 signext %x) { +; CHECK-LABEL: sink_splat_mul_lmul2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a2, 2 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: .LBB81_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vmul.vx v8, v8, a1 +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 32 +; CHECK-NEXT: bne a0, a2, .LBB81_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0 + %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i64, ptr %a, i64 %index + %wide.load = load <4 x i64>, ptr %0, align 8 + %1 = mul <4 x i64> %wide.load, %broadcast.splat + store <4 x i64> %1, ptr %0, align 8 + %index.next = add nuw i64 %index, 4 + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @sink_splat_add_lmul2(ptr nocapture %a, i64 signext %x) { +; CHECK-LABEL: sink_splat_add_lmul2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a2, 2 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: .LBB82_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vadd.vx v8, v8, a1 +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 32 +; CHECK-NEXT: bne a0, a2, .LBB82_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0 + %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i64, ptr %a, i64 %index + %wide.load = load <4 x i64>, ptr %0, align 8 + %1 = add <4 x i64> %wide.load, %broadcast.splat + store <4 x i64> %1, ptr %0, align 8 + %index.next = add nuw i64 %index, 4 + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @sink_splat_sub_lmul2(ptr nocapture %a, i64 signext %x) { +; CHECK-LABEL: sink_splat_sub_lmul2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a2, 2 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: .LBB83_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsub.vx v8, v8, a1 +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 32 +; CHECK-NEXT: bne a0, a2, .LBB83_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0 + %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i64, ptr %a, i64 %index + %wide.load = load <4 x i64>, ptr %0, align 8 + %1 = sub <4 x i64> %wide.load, %broadcast.splat + store <4 x i64> %1, ptr %0, align 8 + %index.next = add nuw i64 %index, 4 + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @sink_splat_rsub_lmul2(ptr nocapture %a, i64 signext %x) { +; CHECK-LABEL: sink_splat_rsub_lmul2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a2, 2 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: .LBB84_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vrsub.vx v8, v8, a1 +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 32 +; CHECK-NEXT: bne a0, a2, .LBB84_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0 + %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i64, ptr %a, i64 %index + %wide.load = load <4 x i64>, ptr %0, align 8 + %1 = sub <4 x i64> %broadcast.splat, %wide.load + store <4 x i64> %1, ptr %0, align 8 + %index.next = add nuw i64 %index, 4 + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @sink_splat_and_lmul2(ptr nocapture %a, i64 signext %x) { +; CHECK-LABEL: sink_splat_and_lmul2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a2, 2 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: .LBB85_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 32 +; CHECK-NEXT: bne a0, a2, .LBB85_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0 + %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i64, ptr %a, i64 %index + %wide.load = load <4 x i64>, ptr %0, align 8 + %1 = and <4 x i64> %wide.load, %broadcast.splat + store <4 x i64> %1, ptr %0, align 8 + %index.next = add nuw i64 %index, 4 + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @sink_splat_or_lmul2(ptr nocapture %a, i64 signext %x) { +; CHECK-LABEL: sink_splat_or_lmul2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a2, 2 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: .LBB86_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vor.vx v8, v8, a1 +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 32 +; CHECK-NEXT: bne a0, a2, .LBB86_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0 + %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i64, ptr %a, i64 %index + %wide.load = load <4 x i64>, ptr %0, align 8 + %1 = or <4 x i64> %wide.load, %broadcast.splat + store <4 x i64> %1, ptr %0, align 8 + %index.next = add nuw i64 %index, 4 + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @sink_splat_xor_lmul2(ptr nocapture %a, i64 signext %x) { +; CHECK-LABEL: sink_splat_xor_lmul2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a2, 2 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: .LBB87_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vxor.vx v8, v8, a1 +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 32 +; CHECK-NEXT: bne a0, a2, .LBB87_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0 + %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i64, ptr %a, i64 %index + %wide.load = load <4 x i64>, ptr %0, align 8 + %1 = xor <4 x i64> %wide.load, %broadcast.splat + store <4 x i64> %1, ptr %0, align 8 + %index.next = add nuw i64 %index, 4 + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @sink_splat_mul_lmul8(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_mul_lmul8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a2, 1 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: .LBB88_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vmul.vx v8, v8, a1 +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bne a0, a2, .LBB88_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <32 x i32>, ptr %0, align 4 + %1 = mul <32 x i32> %wide.load, %broadcast.splat + store <32 x i32> %1, ptr %0, align 4 + %index.next = add nuw i64 %index, 4 + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @sink_splat_add_lmul8(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_add_lmul8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a2, 1 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: .LBB89_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vadd.vx v8, v8, a1 +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bne a0, a2, .LBB89_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <32 x i32>, ptr %0, align 4 + %1 = add <32 x i32> %wide.load, %broadcast.splat + store <32 x i32> %1, ptr %0, align 4 + %index.next = add nuw i64 %index, 4 + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @sink_splat_sub_lmul8(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_sub_lmul8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a2, 1 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: .LBB90_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsub.vx v8, v8, a1 +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bne a0, a2, .LBB90_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <32 x i32>, ptr %0, align 4 + %1 = sub <32 x i32> %wide.load, %broadcast.splat + store <32 x i32> %1, ptr %0, align 4 + %index.next = add nuw i64 %index, 4 + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @sink_splat_rsub_lmul8(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_rsub_lmul8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a2, 1 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: .LBB91_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vrsub.vx v8, v8, a1 +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bne a0, a2, .LBB91_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <32 x i32>, ptr %0, align 4 + %1 = sub <32 x i32> %broadcast.splat, %wide.load + store <32 x i32> %1, ptr %0, align 4 + %index.next = add nuw i64 %index, 4 + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @sink_splat_and_lmul8(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_and_lmul8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a2, 1 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: .LBB92_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bne a0, a2, .LBB92_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <32 x i32>, ptr %0, align 4 + %1 = and <32 x i32> %wide.load, %broadcast.splat + store <32 x i32> %1, ptr %0, align 4 + %index.next = add nuw i64 %index, 4 + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @sink_splat_or_lmul8(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_or_lmul8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a2, 1 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: .LBB93_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vor.vx v8, v8, a1 +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bne a0, a2, .LBB93_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <32 x i32>, ptr %0, align 4 + %1 = or <32 x i32> %wide.load, %broadcast.splat + store <32 x i32> %1, ptr %0, align 4 + %index.next = add nuw i64 %index, 4 + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @sink_splat_xor_lmul8(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_xor_lmul8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a2, 1 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: .LBB94_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vxor.vx v8, v8, a1 +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bne a0, a2, .LBB94_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <32 x i32>, ptr %0, align 4 + %1 = xor <32 x i32> %wide.load, %broadcast.splat + store <32 x i32> %1, ptr %0, align 4 + %index.next = add nuw i64 %index, 4 + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @sink_splat_mul_lmulmf2(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_mul_lmulmf2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a2, 2 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: .LBB95_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vmul.vx v8, v8, a1 +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 32 +; CHECK-NEXT: bne a0, a2, .LBB95_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0 + %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i64, ptr %a, i64 %index + %wide.load = load <2 x i32>, ptr %0, align 8 + %1 = mul <2 x i32> %wide.load, %broadcast.splat + store <2 x i32> %1, ptr %0, align 8 + %index.next = add nuw i64 %index, 4 + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @sink_splat_add_lmulmf2(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_add_lmulmf2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a2, 2 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: .LBB96_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vadd.vx v8, v8, a1 +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 32 +; CHECK-NEXT: bne a0, a2, .LBB96_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0 + %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i64, ptr %a, i64 %index + %wide.load = load <2 x i32>, ptr %0, align 8 + %1 = add <2 x i32> %wide.load, %broadcast.splat + store <2 x i32> %1, ptr %0, align 8 + %index.next = add nuw i64 %index, 4 + %2 = icmp eq i64 %index.next, 1024 + br i1 %2, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @sink_splat_sub_lmulmf2(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_sub_lmulmf2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a2, 2 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: .LBB97_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsub.vx v8, v8, a1 +; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 32 -; CHECK-NEXT: bne a0, a2, .LBB70_1 +; CHECK-NEXT: bne a0, a2, .LBB97_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0 - %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0 + %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = getelementptr inbounds i64, ptr %a, i64 %index - %wide.load = load <4 x i64>, ptr %0, align 8 - %1 = sub <4 x i64> %broadcast.splat, %wide.load - store <4 x i64> %1, ptr %0, align 8 + %wide.load = load <2 x i32>, ptr %0, align 8 + %1 = sub <2 x i32> %wide.load, %broadcast.splat + store <2 x i32> %1, ptr %0, align 8 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -3785,32 +4726,32 @@ for.cond.cleanup: ; preds = %vector.body ret void } -define void @sink_splat_and_lmul2(ptr nocapture %a, i64 signext %x) { -; CHECK-LABEL: sink_splat_and_lmul2: +define void @sink_splat_rsub_lmulmf2(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_rsub_lmulmf2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a2, 2 ; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: .LBB71_1: # %vector.body +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: .LBB98_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vand.vx v8, v8, a1 -; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vrsub.vx v8, v8, a1 +; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 32 -; CHECK-NEXT: bne a0, a2, .LBB71_1 +; CHECK-NEXT: bne a0, a2, .LBB98_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0 - %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0 + %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = getelementptr inbounds i64, ptr %a, i64 %index - %wide.load = load <4 x i64>, ptr %0, align 8 - %1 = and <4 x i64> %wide.load, %broadcast.splat - store <4 x i64> %1, ptr %0, align 8 + %wide.load = load <2 x i32>, ptr %0, align 8 + %1 = sub <2 x i32> %broadcast.splat, %wide.load + store <2 x i32> %1, ptr %0, align 8 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -3819,32 +4760,32 @@ for.cond.cleanup: ; preds = %vector.body ret void } -define void @sink_splat_or_lmul2(ptr nocapture %a, i64 signext %x) { -; CHECK-LABEL: sink_splat_or_lmul2: +define void @sink_splat_and_lmulmf2(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_and_lmulmf2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a2, 2 ; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: .LBB72_1: # %vector.body +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: .LBB99_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vor.vx v8, v8, a1 -; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 32 -; CHECK-NEXT: bne a0, a2, .LBB72_1 +; CHECK-NEXT: bne a0, a2, .LBB99_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0 - %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0 + %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = getelementptr inbounds i64, ptr %a, i64 %index - %wide.load = load <4 x i64>, ptr %0, align 8 - %1 = or <4 x i64> %wide.load, %broadcast.splat - store <4 x i64> %1, ptr %0, align 8 + %wide.load = load <2 x i32>, ptr %0, align 8 + %1 = and <2 x i32> %wide.load, %broadcast.splat + store <2 x i32> %1, ptr %0, align 8 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -3853,32 +4794,32 @@ for.cond.cleanup: ; preds = %vector.body ret void } -define void @sink_splat_xor_lmul2(ptr nocapture %a, i64 signext %x) { -; CHECK-LABEL: sink_splat_xor_lmul2: +define void @sink_splat_or_lmulmf2(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_or_lmulmf2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a2, 2 ; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: .LBB73_1: # %vector.body +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: .LBB100_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vxor.vx v8, v8, a1 -; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vor.vx v8, v8, a1 +; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 32 -; CHECK-NEXT: bne a0, a2, .LBB73_1 +; CHECK-NEXT: bne a0, a2, .LBB100_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %x, i64 0 - %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> poison, <4 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0 + %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = getelementptr inbounds i64, ptr %a, i64 %index - %wide.load = load <4 x i64>, ptr %0, align 8 - %1 = xor <4 x i64> %wide.load, %broadcast.splat - store <4 x i64> %1, ptr %0, align 8 + %wide.load = load <2 x i32>, ptr %0, align 8 + %1 = or <2 x i32> %wide.load, %broadcast.splat + store <2 x i32> %1, ptr %0, align 8 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -3887,33 +4828,32 @@ for.cond.cleanup: ; preds = %vector.body ret void } -define void @sink_splat_mul_lmul8(ptr nocapture %a, i32 signext %x) { -; CHECK-LABEL: sink_splat_mul_lmul8: +define void @sink_splat_xor_lmulmf2(ptr nocapture %a, i32 signext %x) { +; CHECK-LABEL: sink_splat_xor_lmulmf2: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a2, 1 +; CHECK-NEXT: lui a2, 2 ; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: li a3, 32 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-NEXT: .LBB74_1: # %vector.body +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: .LBB101_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmul.vx v8, v8, a1 +; CHECK-NEXT: vxor.vx v8, v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: bne a0, a2, .LBB74_1 +; CHECK-NEXT: addi a0, a0, 32 +; CHECK-NEXT: bne a0, a2, .LBB101_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0 - %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0 + %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = getelementptr inbounds i32, ptr %a, i64 %index - %wide.load = load <32 x i32>, ptr %0, align 4 - %1 = mul <32 x i32> %wide.load, %broadcast.splat - store <32 x i32> %1, ptr %0, align 4 + %0 = getelementptr inbounds i64, ptr %a, i64 %index + %wide.load = load <2 x i32>, ptr %0, align 8 + %1 = xor <2 x i32> %wide.load, %broadcast.splat + store <2 x i32> %1, ptr %0, align 8 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -3922,33 +4862,39 @@ for.cond.cleanup: ; preds = %vector.body ret void } -define void @sink_splat_add_lmul8(ptr nocapture %a, i32 signext %x) { -; CHECK-LABEL: sink_splat_add_lmul8: +declare <4 x i1> @llvm.vp.icmp.v4i32(<4 x i32>, <4 x i32>, metadata, <4 x i1>, i32) + +define void @sink_splat_vp_icmp(ptr nocapture %x, i32 signext %y, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_icmp: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a2, 1 -; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: li a3, 32 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-NEXT: .LBB75_1: # %vector.body +; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a3, a0, a3 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: .LBB102_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vadd.vx v8, v8, a1 -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmseq.vx v0, v10, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v9, (a0), v0.t ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: bne a0, a2, .LBB75_1 +; CHECK-NEXT: bne a0, a3, .LBB102_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0 - %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %y, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = getelementptr inbounds i32, ptr %a, i64 %index - %wide.load = load <32 x i32>, ptr %0, align 4 - %1 = add <32 x i32> %wide.load, %broadcast.splat - store <32 x i32> %1, ptr %0, align 4 + %0 = getelementptr inbounds i32, ptr %x, i64 %index + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i1> @llvm.vp.icmp.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, metadata !"eq", <4 x i1> %m, i32 %vl) + call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr %0, i32 4, <4 x i1> %1) %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -3957,33 +4903,39 @@ for.cond.cleanup: ; preds = %vector.body ret void } -define void @sink_splat_sub_lmul8(ptr nocapture %a, i32 signext %x) { -; CHECK-LABEL: sink_splat_sub_lmul8: +declare <4 x i1> @llvm.vp.fcmp.v4f32(<4 x float>, <4 x float>, metadata, <4 x i1>, i32) + +define void @sink_splat_vp_fcmp(ptr nocapture %x, float %y, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_fcmp: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmv1r.v v8, v0 ; CHECK-NEXT: lui a2, 1 ; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: li a3, 32 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-NEXT: .LBB76_1: # %vector.body +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v9, 0 +; CHECK-NEXT: .LBB103_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsub.vx v8, v8, a1 -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmfeq.vf v0, v10, fa0, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v9, (a0), v0.t ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: bne a0, a2, .LBB76_1 +; CHECK-NEXT: bne a0, a2, .LBB103_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0 - %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <4 x float> poison, float %y, i32 0 + %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = getelementptr inbounds i32, ptr %a, i64 %index - %wide.load = load <32 x i32>, ptr %0, align 4 - %1 = sub <32 x i32> %wide.load, %broadcast.splat - store <32 x i32> %1, ptr %0, align 4 + %0 = getelementptr inbounds float, ptr %x, i64 %index + %wide.load = load <4 x float>, ptr %0, align 4 + %1 = call <4 x i1> @llvm.vp.fcmp.v4f32(<4 x float> %wide.load, <4 x float> %broadcast.splat, metadata !"oeq", <4 x i1> %m, i32 %vl) + call void @llvm.masked.store.v4f32.p0(<4 x float> zeroinitializer, ptr %0, i32 4, <4 x i1> %1) %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -3992,33 +4944,36 @@ for.cond.cleanup: ; preds = %vector.body ret void } -define void @sink_splat_rsub_lmul8(ptr nocapture %a, i32 signext %x) { -; CHECK-LABEL: sink_splat_rsub_lmul8: +declare <4 x i32> @llvm.vp.smin.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define void @sink_splat_vp_min(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_min: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a2, 1 -; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: li a3, 32 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-NEXT: .LBB77_1: # %vector.body +; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a3, a0, a3 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: .LBB104_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vrsub.vx v8, v8, a1 +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vmin.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: bne a0, a2, .LBB77_1 +; CHECK-NEXT: bne a0, a3, .LBB104_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0 - %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = getelementptr inbounds i32, ptr %a, i64 %index - %wide.load = load <32 x i32>, ptr %0, align 4 - %1 = sub <32 x i32> %broadcast.splat, %wide.load - store <32 x i32> %1, ptr %0, align 4 + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.vp.smin.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + store <4 x i32> %1, ptr %0, align 4 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -4027,33 +4982,34 @@ for.cond.cleanup: ; preds = %vector.body ret void } -define void @sink_splat_and_lmul8(ptr nocapture %a, i32 signext %x) { -; CHECK-LABEL: sink_splat_and_lmul8: +define void @sink_splat_vp_min_commute(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_min_commute: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a2, 1 -; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: li a3, 32 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-NEXT: .LBB78_1: # %vector.body +; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a3, a0, a3 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: .LBB105_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vmin.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: bne a0, a2, .LBB78_1 +; CHECK-NEXT: bne a0, a3, .LBB105_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0 - %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = getelementptr inbounds i32, ptr %a, i64 %index - %wide.load = load <32 x i32>, ptr %0, align 4 - %1 = and <32 x i32> %wide.load, %broadcast.splat - store <32 x i32> %1, ptr %0, align 4 + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.vp.smin.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl) + store <4 x i32> %1, ptr %0, align 4 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -4062,33 +5018,36 @@ for.cond.cleanup: ; preds = %vector.body ret void } -define void @sink_splat_or_lmul8(ptr nocapture %a, i32 signext %x) { -; CHECK-LABEL: sink_splat_or_lmul8: +declare <4 x i32> @llvm.vp.smax.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define void @sink_splat_vp_max(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_max: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a2, 1 -; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: li a3, 32 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-NEXT: .LBB79_1: # %vector.body +; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a3, a0, a3 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: .LBB106_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vor.vx v8, v8, a1 +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vmax.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: bne a0, a2, .LBB79_1 +; CHECK-NEXT: bne a0, a3, .LBB106_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0 - %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = getelementptr inbounds i32, ptr %a, i64 %index - %wide.load = load <32 x i32>, ptr %0, align 4 - %1 = or <32 x i32> %wide.load, %broadcast.splat - store <32 x i32> %1, ptr %0, align 4 + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.vp.smax.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + store <4 x i32> %1, ptr %0, align 4 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -4097,33 +5056,34 @@ for.cond.cleanup: ; preds = %vector.body ret void } -define void @sink_splat_xor_lmul8(ptr nocapture %a, i32 signext %x) { -; CHECK-LABEL: sink_splat_xor_lmul8: +define void @sink_splat_vp_max_commute(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_max_commute: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a2, 1 -; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: li a3, 32 -; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-NEXT: .LBB80_1: # %vector.body +; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a3, a0, a3 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: .LBB107_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vxor.vx v8, v8, a1 +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vmax.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: bne a0, a2, .LBB80_1 +; CHECK-NEXT: bne a0, a3, .LBB107_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <32 x i32> poison, i32 %x, i32 0 - %broadcast.splat = shufflevector <32 x i32> %broadcast.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] %0 = getelementptr inbounds i32, ptr %a, i64 %index - %wide.load = load <32 x i32>, ptr %0, align 4 - %1 = xor <32 x i32> %wide.load, %broadcast.splat - store <32 x i32> %1, ptr %0, align 4 + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.vp.smax.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl) + store <4 x i32> %1, ptr %0, align 4 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -4132,32 +5092,34 @@ for.cond.cleanup: ; preds = %vector.body ret void } -define void @sink_splat_mul_lmulmf2(ptr nocapture %a, i32 signext %x) { -; CHECK-LABEL: sink_splat_mul_lmulmf2: +define void @sink_splat_vp_umin_commute(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_umin_commute: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a2, 2 -; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: .LBB81_1: # %vector.body +; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a3, a0, a3 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: .LBB108_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmul.vx v8, v8, a1 +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vminu.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 32 -; CHECK-NEXT: bne a0, a2, .LBB81_1 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bne a0, a3, .LBB108_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0 - %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = getelementptr inbounds i64, ptr %a, i64 %index - %wide.load = load <2 x i32>, ptr %0, align 8 - %1 = mul <2 x i32> %wide.load, %broadcast.splat - store <2 x i32> %1, ptr %0, align 8 + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.vp.umin.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl) + store <4 x i32> %1, ptr %0, align 4 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -4166,32 +5128,36 @@ for.cond.cleanup: ; preds = %vector.body ret void } -define void @sink_splat_add_lmulmf2(ptr nocapture %a, i32 signext %x) { -; CHECK-LABEL: sink_splat_add_lmulmf2: +declare <4 x i32> @llvm.vp.umax.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define void @sink_splat_vp_umax(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_umax: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a2, 2 -; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: .LBB82_1: # %vector.body +; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a3, a0, a3 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: .LBB109_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vadd.vx v8, v8, a1 +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vmaxu.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 32 -; CHECK-NEXT: bne a0, a2, .LBB82_1 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bne a0, a3, .LBB109_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0 - %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = getelementptr inbounds i64, ptr %a, i64 %index - %wide.load = load <2 x i32>, ptr %0, align 8 - %1 = add <2 x i32> %wide.load, %broadcast.splat - store <2 x i32> %1, ptr %0, align 8 + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.vp.umax.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + store <4 x i32> %1, ptr %0, align 4 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -4200,32 +5166,34 @@ for.cond.cleanup: ; preds = %vector.body ret void } -define void @sink_splat_sub_lmulmf2(ptr nocapture %a, i32 signext %x) { -; CHECK-LABEL: sink_splat_sub_lmulmf2: +define void @sink_splat_vp_umax_commute(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_umax_commute: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a2, 2 -; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: .LBB83_1: # %vector.body +; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a3, a0, a3 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: .LBB110_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsub.vx v8, v8, a1 +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vmaxu.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 32 -; CHECK-NEXT: bne a0, a2, .LBB83_1 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bne a0, a3, .LBB110_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0 - %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = getelementptr inbounds i64, ptr %a, i64 %index - %wide.load = load <2 x i32>, ptr %0, align 8 - %1 = sub <2 x i32> %wide.load, %broadcast.splat - store <2 x i32> %1, ptr %0, align 8 + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.vp.umax.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl) + store <4 x i32> %1, ptr %0, align 4 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -4234,32 +5202,36 @@ for.cond.cleanup: ; preds = %vector.body ret void } -define void @sink_splat_rsub_lmulmf2(ptr nocapture %a, i32 signext %x) { -; CHECK-LABEL: sink_splat_rsub_lmulmf2: +declare <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define void @sink_splat_vp_sadd_sat(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_sadd_sat: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a2, 2 -; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: .LBB84_1: # %vector.body +; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a3, a0, a3 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: .LBB111_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vrsub.vx v8, v8, a1 +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 32 -; CHECK-NEXT: bne a0, a2, .LBB84_1 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bne a0, a3, .LBB111_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0 - %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = getelementptr inbounds i64, ptr %a, i64 %index - %wide.load = load <2 x i32>, ptr %0, align 8 - %1 = sub <2 x i32> %broadcast.splat, %wide.load - store <2 x i32> %1, ptr %0, align 8 + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + store <4 x i32> %1, ptr %0, align 4 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -4268,32 +5240,34 @@ for.cond.cleanup: ; preds = %vector.body ret void } -define void @sink_splat_and_lmulmf2(ptr nocapture %a, i32 signext %x) { -; CHECK-LABEL: sink_splat_and_lmulmf2: +define void @sink_splat_vp_sadd_sat_commute(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_sadd_sat_commute: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a2, 2 -; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: .LBB85_1: # %vector.body +; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a3, a0, a3 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: .LBB112_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vsadd.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 32 -; CHECK-NEXT: bne a0, a2, .LBB85_1 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bne a0, a3, .LBB112_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0 - %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = getelementptr inbounds i64, ptr %a, i64 %index - %wide.load = load <2 x i32>, ptr %0, align 8 - %1 = and <2 x i32> %wide.load, %broadcast.splat - store <2 x i32> %1, ptr %0, align 8 + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl) + store <4 x i32> %1, ptr %0, align 4 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -4302,33 +5276,37 @@ for.cond.cleanup: ; preds = %vector.body ret void } -define void @sink_splat_or_lmulmf2(ptr nocapture %a, i32 signext %x) { -; CHECK-LABEL: sink_splat_or_lmulmf2: +declare <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define void @sink_splat_vp_ssub_sat(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_ssub_sat: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a2, 2 -; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: .LBB86_1: # %vector.body +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: .LBB113_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vor.vx v8, v8, a1 +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 32 -; CHECK-NEXT: bne a0, a2, .LBB86_1 +; CHECK-NEXT: addi a3, a3, 4 +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: bnez a3, .LBB113_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0 - %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = getelementptr inbounds i64, ptr %a, i64 %index - %wide.load = load <2 x i32>, ptr %0, align 8 - %1 = or <2 x i32> %wide.load, %broadcast.splat - store <2 x i32> %1, ptr %0, align 8 - %index.next = add nuw i64 %index, 4 + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + store <4 x i32> %1, ptr %0, align 4 + %index.next = sub nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -4336,32 +5314,36 @@ for.cond.cleanup: ; preds = %vector.body ret void } -define void @sink_splat_xor_lmulmf2(ptr nocapture %a, i32 signext %x) { -; CHECK-LABEL: sink_splat_xor_lmulmf2: +declare <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define void @sink_splat_vp_uadd_sat(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_uadd_sat: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a2, 2 -; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: .LBB87_1: # %vector.body +; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: add a3, a0, a3 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: .LBB114_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vxor.vx v8, v8, a1 +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vsaddu.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 32 -; CHECK-NEXT: bne a0, a2, .LBB87_1 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bne a0, a3, .LBB114_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <2 x i32> poison, i32 %x, i64 0 - %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> poison, <2 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = getelementptr inbounds i64, ptr %a, i64 %index - %wide.load = load <2 x i32>, ptr %0, align 8 - %1 = xor <2 x i32> %wide.load, %broadcast.splat - store <2 x i32> %1, ptr %0, align 8 + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + store <4 x i32> %1, ptr %0, align 4 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -4370,39 +5352,34 @@ for.cond.cleanup: ; preds = %vector.body ret void } -declare <4 x i1> @llvm.vp.icmp.v4i32(<4 x i32>, <4 x i32>, metadata, <4 x i1>, i32) - -define void @sink_splat_vp_icmp(ptr nocapture %x, i32 signext %y, <4 x i1> %m, i32 zeroext %vl) { -; CHECK-LABEL: sink_splat_vp_icmp: +define void @sink_splat_vp_uadd_sat_commute(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_uadd_sat_commute: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmv1r.v v8, v0 ; CHECK-NEXT: lui a3, 1 ; CHECK-NEXT: add a3, a0, a3 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.v.i v9, 0 -; CHECK-NEXT: .LBB88_1: # %vector.body +; CHECK-NEXT: .LBB115_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmseq.vx v0, v10, a1, v0.t +; CHECK-NEXT: vsaddu.vx v8, v8, a1, v0.t ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vse32.v v9, (a0), v0.t +; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: bne a0, a3, .LBB88_1 +; CHECK-NEXT: bne a0, a3, .LBB115_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %y, i32 0 + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = getelementptr inbounds i32, ptr %x, i64 %index + %0 = getelementptr inbounds i32, ptr %a, i64 %index %wide.load = load <4 x i32>, ptr %0, align 4 - %1 = call <4 x i1> @llvm.vp.icmp.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, metadata !"eq", <4 x i1> %m, i32 %vl) - call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr %0, i32 4, <4 x i1> %1) + %1 = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl) + store <4 x i32> %1, ptr %0, align 4 %index.next = add nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body @@ -4411,40 +5388,37 @@ for.cond.cleanup: ; preds = %vector.body ret void } -declare <4 x i1> @llvm.vp.fcmp.v4f32(<4 x float>, <4 x float>, metadata, <4 x i1>, i32) +declare <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) -define void @sink_splat_vp_fcmp(ptr nocapture %x, float %y, <4 x i1> %m, i32 zeroext %vl) { -; CHECK-LABEL: sink_splat_vp_fcmp: +define void @sink_splat_vp_usub_sat(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_usub_sat: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmv1r.v v8, v0 -; CHECK-NEXT: lui a2, 1 -; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: li a3, 1024 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.v.i v9, 0 -; CHECK-NEXT: .LBB89_1: # %vector.body +; CHECK-NEXT: .LBB116_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmfeq.vf v0, v10, fa0, v0.t +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vse32.v v9, (a0), v0.t -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: bne a0, a2, .LBB89_1 +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a3, a3, 4 +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: bnez a3, .LBB116_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: - %broadcast.splatinsert = insertelement <4 x float> poison, float %y, i32 0 - %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %entry %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] - %0 = getelementptr inbounds float, ptr %x, i64 %index - %wide.load = load <4 x float>, ptr %0, align 4 - %1 = call <4 x i1> @llvm.vp.fcmp.v4f32(<4 x float> %wide.load, <4 x float> %broadcast.splat, metadata !"oeq", <4 x i1> %m, i32 %vl) - call void @llvm.masked.store.v4f32.p0(<4 x float> zeroinitializer, ptr %0, i32 4, <4 x i1> %1) - %index.next = add nuw i64 %index, 4 + %0 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.load = load <4 x i32>, ptr %0, align 4 + %1 = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + store <4 x i32> %1, ptr %0, align 4 + %index.next = sub nuw i64 %index, 4 %2 = icmp eq i64 %index.next, 1024 br i1 %2, label %for.cond.cleanup, label %vector.body