-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Add more intrinsics into canSplatOperand. #83106
Conversation
This patch adds smin/smax/umin/umax/sadd_sat/ssub_sat/uadd_sat/usub_sat into canSplatOperand. It can help llvm fold vv instructions with one splat operand to vx instructions.
@llvm/pr-subscribers-backend-risc-v Author: Yeting Kuo (yetingk) ChangesThis patch adds smin/smax/umin/umax/sadd_sat/ssub_sat/uadd_sat/usub_sat Patch is 145.04 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/83106.diff 2 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 540c2e7476dc18..9a873c4e199b05 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2009,6 +2009,22 @@ bool RISCVTargetLowering::canSplatOperand(Instruction *I, int Operand) const {
case Intrinsic::vp_fmul:
case Intrinsic::vp_icmp:
case Intrinsic::vp_fcmp:
+ case Intrinsic::smin:
+ case Intrinsic::vp_smin:
+ case Intrinsic::umin:
+ case Intrinsic::vp_umin:
+ case Intrinsic::smax:
+ case Intrinsic::vp_smax:
+ case Intrinsic::umax:
+ case Intrinsic::vp_umax:
+ case Intrinsic::sadd_sat:
+ case Intrinsic::vp_sadd_sat:
+ case Intrinsic::uadd_sat:
+ case Intrinsic::vp_uadd_sat:
+ case Intrinsic::ssub_sat:
+ case Intrinsic::vp_ssub_sat:
+ case Intrinsic::usub_sat:
+ case Intrinsic::vp_usub_sat:
// These intrinsics have 'vr' versions.
case Intrinsic::vp_sub:
case Intrinsic::vp_fsub:
diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
index 191f047131fb16..81f29bb8b7a9a3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
@@ -2849,23 +2849,21 @@ for.body: ; preds = %for.body.preheader,
br i1 %cmp.not, label %for.cond.cleanup, label %for.body
}
-declare <4 x i32> @llvm.vp.mul.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>)
-define void @sink_splat_vp_mul(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_mul:
+define void @sink_splat_min(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_min:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: lui a3, 1
-; CHECK-NEXT: add a3, a0, a3
+; CHECK-NEXT: li a2, 1024
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: .LBB46_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT: vmul.vx v8, v8, a1, v0.t
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmin.vx v8, v8, a1
; CHECK-NEXT: vse32.v v8, (a0)
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: bne a0, a3, .LBB46_1
+; CHECK-NEXT: addi a2, a2, 4
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: bnez a2, .LBB46_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
; CHECK-NEXT: ret
entry:
@@ -2877,9 +2875,9 @@ vector.body: ; preds = %vector.body, %entry
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds i32, ptr %a, i64 %index
%wide.load = load <4 x i32>, ptr %0, align 4
- %1 = call <4 x i32> @llvm.vp.mul.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+ %1 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat)
store <4 x i32> %1, ptr %0, align 4
- %index.next = add nuw i64 %index, 4
+ %index.next = sub nuw i64 %index, 4
%2 = icmp eq i64 %index.next, 1024
br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -2887,23 +2885,19 @@ for.cond.cleanup: ; preds = %vector.body
ret void
}
-declare <4 x i32> @llvm.vp.add.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
-
-define void @sink_splat_vp_add(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_add:
+define void @sink_splat_min_commute(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_min_commute:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: lui a3, 1
-; CHECK-NEXT: add a3, a0, a3
+; CHECK-NEXT: li a2, 1024
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: .LBB47_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT: vadd.vx v8, v8, a1, v0.t
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmin.vx v8, v8, a1
; CHECK-NEXT: vse32.v v8, (a0)
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: bne a0, a3, .LBB47_1
+; CHECK-NEXT: addi a2, a2, 4
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: bnez a2, .LBB47_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
; CHECK-NEXT: ret
entry:
@@ -2915,9 +2909,9 @@ vector.body: ; preds = %vector.body, %entry
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds i32, ptr %a, i64 %index
%wide.load = load <4 x i32>, ptr %0, align 4
- %1 = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+ %1 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load)
store <4 x i32> %1, ptr %0, align 4
- %index.next = add nuw i64 %index, 4
+ %index.next = sub nuw i64 %index, 4
%2 = icmp eq i64 %index.next, 1024
br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -2925,21 +2919,21 @@ for.cond.cleanup: ; preds = %vector.body
ret void
}
-define void @sink_splat_vp_add_commute(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_add_commute:
+declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
+
+define void @sink_splat_max(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_max:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: lui a3, 1
-; CHECK-NEXT: add a3, a0, a3
+; CHECK-NEXT: li a2, 1024
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: .LBB48_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT: vadd.vx v8, v8, a1, v0.t
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmax.vx v8, v8, a1
; CHECK-NEXT: vse32.v v8, (a0)
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: bne a0, a3, .LBB48_1
+; CHECK-NEXT: addi a2, a2, 4
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: bnez a2, .LBB48_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
; CHECK-NEXT: ret
entry:
@@ -2951,9 +2945,9 @@ vector.body: ; preds = %vector.body, %entry
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds i32, ptr %a, i64 %index
%wide.load = load <4 x i32>, ptr %0, align 4
- %1 = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl)
+ %1 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat)
store <4 x i32> %1, ptr %0, align 4
- %index.next = add nuw i64 %index, 4
+ %index.next = sub nuw i64 %index, 4
%2 = icmp eq i64 %index.next, 1024
br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -2961,23 +2955,19 @@ for.cond.cleanup: ; preds = %vector.body
ret void
}
-declare <4 x i32> @llvm.vp.sub.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
-
-define void @sink_splat_vp_sub(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_sub:
+define void @sink_splat_max_commute(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_max_commute:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: lui a3, 1
-; CHECK-NEXT: add a3, a0, a3
+; CHECK-NEXT: li a2, 1024
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: .LBB49_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT: vsub.vx v8, v8, a1, v0.t
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmax.vx v8, v8, a1
; CHECK-NEXT: vse32.v v8, (a0)
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: bne a0, a3, .LBB49_1
+; CHECK-NEXT: addi a2, a2, 4
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: bnez a2, .LBB49_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
; CHECK-NEXT: ret
entry:
@@ -2989,9 +2979,9 @@ vector.body: ; preds = %vector.body, %entry
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds i32, ptr %a, i64 %index
%wide.load = load <4 x i32>, ptr %0, align 4
- %1 = call <4 x i32> @llvm.vp.sub.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+ %1 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load)
store <4 x i32> %1, ptr %0, align 4
- %index.next = add nuw i64 %index, 4
+ %index.next = sub nuw i64 %index, 4
%2 = icmp eq i64 %index.next, 1024
br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -2999,21 +2989,21 @@ for.cond.cleanup: ; preds = %vector.body
ret void
}
-define void @sink_splat_vp_rsub(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_rsub:
+declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>)
+
+define void @sink_splat_umin(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_umin:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: lui a3, 1
-; CHECK-NEXT: add a3, a0, a3
+; CHECK-NEXT: li a2, 1024
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: .LBB50_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT: vrsub.vx v8, v8, a1, v0.t
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vminu.vx v8, v8, a1
; CHECK-NEXT: vse32.v v8, (a0)
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: bne a0, a3, .LBB50_1
+; CHECK-NEXT: addi a2, a2, 4
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: bnez a2, .LBB50_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
; CHECK-NEXT: ret
entry:
@@ -3025,9 +3015,9 @@ vector.body: ; preds = %vector.body, %entry
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds i32, ptr %a, i64 %index
%wide.load = load <4 x i32>, ptr %0, align 4
- %1 = call <4 x i32> @llvm.vp.sub.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl)
+ %1 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat)
store <4 x i32> %1, ptr %0, align 4
- %index.next = add nuw i64 %index, 4
+ %index.next = sub nuw i64 %index, 4
%2 = icmp eq i64 %index.next, 1024
br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -3035,23 +3025,19 @@ for.cond.cleanup: ; preds = %vector.body
ret void
}
-declare <4 x i32> @llvm.vp.shl.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
-
-define void @sink_splat_vp_shl(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_shl:
+define void @sink_splat_umin_commute(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_umin_commute:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: lui a3, 1
-; CHECK-NEXT: add a3, a0, a3
+; CHECK-NEXT: li a2, 1024
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: .LBB51_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT: vsll.vx v8, v8, a1, v0.t
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vminu.vx v8, v8, a1
; CHECK-NEXT: vse32.v v8, (a0)
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: bne a0, a3, .LBB51_1
+; CHECK-NEXT: addi a2, a2, 4
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: bnez a2, .LBB51_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
; CHECK-NEXT: ret
entry:
@@ -3063,9 +3049,9 @@ vector.body: ; preds = %vector.body, %entry
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds i32, ptr %a, i64 %index
%wide.load = load <4 x i32>, ptr %0, align 4
- %1 = call <4 x i32> @llvm.vp.shl.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+ %1 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load)
store <4 x i32> %1, ptr %0, align 4
- %index.next = add nuw i64 %index, 4
+ %index.next = sub nuw i64 %index, 4
%2 = icmp eq i64 %index.next, 1024
br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -3073,23 +3059,21 @@ for.cond.cleanup: ; preds = %vector.body
ret void
}
-declare <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>)
-define void @sink_splat_vp_lshr(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_lshr:
+define void @sink_splat_umax(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_umax:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: lui a3, 1
-; CHECK-NEXT: add a3, a0, a3
+; CHECK-NEXT: li a2, 1024
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: .LBB52_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT: vsrl.vx v8, v8, a1, v0.t
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmaxu.vx v8, v8, a1
; CHECK-NEXT: vse32.v v8, (a0)
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: bne a0, a3, .LBB52_1
+; CHECK-NEXT: addi a2, a2, 4
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: bnez a2, .LBB52_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
; CHECK-NEXT: ret
entry:
@@ -3101,9 +3085,9 @@ vector.body: ; preds = %vector.body, %entry
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds i32, ptr %a, i64 %index
%wide.load = load <4 x i32>, ptr %0, align 4
- %1 = call <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+ %1 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat)
store <4 x i32> %1, ptr %0, align 4
- %index.next = add nuw i64 %index, 4
+ %index.next = sub nuw i64 %index, 4
%2 = icmp eq i64 %index.next, 1024
br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -3111,23 +3095,19 @@ for.cond.cleanup: ; preds = %vector.body
ret void
}
-declare <4 x i32> @llvm.vp.ashr.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
-
-define void @sink_splat_vp_ashr(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_ashr:
+define void @sink_splat_umax_commute(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_umax_commute:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: lui a3, 1
-; CHECK-NEXT: add a3, a0, a3
+; CHECK-NEXT: li a2, 1024
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: .LBB53_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT: vsra.vx v8, v8, a1, v0.t
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmaxu.vx v8, v8, a1
; CHECK-NEXT: vse32.v v8, (a0)
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: bne a0, a3, .LBB53_1
+; CHECK-NEXT: addi a2, a2, 4
+; CHECK-NEXT: addi a0, a0, -16
+; CHECK-NEXT: bnez a2, .LBB53_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
; CHECK-NEXT: ret
entry:
@@ -3139,9 +3119,9 @@ vector.body: ; preds = %vector.body, %entry
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds i32, ptr %a, i64 %index
%wide.load = load <4 x i32>, ptr %0, align 4
- %1 = call <4 x i32> @llvm.vp.ashr.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
+ %1 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load)
store <4 x i32> %1, ptr %0, align 4
- %index.next = add nuw i64 %index, 4
+ %index.next = sub nuw i64 %index, 4
%2 = icmp eq i64 %index.next, 1024
br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -3149,10 +3129,10 @@ for.cond.cleanup: ; preds = %vector.body
ret void
}
-declare <4 x float> @llvm.vp.fmul.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32)
+declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>)
-define void @sink_splat_vp_fmul(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_fmul:
+define void @sink_splat_sadd_sat(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_sadd_sat:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lui a2, 1
; CHECK-NEXT: add a2, a0, a2
@@ -3160,25 +3140,23 @@ define void @sink_splat_vp_fmul(ptr nocapture %a, float %x, <4 x i1> %m, i32 zer
; CHECK-NEXT: .LBB54_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT: vfmul.vf v8, v8, fa0, v0.t
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vsadd.vx v8, v8, a1
; CHECK-NEXT: vse32.v v8, (a0)
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: bne a0, a2, .LBB54_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
; CHECK-NEXT: ret
entry:
- %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
- %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+ %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %entry
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
- %0 = getelementptr inbounds float, ptr %a, i64 %index
- %wide.load = load <4 x float>, ptr %0, align 4
- %1 = call <4 x float> @llvm.vp.fmul.v4i32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x i1> %m, i32 %vl)
- store <4 x float> %1, ptr %0, align 4
+ %0 = getelementptr inbounds i32, ptr %a, i64 %index
+ %wide.load = load <4 x i32>, ptr %0, align 4
+ %1 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat)
+ store <4 x i32> %1, ptr %0, align 4
%index.next = add nuw i64 %index, 4
%2 = icmp eq i64 %index.next, 1024
br i1 %2, label %for.cond.cleanup, label %vector.body
@@ -3187,10 +3165,8 @@ for.cond.cleanup: ; preds = %vector.body
ret void
}
-declare <4 x float> @llvm.vp.fdiv.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32)
-
-define void @sink_splat_vp_fdiv(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) {
-; CHECK-LABEL: sink_splat_vp_fdiv:
+define void @sink_splat_sadd_sat_commute(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_splat_sadd_sat_commute:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lui a2, 1
; CHECK-NEXT: add a2, a0, a2
@@ -3198,25 +3174,23 @@ define void @sink_splat_vp_fdiv(ptr nocapture %a, float %x, <4 x i1> %m, i32 zer
; CHECK-NEXT: .LBB55_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT: vfdiv.vf v8, v8, fa0, v0.t
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vsadd.vx v8, v8, a1
; CHECK-NEXT: vse32.v v8, (a0)
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: bne a0, a2, .LBB55_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
; CHECK-NEXT: ret
entry:
- %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
- %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+ %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
+ %broadcast.splat = shufflevector <4 x i32> %broadcast.sp...
[truncated]
|
case Intrinsic::vp_sadd_sat: | ||
case Intrinsic::uadd_sat: | ||
case Intrinsic::vp_uadd_sat: | ||
case Intrinsic::ssub_sat: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ssub_sat/usub_sat isn't commutable and we only have one .vx form for those instructions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
This patch adds smin/smax/umin/umax/sadd_sat/ssub_sat/uadd_sat/usub_sat
into canSplatOperand. It can help llvm fold vv instructions with one splat
operand to vx instructions.