-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Reorder the vector register allocation sequence. #69290
Conversation
In the past, we used to prioritize allocating vector registers starting from V8, which, in some cases, would result in V1 to V7 being idle, and `VRM8NoV0` having no available registers. Now, we prioritize allocating registers starting from V1 to V7, which optimizes the utilization of register resources.
@llvm/pr-subscribers-backend-risc-v Author: ming (yanming123456) ChangesIn the past, we used to prioritize allocating vector registers starting from V8, which, in some cases, would result in V1 to V7 being idle, and Patch is 13.76 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/69290.diff 601 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index ab0d354967b34c7..01a5b3999e6adce 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -506,29 +506,21 @@ defvar VM8VTs = [vint8m8_t, vint16m8_t, vint32m8_t, vint64m8_t,
vfloat16m8_t, vbfloat16m8_t,
vfloat32m8_t, vfloat64m8_t];
-def VR : VReg<!listconcat(VM1VTs, VMaskVTs),
- (add (sequence "V%u", 8, 31),
- (sequence "V%u", 0, 7)), 1>;
+def VRNoV0 : VReg<!listconcat(VM1VTs, VMaskVTs), (sequence "V%u", 1, 31), 1>;
-def VRNoV0 : VReg<!listconcat(VM1VTs, VMaskVTs),
- (add (sequence "V%u", 8, 31),
- (sequence "V%u", 1, 7)), 1>;
+def VR : VReg<!listconcat(VM1VTs, VMaskVTs), (add VRNoV0, V0), 1>;
-def VRM2 : VReg<VM2VTs, (add (sequence "V%uM2", 8, 31, 2),
- (sequence "V%uM2", 0, 7, 2)), 2>;
+def VRM2NoV0 : VReg<VM2VTs, (sequence "V%uM2", 2, 31, 2), 2>;
-def VRM2NoV0 : VReg<VM2VTs, (add (sequence "V%uM2", 8, 31, 2),
- (sequence "V%uM2", 2, 7, 2)), 2>;
+def VRM2 : VReg<VM2VTs, (add VRM2NoV0, V0M2), 2>;
-def VRM4 : VReg<VM4VTs,
- (add V8M4, V12M4, V16M4, V20M4, V24M4, V28M4, V0M4, V4M4), 4>;
+def VRM4NoV0 : VReg<VM4VTs, (sequence "V%uM4", 4, 31, 4), 4>;
-def VRM4NoV0 : VReg<VM4VTs,
- (add V8M4, V12M4, V16M4, V20M4, V24M4, V28M4, V4M4), 4>;
+def VRM4 : VReg<VM4VTs, (add VRM4NoV0, V0M4), 4>;
-def VRM8 : VReg<VM8VTs, (add V8M8, V16M8, V24M8, V0M8), 8>;
+def VRM8NoV0 : VReg<VM8VTs, (sequence "V%uM8", 8, 31, 8), 8>;
-def VRM8NoV0 : VReg<VM8VTs, (add V8M8, V16M8, V24M8), 8>;
+def VRM8 : VReg<VM8VTs, (add VRM8NoV0, V0M8), 8>;
def VMV0 : RegisterClass<"RISCV", VMaskVTs, 64, (add V0)> {
let Size = 64;
diff --git a/llvm/test/CodeGen/RISCV/double_reduct.ll b/llvm/test/CodeGen/RISCV/double_reduct.ll
index cecaa9d24f8bccf..e5e91989e97ec5c 100644
--- a/llvm/test/CodeGen/RISCV/double_reduct.ll
+++ b/llvm/test/CodeGen/RISCV/double_reduct.ll
@@ -8,10 +8,10 @@ define float @add_f32(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: add_f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vfadd.vv v8, v8, v9
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vfredusum.vs v8, v8, v9
-; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: vfadd.vv v1, v8, v9
+; CHECK-NEXT: vmv.s.x v2, zero
+; CHECK-NEXT: vfredusum.vs v1, v1, v2
+; CHECK-NEXT: vfmv.f.s fa0, v1
; CHECK-NEXT: ret
%r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
%r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
@@ -23,16 +23,16 @@ define float @fmul_f32(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: fmul_f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v10, v8, 2
-; CHECK-NEXT: vfmul.vv v8, v8, v10
-; CHECK-NEXT: vrgather.vi v10, v8, 1
-; CHECK-NEXT: vfmul.vv v8, v8, v10
-; CHECK-NEXT: vfmv.f.s fa5, v8
-; CHECK-NEXT: vslidedown.vi v8, v9, 2
-; CHECK-NEXT: vfmul.vv v8, v9, v8
-; CHECK-NEXT: vrgather.vi v9, v8, 1
-; CHECK-NEXT: vfmul.vv v8, v8, v9
-; CHECK-NEXT: vfmv.f.s fa4, v8
+; CHECK-NEXT: vslidedown.vi v1, v8, 2
+; CHECK-NEXT: vfmul.vv v1, v8, v1
+; CHECK-NEXT: vrgather.vi v2, v1, 1
+; CHECK-NEXT: vfmul.vv v1, v1, v2
+; CHECK-NEXT: vfmv.f.s fa5, v1
+; CHECK-NEXT: vslidedown.vi v1, v9, 2
+; CHECK-NEXT: vfmul.vv v1, v9, v1
+; CHECK-NEXT: vrgather.vi v2, v1, 1
+; CHECK-NEXT: vfmul.vv v1, v1, v2
+; CHECK-NEXT: vfmv.f.s fa4, v1
; CHECK-NEXT: fmul.s fa0, fa5, fa4
; CHECK-NEXT: ret
%r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
@@ -45,9 +45,9 @@ define float @fmin_f32(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: fmin_f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vfmin.vv v8, v8, v9
-; CHECK-NEXT: vfredmin.vs v8, v8, v8
-; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: vfmin.vv v1, v8, v9
+; CHECK-NEXT: vfredmin.vs v1, v1, v1
+; CHECK-NEXT: vfmv.f.s fa0, v1
; CHECK-NEXT: ret
%r1 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
%r2 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b)
@@ -59,9 +59,9 @@ define float @fmax_f32(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: fmax_f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vfmax.vv v8, v8, v9
-; CHECK-NEXT: vfredmax.vs v8, v8, v8
-; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: vfmax.vv v1, v8, v9
+; CHECK-NEXT: vfredmax.vs v1, v1, v1
+; CHECK-NEXT: vfmv.f.s fa0, v1
; CHECK-NEXT: ret
%r1 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
%r2 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b)
@@ -74,10 +74,10 @@ define i32 @add_i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: add_i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vadd.vv v8, v8, v9
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vredsum.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: vadd.vv v1, v8, v9
+; CHECK-NEXT: vmv.s.x v2, zero
+; CHECK-NEXT: vredsum.vs v1, v1, v2
+; CHECK-NEXT: vmv.x.s a0, v1
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %b)
@@ -89,11 +89,11 @@ define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: add_ext_i16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT: vwaddu.vv v10, v8, v9
+; CHECK-NEXT: vwaddu.vv v2, v8, v9
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT: vmv.s.x v8, zero
-; CHECK-NEXT: vredsum.vs v8, v10, v8
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: vmv.s.x v1, zero
+; CHECK-NEXT: vredsum.vs v1, v2, v1
+; CHECK-NEXT: vmv.x.s a0, v1
; CHECK-NEXT: ret
%ae = zext <16 x i8> %a to <16 x i16>
%be = zext <16 x i8> %b to <16 x i16>
@@ -107,14 +107,14 @@ define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: add_ext_v32i16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e16, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v11, zero
+; CHECK-NEXT: vmv.s.x v1, zero
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT: vwredsumu.vs v10, v10, v11
+; CHECK-NEXT: vwredsumu.vs v1, v10, v1
; CHECK-NEXT: li a0, 32
; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT: vwredsumu.vs v8, v8, v10
+; CHECK-NEXT: vwredsumu.vs v1, v8, v1
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: vmv.x.s a0, v1
; CHECK-NEXT: ret
%ae = zext <32 x i8> %a to <32 x i16>
%be = zext <16 x i8> %b to <16 x i16>
@@ -128,32 +128,32 @@ define i32 @mul_i32(<4 x i32> %a, <4 x i32> %b) {
; RV32-LABEL: mul_i32:
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v10, v8, 2
-; RV32-NEXT: vmul.vv v8, v8, v10
-; RV32-NEXT: vrgather.vi v10, v8, 1
-; RV32-NEXT: vmul.vv v8, v8, v10
-; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: vslidedown.vi v8, v9, 2
-; RV32-NEXT: vmul.vv v8, v9, v8
-; RV32-NEXT: vrgather.vi v9, v8, 1
-; RV32-NEXT: vmul.vv v8, v8, v9
-; RV32-NEXT: vmv.x.s a1, v8
+; RV32-NEXT: vslidedown.vi v1, v8, 2
+; RV32-NEXT: vmul.vv v1, v8, v1
+; RV32-NEXT: vrgather.vi v2, v1, 1
+; RV32-NEXT: vmul.vv v1, v1, v2
+; RV32-NEXT: vmv.x.s a0, v1
+; RV32-NEXT: vslidedown.vi v1, v9, 2
+; RV32-NEXT: vmul.vv v1, v9, v1
+; RV32-NEXT: vrgather.vi v2, v1, 1
+; RV32-NEXT: vmul.vv v1, v1, v2
+; RV32-NEXT: vmv.x.s a1, v1
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: ret
;
; RV64-LABEL: mul_i32:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v10, v8, 2
-; RV64-NEXT: vmul.vv v8, v8, v10
-; RV64-NEXT: vrgather.vi v10, v8, 1
-; RV64-NEXT: vmul.vv v8, v8, v10
-; RV64-NEXT: vmv.x.s a0, v8
-; RV64-NEXT: vslidedown.vi v8, v9, 2
-; RV64-NEXT: vmul.vv v8, v9, v8
-; RV64-NEXT: vrgather.vi v9, v8, 1
-; RV64-NEXT: vmul.vv v8, v8, v9
-; RV64-NEXT: vmv.x.s a1, v8
+; RV64-NEXT: vslidedown.vi v1, v8, 2
+; RV64-NEXT: vmul.vv v1, v8, v1
+; RV64-NEXT: vrgather.vi v2, v1, 1
+; RV64-NEXT: vmul.vv v1, v1, v2
+; RV64-NEXT: vmv.x.s a0, v1
+; RV64-NEXT: vslidedown.vi v1, v9, 2
+; RV64-NEXT: vmul.vv v1, v9, v1
+; RV64-NEXT: vrgather.vi v2, v1, 1
+; RV64-NEXT: vmul.vv v1, v1, v2
+; RV64-NEXT: vmv.x.s a1, v1
; RV64-NEXT: mulw a0, a0, a1
; RV64-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %a)
@@ -166,9 +166,9 @@ define i32 @and_i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: and_i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vand.vv v8, v8, v9
-; CHECK-NEXT: vredand.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: vand.vv v1, v8, v9
+; CHECK-NEXT: vredand.vs v1, v1, v1
+; CHECK-NEXT: vmv.x.s a0, v1
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b)
@@ -180,9 +180,9 @@ define i32 @or_i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: or_i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vor.vv v8, v8, v9
-; CHECK-NEXT: vredor.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: vor.vv v1, v8, v9
+; CHECK-NEXT: vredor.vs v1, v1, v1
+; CHECK-NEXT: vmv.x.s a0, v1
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b)
@@ -194,10 +194,10 @@ define i32 @xor_i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: xor_i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vxor.vv v8, v8, v9
-; CHECK-NEXT: vmv.s.x v9, zero
-; CHECK-NEXT: vredxor.vs v8, v8, v9
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: vxor.vv v1, v8, v9
+; CHECK-NEXT: vmv.s.x v2, zero
+; CHECK-NEXT: vredxor.vs v1, v1, v2
+; CHECK-NEXT: vmv.x.s a0, v1
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b)
@@ -209,9 +209,9 @@ define i32 @umin_i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: umin_i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vminu.vv v8, v8, v9
-; CHECK-NEXT: vredminu.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: vminu.vv v1, v8, v9
+; CHECK-NEXT: vredminu.vs v1, v1, v1
+; CHECK-NEXT: vmv.x.s a0, v1
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %b)
@@ -223,9 +223,9 @@ define i32 @umax_i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: umax_i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vmaxu.vv v8, v8, v9
-; CHECK-NEXT: vredmaxu.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: vmaxu.vv v1, v8, v9
+; CHECK-NEXT: vredmaxu.vs v1, v1, v1
+; CHECK-NEXT: vmv.x.s a0, v1
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %b)
@@ -237,9 +237,9 @@ define i32 @smin_i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: smin_i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vmin.vv v8, v8, v9
-; CHECK-NEXT: vredmin.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: vmin.vv v1, v8, v9
+; CHECK-NEXT: vredmin.vs v1, v1, v1
+; CHECK-NEXT: vmv.x.s a0, v1
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %b)
@@ -251,9 +251,9 @@ define i32 @smax_i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: smax_i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vmax.vv v8, v8, v9
-; CHECK-NEXT: vredmax.vs v8, v8, v8
-; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: vmax.vv v1, v8, v9
+; CHECK-NEXT: vredmax.vs v1, v1, v1
+; CHECK-NEXT: vmv.x.s a0, v1
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %b)
diff --git a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
index 83a4f63add337fe..104bfa4c8937339 100644
--- a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
+++ b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
@@ -27,12 +27,12 @@ define void @_Z3foov() {
; CHECK-NEXT: vle16.v v8, (a0)
; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_48)
; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_48)
-; CHECK-NEXT: vle8.v v10, (a0)
+; CHECK-NEXT: vle8.v v1, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs1r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vs1r.v v1, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_46)
; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_46)
; CHECK-NEXT: vle16.v v10, (a0)
@@ -54,33 +54,33 @@ define void @_Z3foov() {
; CHECK-NEXT: vsetivli zero, 2, e16, m2, ta, ma
; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_40)
; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_40)
-; CHECK-NEXT: vle16.v v8, (a0)
+; CHECK-NEXT: vle16.v v2, (a0)
; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_44)
; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_44)
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 1
+; CHECK-NEXT: vl2r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: add a1, a1, a2
; CHECK-NEXT: vl2r.v v10, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: add a1, a1, a2
; CHECK-NEXT: vl2r.v v12, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: add a1, a1, a2
; CHECK-NEXT: vl2r.v v14, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: add a1, a1, a2
-; CHECK-NEXT: vl2r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT: vle16.v v16, (a0)
+; CHECK-NEXT: vle16.v v14, (a0)
; CHECK-NEXT: lui a0, 1048572
; CHECK-NEXT: addi a0, a0, 928
-; CHECK-NEXT: vmsbc.vx v0, v8, a0
+; CHECK-NEXT: vmsbc.vx v0, v2, a0
; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, mu
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vsext.vf2 v10, v8, v0.t
+; CHECK-NEXT: vl1r.v v1, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vsext.vf2 v8, v1, v0.t
; CHECK-NEXT: lui a0, %hi(var_47)
; CHECK-NEXT: addi a0, a0, %lo(var_47)
-; CHECK-NEXT: vsseg4e16.v v10, (a0)
+; CHECK-NEXT: vsseg4e16.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: li a1, 10
; CHECK-NEXT: mul a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll b/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll
index c15321057aeb86b..f03329e4fece0b9 100644
--- a/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll
+++ b/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll
@@ -40,16 +40,16 @@ define void @last_chance_recoloring_failure() {
; CHECK-NEXT: vmclr.m v0
; CHECK-NEXT: li s0, 36
; CHECK-NEXT: vsetvli zero, s0, e16, m4, ta, ma
-; CHECK-NEXT: vfwadd.vv v16, v8, v8, v0.t
+; CHECK-NEXT: vfwadd.vv v8, v4, v4, v0.t
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: call func@plt
; CHECK-NEXT: li a0, 32
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vrgather.vv v4, v8, v8, v0.t
+; CHECK-NEXT: vrgather.vv v4, v8, v4, v0.t
; CHECK-NEXT: vsetvli zero, s0, e16, m4, ta, ma
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
@@ -69,7 +69,7 @@ define void @last_chance_recoloring_failure() {
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu
-; CHECK-NEXT: vssubu.vv v4, v4, v8, v0.t
+; CHECK-NEXT: vssubu.vv v4, v4, v4, v0.t
; CHECK-NEXT: vsetvli zero, s0, e32, m8, tu, mu
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
@@ -117,13 +117,13 @@ define void @last_chance_recoloring_failure() {
; SUBREGLIVENESS-NEXT: vmclr.m v0
; SUBREGLIVENESS-NEXT: li s0, 36
; SUBREGLIVENESS-NEXT: vsetvli zero, s0, e16, m4, ta, ma
-; SUBREGLIVENESS-NEXT: vfwadd.vv v16, v8, v8, v0.t
+; SUBREGLIVENESS-NEXT: vfwadd.vv v8, v4, v4, v0.t
; SUBREGLIVENESS-NEXT: addi a0, sp, 16
-; SUBREGLIVENESS-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; SUBREGLIVENESS-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; SUBREGLIVENESS-NEXT: call func@plt
; SUBREGLIVENESS-NEXT: li a0, 32
; SUBREGLIVENESS-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; SUBREGLIVENESS-NEXT: vrgather.vv v16, v8, v8, v0.t
+; SUBREGLIVENESS-NEXT: vrgather.vv v16, v4, v4, v0.t
; SUBREGLIVENESS-NEXT: vsetvli zero, s0, e16, m4, ta, ma
; SUBREGLIVENESS-NEXT: csrr a1, vlenb
; SUBREGLIVENESS-NEXT: slli a1, a1, 3
@@ -138,7 +138,7 @@ define void @last_chance_recoloring_failure() {
; SUBREGLIVENESS-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload
; SUBREGLIVENESS-NEXT: vfwsub.wv v8, v24, v20
; SUBREGLIVENESS-NEXT: vsetvli zero, a0, e16, m4, tu, mu
-; SUBREGLIVENESS-NEXT: vssubu.vv v16, v16, v8, v0.t
+; SUBREGLIVENESS-NEXT: vssubu.vv v16, v16, v4, v0.t
; SUBREGLIVENESS-NEXT: vsetvli zero, s0, e32, m8, tu, mu
; SUBREGLIVENESS-NEXT: vfdiv.vv v8, v24, v8, v0.t
; SUBREGLIVENESS-NEXT: vse32.v v8, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/abs-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/abs-sdnode.ll
index 589b9994651d24a..99f9b3c744097c7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/abs-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/abs-sdnode.ll
@@ -8,8 +8,8 @@ define <vscale x 1 x i16> @vabs_nxv1i16(<vscale x 1 x i16> %v) {
; CHECK-LABEL: vabs_nxv1i16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vrsub.vi v9, v8, 0
-; CHECK-NEXT: vmax.vv v8, v8, v9
+; CHECK-NEXT: vrsub.vi v1, v8, 0
+; CHECK-NEXT: vmax.vv v8, v8, v1
; CHECK-NEXT: ret
%r = call <vscale x 1 x i16> @llvm.abs.nxv1i16(<vscale x 1 x i16> %v, i1 false)
ret <vscale x 1 x i16> %r
@@ -21,8 +21,8 @@ define <vscale x 2 x i16> @vabs_nxv2i16(<vscale x 2 x i16> %v) {
; CHECK-LABEL: vabs_nxv2i16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vrsub.vi v9, v8, 0
-; CHECK-NEXT: vmax.vv v8, v8, v9
+; CHECK-NEXT: vrsub.vi v1, v8, 0
+; CHECK-NEXT: vmax.vv v8, v8, v1
; CHECK-NEXT: ret
%r = call <vscale x 2 x i16> @llvm.abs.nxv2i16(<vscale x 2 x i16> %v, i1 false)
ret <vscale x 2 x i16> %r
@@ -34,8 +34,8 @@ define <vscale x 4 x i16> @vabs_nxv4i16(<vscale x 4 x i16> %v) {
; CHECK-LABEL: vabs_nxv4i16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vrsub.vi v9, v8, 0
-; CHECK-NEXT: vmax.vv v8, v8, v9
+; CHECK-NEXT: vrsub.vi v1, v8, 0
+; CHECK-NEXT: vmax.vv v8, v8, v1
; CHECK-NEXT: ret
%r = call <vscale x 4 x i16> @llvm.abs.nxv4i16(<vscale x 4 x i16> %v, i1 false)
ret <vscale x 4 x i16> %r
@@ -47,8 +47,8 @@ define <vscale x 8 x i16> ...
[truncated]
|
a example void vec(float *restrict out,
float *in1,
float *in2,
float *in3,
size_t n) {
size_t vl;
for (size_t i = 0; i < n; i += vl) {
vl = __riscv_vsetvl_e32m8(n);
vfloat32m8_t v1 = __riscv_vle32_v_f32m8(&in1[i], vl);
vfloat32m8_t v2 = __riscv_vle32_v_f32m8(&in2[i], vl);
vbool4_t true_mask = __riscv_vmfgt(v1, v2, vl);
vbool4_t false_mask = __riscv_vmnot(true_mask, vl);
vfloat32m8_t v3 = __riscv_vle32_v_f32m8_m(false_mask, &in3[i], vl);
vfloat32m8_t val = __riscv_vfsub(true_mask, v1, v2, vl);
val = __riscv_vfsub_mu(false_mask, val, v2, v3, vl);
__riscv_vse32(&out[i], val, vl);
}
} before: vec: # @vec
# %bb.0: # %entry
beqz a4, .LBB0_4
# %bb.1: # %for.body.lr.ph
addi sp, sp, -16
csrr a5, vlenb
slli a5, a5, 4
sub sp, sp, a5
li a5, 0
vsetvli a6, a4, e32, m8, ta, ma
slli a7, a6, 2
.LBB0_2: # %for.body
# =>This Inner Loop Header: Depth=1
vsetvli zero, a6, e32, m8, ta, ma
vle32.v v24, (a1)
addi t0, sp, 16
vs8r.v v24, (t0) # Unknown-size Folded Spill
vle32.v v16, (a2)
vmflt.vv v8, v16, v24
vmnot.m v9, v8
vmv1r.v v0, v9
vle32.v v24, (a3), v0.t
csrr t0, vlenb
slli t0, t0, 3
add t0, t0, sp
addi t0, t0, 16
vs8r.v v24, (t0) # Unknown-size Folded Spill
vmv1r.v v0, v8
addi t0, sp, 16
vl8r.v v24, (t0) # Unknown-size Folded Reload
vfsub.vv v24, v24, v16, v0.t
vsetvli zero, zero, e32, m8, ta, mu
vmv1r.v v0, v9
csrr t0, vlenb
slli t0, t0, 3
add t0, t0, sp
addi t0, t0, 16
vl8r.v v8, (t0) # Unknown-size Folded Reload
vfsub.vv v24, v16, v8, v0.t
vse32.v v24, (a0)
add a5, a5, a6
add a0, a0, a7
add a3, a3, a7
add a2, a2, a7
add a1, a1, a7
bltu a5, a4, .LBB0_2
# %bb.3:
csrr a0, vlenb
slli a0, a0, 4
add sp, sp, a0
addi sp, sp, 16
.LBB0_4: # %for.cond.cleanup
ret after: vec: # @vec
# %bb.0: # %entry
beqz a4, .LBB0_3
# %bb.1: # %for.body.lr.ph
li a5, 0
vsetvli a6, a4, e32, m8, ta, ma
slli a7, a6, 2
.LBB0_2: # %for.body
# =>This Inner Loop Header: Depth=1
vsetvli zero, a6, e32, m8, ta, mu
vle32.v v16, (a1)
vle32.v v8, (a2)
vmflt.vv v1, v8, v16
vmnot.m v2, v1
vmv1r.v v0, v2
vle32.v v24, (a3), v0.t
vmv1r.v v0, v1
vfsub.vv v16, v16, v8, v0.t
vmv1r.v v0, v2
vfsub.vv v16, v8, v24, v0.t
vse32.v v16, (a0)
add a5, a5, a6
add a0, a0, a7
add a3, a3, a7
add a2, a2, a7
add a1, a1, a7
bltu a5, a4, .LBB0_2
.LBB0_3: # %for.cond.cleanup
ret |
I think this can be added to the test cases. |
There are many optimized outcomes in existing examples, such as:
There are many optimized outcomes in existing examples, such as: llvm/test/CodeGen/RISCV/rvv/abs-vp.ll:607 |
What is the impact on future Vector Calling convention? Do they conflict with each other? |
Currently the vector calling convention proposal reserves |
I think we're probably going to need to have the allocation order depend on the calling convention. For the proposed vector calling convention, having our allocation order prefer callee saved registers is the last thing we want. For the default scalar convention (which has no vector callee saves), preferring the fragmented registers in the first LMUL8 group (i.e. this change) is probably reasonable. Do you have any performance data with this change? On the surface, this seems reasonable, but I'm curious about empirical evaluation. p.s. There are some unrelated style changes rolled into the change which can probably be separated. Specifically, definition order and use of sequence vs hard coded lists. |
refer #82967 |
In the past, we used to prioritize allocating vector registers starting from V8, which, in some cases, would result in V1 to V7 being idle, and
VRM8NoV0
having no available registers. Now, we prioritize allocating registers starting from V1 to V7, which optimizes the utilization of register resources.