From 44a430354d63727d0317f00e2c1210e79131babe Mon Sep 17 00:00:00 2001 From: jacquesguan Date: Mon, 21 Feb 2022 16:13:44 +0800 Subject: [PATCH] [RISCV] Fold store of vmv.f.s to a vse with VL=1. This patch support the FP part of D109482. Differential Revision: https://reviews.llvm.org/D120235 --- .../Target/RISCV/RISCVInstrInfoVSDPatterns.td | 7 +++ .../CodeGen/RISCV/rvv/extractelt-fp-rv64.ll | 29 ++++++++++ .../RISCV/rvv/fixed-vectors-extract.ll | 15 +++++ .../RISCV/rvv/fixed-vectors-fp-buildvec.ll | 19 +++---- .../CodeGen/RISCV/rvv/fixed-vectors-vpload.ll | 57 ++++++++++--------- 5 files changed, 90 insertions(+), 37 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td index 6f1f5574fbae05..46e6b37f4033a1 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td @@ -954,6 +954,13 @@ foreach fvti = AllFloatVectors in { //===----------------------------------------------------------------------===// let Predicates = [HasVInstructionsAnyF] in foreach vti = AllFloatVectors in { + // Fold store of vmv.f.s to a vse with VL=1. + defvar store_instr = !cast("PseudoVSE"#vti.SEW#"_V_"#vti.LMul.MX); + def : Pat<(store (vti.Scalar (int_riscv_vfmv_f_s (vti.Vector vti.RegClass:$rs2))), BaseAddr:$rs1), + (store_instr vti.RegClass:$rs2, BaseAddr:$rs1, 1, vti.Log2SEW)>; + def : Pat<(store (extractelt (vti.Vector vti.RegClass:$rs2), 0), BaseAddr:$rs1), + (store_instr vti.RegClass:$rs2, BaseAddr:$rs1, 1, vti.Log2SEW)>; + defvar vmv_f_s_inst = !cast(!strconcat("PseudoVFMV_", vti.ScalarSuffix, "_S_", vti.LMul.MX)); diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp-rv64.ll index b5525d23370471..ed77e4cf252f5a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp-rv64.ll @@ -481,3 +481,32 @@ define double @extractelt_nxv8f64_idx( %v, i32 signext %idx %r = extractelement %v, i32 %idx ret double %r } + +define void @store_extractelt_nxv8f64(* %x, double* %p) { +; CHECK-LABEL: store_extractelt_nxv8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vl8re64.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, mu +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vse64.v v8, (a1) +; CHECK-NEXT: ret + %a = load , * %x + %b = extractelement %a, i64 1 + store double %b, double* %p + ret void +} + +define void @store_vfmv_f_s_nxv8f64(* %x, double* %p) { +; CHECK-LABEL: store_vfmv_f_s_nxv8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vl8re64.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, mu +; CHECK-NEXT: vse64.v v8, (a1) +; CHECK-NEXT: ret + %a = load , * %x + %b = call double @llvm.riscv.vfmv.f.s.nxv8f64( %a) + store double %b, double* %p + ret void +} + +declare double @llvm.riscv.vfmv.f.s.nxv8f64() diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll index 124552d37a126c..36e0314112773e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll @@ -596,3 +596,18 @@ define void @store_extractelt_v4i64(<2 x i64>* %x, i64* %p) nounwind { store i64 %b, i64* %p ret void } + +define void @store_extractelt_v4f64(<2 x double>* %x, double* %p) nounwind { +; CHECK-LABEL: store_extractelt_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vse64.v v8, (a1) +; CHECK-NEXT: ret + %a = load <2 x double>, <2 x double>* %x + %b = extractelement <2 x double> %a, i64 1 + store double %b, double* %p + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll index af74ea9f50543e..24d3008efae92e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -58,18 +58,17 @@ define <4 x float> @hang_when_merging_stores_after_legalization(<8 x float> %x, ; LMULMAX2: # %bb.0: ; LMULMAX2-NEXT: addi sp, sp, -32 ; LMULMAX2-NEXT: .cfi_def_cfa_offset 32 -; LMULMAX2-NEXT: vsetivli zero, 0, e32, m2, ta, mu -; LMULMAX2-NEXT: vfmv.f.s ft0, v10 -; LMULMAX2-NEXT: fsw ft0, 24(sp) -; LMULMAX2-NEXT: vfmv.f.s ft0, v8 -; LMULMAX2-NEXT: fsw ft0, 16(sp) +; LMULMAX2-NEXT: addi a0, sp, 24 ; LMULMAX2-NEXT: vsetivli zero, 1, e32, m2, ta, mu +; LMULMAX2-NEXT: vse32.v v10, (a0) ; LMULMAX2-NEXT: vslidedown.vi v10, v10, 7 -; LMULMAX2-NEXT: vfmv.f.s ft0, v10 -; LMULMAX2-NEXT: fsw ft0, 28(sp) -; LMULMAX2-NEXT: vslidedown.vi v8, v8, 7 -; LMULMAX2-NEXT: vfmv.f.s ft0, v8 -; LMULMAX2-NEXT: fsw ft0, 20(sp) +; LMULMAX2-NEXT: addi a0, sp, 28 +; LMULMAX2-NEXT: vse32.v v10, (a0) +; LMULMAX2-NEXT: vslidedown.vi v10, v8, 7 +; LMULMAX2-NEXT: addi a0, sp, 20 +; LMULMAX2-NEXT: vse32.v v10, (a0) +; LMULMAX2-NEXT: addi a0, sp, 16 +; LMULMAX2-NEXT: vse32.v v8, (a0) ; LMULMAX2-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX2-NEXT: addi a0, sp, 16 ; LMULMAX2-NEXT: vle32.v v8, (a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll index 8386b33e0fb506..9e51c64307e929 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll @@ -413,52 +413,55 @@ declare <33 x double> @llvm.vp.load.v33f64.p0v33f64(<33 x double>*, <33 x i1>, i define <33 x double> @vpload_v33f64(<33 x double>* %ptr, <33 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vpload_v33f64: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a4, a2, -32 +; CHECK-NEXT: li a4, 32 ; CHECK-NEXT: vmv1r.v v8, v0 -; CHECK-NEXT: li a3, 0 -; CHECK-NEXT: li a5, 0 +; CHECK-NEXT: mv a3, a2 ; CHECK-NEXT: bltu a2, a4, .LBB32_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a5, a4 +; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: .LBB32_2: -; CHECK-NEXT: li a4, 16 -; CHECK-NEXT: bltu a5, a4, .LBB32_4 +; CHECK-NEXT: addi a5, a3, -16 +; CHECK-NEXT: li a4, 0 +; CHECK-NEXT: bltu a3, a5, .LBB32_4 ; CHECK-NEXT: # %bb.3: -; CHECK-NEXT: li a5, 16 +; CHECK-NEXT: mv a4, a5 ; CHECK-NEXT: .LBB32_4: -; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v0, v8, 4 -; CHECK-NEXT: addi a6, a1, 256 -; CHECK-NEXT: vsetvli zero, a5, e64, m8, ta, mu -; CHECK-NEXT: vle64.v v16, (a6), v0.t -; CHECK-NEXT: li a5, 32 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; CHECK-NEXT: vslidedown.vi v0, v8, 2 +; CHECK-NEXT: addi a5, a1, 128 +; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, mu +; CHECK-NEXT: vle64.v v16, (a5), v0.t +; CHECK-NEXT: addi a5, a2, -32 +; CHECK-NEXT: li a4, 0 ; CHECK-NEXT: bltu a2, a5, .LBB32_6 ; CHECK-NEXT: # %bb.5: -; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: mv a4, a5 ; CHECK-NEXT: .LBB32_6: -; CHECK-NEXT: addi a5, a2, -16 -; CHECK-NEXT: bltu a2, a5, .LBB32_8 +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: bltu a4, a2, .LBB32_8 ; CHECK-NEXT: # %bb.7: -; CHECK-NEXT: mv a3, a5 +; CHECK-NEXT: li a4, 16 ; CHECK-NEXT: .LBB32_8: -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; CHECK-NEXT: vslidedown.vi v0, v8, 2 -; CHECK-NEXT: addi a5, a1, 128 -; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; CHECK-NEXT: vslidedown.vi v0, v8, 4 +; CHECK-NEXT: addi a5, a1, 256 +; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, mu ; CHECK-NEXT: vle64.v v24, (a5), v0.t -; CHECK-NEXT: bltu a2, a4, .LBB32_10 +; CHECK-NEXT: bltu a3, a2, .LBB32_10 ; CHECK-NEXT: # %bb.9: -; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: li a3, 16 ; CHECK-NEXT: .LBB32_10: -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, mu ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vle64.v v8, (a1), v0.t ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; CHECK-NEXT: vse64.v v8, (a0) -; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, mu ; CHECK-NEXT: vse64.v v24, (a1) -; CHECK-NEXT: vfmv.f.s ft0, v16 -; CHECK-NEXT: fsd ft0, 256(a0) +; CHECK-NEXT: addi a0, a0, 128 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; CHECK-NEXT: vse64.v v16, (a0) ; CHECK-NEXT: ret %load = call <33 x double> @llvm.vp.load.v33f64.p0v33f64(<33 x double>* %ptr, <33 x i1> %m, i32 %evl) ret <33 x double> %load