From 95ce3c23c2597c882aa01fa31c70e3d5413a9c6e Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Mon, 25 Sep 2023 08:09:46 -0700 Subject: [PATCH] [RISCV] Be more aggressive about shrinking constant build_vector etype (#67175) If LMUL is more than m1, we can be more aggressive about narrowing the build_vector via a vsext if legal. If the narrow build_vector gets lowered as a load, while both are linear in lmul, load uops are generally more expensive than extend uops. If the narrow build_vector gets lowered via dominant values, that work is linear in both #unique elements and LMUL. So provided the number of unique values > 2, this is a net win in work performed. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 16 +- .../CodeGen/RISCV/rvv/active_lane_mask.ll | 55 ++- .../RISCV/rvv/fixed-vectors-extract.ll | 32 +- .../RISCV/rvv/fixed-vectors-int-buildvec.ll | 6 +- .../CodeGen/RISCV/rvv/fixed-vectors-int.ll | 384 +++++++++++------- .../rvv/fixed-vectors-reduction-int-vp.ll | 26 +- .../rvv/fixed-vectors-shuffle-reverse.ll | 6 +- .../RISCV/rvv/fixed-vectors-shuffle-rotate.ll | 30 +- .../RISCV/rvv/fixed-vectors-stepvector.ll | 59 ++- .../CodeGen/RISCV/srem-seteq-illegal-types.ll | 27 +- .../CodeGen/RISCV/urem-seteq-illegal-types.ll | 48 +-- 11 files changed, 406 insertions(+), 283 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 8e6644821031c1..08e37ea2a363d4 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3511,17 +3511,14 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG, } } - if (SDValue Res = lowerBuildVectorViaDominantValues(Op, DAG, Subtarget)) - return Res; - // If the number of signbits allows, see if we can lower as a . - // We restrict this to N <= 4 to ensure the resulting narrow vector is - // 32 bits of smaller and can thus be materialized cheaply from scalar. - // The main motivation for this is the constant index vector required - // by vrgather.vv. This covers all indice vectors up to size 4. + // Our main goal here is to reduce LMUL (and thus work) required to + // build the constant, but we will also narrow if the resulting + // narrow vector is known to materialize cheaply. // TODO: We really should be costing the smaller vector. There are // profitable cases this misses. - if (EltBitSize > 8 && NumElts <= 4) { + if (EltBitSize > 8 && + (NumElts <= 4 || VT.getSizeInBits() > Subtarget.getRealMinVLen())) { unsigned SignBits = DAG.ComputeNumSignBits(Op); if (EltBitSize - SignBits < 8) { SDValue Source = @@ -3533,6 +3530,9 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG, } } + if (SDValue Res = lowerBuildVectorViaDominantValues(Op, DAG, Subtarget)) + return Res; + // For constant vectors, use generic constant pool lowering. Otherwise, // we'd have to materialize constants in GPRs just to move them into the // vector. diff --git a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll index 2437c293644c1d..87d95d7596d4fa 100644 --- a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll @@ -106,11 +106,12 @@ define <32 x i1> @fv32(ptr %p, i64 %index, i64 %tc) { ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: lui a0, %hi(.LCPI8_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_0) -; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vid.v v16 ; CHECK-NEXT: vsaddu.vx v16, v16, a1 ; CHECK-NEXT: vmsltu.vx v0, v16, a2 -; CHECK-NEXT: vsaddu.vx v8, v8, a1 +; CHECK-NEXT: vsext.vf8 v16, v8 +; CHECK-NEXT: vsaddu.vx v8, v16, a1 ; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vslideup.vi v0, v16, 2 @@ -125,27 +126,30 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) { ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: lui a0, %hi(.LCPI9_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_0) -; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vid.v v16 ; CHECK-NEXT: vsaddu.vx v16, v16, a1 ; CHECK-NEXT: vmsltu.vx v0, v16, a2 -; CHECK-NEXT: vsaddu.vx v8, v8, a1 +; CHECK-NEXT: vsext.vf8 v16, v8 +; CHECK-NEXT: vsaddu.vx v8, v16, a1 ; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma ; CHECK-NEXT: vslideup.vi v0, v16, 2 ; CHECK-NEXT: lui a0, %hi(.LCPI9_1) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_1) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsext.vf8 v16, v8 +; CHECK-NEXT: vsaddu.vx v8, v16, a1 ; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma ; CHECK-NEXT: vslideup.vi v0, v16, 4 ; CHECK-NEXT: lui a0, %hi(.LCPI9_2) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_2) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsext.vf8 v16, v8 +; CHECK-NEXT: vsaddu.vx v8, v16, a1 ; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vslideup.vi v0, v16, 6 @@ -160,59 +164,66 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) { ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: lui a0, %hi(.LCPI10_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_0) -; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vid.v v16 ; CHECK-NEXT: vsaddu.vx v16, v16, a1 ; CHECK-NEXT: vmsltu.vx v0, v16, a2 -; CHECK-NEXT: vsaddu.vx v8, v8, a1 +; CHECK-NEXT: vsext.vf8 v16, v8 +; CHECK-NEXT: vsaddu.vx v8, v16, a1 ; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, ma ; CHECK-NEXT: vslideup.vi v0, v16, 2 ; CHECK-NEXT: lui a0, %hi(.LCPI10_1) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_1) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsext.vf8 v16, v8 +; CHECK-NEXT: vsaddu.vx v8, v16, a1 ; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetivli zero, 6, e8, m1, tu, ma ; CHECK-NEXT: vslideup.vi v0, v16, 4 ; CHECK-NEXT: lui a0, %hi(.LCPI10_2) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_2) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsext.vf8 v16, v8 +; CHECK-NEXT: vsaddu.vx v8, v16, a1 ; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, tu, ma ; CHECK-NEXT: vslideup.vi v0, v16, 6 ; CHECK-NEXT: lui a0, %hi(.LCPI10_3) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_3) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsext.vf8 v16, v8 +; CHECK-NEXT: vsaddu.vx v8, v16, a1 ; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetivli zero, 10, e8, m1, tu, ma ; CHECK-NEXT: vslideup.vi v0, v16, 8 ; CHECK-NEXT: lui a0, %hi(.LCPI10_4) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_4) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsext.vf8 v16, v8 +; CHECK-NEXT: vsaddu.vx v8, v16, a1 ; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetivli zero, 12, e8, m1, tu, ma ; CHECK-NEXT: vslideup.vi v0, v16, 10 ; CHECK-NEXT: lui a0, %hi(.LCPI10_5) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_5) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsext.vf8 v16, v8 +; CHECK-NEXT: vsaddu.vx v8, v16, a1 ; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetivli zero, 14, e8, m1, tu, ma ; CHECK-NEXT: vslideup.vi v0, v16, 12 ; CHECK-NEXT: lui a0, %hi(.LCPI10_6) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_6) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsext.vf8 v16, v8 +; CHECK-NEXT: vsaddu.vx v8, v16, a1 ; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; CHECK-NEXT: vslideup.vi v0, v16, 14 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll index 892db75ee67101..5605437443d76b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll @@ -950,16 +950,16 @@ define i32 @extractelt_mul_v4i32(<4 x i32> %x) { define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) { ; RV32NOM-LABEL: extractelt_sdiv_v4i32: ; RV32NOM: # %bb.0: -; RV32NOM-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32NOM-NEXT: vmv.v.i v9, 0 ; RV32NOM-NEXT: lui a0, %hi(.LCPI42_0) ; RV32NOM-NEXT: addi a0, a0, %lo(.LCPI42_0) -; RV32NOM-NEXT: vle32.v v10, (a0) -; RV32NOM-NEXT: li a0, -1 -; RV32NOM-NEXT: vslide1down.vx v9, v9, a0 -; RV32NOM-NEXT: vand.vv v9, v8, v9 -; RV32NOM-NEXT: vmulh.vv v8, v8, v10 -; RV32NOM-NEXT: vadd.vv v8, v8, v9 +; RV32NOM-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32NOM-NEXT: vle32.v v9, (a0) +; RV32NOM-NEXT: vmulh.vv v9, v8, v9 +; RV32NOM-NEXT: lui a0, 1044480 +; RV32NOM-NEXT: vmv.s.x v10, a0 +; RV32NOM-NEXT: vsext.vf4 v11, v10 +; RV32NOM-NEXT: vand.vv v8, v8, v11 +; RV32NOM-NEXT: vadd.vv v8, v9, v8 ; RV32NOM-NEXT: lui a0, 12320 ; RV32NOM-NEXT: addi a0, a0, 257 ; RV32NOM-NEXT: vmv.s.x v9, a0 @@ -986,16 +986,16 @@ define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) { ; ; RV64NOM-LABEL: extractelt_sdiv_v4i32: ; RV64NOM: # %bb.0: -; RV64NOM-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64NOM-NEXT: vmv.v.i v9, 0 ; RV64NOM-NEXT: lui a0, %hi(.LCPI42_0) ; RV64NOM-NEXT: addi a0, a0, %lo(.LCPI42_0) -; RV64NOM-NEXT: vle32.v v10, (a0) -; RV64NOM-NEXT: li a0, -1 -; RV64NOM-NEXT: vslide1down.vx v9, v9, a0 -; RV64NOM-NEXT: vand.vv v9, v8, v9 -; RV64NOM-NEXT: vmulh.vv v8, v8, v10 -; RV64NOM-NEXT: vadd.vv v8, v8, v9 +; RV64NOM-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64NOM-NEXT: vle32.v v9, (a0) +; RV64NOM-NEXT: vmulh.vv v9, v8, v9 +; RV64NOM-NEXT: lui a0, 1044480 +; RV64NOM-NEXT: vmv.s.x v10, a0 +; RV64NOM-NEXT: vsext.vf4 v11, v10 +; RV64NOM-NEXT: vand.vv v8, v8, v11 +; RV64NOM-NEXT: vadd.vv v8, v9, v8 ; RV64NOM-NEXT: lui a0, 12320 ; RV64NOM-NEXT: addiw a0, a0, 257 ; RV64NOM-NEXT: vmv.s.x v9, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll index 79947ca4cdf069..e95978744c408e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll @@ -292,7 +292,8 @@ define <4 x i64> @buildvec_vid_step1_add0_v4i64() { ; RV32-NEXT: lui a0, %hi(.LCPI25_0) ; RV32-NEXT: addi a0, a0, %lo(.LCPI25_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vle8.v v10, (a0) +; RV32-NEXT: vsext.vf4 v8, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: buildvec_vid_step1_add0_v4i64: @@ -309,7 +310,8 @@ define <4 x i64> @buildvec_vid_step2_add0_v4i64() { ; RV32-NEXT: lui a0, %hi(.LCPI26_0) ; RV32-NEXT: addi a0, a0, %lo(.LCPI26_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vle8.v v10, (a0) +; RV32-NEXT: vsext.vf4 v8, v10 ; RV32-NEXT: ret ; ; RV64-LABEL: buildvec_vid_step2_add0_v4i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll index dbf7dfbcab49cb..b2a9813e50a186 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -1292,29 +1292,55 @@ define void @mulhu_v6i16(ptr %x) { } define void @mulhu_v4i32(ptr %x) { -; CHECK-LABEL: mulhu_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: lui a1, 524288 -; CHECK-NEXT: vmv.s.x v9, a1 -; CHECK-NEXT: vmv.v.i v10, 0 -; CHECK-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; CHECK-NEXT: vslideup.vi v10, v9, 2 -; CHECK-NEXT: lui a1, %hi(.LCPI68_0) -; CHECK-NEXT: addi a1, a1, %lo(.LCPI68_0) -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v9, (a1) -; CHECK-NEXT: vmulhu.vv v9, v8, v9 -; CHECK-NEXT: vsub.vv v8, v8, v9 -; CHECK-NEXT: vmulhu.vv v8, v8, v10 -; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: vmv.v.i v9, 2 -; CHECK-NEXT: li a1, 1 -; CHECK-NEXT: vslide1down.vx v9, v9, a1 -; CHECK-NEXT: vsrl.vv v8, v8, v9 -; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: ret +; RV32-LABEL: mulhu_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: lui a1, 524288 +; RV32-NEXT: vmv.s.x v9, a1 +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetivli zero, 3, e32, m1, tu, ma +; RV32-NEXT: vslideup.vi v10, v9, 2 +; RV32-NEXT: lui a1, %hi(.LCPI68_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI68_0) +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vle32.v v9, (a1) +; RV32-NEXT: vmulhu.vv v9, v8, v9 +; RV32-NEXT: vsub.vv v8, v8, v9 +; RV32-NEXT: vmulhu.vv v8, v8, v10 +; RV32-NEXT: vadd.vv v8, v8, v9 +; RV32-NEXT: lui a1, 4128 +; RV32-NEXT: addi a1, a1, 514 +; RV32-NEXT: vmv.s.x v9, a1 +; RV32-NEXT: vsext.vf4 v10, v9 +; RV32-NEXT: vsrl.vv v8, v8, v10 +; RV32-NEXT: vse32.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: mulhu_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: lui a1, 524288 +; RV64-NEXT: vmv.s.x v9, a1 +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vsetivli zero, 3, e32, m1, tu, ma +; RV64-NEXT: vslideup.vi v10, v9, 2 +; RV64-NEXT: lui a1, %hi(.LCPI68_0) +; RV64-NEXT: addi a1, a1, %lo(.LCPI68_0) +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vle32.v v9, (a1) +; RV64-NEXT: vmulhu.vv v9, v8, v9 +; RV64-NEXT: vsub.vv v8, v8, v9 +; RV64-NEXT: vmulhu.vv v8, v8, v10 +; RV64-NEXT: vadd.vv v8, v8, v9 +; RV64-NEXT: lui a1, 4128 +; RV64-NEXT: addiw a1, a1, 514 +; RV64-NEXT: vmv.s.x v9, a1 +; RV64-NEXT: vsext.vf4 v10, v9 +; RV64-NEXT: vsrl.vv v8, v8, v10 +; RV64-NEXT: vse32.v v8, (a0) +; RV64-NEXT: ret %a = load <4 x i32>, ptr %x %b = udiv <4 x i32> %a, store <4 x i32> %b, ptr %x @@ -1461,29 +1487,57 @@ define void @mulhs_v8i16(ptr %x) { } define void @mulhs_v6i16(ptr %x) { -; CHECK-LABEL: mulhs_v6i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vmv.v.i v9, 7 -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: li a1, -14 -; CHECK-NEXT: vmadd.vx v10, a1, v9 -; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 4 -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vdiv.vv v9, v9, v10 -; CHECK-NEXT: vmv.v.i v0, 6 -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v10, -7 -; CHECK-NEXT: vmerge.vim v10, v10, 7, v0 -; CHECK-NEXT: vdiv.vv v8, v8, v10 -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 4 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vse16.v v8, (a0) -; CHECK-NEXT: ret +; RV32-LABEL: mulhs_v6i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; RV32-NEXT: vle16.v v8, (a0) +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vmv.v.i v9, 7 +; RV32-NEXT: vid.v v10 +; RV32-NEXT: li a1, -14 +; RV32-NEXT: vmadd.vx v10, a1, v9 +; RV32-NEXT: vsetivli zero, 2, e16, m1, ta, ma +; RV32-NEXT: vslidedown.vi v9, v8, 4 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vdiv.vv v9, v9, v10 +; RV32-NEXT: lui a1, 1020016 +; RV32-NEXT: addi a1, a1, 2041 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.s.x v10, a1 +; RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV32-NEXT: vsext.vf2 v11, v10 +; RV32-NEXT: vdiv.vv v8, v8, v11 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vslideup.vi v8, v9, 4 +; RV32-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; RV32-NEXT: vse16.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: mulhs_v6i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; RV64-NEXT: vle16.v v8, (a0) +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV64-NEXT: vmv.v.i v9, 7 +; RV64-NEXT: vid.v v10 +; RV64-NEXT: li a1, -14 +; RV64-NEXT: vmadd.vx v10, a1, v9 +; RV64-NEXT: vsetivli zero, 2, e16, m1, ta, ma +; RV64-NEXT: vslidedown.vi v9, v8, 4 +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV64-NEXT: vdiv.vv v9, v9, v10 +; RV64-NEXT: lui a1, 1020016 +; RV64-NEXT: addiw a1, a1, 2041 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vmv.s.x v10, a1 +; RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64-NEXT: vsext.vf2 v11, v10 +; RV64-NEXT: vdiv.vv v8, v8, v11 +; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64-NEXT: vslideup.vi v8, v9, 4 +; RV64-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; RV64-NEXT: vse16.v v8, (a0) +; RV64-NEXT: ret %a = load <6 x i16>, ptr %x %b = sdiv <6 x i16> %a, store <6 x i16> %b, ptr %x @@ -1550,16 +1604,15 @@ define void @mulhs_v2i64(ptr %x) { ; RV32-NEXT: vrsub.vi v10, v10, 0 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vmadd.vv v10, v8, v9 +; RV32-NEXT: li a1, 63 +; RV32-NEXT: vsrl.vx v8, v10, a1 +; RV32-NEXT: lui a1, 16 +; RV32-NEXT: vmv.s.x v9, a1 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v8, 1 -; RV32-NEXT: vmv.v.i v9, 0 -; RV32-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV32-NEXT: vslideup.vi v9, v8, 2 +; RV32-NEXT: vsext.vf4 v11, v9 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vsra.vv v8, v10, v9 -; RV32-NEXT: li a1, 63 -; RV32-NEXT: vsrl.vx v9, v10, a1 -; RV32-NEXT: vadd.vv v8, v8, v9 +; RV32-NEXT: vsra.vv v9, v10, v11 +; RV32-NEXT: vadd.vv v8, v9, v8 ; RV32-NEXT: vse64.v v8, (a0) ; RV32-NEXT: ret ; @@ -5101,63 +5154,70 @@ define void @mulhu_v16i16(ptr %x) { ; LMULMAX2-RV32-NEXT: vle16.v v10, (a0) ; LMULMAX2-RV32-NEXT: li a1, 257 ; LMULMAX2-RV32-NEXT: vmv.s.x v0, a1 -; LMULMAX2-RV32-NEXT: vmv.v.i v12, 0 +; LMULMAX2-RV32-NEXT: vmv.v.i v8, 0 +; LMULMAX2-RV32-NEXT: lui a1, 1048568 +; LMULMAX2-RV32-NEXT: vmerge.vxm v12, v8, a1, v0 ; LMULMAX2-RV32-NEXT: lui a1, 4 ; LMULMAX2-RV32-NEXT: addi a1, a1, 64 ; LMULMAX2-RV32-NEXT: vmv.s.x v8, a1 +; LMULMAX2-RV32-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; LMULMAX2-RV32-NEXT: vmv.v.i v9, 0 +; LMULMAX2-RV32-NEXT: vmv1r.v v0, v8 +; LMULMAX2-RV32-NEXT: vmerge.vim v9, v9, 1, v0 +; LMULMAX2-RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI182_0) ; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI182_0) ; LMULMAX2-RV32-NEXT: vle16.v v14, (a1) -; LMULMAX2-RV32-NEXT: lui a1, 1048568 -; LMULMAX2-RV32-NEXT: vmerge.vxm v16, v12, a1, v0 -; LMULMAX2-RV32-NEXT: vmv1r.v v0, v8 -; LMULMAX2-RV32-NEXT: vmerge.vim v12, v12, 1, v0 -; LMULMAX2-RV32-NEXT: vsrl.vv v12, v10, v12 -; LMULMAX2-RV32-NEXT: vmulhu.vv v12, v12, v14 -; LMULMAX2-RV32-NEXT: vsub.vv v10, v10, v12 -; LMULMAX2-RV32-NEXT: vmulhu.vv v10, v10, v16 -; LMULMAX2-RV32-NEXT: vadd.vv v10, v10, v12 +; LMULMAX2-RV32-NEXT: vsext.vf2 v16, v9 +; LMULMAX2-RV32-NEXT: vsrl.vv v16, v10, v16 +; LMULMAX2-RV32-NEXT: vmulhu.vv v14, v16, v14 +; LMULMAX2-RV32-NEXT: vsub.vv v10, v10, v14 +; LMULMAX2-RV32-NEXT: vmulhu.vv v10, v10, v12 +; LMULMAX2-RV32-NEXT: vadd.vv v10, v10, v14 ; LMULMAX2-RV32-NEXT: lui a1, 2 ; LMULMAX2-RV32-NEXT: addi a1, a1, 289 ; LMULMAX2-RV32-NEXT: vmv.s.x v0, a1 -; LMULMAX2-RV32-NEXT: vmv.v.i v12, 3 -; LMULMAX2-RV32-NEXT: vmerge.vim v12, v12, 2, v0 +; LMULMAX2-RV32-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; LMULMAX2-RV32-NEXT: vmv.v.i v9, 3 +; LMULMAX2-RV32-NEXT: vmerge.vim v9, v9, 2, v0 ; LMULMAX2-RV32-NEXT: vmv1r.v v0, v8 -; LMULMAX2-RV32-NEXT: vmerge.vim v8, v12, 1, v0 -; LMULMAX2-RV32-NEXT: vsrl.vv v8, v10, v8 +; LMULMAX2-RV32-NEXT: vmerge.vim v8, v9, 1, v0 +; LMULMAX2-RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; LMULMAX2-RV32-NEXT: vsext.vf2 v12, v8 +; LMULMAX2-RV32-NEXT: vsrl.vv v8, v10, v12 ; LMULMAX2-RV32-NEXT: vse16.v v8, (a0) ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: mulhu_v16i16: ; LMULMAX2-RV64: # %bb.0: ; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; LMULMAX2-RV64-NEXT: vle16.v v10, (a0) +; LMULMAX2-RV64-NEXT: vle16.v v8, (a0) ; LMULMAX2-RV64-NEXT: li a1, 257 ; LMULMAX2-RV64-NEXT: vmv.s.x v0, a1 -; LMULMAX2-RV64-NEXT: vmv.v.i v12, 0 -; LMULMAX2-RV64-NEXT: lui a1, 4 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 64 -; LMULMAX2-RV64-NEXT: vmv.s.x v8, a1 +; LMULMAX2-RV64-NEXT: vmv.v.i v10, 0 +; LMULMAX2-RV64-NEXT: lui a1, 1048568 +; LMULMAX2-RV64-NEXT: vmerge.vxm v10, v10, a1, v0 +; LMULMAX2-RV64-NEXT: li a1, 1 +; LMULMAX2-RV64-NEXT: slli a1, a1, 48 +; LMULMAX2-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV64-NEXT: vmv.v.x v12, a1 +; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI182_0) ; LMULMAX2-RV64-NEXT: addi a1, a1, %lo(.LCPI182_0) ; LMULMAX2-RV64-NEXT: vle16.v v14, (a1) -; LMULMAX2-RV64-NEXT: lui a1, 1048568 -; LMULMAX2-RV64-NEXT: vmerge.vxm v16, v12, a1, v0 -; LMULMAX2-RV64-NEXT: vmv1r.v v0, v8 -; LMULMAX2-RV64-NEXT: vmerge.vim v12, v12, 1, v0 -; LMULMAX2-RV64-NEXT: vsrl.vv v12, v10, v12 +; LMULMAX2-RV64-NEXT: vsext.vf2 v16, v12 +; LMULMAX2-RV64-NEXT: vsrl.vv v12, v8, v16 ; LMULMAX2-RV64-NEXT: vmulhu.vv v12, v12, v14 -; LMULMAX2-RV64-NEXT: vsub.vv v10, v10, v12 -; LMULMAX2-RV64-NEXT: vmulhu.vv v10, v10, v16 -; LMULMAX2-RV64-NEXT: vadd.vv v10, v10, v12 -; LMULMAX2-RV64-NEXT: lui a1, 2 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 289 -; LMULMAX2-RV64-NEXT: vmv.s.x v0, a1 -; LMULMAX2-RV64-NEXT: vmv.v.i v12, 3 -; LMULMAX2-RV64-NEXT: vmerge.vim v12, v12, 2, v0 -; LMULMAX2-RV64-NEXT: vmv1r.v v0, v8 -; LMULMAX2-RV64-NEXT: vmerge.vim v8, v12, 1, v0 -; LMULMAX2-RV64-NEXT: vsrl.vv v8, v10, v8 +; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v12 +; LMULMAX2-RV64-NEXT: vmulhu.vv v8, v8, v10 +; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v12 +; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI182_1) +; LMULMAX2-RV64-NEXT: addi a1, a1, %lo(.LCPI182_1) +; LMULMAX2-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX2-RV64-NEXT: vlse64.v v10, (a1), zero +; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; LMULMAX2-RV64-NEXT: vsext.vf2 v12, v10 +; LMULMAX2-RV64-NEXT: vsrl.vv v8, v8, v12 ; LMULMAX2-RV64-NEXT: vse16.v v8, (a0) ; LMULMAX2-RV64-NEXT: ret ; @@ -5182,29 +5242,57 @@ define void @mulhu_v16i16(ptr %x) { } define void @mulhu_v8i32(ptr %x) { -; LMULMAX2-LABEL: mulhu_v8i32: -; LMULMAX2: # %bb.0: -; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-NEXT: vle32.v v8, (a0) -; LMULMAX2-NEXT: li a1, 68 -; LMULMAX2-NEXT: vmv.s.x v0, a1 -; LMULMAX2-NEXT: lui a1, %hi(.LCPI183_0) -; LMULMAX2-NEXT: addi a1, a1, %lo(.LCPI183_0) -; LMULMAX2-NEXT: vle32.v v10, (a1) -; LMULMAX2-NEXT: vmv.v.i v12, 0 -; LMULMAX2-NEXT: lui a1, 524288 -; LMULMAX2-NEXT: vmerge.vxm v12, v12, a1, v0 -; LMULMAX2-NEXT: vmulhu.vv v10, v8, v10 -; LMULMAX2-NEXT: vsub.vv v8, v8, v10 -; LMULMAX2-NEXT: vmulhu.vv v8, v8, v12 -; LMULMAX2-NEXT: vadd.vv v8, v8, v10 -; LMULMAX2-NEXT: li a1, 136 -; LMULMAX2-NEXT: vmv.s.x v0, a1 -; LMULMAX2-NEXT: vmv.v.i v10, 2 -; LMULMAX2-NEXT: vmerge.vim v10, v10, 1, v0 -; LMULMAX2-NEXT: vsrl.vv v8, v8, v10 -; LMULMAX2-NEXT: vse32.v v8, (a0) -; LMULMAX2-NEXT: ret +; LMULMAX2-RV32-LABEL: mulhu_v8i32: +; LMULMAX2-RV32: # %bb.0: +; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV32-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV32-NEXT: li a1, 68 +; LMULMAX2-RV32-NEXT: vmv.s.x v0, a1 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI183_0) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI183_0) +; LMULMAX2-RV32-NEXT: vle32.v v10, (a1) +; LMULMAX2-RV32-NEXT: vmv.v.i v12, 0 +; LMULMAX2-RV32-NEXT: lui a1, 524288 +; LMULMAX2-RV32-NEXT: vmerge.vxm v12, v12, a1, v0 +; LMULMAX2-RV32-NEXT: vmulhu.vv v10, v8, v10 +; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV32-NEXT: vmulhu.vv v8, v8, v12 +; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV32-NEXT: lui a1, 4128 +; LMULMAX2-RV32-NEXT: addi a1, a1, 514 +; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1 +; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV32-NEXT: vsext.vf4 v12, v10 +; LMULMAX2-RV32-NEXT: vsrl.vv v8, v8, v12 +; LMULMAX2-RV32-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV32-NEXT: ret +; +; LMULMAX2-RV64-LABEL: mulhu_v8i32: +; LMULMAX2-RV64: # %bb.0: +; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV64-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV64-NEXT: li a1, 68 +; LMULMAX2-RV64-NEXT: vmv.s.x v0, a1 +; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI183_0) +; LMULMAX2-RV64-NEXT: addi a1, a1, %lo(.LCPI183_0) +; LMULMAX2-RV64-NEXT: vle32.v v10, (a1) +; LMULMAX2-RV64-NEXT: vmv.v.i v12, 0 +; LMULMAX2-RV64-NEXT: lui a1, 524288 +; LMULMAX2-RV64-NEXT: vmerge.vxm v12, v12, a1, v0 +; LMULMAX2-RV64-NEXT: vmulhu.vv v10, v8, v10 +; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV64-NEXT: vmulhu.vv v8, v8, v12 +; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV64-NEXT: lui a1, 4128 +; LMULMAX2-RV64-NEXT: addiw a1, a1, 514 +; LMULMAX2-RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; LMULMAX2-RV64-NEXT: vmv.v.x v10, a1 +; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-RV64-NEXT: vsext.vf4 v12, v10 +; LMULMAX2-RV64-NEXT: vsrl.vv v8, v8, v12 +; LMULMAX2-RV64-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV64-NEXT: ret ; ; LMULMAX1-RV32-LABEL: mulhu_v8i32: ; LMULMAX1-RV32: # %bb.0: @@ -5225,15 +5313,16 @@ define void @mulhu_v8i32(ptr %x) { ; LMULMAX1-RV32-NEXT: vsub.vv v9, v9, v12 ; LMULMAX1-RV32-NEXT: vmulhu.vv v9, v9, v11 ; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v12 -; LMULMAX1-RV32-NEXT: vmv.v.i v12, 2 -; LMULMAX1-RV32-NEXT: li a2, 1 -; LMULMAX1-RV32-NEXT: vslide1down.vx v12, v12, a2 -; LMULMAX1-RV32-NEXT: vsrl.vv v9, v9, v12 +; LMULMAX1-RV32-NEXT: lui a2, 4128 +; LMULMAX1-RV32-NEXT: addi a2, a2, 514 +; LMULMAX1-RV32-NEXT: vmv.s.x v12, a2 +; LMULMAX1-RV32-NEXT: vsext.vf4 v13, v12 +; LMULMAX1-RV32-NEXT: vsrl.vv v9, v9, v13 ; LMULMAX1-RV32-NEXT: vmulhu.vv v10, v8, v10 ; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v10 ; LMULMAX1-RV32-NEXT: vmulhu.vv v8, v8, v11 ; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vv v8, v8, v12 +; LMULMAX1-RV32-NEXT: vsrl.vv v8, v8, v13 ; LMULMAX1-RV32-NEXT: vse32.v v8, (a0) ; LMULMAX1-RV32-NEXT: vse32.v v9, (a1) ; LMULMAX1-RV32-NEXT: ret @@ -5283,9 +5372,10 @@ define void @mulhu_v4i64(ptr %x) { ; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI184_1) ; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI184_1) ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vle32.v v10, (a1) +; LMULMAX2-RV32-NEXT: vle8.v v10, (a1) +; LMULMAX2-RV32-NEXT: vsext.vf4 v12, v10 ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32-NEXT: vsrl.vv v8, v8, v10 +; LMULMAX2-RV32-NEXT: vsrl.vv v8, v8, v12 ; LMULMAX2-RV32-NEXT: vse64.v v8, (a0) ; LMULMAX2-RV32-NEXT: ret ; @@ -5637,22 +5727,22 @@ define void @mulhs_v4i64(ptr %x) { ; LMULMAX2-RV32-NEXT: vmerge.vxm v10, v10, a1, v0 ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX2-RV32-NEXT: vmulh.vv v10, v8, v10 -; LMULMAX2-RV32-NEXT: li a1, 51 -; LMULMAX2-RV32-NEXT: vmv.s.x v0, a1 +; LMULMAX2-RV32-NEXT: lui a1, 1048560 +; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; LMULMAX2-RV32-NEXT: vmv.v.x v12, a1 ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.i v12, -1 -; LMULMAX2-RV32-NEXT: vmerge.vim v12, v12, 0, v0 +; LMULMAX2-RV32-NEXT: vsext.vf4 v14, v12 ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmadd.vv v12, v8, v10 +; LMULMAX2-RV32-NEXT: vmadd.vv v14, v8, v10 ; LMULMAX2-RV32-NEXT: li a1, 63 -; LMULMAX2-RV32-NEXT: vsrl.vx v8, v12, a1 -; LMULMAX2-RV32-NEXT: li a1, 68 -; LMULMAX2-RV32-NEXT: vmv.s.x v0, a1 +; LMULMAX2-RV32-NEXT: vsrl.vx v8, v14, a1 +; LMULMAX2-RV32-NEXT: lui a1, 16 +; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1 ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.i v10, 0 -; LMULMAX2-RV32-NEXT: vmerge.vim v10, v10, 1, v0 +; LMULMAX2-RV32-NEXT: vsext.vf4 v12, v10 ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32-NEXT: vsra.vv v10, v12, v10 +; LMULMAX2-RV32-NEXT: vsra.vv v10, v14, v12 ; LMULMAX2-RV32-NEXT: vadd.vv v8, v10, v8 ; LMULMAX2-RV32-NEXT: vse64.v v8, (a0) ; LMULMAX2-RV32-NEXT: ret @@ -5661,26 +5751,30 @@ define void @mulhs_v4i64(ptr %x) { ; LMULMAX2-RV64: # %bb.0: ; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX2-RV64-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; LMULMAX2-RV64-NEXT: vmv.v.i v0, 5 -; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX2-RV64-NEXT: lui a1, 349525 ; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 ; LMULMAX2-RV64-NEXT: slli a2, a1, 32 ; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: lui a2, %hi(.LCPI188_0) -; LMULMAX2-RV64-NEXT: ld a2, %lo(.LCPI188_0)(a2) -; LMULMAX2-RV64-NEXT: vmv.v.i v10, -1 -; LMULMAX2-RV64-NEXT: vmerge.vim v10, v10, 0, v0 -; LMULMAX2-RV64-NEXT: vmv.v.x v12, a1 -; LMULMAX2-RV64-NEXT: vmerge.vxm v12, v12, a2, v0 -; LMULMAX2-RV64-NEXT: vmulh.vv v12, v8, v12 -; LMULMAX2-RV64-NEXT: vmacc.vv v12, v8, v10 +; LMULMAX2-RV64-NEXT: vmv.v.x v10, a1 +; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI188_0) +; LMULMAX2-RV64-NEXT: ld a1, %lo(.LCPI188_0)(a1) +; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; LMULMAX2-RV64-NEXT: vmv.v.i v0, 5 +; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV64-NEXT: vmerge.vxm v10, v10, a1, v0 +; LMULMAX2-RV64-NEXT: vmulh.vv v10, v8, v10 +; LMULMAX2-RV64-NEXT: lui a1, 1044496 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -256 +; LMULMAX2-RV64-NEXT: vmv.s.x v12, a1 +; LMULMAX2-RV64-NEXT: vsext.vf8 v14, v12 +; LMULMAX2-RV64-NEXT: vmadd.vv v14, v8, v10 ; LMULMAX2-RV64-NEXT: li a1, 63 -; LMULMAX2-RV64-NEXT: vsrl.vx v8, v12, a1 -; LMULMAX2-RV64-NEXT: vmv.v.i v10, 1 -; LMULMAX2-RV64-NEXT: vmerge.vim v10, v10, 0, v0 -; LMULMAX2-RV64-NEXT: vsra.vv v10, v12, v10 +; LMULMAX2-RV64-NEXT: vsrl.vx v8, v14, a1 +; LMULMAX2-RV64-NEXT: lui a1, 4096 +; LMULMAX2-RV64-NEXT: addiw a1, a1, 256 +; LMULMAX2-RV64-NEXT: vmv.s.x v10, a1 +; LMULMAX2-RV64-NEXT: vsext.vf8 v12, v10 +; LMULMAX2-RV64-NEXT: vsra.vv v10, v14, v12 ; LMULMAX2-RV64-NEXT: vadd.vv v8, v10, v8 ; LMULMAX2-RV64-NEXT: vse64.v v8, (a0) ; LMULMAX2-RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll index f0a359c13ce5d3..894e96d682871a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll @@ -1797,16 +1797,17 @@ define signext i8 @vpreduce_mul_v64i8(i8 signext %s, <64 x i8> %v, <64 x i1> %m, ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV32-NEXT: lui a2, %hi(.LCPI72_0) ; RV32-NEXT: addi a2, a2, %lo(.LCPI72_0) -; RV32-NEXT: vle32.v v16, (a2) +; RV32-NEXT: vle8.v v12, (a2) ; RV32-NEXT: mv a2, a0 -; RV32-NEXT: vid.v v24 -; RV32-NEXT: vmsltu.vx v12, v24, a1 -; RV32-NEXT: vmsltu.vx v13, v16, a1 +; RV32-NEXT: vid.v v16 +; RV32-NEXT: vmsltu.vx v14, v16, a1 +; RV32-NEXT: vsext.vf4 v16, v12 +; RV32-NEXT: vmsltu.vx v12, v16, a1 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vslideup.vi v12, v13, 4 +; RV32-NEXT: vslideup.vi v14, v12, 4 ; RV32-NEXT: li a0, 64 ; RV32-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; RV32-NEXT: vmand.mm v0, v12, v0 +; RV32-NEXT: vmand.mm v0, v14, v0 ; RV32-NEXT: vmv.v.i v12, 1 ; RV32-NEXT: vmerge.vvm v8, v12, v8, v0 ; RV32-NEXT: vslidedown.vx v12, v8, a3 @@ -1840,16 +1841,17 @@ define signext i8 @vpreduce_mul_v64i8(i8 signext %s, <64 x i8> %v, <64 x i1> %m, ; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV64-NEXT: lui a2, %hi(.LCPI72_0) ; RV64-NEXT: addi a2, a2, %lo(.LCPI72_0) -; RV64-NEXT: vle32.v v16, (a2) +; RV64-NEXT: vle8.v v12, (a2) ; RV64-NEXT: mv a2, a0 -; RV64-NEXT: vid.v v24 -; RV64-NEXT: vmsltu.vx v12, v24, a1 -; RV64-NEXT: vmsltu.vx v13, v16, a1 +; RV64-NEXT: vid.v v16 +; RV64-NEXT: vmsltu.vx v14, v16, a1 +; RV64-NEXT: vsext.vf4 v16, v12 +; RV64-NEXT: vmsltu.vx v12, v16, a1 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vslideup.vi v12, v13, 4 +; RV64-NEXT: vslideup.vi v14, v12, 4 ; RV64-NEXT: li a0, 64 ; RV64-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; RV64-NEXT: vmand.mm v0, v12, v0 +; RV64-NEXT: vmand.mm v0, v14, v0 ; RV64-NEXT: vmv.v.i v12, 1 ; RV64-NEXT: vmerge.vvm v8, v12, v8, v0 ; RV64-NEXT: vslidedown.vx v12, v8, a3 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll index 45f59d3a28a7c1..70b19856fc536c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll @@ -327,7 +327,8 @@ define <32 x i16> @reverse_v32i16(<32 x i16> %a) { ; CHECK-NEXT: addi a0, a0, %lo(.LCPI19_0) ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vsext.vf2 v16, v12 ; CHECK-NEXT: vrgather.vv v12, v8, v16 ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret @@ -523,7 +524,8 @@ define <32 x half> @reverse_v32f16(<32 x half> %a) { ; CHECK-NEXT: addi a0, a0, %lo(.LCPI34_0) ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vsext.vf2 v16, v12 ; CHECK-NEXT: vrgather.vv v12, v8, v16 ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll index 05121b4a18200f..4b7bfba06377da 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll @@ -511,7 +511,8 @@ define <8 x i16> @shuffle_v8i16_as_i64_16(<8 x i16> %v) { ; ZVBB-ZVE32X-NEXT: lui a0, %hi(.LCPI19_0) ; ZVBB-ZVE32X-NEXT: addi a0, a0, %lo(.LCPI19_0) ; ZVBB-ZVE32X-NEXT: vsetivli zero, 8, e16, m4, ta, ma -; ZVBB-ZVE32X-NEXT: vle16.v v16, (a0) +; ZVBB-ZVE32X-NEXT: vle8.v v12, (a0) +; ZVBB-ZVE32X-NEXT: vsext.vf2 v16, v12 ; ZVBB-ZVE32X-NEXT: vrgather.vv v12, v8, v16 ; ZVBB-ZVE32X-NEXT: vmv.v.v v8, v12 ; ZVBB-ZVE32X-NEXT: ret @@ -554,7 +555,8 @@ define <8 x i16> @shuffle_v8i16_as_i64_32(<8 x i16> %v) { ; ZVBB-ZVE32X-NEXT: lui a0, %hi(.LCPI20_0) ; ZVBB-ZVE32X-NEXT: addi a0, a0, %lo(.LCPI20_0) ; ZVBB-ZVE32X-NEXT: vsetivli zero, 8, e16, m4, ta, ma -; ZVBB-ZVE32X-NEXT: vle16.v v16, (a0) +; ZVBB-ZVE32X-NEXT: vle8.v v12, (a0) +; ZVBB-ZVE32X-NEXT: vsext.vf2 v16, v12 ; ZVBB-ZVE32X-NEXT: vrgather.vv v12, v8, v16 ; ZVBB-ZVE32X-NEXT: vmv.v.v v8, v12 ; ZVBB-ZVE32X-NEXT: ret @@ -597,7 +599,8 @@ define <8 x i16> @shuffle_v8i16_as_i64_48(<8 x i16> %v) { ; ZVBB-ZVE32X-NEXT: lui a0, %hi(.LCPI21_0) ; ZVBB-ZVE32X-NEXT: addi a0, a0, %lo(.LCPI21_0) ; ZVBB-ZVE32X-NEXT: vsetivli zero, 8, e16, m4, ta, ma -; ZVBB-ZVE32X-NEXT: vle16.v v16, (a0) +; ZVBB-ZVE32X-NEXT: vle8.v v12, (a0) +; ZVBB-ZVE32X-NEXT: vsext.vf2 v16, v12 ; ZVBB-ZVE32X-NEXT: vrgather.vv v12, v8, v16 ; ZVBB-ZVE32X-NEXT: vmv.v.v v8, v12 ; ZVBB-ZVE32X-NEXT: ret @@ -639,8 +642,10 @@ define <8 x i32> @shuffle_v8i32_as_i64(<8 x i32> %v) { ; ZVBB-ZVE32X: # %bb.0: ; ZVBB-ZVE32X-NEXT: lui a0, %hi(.LCPI22_0) ; ZVBB-ZVE32X-NEXT: addi a0, a0, %lo(.LCPI22_0) -; ZVBB-ZVE32X-NEXT: vsetivli zero, 8, e32, m8, ta, ma -; ZVBB-ZVE32X-NEXT: vle16.v v24, (a0) +; ZVBB-ZVE32X-NEXT: vsetivli zero, 8, e16, m4, ta, ma +; ZVBB-ZVE32X-NEXT: vle8.v v16, (a0) +; ZVBB-ZVE32X-NEXT: vsext.vf2 v24, v16 +; ZVBB-ZVE32X-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVBB-ZVE32X-NEXT: vrgatherei16.vv v16, v8, v24 ; ZVBB-ZVE32X-NEXT: vmv.v.v v8, v16 ; ZVBB-ZVE32X-NEXT: ret @@ -707,7 +712,8 @@ define <8 x half> @shuffle_v8f16_as_i64_16(<8 x half> %v) { ; ZVBB-ZVE32X-NEXT: lui a0, %hi(.LCPI24_0) ; ZVBB-ZVE32X-NEXT: addi a0, a0, %lo(.LCPI24_0) ; ZVBB-ZVE32X-NEXT: vsetivli zero, 8, e16, m4, ta, ma -; ZVBB-ZVE32X-NEXT: vle16.v v16, (a0) +; ZVBB-ZVE32X-NEXT: vle8.v v12, (a0) +; ZVBB-ZVE32X-NEXT: vsext.vf2 v16, v12 ; ZVBB-ZVE32X-NEXT: vrgather.vv v12, v8, v16 ; ZVBB-ZVE32X-NEXT: vmv.v.v v8, v12 ; ZVBB-ZVE32X-NEXT: ret @@ -750,7 +756,8 @@ define <8 x half> @shuffle_v8f16_as_i64_32(<8 x half> %v) { ; ZVBB-ZVE32X-NEXT: lui a0, %hi(.LCPI25_0) ; ZVBB-ZVE32X-NEXT: addi a0, a0, %lo(.LCPI25_0) ; ZVBB-ZVE32X-NEXT: vsetivli zero, 8, e16, m4, ta, ma -; ZVBB-ZVE32X-NEXT: vle16.v v16, (a0) +; ZVBB-ZVE32X-NEXT: vle8.v v12, (a0) +; ZVBB-ZVE32X-NEXT: vsext.vf2 v16, v12 ; ZVBB-ZVE32X-NEXT: vrgather.vv v12, v8, v16 ; ZVBB-ZVE32X-NEXT: vmv.v.v v8, v12 ; ZVBB-ZVE32X-NEXT: ret @@ -793,7 +800,8 @@ define <8 x half> @shuffle_v8f16_as_i64_48(<8 x half> %v) { ; ZVBB-ZVE32X-NEXT: lui a0, %hi(.LCPI26_0) ; ZVBB-ZVE32X-NEXT: addi a0, a0, %lo(.LCPI26_0) ; ZVBB-ZVE32X-NEXT: vsetivli zero, 8, e16, m4, ta, ma -; ZVBB-ZVE32X-NEXT: vle16.v v16, (a0) +; ZVBB-ZVE32X-NEXT: vle8.v v12, (a0) +; ZVBB-ZVE32X-NEXT: vsext.vf2 v16, v12 ; ZVBB-ZVE32X-NEXT: vrgather.vv v12, v8, v16 ; ZVBB-ZVE32X-NEXT: vmv.v.v v8, v12 ; ZVBB-ZVE32X-NEXT: ret @@ -835,8 +843,10 @@ define <8 x float> @shuffle_v8f32_as_i64(<8 x float> %v) { ; ZVBB-ZVE32X: # %bb.0: ; ZVBB-ZVE32X-NEXT: lui a0, %hi(.LCPI27_0) ; ZVBB-ZVE32X-NEXT: addi a0, a0, %lo(.LCPI27_0) -; ZVBB-ZVE32X-NEXT: vsetivli zero, 8, e32, m8, ta, ma -; ZVBB-ZVE32X-NEXT: vle16.v v24, (a0) +; ZVBB-ZVE32X-NEXT: vsetivli zero, 8, e16, m4, ta, ma +; ZVBB-ZVE32X-NEXT: vle8.v v16, (a0) +; ZVBB-ZVE32X-NEXT: vsext.vf2 v24, v16 +; ZVBB-ZVE32X-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVBB-ZVE32X-NEXT: vrgatherei16.vv v16, v8, v24 ; ZVBB-ZVE32X-NEXT: vmv.v.v v8, v16 ; ZVBB-ZVE32X-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll index 13e81d30d66a33..5574d12d2d5dd8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll @@ -189,11 +189,10 @@ declare <2 x i64> @llvm.experimental.stepvector.v2i64() define <2 x i64> @stepvector_v2i64() { ; RV32LMULMAX1-LABEL: stepvector_v2i64: ; RV32LMULMAX1: # %bb.0: +; RV32LMULMAX1-NEXT: lui a0, 16 ; RV32LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32LMULMAX1-NEXT: vmv.v.i v9, 1 -; RV32LMULMAX1-NEXT: vmv.v.i v8, 0 -; RV32LMULMAX1-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV32LMULMAX1-NEXT: vslideup.vi v8, v9, 2 +; RV32LMULMAX1-NEXT: vmv.s.x v9, a0 +; RV32LMULMAX1-NEXT: vsext.vf4 v8, v9 ; RV32LMULMAX1-NEXT: ret ; ; RV64LMULMAX1-LABEL: stepvector_v2i64: @@ -204,11 +203,10 @@ define <2 x i64> @stepvector_v2i64() { ; ; RV32LMULMAX2-LABEL: stepvector_v2i64: ; RV32LMULMAX2: # %bb.0: +; RV32LMULMAX2-NEXT: lui a0, 16 ; RV32LMULMAX2-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32LMULMAX2-NEXT: vmv.v.i v9, 1 -; RV32LMULMAX2-NEXT: vmv.v.i v8, 0 -; RV32LMULMAX2-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV32LMULMAX2-NEXT: vslideup.vi v8, v9, 2 +; RV32LMULMAX2-NEXT: vmv.s.x v9, a0 +; RV32LMULMAX2-NEXT: vsext.vf4 v8, v9 ; RV32LMULMAX2-NEXT: ret ; ; RV64LMULMAX2-LABEL: stepvector_v2i64: @@ -225,15 +223,13 @@ declare <4 x i64> @llvm.experimental.stepvector.v4i64() define <4 x i64> @stepvector_v4i64() { ; RV32LMULMAX1-LABEL: stepvector_v4i64: ; RV32LMULMAX1: # %bb.0: +; RV32LMULMAX1-NEXT: lui a0, 16 ; RV32LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32LMULMAX1-NEXT: vmv.v.i v9, 1 -; RV32LMULMAX1-NEXT: vmv.v.i v8, 0 -; RV32LMULMAX1-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV32LMULMAX1-NEXT: vslideup.vi v8, v9, 2 +; RV32LMULMAX1-NEXT: vmv.s.x v9, a0 +; RV32LMULMAX1-NEXT: vsext.vf4 v8, v9 ; RV32LMULMAX1-NEXT: lui a0, 48 ; RV32LMULMAX1-NEXT: addi a0, a0, 2 ; RV32LMULMAX1-NEXT: vmv.s.x v10, a0 -; RV32LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32LMULMAX1-NEXT: vsext.vf4 v9, v10 ; RV32LMULMAX1-NEXT: ret ; @@ -249,7 +245,8 @@ define <4 x i64> @stepvector_v4i64() { ; RV32LMULMAX2-NEXT: lui a0, %hi(.LCPI14_0) ; RV32LMULMAX2-NEXT: addi a0, a0, %lo(.LCPI14_0) ; RV32LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32LMULMAX2-NEXT: vle32.v v8, (a0) +; RV32LMULMAX2-NEXT: vle8.v v10, (a0) +; RV32LMULMAX2-NEXT: vsext.vf4 v8, v10 ; RV32LMULMAX2-NEXT: ret ; ; RV64LMULMAX2-LABEL: stepvector_v4i64: @@ -266,15 +263,13 @@ declare <8 x i64> @llvm.experimental.stepvector.v8i64() define <8 x i64> @stepvector_v8i64() { ; RV32LMULMAX1-LABEL: stepvector_v8i64: ; RV32LMULMAX1: # %bb.0: +; RV32LMULMAX1-NEXT: lui a0, 16 ; RV32LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32LMULMAX1-NEXT: vmv.v.i v9, 1 -; RV32LMULMAX1-NEXT: vmv.v.i v8, 0 -; RV32LMULMAX1-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV32LMULMAX1-NEXT: vslideup.vi v8, v9, 2 +; RV32LMULMAX1-NEXT: vmv.s.x v9, a0 +; RV32LMULMAX1-NEXT: vsext.vf4 v8, v9 ; RV32LMULMAX1-NEXT: lui a0, 48 ; RV32LMULMAX1-NEXT: addi a0, a0, 2 ; RV32LMULMAX1-NEXT: vmv.s.x v10, a0 -; RV32LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32LMULMAX1-NEXT: vsext.vf4 v9, v10 ; RV32LMULMAX1-NEXT: lui a0, 80 ; RV32LMULMAX1-NEXT: addi a0, a0, 4 @@ -300,10 +295,12 @@ define <8 x i64> @stepvector_v8i64() { ; RV32LMULMAX2-NEXT: lui a0, %hi(.LCPI15_0) ; RV32LMULMAX2-NEXT: addi a0, a0, %lo(.LCPI15_0) ; RV32LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32LMULMAX2-NEXT: vle32.v v8, (a0) +; RV32LMULMAX2-NEXT: vle8.v v10, (a0) ; RV32LMULMAX2-NEXT: lui a0, %hi(.LCPI15_1) ; RV32LMULMAX2-NEXT: addi a0, a0, %lo(.LCPI15_1) -; RV32LMULMAX2-NEXT: vle32.v v10, (a0) +; RV32LMULMAX2-NEXT: vle8.v v12, (a0) +; RV32LMULMAX2-NEXT: vsext.vf4 v8, v10 +; RV32LMULMAX2-NEXT: vsext.vf4 v10, v12 ; RV32LMULMAX2-NEXT: ret ; ; RV64LMULMAX2-LABEL: stepvector_v8i64: @@ -321,15 +318,13 @@ declare <16 x i64> @llvm.experimental.stepvector.v16i64() define <16 x i64> @stepvector_v16i64() { ; RV32LMULMAX1-LABEL: stepvector_v16i64: ; RV32LMULMAX1: # %bb.0: +; RV32LMULMAX1-NEXT: lui a0, 16 ; RV32LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32LMULMAX1-NEXT: vmv.v.i v9, 1 -; RV32LMULMAX1-NEXT: vmv.v.i v8, 0 -; RV32LMULMAX1-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV32LMULMAX1-NEXT: vslideup.vi v8, v9, 2 +; RV32LMULMAX1-NEXT: vmv.s.x v9, a0 +; RV32LMULMAX1-NEXT: vsext.vf4 v8, v9 ; RV32LMULMAX1-NEXT: lui a0, 48 ; RV32LMULMAX1-NEXT: addi a0, a0, 2 ; RV32LMULMAX1-NEXT: vmv.s.x v10, a0 -; RV32LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32LMULMAX1-NEXT: vsext.vf4 v9, v10 ; RV32LMULMAX1-NEXT: lui a0, 80 ; RV32LMULMAX1-NEXT: addi a0, a0, 4 @@ -375,16 +370,20 @@ define <16 x i64> @stepvector_v16i64() { ; RV32LMULMAX2-NEXT: lui a0, %hi(.LCPI16_0) ; RV32LMULMAX2-NEXT: addi a0, a0, %lo(.LCPI16_0) ; RV32LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32LMULMAX2-NEXT: vle32.v v8, (a0) +; RV32LMULMAX2-NEXT: vle8.v v10, (a0) ; RV32LMULMAX2-NEXT: lui a0, %hi(.LCPI16_1) ; RV32LMULMAX2-NEXT: addi a0, a0, %lo(.LCPI16_1) -; RV32LMULMAX2-NEXT: vle32.v v10, (a0) +; RV32LMULMAX2-NEXT: vle8.v v12, (a0) ; RV32LMULMAX2-NEXT: lui a0, %hi(.LCPI16_2) ; RV32LMULMAX2-NEXT: addi a0, a0, %lo(.LCPI16_2) -; RV32LMULMAX2-NEXT: vle32.v v12, (a0) +; RV32LMULMAX2-NEXT: vle8.v v14, (a0) ; RV32LMULMAX2-NEXT: lui a0, %hi(.LCPI16_3) ; RV32LMULMAX2-NEXT: addi a0, a0, %lo(.LCPI16_3) -; RV32LMULMAX2-NEXT: vle32.v v14, (a0) +; RV32LMULMAX2-NEXT: vle8.v v16, (a0) +; RV32LMULMAX2-NEXT: vsext.vf4 v8, v10 +; RV32LMULMAX2-NEXT: vsext.vf4 v10, v12 +; RV32LMULMAX2-NEXT: vsext.vf4 v12, v14 +; RV32LMULMAX2-NEXT: vsext.vf4 v14, v16 ; RV32LMULMAX2-NEXT: ret ; ; RV64LMULMAX2-LABEL: stepvector_v16i64: diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll index 4ece90dac18ac0..ee91bae6b6e027 100644 --- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll @@ -661,22 +661,25 @@ define void @test_srem_vec(ptr %X) nounwind { ; RV32MV-NEXT: vslide1down.vx v8, v8, a0 ; RV32MV-NEXT: vslide1down.vx v8, v8, a1 ; RV32MV-NEXT: vslidedown.vi v8, v8, 2 -; RV32MV-NEXT: li a0, 85 -; RV32MV-NEXT: vmv.s.x v0, a0 +; RV32MV-NEXT: li a0, 511 +; RV32MV-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV32MV-NEXT: vmv.v.x v10, a0 +; RV32MV-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32MV-NEXT: vsext.vf4 v12, v10 +; RV32MV-NEXT: vand.vv v8, v8, v12 +; RV32MV-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32MV-NEXT: vmv.v.i v10, 1 -; RV32MV-NEXT: vmerge.vim v10, v10, -1, v0 -; RV32MV-NEXT: vand.vv v8, v8, v10 -; RV32MV-NEXT: vsetivli zero, 8, e32, m1, ta, ma +; RV32MV-NEXT: vmv.v.i v11, 0 +; RV32MV-NEXT: vsetivli zero, 3, e8, mf2, tu, ma +; RV32MV-NEXT: vslideup.vi v11, v10, 2 +; RV32MV-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32MV-NEXT: vmv.v.i v10, 2 -; RV32MV-NEXT: vmv.v.i v12, 1 +; RV32MV-NEXT: vsetivli zero, 5, e8, mf2, tu, ma +; RV32MV-NEXT: vslideup.vi v11, v10, 4 ; RV32MV-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32MV-NEXT: vmv.v.i v14, 0 -; RV32MV-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV32MV-NEXT: vslideup.vi v14, v12, 2 -; RV32MV-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV32MV-NEXT: vslideup.vi v14, v10, 4 +; RV32MV-NEXT: vsext.vf4 v12, v11 ; RV32MV-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32MV-NEXT: vmsne.vv v0, v8, v14 +; RV32MV-NEXT: vmsne.vv v0, v8, v12 ; RV32MV-NEXT: vmv.v.i v8, 0 ; RV32MV-NEXT: vmerge.vim v8, v8, -1, v0 ; RV32MV-NEXT: vsetivli zero, 1, e32, m1, ta, ma diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll index dceae00d2e517d..f24b6107f15ada 100644 --- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll @@ -542,28 +542,28 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV32MV-NEXT: vsub.vv v8, v8, v10 ; RV32MV-NEXT: vmul.vv v8, v8, v9 ; RV32MV-NEXT: vadd.vv v9, v8, v8 -; RV32MV-NEXT: vmv.v.i v10, 10 -; RV32MV-NEXT: li a1, 9 -; RV32MV-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; RV32MV-NEXT: lui a1, 41121 +; RV32MV-NEXT: addi a1, a1, -1527 +; RV32MV-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32MV-NEXT: vmv.s.x v10, a1 ; RV32MV-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV32MV-NEXT: vsll.vv v9, v9, v10 +; RV32MV-NEXT: vsext.vf2 v11, v10 +; RV32MV-NEXT: vsll.vv v9, v9, v11 ; RV32MV-NEXT: li a1, 2047 ; RV32MV-NEXT: vand.vx v8, v8, a1 -; RV32MV-NEXT: vmv.v.i v10, 0 -; RV32MV-NEXT: li a2, 1 -; RV32MV-NEXT: vsetvli zero, zero, e16, mf2, tu, ma -; RV32MV-NEXT: vmv1r.v v11, v10 -; RV32MV-NEXT: vmv.s.x v11, a2 +; RV32MV-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV32MV-NEXT: vmv.v.i v10, 1 ; RV32MV-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV32MV-NEXT: vsext.vf2 v11, v10 ; RV32MV-NEXT: lui a2, %hi(.LCPI4_1) ; RV32MV-NEXT: addi a2, a2, %lo(.LCPI4_1) -; RV32MV-NEXT: vle16.v v12, (a2) +; RV32MV-NEXT: vle16.v v10, (a2) ; RV32MV-NEXT: vsrl.vv v8, v8, v11 ; RV32MV-NEXT: vor.vv v8, v8, v9 ; RV32MV-NEXT: vand.vx v8, v8, a1 -; RV32MV-NEXT: vmsltu.vv v0, v12, v8 -; RV32MV-NEXT: vmerge.vim v8, v10, -1, v0 +; RV32MV-NEXT: vmsltu.vv v0, v10, v8 +; RV32MV-NEXT: vmv.v.i v8, 0 +; RV32MV-NEXT: vmerge.vim v8, v8, -1, v0 ; RV32MV-NEXT: vslidedown.vi v9, v8, 2 ; RV32MV-NEXT: vmv.x.s a1, v9 ; RV32MV-NEXT: slli a2, a1, 21 @@ -603,28 +603,28 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV64MV-NEXT: vsub.vv v8, v8, v10 ; RV64MV-NEXT: vmul.vv v8, v8, v9 ; RV64MV-NEXT: vadd.vv v9, v8, v8 -; RV64MV-NEXT: vmv.v.i v10, 10 -; RV64MV-NEXT: li a1, 9 -; RV64MV-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; RV64MV-NEXT: lui a1, 41121 +; RV64MV-NEXT: addiw a1, a1, -1527 +; RV64MV-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64MV-NEXT: vmv.s.x v10, a1 ; RV64MV-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64MV-NEXT: vsll.vv v9, v9, v10 +; RV64MV-NEXT: vsext.vf2 v11, v10 +; RV64MV-NEXT: vsll.vv v9, v9, v11 ; RV64MV-NEXT: li a1, 2047 ; RV64MV-NEXT: vand.vx v8, v8, a1 -; RV64MV-NEXT: vmv.v.i v10, 0 -; RV64MV-NEXT: li a2, 1 -; RV64MV-NEXT: vsetvli zero, zero, e16, mf2, tu, ma -; RV64MV-NEXT: vmv1r.v v11, v10 -; RV64MV-NEXT: vmv.s.x v11, a2 +; RV64MV-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64MV-NEXT: vmv.v.i v10, 1 ; RV64MV-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64MV-NEXT: vsext.vf2 v11, v10 ; RV64MV-NEXT: lui a2, %hi(.LCPI4_1) ; RV64MV-NEXT: addi a2, a2, %lo(.LCPI4_1) -; RV64MV-NEXT: vle16.v v12, (a2) +; RV64MV-NEXT: vle16.v v10, (a2) ; RV64MV-NEXT: vsrl.vv v8, v8, v11 ; RV64MV-NEXT: vor.vv v8, v8, v9 ; RV64MV-NEXT: vand.vx v8, v8, a1 -; RV64MV-NEXT: vmsltu.vv v0, v12, v8 -; RV64MV-NEXT: vmerge.vim v8, v10, -1, v0 +; RV64MV-NEXT: vmsltu.vv v0, v10, v8 +; RV64MV-NEXT: vmv.v.i v8, 0 +; RV64MV-NEXT: vmerge.vim v8, v8, -1, v0 ; RV64MV-NEXT: vmv.x.s a1, v8 ; RV64MV-NEXT: andi a1, a1, 2047 ; RV64MV-NEXT: vslidedown.vi v9, v8, 1