From ef7900d18bcf1ffaa68336d741a4f38672b482cc Mon Sep 17 00:00:00 2001 From: Brandon Wu Date: Wed, 8 Oct 2025 21:11:23 -0700 Subject: [PATCH 1/4] [RISCV][llvm] Preliminary P extension codegen support This is the initial support of P extension codegen, it only includes small part of instructions: PADD_H, PADD_B, PSADD_H, PSADD_B, PAADD_H, PAADD_B, PSADDU_H, PSADDU_B, PAADDU_H, PAADDU_B, PSUB_H, PSUB_B, PDIF_H, PDIF_B, PSSUB_H, PSSUB_B, PASUB_H, PASUB_B, PDIFU_H, PDIFU_B, PSSUBU_H, PSSUBU_B, PASUBU_H, PASUBU_B --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 29 ++ llvm/lib/Target/RISCV/RISCVInstrInfoP.td | 124 +++++ llvm/lib/Target/RISCV/RISCVRegisterInfo.td | 6 +- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 21 + llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll | 426 +++++++++++++++++ llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll | 430 ++++++++++++++++++ 6 files changed, 1035 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll create mode 100644 llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 7123a2d706787..1eb8c9457ee6a 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -279,6 +279,17 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, addRegisterClass(MVT::riscv_nxv32i8x2, &RISCV::VRN2M4RegClass); } + // fixed vector is stored in GPRs for P extension packed operations + if (Subtarget.hasStdExtP()) { + addRegisterClass(MVT::v2i16, &RISCV::GPRRegClass); + addRegisterClass(MVT::v4i8, &RISCV::GPRRegClass); + if (Subtarget.is64Bit()) { + addRegisterClass(MVT::v2i32, &RISCV::GPRRegClass); + addRegisterClass(MVT::v4i16, &RISCV::GPRRegClass); + addRegisterClass(MVT::v8i8, &RISCV::GPRRegClass); + } + } + // Compute derived properties from the register classes. computeRegisterProperties(STI.getRegisterInfo()); @@ -479,6 +490,24 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::FTRUNC, ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN, ISD::FCANONICALIZE}; + if (Subtarget.hasStdExtP()) { + // load/store are already handled by pattern matching + SmallVector VTs = {MVT::v2i16, MVT::v4i8}; + if (Subtarget.is64Bit()) + VTs.append({MVT::v2i32, MVT::v4i16, MVT::v8i8}); + for (auto VT : VTs) { + setOperationAction(ISD::UADDSAT, VT, Legal); + setOperationAction(ISD::SADDSAT, VT, Legal); + setOperationAction(ISD::USUBSAT, VT, Legal); + setOperationAction(ISD::SSUBSAT, VT, Legal); + setOperationAction(ISD::SSHLSAT, VT, Legal); + setOperationAction(ISD::USHLSAT, VT, Legal); + setOperationAction(ISD::BITCAST, VT, Custom); + setOperationAction({ISD::AVGFLOORS, ISD::AVGFLOORU}, VT, Legal); + setOperationAction({ISD::ABDS, ISD::ABDU}, VT, Legal); + } + } + if (Subtarget.hasStdExtZfbfmin()) { setOperationAction(ISD::BITCAST, MVT::i16, Custom); setOperationAction(ISD::ConstantFP, MVT::bf16, Expand); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index 7d8a9192d9847..c5e2f12aafb1e 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -1455,3 +1455,127 @@ let Predicates = [HasStdExtP, IsRV32] in { def PMAXU_DW : RVPPairBinaryExchanged_rr<0b1111, 0b01, "pmaxu.dw">; def PMAXU_DB : RVPPairBinaryExchanged_rr<0b1111, 0b10, "pmaxu.db">; } // Predicates = [HasStdExtP, IsRV32] + +let Predicates = [HasStdExtP, IsRV64] in { + // Basic arithmetic patterns for v4i16 (16-bit elements in 64-bit GPR) + def: Pat<(v4i16 (add v4i16:$rs1, v4i16:$rs2)), (!cast("PADD_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i16 (sub v4i16:$rs1, v4i16:$rs2)), (!cast("PSUB_H") GPR:$rs1, GPR:$rs2)>; + + // Saturating add/sub patterns for v4i16 + def: Pat<(v4i16 (saddsat v4i16:$rs1, v4i16:$rs2)), (!cast("PSADD_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i16 (uaddsat v4i16:$rs1, v4i16:$rs2)), (!cast("PSADDU_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i16 (ssubsat v4i16:$rs1, v4i16:$rs2)), (!cast("PSSUB_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i16 (usubsat v4i16:$rs1, v4i16:$rs2)), (!cast("PSSUBU_H") GPR:$rs1, GPR:$rs2)>; + + // Averaging patterns for v4i16 + def: Pat<(v4i16 (avgfloors v4i16:$rs1, v4i16:$rs2)), (!cast("PAADD_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i16 (avgflooru v4i16:$rs1, v4i16:$rs2)), (!cast("PAADDU_H") GPR:$rs1, GPR:$rs2)>; + + // Averaging subtraction patterns for v4i16 + // PASUB_H: signed (a - b) >> 1 + def: Pat<(v4i16 (sra (sub v4i16:$rs1, v4i16:$rs2), (v4i16 (build_vector (XLenVT 1))))), + (!cast("PASUB_H") GPR:$rs1, GPR:$rs2)>; + // PASUBU_H: unsigned (a - b) >> 1 + def: Pat<(v4i16 (srl (sub v4i16:$rs1, v4i16:$rs2), (v4i16 (build_vector (XLenVT 1))))), + (!cast("PASUBU_H") GPR:$rs1, GPR:$rs2)>; + + // Absolute difference patterns for v4i16 + def: Pat<(v4i16 (abds v4i16:$rs1, v4i16:$rs2)), (!cast("PDIF_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i16 (abdu v4i16:$rs1, v4i16:$rs2)), (!cast("PDIFU_H") GPR:$rs1, GPR:$rs2)>; + + // Basic arithmetic patterns for v8i8 (8-bit elements in 64-bit GPR) + def: Pat<(v8i8 (add v8i8:$rs1, v8i8:$rs2)), (!cast("PADD_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v8i8 (sub v8i8:$rs1, v8i8:$rs2)), (!cast("PSUB_B") GPR:$rs1, GPR:$rs2)>; + + // Saturating add/sub patterns for v8i8 + def: Pat<(v8i8 (saddsat v8i8:$rs1, v8i8:$rs2)), (!cast("PSADD_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v8i8 (uaddsat v8i8:$rs1, v8i8:$rs2)), (!cast("PSADDU_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v8i8 (ssubsat v8i8:$rs1, v8i8:$rs2)), (!cast("PSSUB_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v8i8 (usubsat v8i8:$rs1, v8i8:$rs2)), (!cast("PSSUBU_B") GPR:$rs1, GPR:$rs2)>; + + // Averaging patterns for v8i8 + def: Pat<(v8i8 (avgfloors v8i8:$rs1, v8i8:$rs2)), (!cast("PAADD_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v8i8 (avgflooru v8i8:$rs1, v8i8:$rs2)), (!cast("PAADDU_B") GPR:$rs1, GPR:$rs2)>; + + // Averaging subtraction patterns for v8i8 + // PASUB_B: signed (a - b) >> 1 + def: Pat<(v8i8 (sra (sub v8i8:$rs1, v8i8:$rs2), (v8i8 (build_vector (XLenVT 1))))), + (!cast("PASUB_B") GPR:$rs1, GPR:$rs2)>; + // PASUBU_B: unsigned (a - b) >> 1 + def: Pat<(v8i8 (srl (sub v8i8:$rs1, v8i8:$rs2), (v8i8 (build_vector (XLenVT 1))))), + (!cast("PASUBU_B") GPR:$rs1, GPR:$rs2)>; + + // Absolute difference patterns for v8i8 + def: Pat<(v8i8 (abds v8i8:$rs1, v8i8:$rs2)), (!cast("PDIF_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v8i8 (abdu v8i8:$rs1, v8i8:$rs2)), (!cast("PDIFU_B") GPR:$rs1, GPR:$rs2)>; + + // Load/Store patterns for v4i16 and v8i8 (use regular GPR load/store since they're in GPRs) + def : StPat; + def : LdPat; + def : StPat; + def : LdPat; + + // Load/Store patterns for v2i32 (32-bit elements in 64-bit GPR) + def : StPat; + def : LdPat; +} // Predicates = [HasStdExtP, IsRV64] + +let Predicates = [HasStdExtP, IsRV32] in { + // Basic arithmetic patterns for v2i16 (16-bit elements in 32-bit GPR) + def: Pat<(v2i16 (add v2i16:$rs1, v2i16:$rs2)), (!cast("PADD_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i16 (sub v2i16:$rs1, v2i16:$rs2)), (!cast("PSUB_H") GPR:$rs1, GPR:$rs2)>; + + // Saturating add/sub patterns for v2i16 + def: Pat<(v2i16 (saddsat v2i16:$rs1, v2i16:$rs2)), (!cast("PSADD_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i16 (uaddsat v2i16:$rs1, v2i16:$rs2)), (!cast("PSADDU_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i16 (ssubsat v2i16:$rs1, v2i16:$rs2)), (!cast("PSSUB_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i16 (usubsat v2i16:$rs1, v2i16:$rs2)), (!cast("PSSUBU_H") GPR:$rs1, GPR:$rs2)>; + + // Averaging patterns for v2i16 + def: Pat<(v2i16 (avgfloors v2i16:$rs1, v2i16:$rs2)), (!cast("PAADD_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i16 (avgflooru v2i16:$rs1, v2i16:$rs2)), (!cast("PAADDU_H") GPR:$rs1, GPR:$rs2)>; + + // Averaging subtraction patterns for v2i16 + // PASUB_H: signed (a - b) >> 1 + def: Pat<(v2i16 (sra (sub v2i16:$rs1, v2i16:$rs2), (v2i16 (build_vector (XLenVT 1))))), + (!cast("PASUB_H") GPR:$rs1, GPR:$rs2)>; + // PASUBU_H: unsigned (a - b) >> 1 + def: Pat<(v2i16 (srl (sub v2i16:$rs1, v2i16:$rs2), (v2i16 (build_vector (XLenVT 1))))), + (!cast("PASUBU_H") GPR:$rs1, GPR:$rs2)>; + + // Absolute difference patterns for v2i16 + def: Pat<(v2i16 (abds v2i16:$rs1, v2i16:$rs2)), (!cast("PDIF_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i16 (abdu v2i16:$rs1, v2i16:$rs2)), (!cast("PDIFU_H") GPR:$rs1, GPR:$rs2)>; + + // Basic arithmetic patterns for v4i8 (8-bit elements in 32-bit GPR) + def: Pat<(v4i8 (add v4i8:$rs1, v4i8:$rs2)), (!cast("PADD_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i8 (sub v4i8:$rs1, v4i8:$rs2)), (!cast("PSUB_B") GPR:$rs1, GPR:$rs2)>; + + // Saturating add/sub patterns for v4i8 + def: Pat<(v4i8 (saddsat v4i8:$rs1, v4i8:$rs2)), (!cast("PSADD_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i8 (uaddsat v4i8:$rs1, v4i8:$rs2)), (!cast("PSADDU_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i8 (ssubsat v4i8:$rs1, v4i8:$rs2)), (!cast("PSSUB_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i8 (usubsat v4i8:$rs1, v4i8:$rs2)), (!cast("PSSUBU_B") GPR:$rs1, GPR:$rs2)>; + + // Averaging patterns for v4i8 + def: Pat<(v4i8 (avgfloors v4i8:$rs1, v4i8:$rs2)), (!cast("PAADD_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i8 (avgflooru v4i8:$rs1, v4i8:$rs2)), (!cast("PAADDU_B") GPR:$rs1, GPR:$rs2)>; + + // Averaging subtraction patterns for v4i8 + // PASUB_B: signed (a - b) >> 1 + def: Pat<(v4i8 (sra (sub v4i8:$rs1, v4i8:$rs2), (v4i8 (build_vector (XLenVT 1))))), + (!cast("PASUB_B") GPR:$rs1, GPR:$rs2)>; + // PASUBU_B: unsigned (a - b) >> 1 + def: Pat<(v4i8 (srl (sub v4i8:$rs1, v4i8:$rs2), (v4i8 (build_vector (XLenVT 1))))), + (!cast("PASUBU_B") GPR:$rs1, GPR:$rs2)>; + + // Absolute difference patterns for v4i8 + def: Pat<(v4i8 (abds v4i8:$rs1, v4i8:$rs2)), (!cast("PDIF_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i8 (abdu v4i8:$rs1, v4i8:$rs2)), (!cast("PDIFU_B") GPR:$rs1, GPR:$rs2)>; + + // Load/Store patterns for v2i16 and v4i8 (use regular GPR load/store since they're in GPRs) + def : StPat; + def : LdPat; + def : StPat; + def : LdPat; +} // Predicates = [HasStdExtP, IsRV32] diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td index 6605a5ccdfde2..fcbb93a55375b 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td @@ -238,7 +238,11 @@ class RISCVRegisterClass regTypes, int align, dag regList> } class GPRRegisterClass - : RISCVRegisterClass<[XLenVT, XLenFVT], 32, regList> { + : RISCVRegisterClass<[XLenVT, XLenFVT, + // P extension packed vector types: + // RV32: v2i16, v4i8 + // RV64: v2i32, v4i16, v8i8 + v2i16, v4i8, v2i32, v4i16, v8i8], 32, regList> { let RegInfos = XLenRI; } diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 7bc0b5b394828..e669175a3d8e1 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -969,6 +969,13 @@ InstructionCost RISCVTTIImpl::getScalarizationOverhead( if (isa(Ty)) return InstructionCost::getInvalid(); + // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16) + // For now, skip all fixed vector cost analysis when P extension is available + // to avoid crashes in getMinRVVVectorSizeInBits() + if (ST->hasStdExtP() && isa(Ty)) { + return 1; // Treat as single instruction cost for now + } + // A build_vector (which is m1 sized or smaller) can be done in no // worse than one vslide1down.vx per element in the type. We could // in theory do an explode_vector in the inverse manner, but our @@ -1625,6 +1632,13 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, if (!IsVectorType) return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); + // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16) + // For now, skip all fixed vector cost analysis when P extension is available + // to avoid crashes in getMinRVVVectorSizeInBits() + if (ST->hasStdExtP() && (isa(Dst) || isa(Src))) { + return 1; // Treat as single instruction cost for now + } + // FIXME: Need to compute legalizing cost for illegal types. The current // code handles only legal types and those which can be trivially // promoted to legal. @@ -2321,6 +2335,13 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, const Value *Op1) const { assert(Val->isVectorTy() && "This must be a vector type"); + // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16) + // For now, skip all fixed vector cost analysis when P extension is available + // to avoid crashes in getMinRVVVectorSizeInBits() + if (ST->hasStdExtP() && isa(Val)) { + return 1; // Treat as single instruction cost for now + } + if (Opcode != Instruction::ExtractElement && Opcode != Instruction::InsertElement) return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll new file mode 100644 index 0000000000000..8a4ab1d545f41 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll @@ -0,0 +1,426 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+experimental-p -verify-machineinstrs < %s | FileCheck %s + +; Test basic add/sub operations for v2i16 +define void @test_padd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_padd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: padd.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %res = add <2 x i16> %a, %b + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_psub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psub_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: psub.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %res = sub <2 x i16> %a, %b + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test basic add/sub operations for v4i8 +define void @test_padd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_padd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: padd.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %res = add <4 x i8> %a, %b + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_psub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psub_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: psub.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %res = sub <4 x i8> %a, %b + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test saturating add operations for v2i16 +define void @test_psadd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psadd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: psadd.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %res = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %a, <2 x i16> %b) + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_psaddu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psaddu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: psaddu.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %res = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %a, <2 x i16> %b) + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test saturating sub operations for v2i16 +define void @test_pssub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssub_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pssub.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %res = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %a, <2 x i16> %b) + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_pssubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssubu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pssubu.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %res = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %a, <2 x i16> %b) + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test saturating add operations for v4i8 +define void @test_psadd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psadd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: psadd.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %res = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> %a, <4 x i8> %b) + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_psaddu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psaddu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: psaddu.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %res = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> %a, <4 x i8> %b) + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test saturating sub operations for v4i8 +define void @test_pssub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssub_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pssub.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %res = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> %a, <4 x i8> %b) + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_pssubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssubu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pssubu.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %res = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %a, <4 x i8> %b) + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor signed operations for v2i16 +define void @test_paadd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paadd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: paadd.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %ext.a = sext <2 x i16> %a to <2 x i32> + %ext.b = sext <2 x i16> %b to <2 x i32> + %add = add nsw <2 x i32> %ext.a, %ext.b + %shift = ashr <2 x i32> %add, + %res = trunc <2 x i32> %shift to <2 x i16> + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor unsigned operations for v2i16 +define void @test_paaddu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paaddu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: paaddu.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %and = and <2 x i16> %a, %b + %xor = xor <2 x i16> %a, %b + %shift = lshr <2 x i16> %xor, + %res = add <2 x i16> %and, %shift + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor signed operations for v4i8 +define void @test_paadd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paadd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: paadd.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %ext.a = sext <4 x i8> %a to <4 x i16> + %ext.b = sext <4 x i8> %b to <4 x i16> + %add = add nsw <4 x i16> %ext.a, %ext.b + %shift = ashr <4 x i16> %add, + %res = trunc <4 x i16> %shift to <4 x i8> + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor unsigned operations for v4i8 +define void @test_paaddu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paaddu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: paaddu.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %and = and <4 x i8> %a, %b + %xor = xor <4 x i8> %a, %b + %shift = lshr <4 x i8> %xor, + %res = add <4 x i8> %and, %shift + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference signed for v2i16 +define void @test_pdif_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdif_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pdif.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %min = call <2 x i16> @llvm.smin.v2i16(<2 x i16> %a, <2 x i16> %b) + %max = call <2 x i16> @llvm.smax.v2i16(<2 x i16> %a, <2 x i16> %b) + %res = sub <2 x i16> %max, %min + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference unsigned for v2i16 +define void @test_pdifu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdifu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pdifu.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %min = call <2 x i16> @llvm.umin.v2i16(<2 x i16> %a, <2 x i16> %b) + %max = call <2 x i16> @llvm.umax.v2i16(<2 x i16> %a, <2 x i16> %b) + %res = sub <2 x i16> %max, %min + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference signed for v4i8 +define void @test_pdif_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdif_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pdif.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %min = call <4 x i8> @llvm.smin.v4i8(<4 x i8> %a, <4 x i8> %b) + %max = call <4 x i8> @llvm.smax.v4i8(<4 x i8> %a, <4 x i8> %b) + %res = sub <4 x i8> %max, %min + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference unsigned for v4i8 +define void @test_pdifu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdifu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pdifu.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %min = call <4 x i8> @llvm.umin.v4i8(<4 x i8> %a, <4 x i8> %b) + %max = call <4 x i8> @llvm.umax.v4i8(<4 x i8> %a, <4 x i8> %b) + %res = sub <4 x i8> %max, %min + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction signed for v2i16 +; pasub pattern: (a - b) arithmetic shift right 1 +define void @test_pasub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasub_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pasub.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %sub = sub <2 x i16> %a, %b + %res = ashr <2 x i16> %sub, + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction unsigned for v2i16 +; pasubu pattern: (a - b) logical shift right 1 +define void @test_pasubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasubu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pasubu.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %sub = sub <2 x i16> %a, %b + %res = lshr <2 x i16> %sub, + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction signed for v4i8 +define void @test_pasub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasub_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pasub.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %sub = sub <4 x i8> %a, %b + %res = ashr <4 x i8> %sub, + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction unsigned for v4i8 +define void @test_pasubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasubu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pasubu.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %sub = sub <4 x i8> %a, %b + %res = lshr <4 x i8> %sub, + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Intrinsic declarations +declare <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.usub.sat.v2i16(<2 x i16>, <2 x i16>) +declare <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8>, <4 x i8>) +declare <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8>, <4 x i8>) +declare <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8>, <4 x i8>) +declare <4 x i8> @llvm.usub.sat.v4i8(<4 x i8>, <4 x i8>) +declare <2 x i16> @llvm.smin.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.smax.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.umin.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.umax.v2i16(<2 x i16>, <2 x i16>) +declare <4 x i8> @llvm.smin.v4i8(<4 x i8>, <4 x i8>) +declare <4 x i8> @llvm.smax.v4i8(<4 x i8>, <4 x i8>) +declare <4 x i8> @llvm.umin.v4i8(<4 x i8>, <4 x i8>) +declare <4 x i8> @llvm.umax.v4i8(<4 x i8>, <4 x i8>) diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll new file mode 100644 index 0000000000000..d4918e4e0aa62 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll @@ -0,0 +1,430 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+experimental-p -verify-machineinstrs < %s | FileCheck %s + +; Test basic add/sub operations for v4i16 +define void @test_padd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_padd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: padd.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %res = add <4 x i16> %a, %b + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_psub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psub_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: psub.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %res = sub <4 x i16> %a, %b + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test basic add/sub operations for v8i8 +define void @test_padd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_padd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: padd.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %res = add <8 x i8> %a, %b + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_psub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psub_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: psub.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %res = sub <8 x i8> %a, %b + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test saturating add operations for v4i16 +define void @test_psadd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psadd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: psadd.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %res = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %a, <4 x i16> %b) + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_psaddu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psaddu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: psaddu.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %res = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %a, <4 x i16> %b) + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test saturating sub operations for v4i16 +define void @test_pssub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssub_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pssub.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %res = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %a, <4 x i16> %b) + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_pssubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssubu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pssubu.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %res = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %a, <4 x i16> %b) + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test saturating add operations for v8i8 +define void @test_psadd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psadd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: psadd.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %res = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> %a, <8 x i8> %b) + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_psaddu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psaddu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: psaddu.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %res = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> %a, <8 x i8> %b) + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test saturating sub operations for v8i8 +define void @test_pssub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssub_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pssub.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %res = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> %a, <8 x i8> %b) + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_pssubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssubu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pssubu.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %res = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> %a, <8 x i8> %b) + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor signed operations for v4i16 +; avgfloors pattern: (a + b) arithmetic shift right 1 +define void @test_paadd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paadd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: paadd.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %ext.a = sext <4 x i16> %a to <4 x i32> + %ext.b = sext <4 x i16> %b to <4 x i32> + %add = add nsw <4 x i32> %ext.a, %ext.b + %shift = ashr <4 x i32> %add, + %res = trunc <4 x i32> %shift to <4 x i16> + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor unsigned operations for v4i16 +; avgflooru pattern: (a & b) + ((a ^ b) >> 1) +define void @test_paaddu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paaddu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: paaddu.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %and = and <4 x i16> %a, %b + %xor = xor <4 x i16> %a, %b + %shift = lshr <4 x i16> %xor, + %res = add <4 x i16> %and, %shift + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor signed operations for v8i8 +define void @test_paadd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paadd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: paadd.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %ext.a = sext <8 x i8> %a to <8 x i16> + %ext.b = sext <8 x i8> %b to <8 x i16> + %add = add nsw <8 x i16> %ext.a, %ext.b + %shift = ashr <8 x i16> %add, + %res = trunc <8 x i16> %shift to <8 x i8> + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor unsigned operations for v8i8 +define void @test_paaddu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paaddu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: paaddu.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %and = and <8 x i8> %a, %b + %xor = xor <8 x i8> %a, %b + %shift = lshr <8 x i8> %xor, + %res = add <8 x i8> %and, %shift + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference signed for v4i16 +; abds pattern: sub(smax(a,b), smin(a,b)) +define void @test_pdif_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdif_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pdif.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %min = call <4 x i16> @llvm.smin.v4i16(<4 x i16> %a, <4 x i16> %b) + %max = call <4 x i16> @llvm.smax.v4i16(<4 x i16> %a, <4 x i16> %b) + %res = sub <4 x i16> %max, %min + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference unsigned for v4i16 +; abdu pattern: sub(umax(a,b), umin(a,b)) +define void @test_pdifu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdifu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pdifu.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %min = call <4 x i16> @llvm.umin.v4i16(<4 x i16> %a, <4 x i16> %b) + %max = call <4 x i16> @llvm.umax.v4i16(<4 x i16> %a, <4 x i16> %b) + %res = sub <4 x i16> %max, %min + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference signed for v8i8 +define void @test_pdif_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdif_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pdif.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %min = call <8 x i8> @llvm.smin.v8i8(<8 x i8> %a, <8 x i8> %b) + %max = call <8 x i8> @llvm.smax.v8i8(<8 x i8> %a, <8 x i8> %b) + %res = sub <8 x i8> %max, %min + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference unsigned for v8i8 +define void @test_pdifu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdifu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pdifu.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %min = call <8 x i8> @llvm.umin.v8i8(<8 x i8> %a, <8 x i8> %b) + %max = call <8 x i8> @llvm.umax.v8i8(<8 x i8> %a, <8 x i8> %b) + %res = sub <8 x i8> %max, %min + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction signed for v4i16 +; pasub pattern: (a - b) arithmetic shift right 1 +define void @test_pasub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasub_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pasub.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %sub = sub <4 x i16> %a, %b + %res = ashr <4 x i16> %sub, + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction unsigned for v4i16 +; pasubu pattern: (a - b) logical shift right 1 +define void @test_pasubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasubu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pasubu.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %sub = sub <4 x i16> %a, %b + %res = lshr <4 x i16> %sub, + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction signed for v8i8 +define void @test_pasub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasub_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pasub.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %sub = sub <8 x i8> %a, %b + %res = ashr <8 x i8> %sub, + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction unsigned for v8i8 +define void @test_pasubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasubu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pasubu.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %sub = sub <8 x i8> %a, %b + %res = lshr <8 x i8> %sub, + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Intrinsic declarations +declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.usub.sat.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.usub.sat.v8i8(<8 x i8>, <8 x i8>) +declare <4 x i16> @llvm.smin.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.smax.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.umin.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.umax.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i8> @llvm.smin.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.smax.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.umin.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.umax.v8i8(<8 x i8>, <8 x i8>) From bc91902b4aae954be8efd5b505decc0b5570bf35 Mon Sep 17 00:00:00 2001 From: Brandon Wu Date: Wed, 15 Oct 2025 02:05:07 -0700 Subject: [PATCH 2/4] fixup! fix pasub, add BUILD_VECTOR, fix comments --- .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 1 + llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 126 ++++++++++++++- llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 8 + llvm/lib/Target/RISCV/RISCVInstrInfoP.td | 146 ++++++++---------- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 3 +- llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll | 58 +++++-- llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll | 70 +++++++-- 7 files changed, 303 insertions(+), 109 deletions(-) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 21dbb7cbc9844..a50e19a85263f 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -809,6 +809,7 @@ struct RISCVOperand final : public MCParsedAsmOperand { bool isSImm5() const { return isSImm<5>(); } bool isSImm6() const { return isSImm<6>(); } + bool isSImm8() const { return isSImm<8>(); } bool isSImm10() const { return isSImm<10>(); } bool isSImm11() const { return isSImm<11>(); } bool isSImm12() const { return isSImm<12>(); } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 1eb8c9457ee6a..33c0e6bc66ad9 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -491,11 +491,19 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::FROUNDEVEN, ISD::FCANONICALIZE}; if (Subtarget.hasStdExtP()) { + setTargetDAGCombine(ISD::TRUNCATE); + setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); + setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand); // load/store are already handled by pattern matching SmallVector VTs = {MVT::v2i16, MVT::v4i8}; - if (Subtarget.is64Bit()) + if (Subtarget.is64Bit()) { VTs.append({MVT::v2i32, MVT::v4i16, MVT::v8i8}); + setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); + setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand); + } for (auto VT : VTs) { + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::UADDSAT, VT, Legal); setOperationAction(ISD::SADDSAT, VT, Legal); setOperationAction(ISD::USUBSAT, VT, Legal); @@ -4340,6 +4348,34 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, MVT XLenVT = Subtarget.getXLenVT(); SDLoc DL(Op); + // Handle P extension packed vector BUILD_VECTOR with PLI for splat constants + if (Subtarget.hasStdExtP()) { + bool IsPExtVector = + (VT == MVT::v2i16 || VT == MVT::v4i8) || + (Subtarget.is64Bit() && + (VT == MVT::v4i16 || VT == MVT::v8i8 || VT == MVT::v2i32)); + if (IsPExtVector) { + if (SDValue SplatValue = cast(Op)->getSplatValue()) { + if (auto *C = dyn_cast(SplatValue)) { + int64_t SplatImm = C->getSExtValue(); + bool IsValidImm = false; + + // Check immediate range based on vector type + if (VT == MVT::v8i8 || VT == MVT::v4i8) + // PLI_B uses 8-bit unsigned immediate + IsValidImm = isUInt<8>(SplatImm); + else + // PLI_H and PLI_W use 10-bit signed immediate + IsValidImm = isInt<10>(SplatImm); + + if (IsValidImm) { + SDValue Imm = DAG.getConstant(SplatImm, DL, XLenVT); + return DAG.getNode(RISCVISD::PLI, DL, VT, Imm); + } + } + } + } + } // Proper support for f16 requires Zvfh. bf16 always requires special // handling. We need to cast the scalar to integer and create an integer @@ -16025,11 +16061,99 @@ static SDValue combineTruncSelectToSMaxUSat(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(ISD::TRUNCATE, DL, VT, Min); } +// Handle P extension averaging subtraction pattern: +// (vXiY (trunc (srl (sub ([s|z]ext vXiY:$a), ([s|z]ext vXiY:$b)), 1))) +// -> PASUB/PASUBU +static SDValue combinePExtTruncate(SDNode *N, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + if (!Subtarget.hasStdExtP() || !VT.isFixedLengthVector()) + return SDValue(); + + if (N0.getOpcode() != ISD::SRL) + return SDValue(); + + // Check if shift amount is 1 + SDValue ShAmt = N0.getOperand(1); + if (ShAmt.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + + BuildVectorSDNode *BV = dyn_cast(ShAmt.getNode()); + if (!BV) + return SDValue(); + SDValue Splat = BV->getSplatValue(); + if (!Splat) + return SDValue(); + ConstantSDNode *C = dyn_cast(Splat); + if (!C) + return SDValue(); + if (C->getZExtValue() != 1) + return SDValue(); + + // Check for SUB operation + SDValue Sub = N0.getOperand(0); + if (Sub.getOpcode() != ISD::SUB) + return SDValue(); + + SDValue LHS = Sub.getOperand(0); + SDValue RHS = Sub.getOperand(1); + + // Check if both operands are sign/zero extends from the target + // type + bool IsSignExt = LHS.getOpcode() == ISD::SIGN_EXTEND && + RHS.getOpcode() == ISD::SIGN_EXTEND; + bool IsZeroExt = LHS.getOpcode() == ISD::ZERO_EXTEND && + RHS.getOpcode() == ISD::ZERO_EXTEND; + + if (!IsSignExt && !IsZeroExt) + return SDValue(); + + SDValue A = LHS.getOperand(0); + SDValue B = RHS.getOperand(0); + + // Check if the extends are from our target vector type + if (A.getValueType() != VT || B.getValueType() != VT) + return SDValue(); + + // Determine the instruction based on type and signedness + unsigned Opc; + MVT VecVT = VT.getSimpleVT(); + if (VecVT == MVT::v4i16 && IsSignExt) + Opc = RISCV::PASUB_H; + else if (VecVT == MVT::v4i16 && IsZeroExt) + Opc = RISCV::PASUBU_H; + else if (VecVT == MVT::v2i16 && IsSignExt) + Opc = RISCV::PASUB_H; + else if (VecVT == MVT::v2i16 && IsZeroExt) + Opc = RISCV::PASUBU_H; + else if (VecVT == MVT::v8i8 && IsSignExt) + Opc = RISCV::PASUB_B; + else if (VecVT == MVT::v8i8 && IsZeroExt) + Opc = RISCV::PASUBU_B; + else if (VecVT == MVT::v4i8 && IsSignExt) + Opc = RISCV::PASUB_B; + else if (VecVT == MVT::v4i8 && IsZeroExt) + Opc = RISCV::PASUBU_B; + else if (VecVT == MVT::v2i32 && IsSignExt) + Opc = RISCV::PASUB_W; + else if (VecVT == MVT::v2i32 && IsZeroExt) + Opc = RISCV::PASUBU_W; + else + return SDValue(); + + // Create the machine node directly + return SDValue(DAG.getMachineNode(Opc, SDLoc(N), VT, {A, B}), 0); +} + static SDValue performTRUNCATECombine(SDNode *N, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + if (Subtarget.hasStdExtP() && VT.isFixedLengthVector()) + return combinePExtTruncate(N, DAG, Subtarget); + // Pre-promote (i1 (truncate (srl X, Y))) on RV64 with Zbs without zero // extending X. This is safe since we only need the LSB after the shift and // shift amounts larger than 31 would produce poison. If we wait until diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 96e1078467f19..7b143311d27e0 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -2895,6 +2895,12 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, case RISCVOp::OPERAND_UIMM9_LSB000: Ok = isShiftedUInt<6, 3>(Imm); break; + case RISCVOp::OPERAND_SIMM8_UNSIGNED: + Ok = isInt<8>(Imm); + break; + case RISCVOp::OPERAND_SIMM10_UNSIGNED: + Ok = isInt<10>(Imm); + break; case RISCVOp::OPERAND_SIMM10_LSB0000_NONZERO: Ok = isShiftedInt<6, 4>(Imm) && (Imm != 0); break; @@ -2916,6 +2922,8 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, // clang-format off CASE_OPERAND_SIMM(5) CASE_OPERAND_SIMM(6) + CASE_OPERAND_SIMM(8) + CASE_OPERAND_SIMM(10) CASE_OPERAND_SIMM(11) CASE_OPERAND_SIMM(12) CASE_OPERAND_SIMM(26) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index c5e2f12aafb1e..e6d4aa8070e0a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -18,7 +18,7 @@ // Operand and SDNode transformation definitions. //===----------------------------------------------------------------------===// -def simm10 : RISCVSImmOp<10>; +def simm10 : RISCVSImmLeafOp<10>; def SImm8UnsignedAsmOperand : SImmAsmOperand<8, "Unsigned"> { let RenderMethod = "addSImm8UnsignedOperands"; @@ -26,7 +26,7 @@ def SImm8UnsignedAsmOperand : SImmAsmOperand<8, "Unsigned"> { // A 8-bit signed immediate allowing range [-128, 255] // but represented as [-128, 127]. -def simm8_unsigned : RISCVOp { +def simm8_unsigned : RISCVSImmLeafOp<8> { let ParserMatchClass = SImm8UnsignedAsmOperand; let EncoderMethod = "getImmOpValue"; let DecoderMethod = "decodeSImmOperand<8>"; @@ -1456,59 +1456,49 @@ let Predicates = [HasStdExtP, IsRV32] in { def PMAXU_DB : RVPPairBinaryExchanged_rr<0b1111, 0b10, "pmaxu.db">; } // Predicates = [HasStdExtP, IsRV32] +def SDT_RISCVPLI : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>; +def riscv_pli : RVSDNode<"PLI", SDT_RISCVPLI>; let Predicates = [HasStdExtP, IsRV64] in { // Basic arithmetic patterns for v4i16 (16-bit elements in 64-bit GPR) - def: Pat<(v4i16 (add v4i16:$rs1, v4i16:$rs2)), (!cast("PADD_H") GPR:$rs1, GPR:$rs2)>; - def: Pat<(v4i16 (sub v4i16:$rs1, v4i16:$rs2)), (!cast("PSUB_H") GPR:$rs1, GPR:$rs2)>; - + def: Pat<(v4i16 (add (v4i16 GPR:$rs1), (v4i16 GPR:$rs2))), (!cast("PADD_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i16 (sub (v4i16 GPR:$rs1), (v4i16 GPR:$rs2))), (!cast("PSUB_H") GPR:$rs1, GPR:$rs2)>; + // Saturating add/sub patterns for v4i16 - def: Pat<(v4i16 (saddsat v4i16:$rs1, v4i16:$rs2)), (!cast("PSADD_H") GPR:$rs1, GPR:$rs2)>; - def: Pat<(v4i16 (uaddsat v4i16:$rs1, v4i16:$rs2)), (!cast("PSADDU_H") GPR:$rs1, GPR:$rs2)>; - def: Pat<(v4i16 (ssubsat v4i16:$rs1, v4i16:$rs2)), (!cast("PSSUB_H") GPR:$rs1, GPR:$rs2)>; - def: Pat<(v4i16 (usubsat v4i16:$rs1, v4i16:$rs2)), (!cast("PSSUBU_H") GPR:$rs1, GPR:$rs2)>; - + def: Pat<(v4i16 (saddsat (v4i16 GPR:$rs1), (v4i16 GPR:$rs2))), (!cast("PSADD_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i16 (uaddsat (v4i16 GPR:$rs1), (v4i16 GPR:$rs2))), (!cast("PSADDU_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i16 (ssubsat (v4i16 GPR:$rs1), (v4i16 GPR:$rs2))), (!cast("PSSUB_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i16 (usubsat (v4i16 GPR:$rs1), (v4i16 GPR:$rs2))), (!cast("PSSUBU_H") GPR:$rs1, GPR:$rs2)>; + // Averaging patterns for v4i16 - def: Pat<(v4i16 (avgfloors v4i16:$rs1, v4i16:$rs2)), (!cast("PAADD_H") GPR:$rs1, GPR:$rs2)>; - def: Pat<(v4i16 (avgflooru v4i16:$rs1, v4i16:$rs2)), (!cast("PAADDU_H") GPR:$rs1, GPR:$rs2)>; - - // Averaging subtraction patterns for v4i16 - // PASUB_H: signed (a - b) >> 1 - def: Pat<(v4i16 (sra (sub v4i16:$rs1, v4i16:$rs2), (v4i16 (build_vector (XLenVT 1))))), - (!cast("PASUB_H") GPR:$rs1, GPR:$rs2)>; - // PASUBU_H: unsigned (a - b) >> 1 - def: Pat<(v4i16 (srl (sub v4i16:$rs1, v4i16:$rs2), (v4i16 (build_vector (XLenVT 1))))), - (!cast("PASUBU_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i16 (avgfloors (v4i16 GPR:$rs1), (v4i16 GPR:$rs2))), (!cast("PAADD_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i16 (avgflooru (v4i16 GPR:$rs1), (v4i16 GPR:$rs2))), (!cast("PAADDU_H") GPR:$rs1, GPR:$rs2)>; // Absolute difference patterns for v4i16 - def: Pat<(v4i16 (abds v4i16:$rs1, v4i16:$rs2)), (!cast("PDIF_H") GPR:$rs1, GPR:$rs2)>; - def: Pat<(v4i16 (abdu v4i16:$rs1, v4i16:$rs2)), (!cast("PDIFU_H") GPR:$rs1, GPR:$rs2)>; - - // Basic arithmetic patterns for v8i8 (8-bit elements in 64-bit GPR) - def: Pat<(v8i8 (add v8i8:$rs1, v8i8:$rs2)), (!cast("PADD_B") GPR:$rs1, GPR:$rs2)>; - def: Pat<(v8i8 (sub v8i8:$rs1, v8i8:$rs2)), (!cast("PSUB_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i16 (abds (v4i16 GPR:$rs1), (v4i16 GPR:$rs2))), (!cast("PDIF_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i16 (abdu (v4i16 GPR:$rs1), (v4i16 GPR:$rs2))), (!cast("PDIFU_H") GPR:$rs1, GPR:$rs2)>; + // Basic arithmetic patterns for v8i8 (8-bit elements in 64-bit GPR) + def: Pat<(v8i8 (add (v8i8 GPR:$rs1), (v8i8 GPR:$rs2))), (!cast("PADD_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v8i8 (sub (v8i8 GPR:$rs1), (v8i8 GPR:$rs2))), (!cast("PSUB_B") GPR:$rs1, GPR:$rs2)>; + // Saturating add/sub patterns for v8i8 - def: Pat<(v8i8 (saddsat v8i8:$rs1, v8i8:$rs2)), (!cast("PSADD_B") GPR:$rs1, GPR:$rs2)>; - def: Pat<(v8i8 (uaddsat v8i8:$rs1, v8i8:$rs2)), (!cast("PSADDU_B") GPR:$rs1, GPR:$rs2)>; - def: Pat<(v8i8 (ssubsat v8i8:$rs1, v8i8:$rs2)), (!cast("PSSUB_B") GPR:$rs1, GPR:$rs2)>; - def: Pat<(v8i8 (usubsat v8i8:$rs1, v8i8:$rs2)), (!cast("PSSUBU_B") GPR:$rs1, GPR:$rs2)>; - + def: Pat<(v8i8 (saddsat (v8i8 GPR:$rs1), (v8i8 GPR:$rs2))), (!cast("PSADD_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v8i8 (uaddsat (v8i8 GPR:$rs1), (v8i8 GPR:$rs2))), (!cast("PSADDU_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v8i8 (ssubsat (v8i8 GPR:$rs1), (v8i8 GPR:$rs2))), (!cast("PSSUB_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v8i8 (usubsat (v8i8 GPR:$rs1), (v8i8 GPR:$rs2))), (!cast("PSSUBU_B") GPR:$rs1, GPR:$rs2)>; + // Averaging patterns for v8i8 - def: Pat<(v8i8 (avgfloors v8i8:$rs1, v8i8:$rs2)), (!cast("PAADD_B") GPR:$rs1, GPR:$rs2)>; - def: Pat<(v8i8 (avgflooru v8i8:$rs1, v8i8:$rs2)), (!cast("PAADDU_B") GPR:$rs1, GPR:$rs2)>; - - // Averaging subtraction patterns for v8i8 - // PASUB_B: signed (a - b) >> 1 - def: Pat<(v8i8 (sra (sub v8i8:$rs1, v8i8:$rs2), (v8i8 (build_vector (XLenVT 1))))), - (!cast("PASUB_B") GPR:$rs1, GPR:$rs2)>; - // PASUBU_B: unsigned (a - b) >> 1 - def: Pat<(v8i8 (srl (sub v8i8:$rs1, v8i8:$rs2), (v8i8 (build_vector (XLenVT 1))))), - (!cast("PASUBU_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v8i8 (avgfloors (v8i8 GPR:$rs1), (v8i8 GPR:$rs2))), (!cast("PAADD_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v8i8 (avgflooru (v8i8 GPR:$rs1), (v8i8 GPR:$rs2))), (!cast("PAADDU_B") GPR:$rs1, GPR:$rs2)>; // Absolute difference patterns for v8i8 - def: Pat<(v8i8 (abds v8i8:$rs1, v8i8:$rs2)), (!cast("PDIF_B") GPR:$rs1, GPR:$rs2)>; - def: Pat<(v8i8 (abdu v8i8:$rs1, v8i8:$rs2)), (!cast("PDIFU_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v8i8 (abds (v8i8 GPR:$rs1), (v8i8 GPR:$rs2))), (!cast("PDIF_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v8i8 (abdu (v8i8 GPR:$rs1), (v8i8 GPR:$rs2))), (!cast("PDIFU_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i32 (riscv_pli simm10:$imm10)), (PLI_W simm10:$imm10)>; + def: Pat<(v4i16 (riscv_pli simm10:$imm10)), (PLI_H simm10:$imm10)>; + def: Pat<(v8i8 (riscv_pli simm8_unsigned:$imm8)), (PLI_B simm8_unsigned:$imm8)>; + // Load/Store patterns for v4i16 and v8i8 (use regular GPR load/store since they're in GPRs) def : StPat; def : LdPat; @@ -1520,62 +1510,48 @@ let Predicates = [HasStdExtP, IsRV64] in { def : LdPat; } // Predicates = [HasStdExtP, IsRV64] -let Predicates = [HasStdExtP, IsRV32] in { +let Predicates = [HasStdExtP] in { // Basic arithmetic patterns for v2i16 (16-bit elements in 32-bit GPR) - def: Pat<(v2i16 (add v2i16:$rs1, v2i16:$rs2)), (!cast("PADD_H") GPR:$rs1, GPR:$rs2)>; - def: Pat<(v2i16 (sub v2i16:$rs1, v2i16:$rs2)), (!cast("PSUB_H") GPR:$rs1, GPR:$rs2)>; - + def: Pat<(v2i16 (add (v2i16 GPR:$rs1), (v2i16 GPR:$rs2))), (!cast("PADD_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i16 (sub (v2i16 GPR:$rs1), (v2i16 GPR:$rs2))), (!cast("PSUB_H") GPR:$rs1, GPR:$rs2)>; + // Saturating add/sub patterns for v2i16 - def: Pat<(v2i16 (saddsat v2i16:$rs1, v2i16:$rs2)), (!cast("PSADD_H") GPR:$rs1, GPR:$rs2)>; - def: Pat<(v2i16 (uaddsat v2i16:$rs1, v2i16:$rs2)), (!cast("PSADDU_H") GPR:$rs1, GPR:$rs2)>; - def: Pat<(v2i16 (ssubsat v2i16:$rs1, v2i16:$rs2)), (!cast("PSSUB_H") GPR:$rs1, GPR:$rs2)>; - def: Pat<(v2i16 (usubsat v2i16:$rs1, v2i16:$rs2)), (!cast("PSSUBU_H") GPR:$rs1, GPR:$rs2)>; - + def: Pat<(v2i16 (saddsat (v2i16 GPR:$rs1), (v2i16 GPR:$rs2))), (!cast("PSADD_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i16 (uaddsat (v2i16 GPR:$rs1), (v2i16 GPR:$rs2))), (!cast("PSADDU_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i16 (ssubsat (v2i16 GPR:$rs1), (v2i16 GPR:$rs2))), (!cast("PSSUB_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i16 (usubsat (v2i16 GPR:$rs1), (v2i16 GPR:$rs2))), (!cast("PSSUBU_H") GPR:$rs1, GPR:$rs2)>; + // Averaging patterns for v2i16 - def: Pat<(v2i16 (avgfloors v2i16:$rs1, v2i16:$rs2)), (!cast("PAADD_H") GPR:$rs1, GPR:$rs2)>; - def: Pat<(v2i16 (avgflooru v2i16:$rs1, v2i16:$rs2)), (!cast("PAADDU_H") GPR:$rs1, GPR:$rs2)>; - - // Averaging subtraction patterns for v2i16 - // PASUB_H: signed (a - b) >> 1 - def: Pat<(v2i16 (sra (sub v2i16:$rs1, v2i16:$rs2), (v2i16 (build_vector (XLenVT 1))))), - (!cast("PASUB_H") GPR:$rs1, GPR:$rs2)>; - // PASUBU_H: unsigned (a - b) >> 1 - def: Pat<(v2i16 (srl (sub v2i16:$rs1, v2i16:$rs2), (v2i16 (build_vector (XLenVT 1))))), - (!cast("PASUBU_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i16 (avgfloors (v2i16 GPR:$rs1), (v2i16 GPR:$rs2))), (!cast("PAADD_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i16 (avgflooru (v2i16 GPR:$rs1), (v2i16 GPR:$rs2))), (!cast("PAADDU_H") GPR:$rs1, GPR:$rs2)>; // Absolute difference patterns for v2i16 - def: Pat<(v2i16 (abds v2i16:$rs1, v2i16:$rs2)), (!cast("PDIF_H") GPR:$rs1, GPR:$rs2)>; - def: Pat<(v2i16 (abdu v2i16:$rs1, v2i16:$rs2)), (!cast("PDIFU_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i16 (abds (v2i16 GPR:$rs1), (v2i16 GPR:$rs2))), (!cast("PDIF_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i16 (abdu (v2i16 GPR:$rs1), (v2i16 GPR:$rs2))), (!cast("PDIFU_H") GPR:$rs1, GPR:$rs2)>; // Basic arithmetic patterns for v4i8 (8-bit elements in 32-bit GPR) - def: Pat<(v4i8 (add v4i8:$rs1, v4i8:$rs2)), (!cast("PADD_B") GPR:$rs1, GPR:$rs2)>; - def: Pat<(v4i8 (sub v4i8:$rs1, v4i8:$rs2)), (!cast("PSUB_B") GPR:$rs1, GPR:$rs2)>; - + def: Pat<(v4i8 (add (v4i8 GPR:$rs1), (v4i8 GPR:$rs2))), (!cast("PADD_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i8 (sub (v4i8 GPR:$rs1), (v4i8 GPR:$rs2))), (!cast("PSUB_B") GPR:$rs1, GPR:$rs2)>; + // Saturating add/sub patterns for v4i8 - def: Pat<(v4i8 (saddsat v4i8:$rs1, v4i8:$rs2)), (!cast("PSADD_B") GPR:$rs1, GPR:$rs2)>; - def: Pat<(v4i8 (uaddsat v4i8:$rs1, v4i8:$rs2)), (!cast("PSADDU_B") GPR:$rs1, GPR:$rs2)>; - def: Pat<(v4i8 (ssubsat v4i8:$rs1, v4i8:$rs2)), (!cast("PSSUB_B") GPR:$rs1, GPR:$rs2)>; - def: Pat<(v4i8 (usubsat v4i8:$rs1, v4i8:$rs2)), (!cast("PSSUBU_B") GPR:$rs1, GPR:$rs2)>; - + def: Pat<(v4i8 (saddsat (v4i8 GPR:$rs1), (v4i8 GPR:$rs2))), (!cast("PSADD_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i8 (uaddsat (v4i8 GPR:$rs1), (v4i8 GPR:$rs2))), (!cast("PSADDU_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i8 (ssubsat (v4i8 GPR:$rs1), (v4i8 GPR:$rs2))), (!cast("PSSUB_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i8 (usubsat (v4i8 GPR:$rs1), (v4i8 GPR:$rs2))), (!cast("PSSUBU_B") GPR:$rs1, GPR:$rs2)>; + // Averaging patterns for v4i8 - def: Pat<(v4i8 (avgfloors v4i8:$rs1, v4i8:$rs2)), (!cast("PAADD_B") GPR:$rs1, GPR:$rs2)>; - def: Pat<(v4i8 (avgflooru v4i8:$rs1, v4i8:$rs2)), (!cast("PAADDU_B") GPR:$rs1, GPR:$rs2)>; - - // Averaging subtraction patterns for v4i8 - // PASUB_B: signed (a - b) >> 1 - def: Pat<(v4i8 (sra (sub v4i8:$rs1, v4i8:$rs2), (v4i8 (build_vector (XLenVT 1))))), - (!cast("PASUB_B") GPR:$rs1, GPR:$rs2)>; - // PASUBU_B: unsigned (a - b) >> 1 - def: Pat<(v4i8 (srl (sub v4i8:$rs1, v4i8:$rs2), (v4i8 (build_vector (XLenVT 1))))), - (!cast("PASUBU_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i8 (avgfloors (v4i8 GPR:$rs1), (v4i8 GPR:$rs2))), (!cast("PAADD_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i8 (avgflooru (v4i8 GPR:$rs1), (v4i8 GPR:$rs2))), (!cast("PAADDU_B") GPR:$rs1, GPR:$rs2)>; // Absolute difference patterns for v4i8 - def: Pat<(v4i8 (abds v4i8:$rs1, v4i8:$rs2)), (!cast("PDIF_B") GPR:$rs1, GPR:$rs2)>; - def: Pat<(v4i8 (abdu v4i8:$rs1, v4i8:$rs2)), (!cast("PDIFU_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i8 (abds (v4i8 GPR:$rs1), (v4i8 GPR:$rs2))), (!cast("PDIF_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i8 (abdu (v4i8 GPR:$rs1), (v4i8 GPR:$rs2))), (!cast("PDIFU_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i16 (riscv_pli simm10:$imm10)), (PLI_H simm10:$imm10)>; + def: Pat<(v4i8 (riscv_pli simm8_unsigned:$imm8)), (PLI_B simm8_unsigned:$imm8)>; // Load/Store patterns for v2i16 and v4i8 (use regular GPR load/store since they're in GPRs) def : StPat; def : LdPat; def : StPat; def : LdPat; -} // Predicates = [HasStdExtP, IsRV32] +} // Predicates = [HasStdExtP] diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index e669175a3d8e1..a07d441b448d2 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1635,7 +1635,8 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16) // For now, skip all fixed vector cost analysis when P extension is available // to avoid crashes in getMinRVVVectorSizeInBits() - if (ST->hasStdExtP() && (isa(Dst) || isa(Src))) { + if (ST->hasStdExtP() && + (isa(Dst) || isa(Src))) { return 1; // Treat as single instruction cost for now } diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll index 8a4ab1d545f41..c38f103d86bd6 100644 --- a/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll +++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll @@ -349,9 +349,12 @@ define void @test_pasub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %a_ptr %b = load <2 x i16>, ptr %b_ptr - %sub = sub <2 x i16> %a, %b - %res = ashr <2 x i16> %sub, - store <2 x i16> %res, ptr %ret_ptr + %a_ext = sext <2 x i16> %a to <2 x i32> + %b_ext = sext <2 x i16> %b to <2 x i32> + %sub = sub <2 x i32> %a_ext, %b_ext + %res = ashr <2 x i32> %sub, + %res_trunc = trunc <2 x i32> %res to <2 x i16> + store <2 x i16> %res_trunc, ptr %ret_ptr ret void } @@ -367,9 +370,12 @@ define void @test_pasubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %a_ptr %b = load <2 x i16>, ptr %b_ptr - %sub = sub <2 x i16> %a, %b - %res = lshr <2 x i16> %sub, - store <2 x i16> %res, ptr %ret_ptr + %a_ext = zext <2 x i16> %a to <2 x i32> + %b_ext = zext <2 x i16> %b to <2 x i32> + %sub = sub <2 x i32> %a_ext, %b_ext + %res = lshr <2 x i32> %sub, + %res_trunc = trunc <2 x i32> %res to <2 x i16> + store <2 x i16> %res_trunc, ptr %ret_ptr ret void } @@ -384,9 +390,12 @@ define void @test_pasub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %a_ptr %b = load <4 x i8>, ptr %b_ptr - %sub = sub <4 x i8> %a, %b - %res = ashr <4 x i8> %sub, - store <4 x i8> %res, ptr %ret_ptr + %a_ext = sext <4 x i8> %a to <4 x i16> + %b_ext = sext <4 x i8> %b to <4 x i16> + %sub = sub <4 x i16> %a_ext, %b_ext + %res = ashr <4 x i16> %sub, + %res_trunc = trunc <4 x i16> %res to <4 x i8> + store <4 x i8> %res_trunc, ptr %ret_ptr ret void } @@ -401,8 +410,35 @@ define void @test_pasubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %a_ptr %b = load <4 x i8>, ptr %b_ptr - %sub = sub <4 x i8> %a, %b - %res = lshr <4 x i8> %sub, + %a_ext = zext <4 x i8> %a to <4 x i16> + %b_ext = zext <4 x i8> %b to <4 x i16> + %sub = sub <4 x i16> %a_ext, %b_ext + %res = lshr <4 x i16> %sub, + %res_trunc = trunc <4 x i16> %res to <4 x i8> + store <4 x i8> %res_trunc, ptr %ret_ptr + ret void +} + +; Test PLI (pack load immediate) for v2i16 +define void @test_pli_h(ptr %ret_ptr) { +; CHECK-LABEL: test_pli_h: +; CHECK: # %bb.0: +; CHECK-NEXT: pli.h a1, 42 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %res = add <2 x i16> , + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test PLI for v4i8 with unsigned immediate +define void @test_pli_b(ptr %ret_ptr) { +; CHECK-LABEL: test_pli_b: +; CHECK: # %bb.0: +; CHECK-NEXT: pli.b a1, 32 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %res = add <4 x i8> , store <4 x i8> %res, ptr %ret_ptr ret void } diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll index d4918e4e0aa62..d4452b7ccbc65 100644 --- a/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll @@ -353,9 +353,12 @@ define void @test_pasub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %a_ptr %b = load <4 x i16>, ptr %b_ptr - %sub = sub <4 x i16> %a, %b - %res = ashr <4 x i16> %sub, - store <4 x i16> %res, ptr %ret_ptr + %a_ext = sext <4 x i16> %a to <4 x i32> + %b_ext = sext <4 x i16> %b to <4 x i32> + %sub = sub <4 x i32> %a_ext, %b_ext + %res = ashr <4 x i32> %sub, + %res_trunc = trunc <4 x i32> %res to <4 x i16> + store <4 x i16> %res_trunc, ptr %ret_ptr ret void } @@ -371,9 +374,12 @@ define void @test_pasubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %a_ptr %b = load <4 x i16>, ptr %b_ptr - %sub = sub <4 x i16> %a, %b - %res = lshr <4 x i16> %sub, - store <4 x i16> %res, ptr %ret_ptr + %a_ext = zext <4 x i16> %a to <4 x i32> + %b_ext = zext <4 x i16> %b to <4 x i32> + %sub = sub <4 x i32> %a_ext, %b_ext + %res = lshr <4 x i32> %sub, + %res_trunc = trunc <4 x i32> %res to <4 x i16> + store <4 x i16> %res_trunc, ptr %ret_ptr ret void } @@ -388,9 +394,12 @@ define void @test_pasub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %a_ptr %b = load <8 x i8>, ptr %b_ptr - %sub = sub <8 x i8> %a, %b - %res = ashr <8 x i8> %sub, - store <8 x i8> %res, ptr %ret_ptr + %a_ext = sext <8 x i8> %a to <8 x i16> + %b_ext = sext <8 x i8> %b to <8 x i16> + %sub = sub <8 x i16> %a_ext, %b_ext + %res = ashr <8 x i16> %sub, + %res_trunc = trunc <8 x i16> %res to <8 x i8> + store <8 x i8> %res_trunc, ptr %ret_ptr ret void } @@ -405,12 +414,51 @@ define void @test_pasubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %a_ptr %b = load <8 x i8>, ptr %b_ptr - %sub = sub <8 x i8> %a, %b - %res = lshr <8 x i8> %sub, + %a_ext = zext <8 x i8> %a to <8 x i16> + %b_ext = zext <8 x i8> %b to <8 x i16> + %sub = sub <8 x i16> %a_ext, %b_ext + %res = lshr <8 x i16> %sub, + %res_trunc = trunc <8 x i16> %res to <8 x i8> + store <8 x i8> %res_trunc, ptr %ret_ptr + ret void +} + +; Test PLI (pack load immediate) for v4i16 +define void @test_pli_h(ptr %ret_ptr) { +; CHECK-LABEL: test_pli_h: +; CHECK: # %bb.0: +; CHECK-NEXT: pli.h a1, 100 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %res = add <4 x i16> , + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test PLI for v8i8 with unsigned immediate +define void @test_pli_b(ptr %ret_ptr) { +; CHECK-LABEL: test_pli_b: +; CHECK: # %bb.0: +; CHECK-NEXT: pli.b a1, 64 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %res = add <8 x i8> , store <8 x i8> %res, ptr %ret_ptr ret void } +; Test PLI for v2i32 with signed immediate +define void @test_pli_w(ptr %ret_ptr) { +; CHECK-LABEL: test_pli_w: +; CHECK: # %bb.0: +; CHECK-NEXT: pli.w a1, -256 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %res = add <2 x i32> , + store <2 x i32> %res, ptr %ret_ptr + ret void +} + ; Intrinsic declarations declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>) declare <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16>, <4 x i16>) From 34dc464b646e0e196431212930e6d9c708387f98 Mon Sep 17 00:00:00 2001 From: Brandon Wu Date: Wed, 15 Oct 2025 02:34:20 -0700 Subject: [PATCH 3/4] fixup! add a switch for codegen --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 14 ++++++++++---- llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll | 2 +- llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll | 2 +- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 33c0e6bc66ad9..9eb98766e4846 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -87,6 +87,12 @@ static cl::opt "be combined with a shift"), cl::init(true)); +static cl::opt EnablePExtCodeGen( + DEBUG_TYPE "-enable-p-ext-codegen", cl::Hidden, + cl::desc("Turn on P Extension codegen(This is a temporary switch where " + "only partial codegen is currently supported."), + cl::init(false)); + RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, const RISCVSubtarget &STI) : TargetLowering(TM), Subtarget(STI) { @@ -280,7 +286,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, } // fixed vector is stored in GPRs for P extension packed operations - if (Subtarget.hasStdExtP()) { + if (Subtarget.hasStdExtP() && EnablePExtCodeGen) { addRegisterClass(MVT::v2i16, &RISCV::GPRRegClass); addRegisterClass(MVT::v4i8, &RISCV::GPRRegClass); if (Subtarget.is64Bit()) { @@ -490,7 +496,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::FTRUNC, ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN, ISD::FCANONICALIZE}; - if (Subtarget.hasStdExtP()) { + if (Subtarget.hasStdExtP() && EnablePExtCodeGen) { setTargetDAGCombine(ISD::TRUNCATE); setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand); @@ -4349,7 +4355,7 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, SDLoc DL(Op); // Handle P extension packed vector BUILD_VECTOR with PLI for splat constants - if (Subtarget.hasStdExtP()) { + if (Subtarget.hasStdExtP() && EnablePExtCodeGen) { bool IsPExtVector = (VT == MVT::v2i16 || VT == MVT::v4i8) || (Subtarget.is64Bit() && @@ -16151,7 +16157,7 @@ static SDValue performTRUNCATECombine(SDNode *N, SelectionDAG &DAG, SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); - if (Subtarget.hasStdExtP() && VT.isFixedLengthVector()) + if (Subtarget.hasStdExtP() && VT.isFixedLengthVector() && EnablePExtCodeGen) return combinePExtTruncate(N, DAG, Subtarget); // Pre-promote (i1 (truncate (srl X, Y))) on RV64 with Zbs without zero diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll index c38f103d86bd6..2cb93f1faab91 100644 --- a/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll +++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+experimental-p -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+experimental-p -riscv-lower-enable-p-ext-codegen -verify-machineinstrs < %s | FileCheck %s ; Test basic add/sub operations for v2i16 define void @test_padd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll index d4452b7ccbc65..67cfb0e2123a4 100644 --- a/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv64 -mattr=+experimental-p -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+experimental-p -riscv-lower-enable-p-ext-codegen -verify-machineinstrs < %s | FileCheck %s ; Test basic add/sub operations for v4i16 define void @test_padd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { From 47c5eb436007750bf7563b669c88645840424ecd Mon Sep 17 00:00:00 2001 From: Brandon Wu Date: Sun, 19 Oct 2025 23:33:56 -0700 Subject: [PATCH 4/4] fixup! handle rv32 legalization --- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 8 ++ llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 135 ++++++++++++++------ llvm/lib/Target/RISCV/RISCVISelLowering.h | 3 + llvm/lib/Target/RISCV/RISCVInstrInfoP.td | 29 +++++ llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll | 1 + 5 files changed, 138 insertions(+), 38 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 437022f5cde9f..4c1a9929d2574 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -2471,6 +2471,14 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { CurDAG->RemoveDeadNode(Node); return; } + if (Subtarget->hasStdExtP()) { + if (((VT == MVT::v4i16 || VT == MVT::v8i8) && SrcVT == MVT::i64) || + ((SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) && VT == MVT::i64)) { + ReplaceUses(SDValue(Node, 0), Node->getOperand(0)); + CurDAG->RemoveDeadNode(Node); + } + return; + } break; } case ISD::INSERT_SUBVECTOR: diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 9eb98766e4846..7a5b6bb2b90d0 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -287,12 +287,13 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, // fixed vector is stored in GPRs for P extension packed operations if (Subtarget.hasStdExtP() && EnablePExtCodeGen) { - addRegisterClass(MVT::v2i16, &RISCV::GPRRegClass); - addRegisterClass(MVT::v4i8, &RISCV::GPRRegClass); if (Subtarget.is64Bit()) { addRegisterClass(MVT::v2i32, &RISCV::GPRRegClass); addRegisterClass(MVT::v4i16, &RISCV::GPRRegClass); addRegisterClass(MVT::v8i8, &RISCV::GPRRegClass); + } else { + addRegisterClass(MVT::v2i16, &RISCV::GPRRegClass); + addRegisterClass(MVT::v4i8, &RISCV::GPRRegClass); } } @@ -500,26 +501,31 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::TRUNCATE); setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand); - // load/store are already handled by pattern matching - SmallVector VTs = {MVT::v2i16, MVT::v4i8}; + SmallVector VTs; if (Subtarget.is64Bit()) { VTs.append({MVT::v2i32, MVT::v4i16, MVT::v8i8}); setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand); + setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); + setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand); + setOperationAction(ISD::LOAD, MVT::v2i16, Custom); + setOperationAction(ISD::LOAD, MVT::v4i8, Custom); + setOperationAction(ISD::STORE, MVT::v2i16, Custom); + setOperationAction(ISD::STORE, MVT::v4i8, Custom); + } else { + VTs.append({MVT::v2i16, MVT::v4i8}); } - for (auto VT : VTs) { - setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::UADDSAT, VT, Legal); - setOperationAction(ISD::SADDSAT, VT, Legal); - setOperationAction(ISD::USUBSAT, VT, Legal); - setOperationAction(ISD::SSUBSAT, VT, Legal); - setOperationAction(ISD::SSHLSAT, VT, Legal); - setOperationAction(ISD::USHLSAT, VT, Legal); - setOperationAction(ISD::BITCAST, VT, Custom); - setOperationAction({ISD::AVGFLOORS, ISD::AVGFLOORU}, VT, Legal); - setOperationAction({ISD::ABDS, ISD::ABDU}, VT, Legal); - } + setOperationAction(ISD::UADDSAT, VTs, Legal); + setOperationAction(ISD::SADDSAT, VTs, Legal); + setOperationAction(ISD::USUBSAT, VTs, Legal); + setOperationAction(ISD::SSUBSAT, VTs, Legal); + setOperationAction(ISD::SSHLSAT, VTs, Legal); + setOperationAction(ISD::USHLSAT, VTs, Legal); + setOperationAction({ISD::AVGFLOORS, ISD::AVGFLOORU}, VTs, Legal); + setOperationAction({ISD::ABDS, ISD::ABDU}, VTs, Legal); + setOperationAction(ISD::BUILD_VECTOR, VTs, Custom); + setOperationAction(ISD::BITCAST, VTs, Custom); } if (Subtarget.hasStdExtZfbfmin()) { @@ -1739,6 +1745,15 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, MaxLoadsPerMemcmp = Subtarget.getMaxLoadsPerMemcmp(/*OptSize=*/false); } +TargetLoweringBase::LegalizeTypeAction +RISCVTargetLowering::getPreferredVectorAction(MVT VT) const { + if (Subtarget.hasStdExtP() && Subtarget.is64Bit()) + if (VT == MVT::v2i16 || VT == MVT::v4i8) + return TypeWidenVector; + + return TargetLoweringBase::getPreferredVectorAction(VT); +} + EVT RISCVTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const { @@ -7533,6 +7548,19 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, Lo, Hi); } + if (Subtarget.hasStdExtP()) { + bool Is32BitCast = + (VT == MVT::i32 && (Op0VT == MVT::v4i8 || Op0VT == MVT::v2i16)) || + (Op0VT == MVT::i32 && (VT == MVT::v4i8 || VT == MVT::v2i16)); + bool Is64BitCast = + (VT == MVT::i64 && (Op0VT == MVT::v8i8 || Op0VT == MVT::v4i16 || + Op0VT == MVT::v2i32)) || + (Op0VT == MVT::i64 && + (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32)); + if (Is32BitCast || Is64BitCast) + return Op; + } + // Consider other scalar<->scalar casts as legal if the types are legal. // Otherwise expand them. if (!VT.isVector() && !Op0VT.isVector()) { @@ -8205,6 +8233,17 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, auto *Store = cast(Op); SDValue StoredVal = Store->getValue(); EVT VT = StoredVal.getValueType(); + if (Subtarget.hasStdExtP()) { + if (VT == MVT::v2i16 || VT == MVT::v4i8) { + SDValue DL(Op); + SDValue Cast = DAG.getBitcast(MVT::i32, StoredVal); + SDValue NewStore = + DAG.getStore(Store->getChain(), DL, Cast, Store->getBasePtr(), + Store->getPointerInfo(), Store->getBaseAlign(), + Store->getMemOperand()->getFlags()); + return NewStore; + } + } if (VT == MVT::f64) { assert(Subtarget.hasStdExtZdinx() && !Subtarget.hasStdExtZilsd() && !Subtarget.is64Bit() && "Unexpected custom legalisation"); @@ -14632,6 +14671,19 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, return; } + if (Subtarget.hasStdExtP() && Subtarget.is64Bit()) { + SDLoc DL(N); + SDValue ExtLoad = + DAG.getExtLoad(ISD::SEXTLOAD, DL, MVT::i64, Ld->getChain(), + Ld->getBasePtr(), MVT::i32, Ld->getMemOperand()); + if (N->getValueType(0) == MVT::v2i16) + Results.push_back(DAG.getBitcast(MVT::v4i16, ExtLoad)); + else if (N->getValueType(0) == MVT::v4i8) + Results.push_back(DAG.getBitcast(MVT::v8i8, ExtLoad)); + Results.push_back(ExtLoad.getValue(1)); + return; + } + assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && "Unexpected custom legalisation"); @@ -14960,6 +15012,24 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, NewRes)); break; } + case RISCVISD::PASUB: + case RISCVISD::PASUBU: { + MVT VT = N->getSimpleValueType(0); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + assert(VT == MVT::v2i16 || VT == MVT::v4i8); + MVT NewVT = MVT::v4i16; + if (VT == MVT::v4i8) + NewVT = MVT::v8i8; + Op0 = DAG.getBitcast(MVT::i32, Op0); + Op0 = DAG.getSExtOrTrunc(Op0, DL, MVT::i64); + Op0 = DAG.getBitcast(NewVT, Op0); + Op1 = DAG.getBitcast(MVT::i32, Op1); + Op1 = DAG.getSExtOrTrunc(Op1, DL, MVT::i64); + Op1 = DAG.getBitcast(NewVT, Op1); + Results.push_back(DAG.getNode(N->getOpcode(), DL, NewVT, {Op0, Op1})); + return; + } case ISD::EXTRACT_VECTOR_ELT: { // Custom-legalize an EXTRACT_VECTOR_ELT where XLEN, SDTCisInt<1>]>; def riscv_pli : RVSDNode<"PLI", SDT_RISCVPLI>; +def SDT_RISCVPASUB + : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>]>; +def riscv_pasub : RVSDNode<"PASUB", SDT_RISCVPASUB>; +def SDT_RISCVPASUBU + : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>]>; +def riscv_pasubu : RVSDNode<"PASUBU", SDT_RISCVPASUBU>; let Predicates = [HasStdExtP, IsRV64] in { // Basic arithmetic patterns for v4i16 (16-bit elements in 64-bit GPR) def: Pat<(v4i16 (add (v4i16 GPR:$rs1), (v4i16 GPR:$rs2))), (!cast("PADD_H") GPR:$rs1, GPR:$rs2)>; @@ -1499,6 +1505,19 @@ let Predicates = [HasStdExtP, IsRV64] in { def: Pat<(v4i16 (riscv_pli simm10:$imm10)), (PLI_H simm10:$imm10)>; def: Pat<(v8i8 (riscv_pli simm8_unsigned:$imm8)), (PLI_B simm8_unsigned:$imm8)>; + def: Pat<(v8i8 (riscv_pasub (v8i8 GPR:$rs1), (v8i8 GPR:$rs2))), + (PASUB_B (v8i8 GPR:$rs1), (v8i8 GPR:$rs2))>; + def: Pat<(v4i16 (riscv_pasub (v4i16 GPR:$rs1), (v4i16 GPR:$rs2))), + (PASUB_H (v4i16 GPR:$rs1), (v4i16 GPR:$rs2))>; + def: Pat<(v2i32 (riscv_pasub (v2i32 GPR:$rs1), (v2i32 GPR:$rs2))), + (PASUB_W (v2i32 GPR:$rs1), (v2i32 GPR:$rs2))>; + def: Pat<(v8i8 (riscv_pasubu (v8i8 GPR:$rs1), (v8i8 GPR:$rs2))), + (PASUBU_B (v8i8 GPR:$rs1), (v8i8 GPR:$rs2))>; + def: Pat<(v4i16 (riscv_pasubu (v4i16 GPR:$rs1), (v4i16 GPR:$rs2))), + (PASUBU_H (v4i16 GPR:$rs1), (v4i16 GPR:$rs2))>; + def: Pat<(v2i32 (riscv_pasubu (v2i32 GPR:$rs1), (v2i32 GPR:$rs2))), + (PASUBU_W (v2i32 GPR:$rs1), (v2i32 GPR:$rs2))>; + // Load/Store patterns for v4i16 and v8i8 (use regular GPR load/store since they're in GPRs) def : StPat; def : LdPat; @@ -1549,6 +1568,16 @@ let Predicates = [HasStdExtP] in { def: Pat<(v2i16 (riscv_pli simm10:$imm10)), (PLI_H simm10:$imm10)>; def: Pat<(v4i8 (riscv_pli simm8_unsigned:$imm8)), (PLI_B simm8_unsigned:$imm8)>; + + def: Pat<(v4i8 (riscv_pasub (v4i8 GPR:$rs1), (v4i8 GPR:$rs2))), + (PASUB_B (v4i8 GPR:$rs1), (v4i8 GPR:$rs2))>; + def: Pat<(v2i16 (riscv_pasub (v2i16 GPR:$rs1), (v2i16 GPR:$rs2))), + (PASUB_H (v2i16 GPR:$rs1), (v2i16 GPR:$rs2))>; + def: Pat<(v4i8 (riscv_pasubu (v4i8 GPR:$rs1), (v4i8 GPR:$rs2))), + (PASUBU_B (v4i8 GPR:$rs1), (v4i8 GPR:$rs2))>; + def: Pat<(v2i16 (riscv_pasubu (v2i16 GPR:$rs1), (v2i16 GPR:$rs2))), + (PASUBU_H (v2i16 GPR:$rs1), (v2i16 GPR:$rs2))>; + // Load/Store patterns for v2i16 and v4i8 (use regular GPR load/store since they're in GPRs) def : StPat; def : LdPat; diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll index 2cb93f1faab91..fce0a31d60335 100644 --- a/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll +++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+experimental-p -riscv-lower-enable-p-ext-codegen -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+experimental-p -riscv-lower-enable-p-ext-codegen -verify-machineinstrs < %s | FileCheck %s ; Test basic add/sub operations for v2i16 define void @test_padd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) {