diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 7123a2d706787..1eb8c9457ee6a 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -279,6 +279,17 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, addRegisterClass(MVT::riscv_nxv32i8x2, &RISCV::VRN2M4RegClass); } + // fixed vector is stored in GPRs for P extension packed operations + if (Subtarget.hasStdExtP()) { + addRegisterClass(MVT::v2i16, &RISCV::GPRRegClass); + addRegisterClass(MVT::v4i8, &RISCV::GPRRegClass); + if (Subtarget.is64Bit()) { + addRegisterClass(MVT::v2i32, &RISCV::GPRRegClass); + addRegisterClass(MVT::v4i16, &RISCV::GPRRegClass); + addRegisterClass(MVT::v8i8, &RISCV::GPRRegClass); + } + } + // Compute derived properties from the register classes. computeRegisterProperties(STI.getRegisterInfo()); @@ -479,6 +490,24 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::FTRUNC, ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN, ISD::FCANONICALIZE}; + if (Subtarget.hasStdExtP()) { + // load/store are already handled by pattern matching + SmallVector VTs = {MVT::v2i16, MVT::v4i8}; + if (Subtarget.is64Bit()) + VTs.append({MVT::v2i32, MVT::v4i16, MVT::v8i8}); + for (auto VT : VTs) { + setOperationAction(ISD::UADDSAT, VT, Legal); + setOperationAction(ISD::SADDSAT, VT, Legal); + setOperationAction(ISD::USUBSAT, VT, Legal); + setOperationAction(ISD::SSUBSAT, VT, Legal); + setOperationAction(ISD::SSHLSAT, VT, Legal); + setOperationAction(ISD::USHLSAT, VT, Legal); + setOperationAction(ISD::BITCAST, VT, Custom); + setOperationAction({ISD::AVGFLOORS, ISD::AVGFLOORU}, VT, Legal); + setOperationAction({ISD::ABDS, ISD::ABDU}, VT, Legal); + } + } + if (Subtarget.hasStdExtZfbfmin()) { setOperationAction(ISD::BITCAST, MVT::i16, Custom); setOperationAction(ISD::ConstantFP, MVT::bf16, Expand); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index 7d8a9192d9847..c5e2f12aafb1e 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -1455,3 +1455,127 @@ let Predicates = [HasStdExtP, IsRV32] in { def PMAXU_DW : RVPPairBinaryExchanged_rr<0b1111, 0b01, "pmaxu.dw">; def PMAXU_DB : RVPPairBinaryExchanged_rr<0b1111, 0b10, "pmaxu.db">; } // Predicates = [HasStdExtP, IsRV32] + +let Predicates = [HasStdExtP, IsRV64] in { + // Basic arithmetic patterns for v4i16 (16-bit elements in 64-bit GPR) + def: Pat<(v4i16 (add v4i16:$rs1, v4i16:$rs2)), (!cast("PADD_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i16 (sub v4i16:$rs1, v4i16:$rs2)), (!cast("PSUB_H") GPR:$rs1, GPR:$rs2)>; + + // Saturating add/sub patterns for v4i16 + def: Pat<(v4i16 (saddsat v4i16:$rs1, v4i16:$rs2)), (!cast("PSADD_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i16 (uaddsat v4i16:$rs1, v4i16:$rs2)), (!cast("PSADDU_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i16 (ssubsat v4i16:$rs1, v4i16:$rs2)), (!cast("PSSUB_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i16 (usubsat v4i16:$rs1, v4i16:$rs2)), (!cast("PSSUBU_H") GPR:$rs1, GPR:$rs2)>; + + // Averaging patterns for v4i16 + def: Pat<(v4i16 (avgfloors v4i16:$rs1, v4i16:$rs2)), (!cast("PAADD_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i16 (avgflooru v4i16:$rs1, v4i16:$rs2)), (!cast("PAADDU_H") GPR:$rs1, GPR:$rs2)>; + + // Averaging subtraction patterns for v4i16 + // PASUB_H: signed (a - b) >> 1 + def: Pat<(v4i16 (sra (sub v4i16:$rs1, v4i16:$rs2), (v4i16 (build_vector (XLenVT 1))))), + (!cast("PASUB_H") GPR:$rs1, GPR:$rs2)>; + // PASUBU_H: unsigned (a - b) >> 1 + def: Pat<(v4i16 (srl (sub v4i16:$rs1, v4i16:$rs2), (v4i16 (build_vector (XLenVT 1))))), + (!cast("PASUBU_H") GPR:$rs1, GPR:$rs2)>; + + // Absolute difference patterns for v4i16 + def: Pat<(v4i16 (abds v4i16:$rs1, v4i16:$rs2)), (!cast("PDIF_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i16 (abdu v4i16:$rs1, v4i16:$rs2)), (!cast("PDIFU_H") GPR:$rs1, GPR:$rs2)>; + + // Basic arithmetic patterns for v8i8 (8-bit elements in 64-bit GPR) + def: Pat<(v8i8 (add v8i8:$rs1, v8i8:$rs2)), (!cast("PADD_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v8i8 (sub v8i8:$rs1, v8i8:$rs2)), (!cast("PSUB_B") GPR:$rs1, GPR:$rs2)>; + + // Saturating add/sub patterns for v8i8 + def: Pat<(v8i8 (saddsat v8i8:$rs1, v8i8:$rs2)), (!cast("PSADD_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v8i8 (uaddsat v8i8:$rs1, v8i8:$rs2)), (!cast("PSADDU_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v8i8 (ssubsat v8i8:$rs1, v8i8:$rs2)), (!cast("PSSUB_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v8i8 (usubsat v8i8:$rs1, v8i8:$rs2)), (!cast("PSSUBU_B") GPR:$rs1, GPR:$rs2)>; + + // Averaging patterns for v8i8 + def: Pat<(v8i8 (avgfloors v8i8:$rs1, v8i8:$rs2)), (!cast("PAADD_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v8i8 (avgflooru v8i8:$rs1, v8i8:$rs2)), (!cast("PAADDU_B") GPR:$rs1, GPR:$rs2)>; + + // Averaging subtraction patterns for v8i8 + // PASUB_B: signed (a - b) >> 1 + def: Pat<(v8i8 (sra (sub v8i8:$rs1, v8i8:$rs2), (v8i8 (build_vector (XLenVT 1))))), + (!cast("PASUB_B") GPR:$rs1, GPR:$rs2)>; + // PASUBU_B: unsigned (a - b) >> 1 + def: Pat<(v8i8 (srl (sub v8i8:$rs1, v8i8:$rs2), (v8i8 (build_vector (XLenVT 1))))), + (!cast("PASUBU_B") GPR:$rs1, GPR:$rs2)>; + + // Absolute difference patterns for v8i8 + def: Pat<(v8i8 (abds v8i8:$rs1, v8i8:$rs2)), (!cast("PDIF_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v8i8 (abdu v8i8:$rs1, v8i8:$rs2)), (!cast("PDIFU_B") GPR:$rs1, GPR:$rs2)>; + + // Load/Store patterns for v4i16 and v8i8 (use regular GPR load/store since they're in GPRs) + def : StPat; + def : LdPat; + def : StPat; + def : LdPat; + + // Load/Store patterns for v2i32 (32-bit elements in 64-bit GPR) + def : StPat; + def : LdPat; +} // Predicates = [HasStdExtP, IsRV64] + +let Predicates = [HasStdExtP, IsRV32] in { + // Basic arithmetic patterns for v2i16 (16-bit elements in 32-bit GPR) + def: Pat<(v2i16 (add v2i16:$rs1, v2i16:$rs2)), (!cast("PADD_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i16 (sub v2i16:$rs1, v2i16:$rs2)), (!cast("PSUB_H") GPR:$rs1, GPR:$rs2)>; + + // Saturating add/sub patterns for v2i16 + def: Pat<(v2i16 (saddsat v2i16:$rs1, v2i16:$rs2)), (!cast("PSADD_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i16 (uaddsat v2i16:$rs1, v2i16:$rs2)), (!cast("PSADDU_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i16 (ssubsat v2i16:$rs1, v2i16:$rs2)), (!cast("PSSUB_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i16 (usubsat v2i16:$rs1, v2i16:$rs2)), (!cast("PSSUBU_H") GPR:$rs1, GPR:$rs2)>; + + // Averaging patterns for v2i16 + def: Pat<(v2i16 (avgfloors v2i16:$rs1, v2i16:$rs2)), (!cast("PAADD_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i16 (avgflooru v2i16:$rs1, v2i16:$rs2)), (!cast("PAADDU_H") GPR:$rs1, GPR:$rs2)>; + + // Averaging subtraction patterns for v2i16 + // PASUB_H: signed (a - b) >> 1 + def: Pat<(v2i16 (sra (sub v2i16:$rs1, v2i16:$rs2), (v2i16 (build_vector (XLenVT 1))))), + (!cast("PASUB_H") GPR:$rs1, GPR:$rs2)>; + // PASUBU_H: unsigned (a - b) >> 1 + def: Pat<(v2i16 (srl (sub v2i16:$rs1, v2i16:$rs2), (v2i16 (build_vector (XLenVT 1))))), + (!cast("PASUBU_H") GPR:$rs1, GPR:$rs2)>; + + // Absolute difference patterns for v2i16 + def: Pat<(v2i16 (abds v2i16:$rs1, v2i16:$rs2)), (!cast("PDIF_H") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i16 (abdu v2i16:$rs1, v2i16:$rs2)), (!cast("PDIFU_H") GPR:$rs1, GPR:$rs2)>; + + // Basic arithmetic patterns for v4i8 (8-bit elements in 32-bit GPR) + def: Pat<(v4i8 (add v4i8:$rs1, v4i8:$rs2)), (!cast("PADD_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i8 (sub v4i8:$rs1, v4i8:$rs2)), (!cast("PSUB_B") GPR:$rs1, GPR:$rs2)>; + + // Saturating add/sub patterns for v4i8 + def: Pat<(v4i8 (saddsat v4i8:$rs1, v4i8:$rs2)), (!cast("PSADD_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i8 (uaddsat v4i8:$rs1, v4i8:$rs2)), (!cast("PSADDU_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i8 (ssubsat v4i8:$rs1, v4i8:$rs2)), (!cast("PSSUB_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i8 (usubsat v4i8:$rs1, v4i8:$rs2)), (!cast("PSSUBU_B") GPR:$rs1, GPR:$rs2)>; + + // Averaging patterns for v4i8 + def: Pat<(v4i8 (avgfloors v4i8:$rs1, v4i8:$rs2)), (!cast("PAADD_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i8 (avgflooru v4i8:$rs1, v4i8:$rs2)), (!cast("PAADDU_B") GPR:$rs1, GPR:$rs2)>; + + // Averaging subtraction patterns for v4i8 + // PASUB_B: signed (a - b) >> 1 + def: Pat<(v4i8 (sra (sub v4i8:$rs1, v4i8:$rs2), (v4i8 (build_vector (XLenVT 1))))), + (!cast("PASUB_B") GPR:$rs1, GPR:$rs2)>; + // PASUBU_B: unsigned (a - b) >> 1 + def: Pat<(v4i8 (srl (sub v4i8:$rs1, v4i8:$rs2), (v4i8 (build_vector (XLenVT 1))))), + (!cast("PASUBU_B") GPR:$rs1, GPR:$rs2)>; + + // Absolute difference patterns for v4i8 + def: Pat<(v4i8 (abds v4i8:$rs1, v4i8:$rs2)), (!cast("PDIF_B") GPR:$rs1, GPR:$rs2)>; + def: Pat<(v4i8 (abdu v4i8:$rs1, v4i8:$rs2)), (!cast("PDIFU_B") GPR:$rs1, GPR:$rs2)>; + + // Load/Store patterns for v2i16 and v4i8 (use regular GPR load/store since they're in GPRs) + def : StPat; + def : LdPat; + def : StPat; + def : LdPat; +} // Predicates = [HasStdExtP, IsRV32] diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td index 6605a5ccdfde2..fcbb93a55375b 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td @@ -238,7 +238,11 @@ class RISCVRegisterClass regTypes, int align, dag regList> } class GPRRegisterClass - : RISCVRegisterClass<[XLenVT, XLenFVT], 32, regList> { + : RISCVRegisterClass<[XLenVT, XLenFVT, + // P extension packed vector types: + // RV32: v2i16, v4i8 + // RV64: v2i32, v4i16, v8i8 + v2i16, v4i8, v2i32, v4i16, v8i8], 32, regList> { let RegInfos = XLenRI; } diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 7bc0b5b394828..e669175a3d8e1 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -969,6 +969,13 @@ InstructionCost RISCVTTIImpl::getScalarizationOverhead( if (isa(Ty)) return InstructionCost::getInvalid(); + // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16) + // For now, skip all fixed vector cost analysis when P extension is available + // to avoid crashes in getMinRVVVectorSizeInBits() + if (ST->hasStdExtP() && isa(Ty)) { + return 1; // Treat as single instruction cost for now + } + // A build_vector (which is m1 sized or smaller) can be done in no // worse than one vslide1down.vx per element in the type. We could // in theory do an explode_vector in the inverse manner, but our @@ -1625,6 +1632,13 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, if (!IsVectorType) return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); + // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16) + // For now, skip all fixed vector cost analysis when P extension is available + // to avoid crashes in getMinRVVVectorSizeInBits() + if (ST->hasStdExtP() && (isa(Dst) || isa(Src))) { + return 1; // Treat as single instruction cost for now + } + // FIXME: Need to compute legalizing cost for illegal types. The current // code handles only legal types and those which can be trivially // promoted to legal. @@ -2321,6 +2335,13 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, const Value *Op1) const { assert(Val->isVectorTy() && "This must be a vector type"); + // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16) + // For now, skip all fixed vector cost analysis when P extension is available + // to avoid crashes in getMinRVVVectorSizeInBits() + if (ST->hasStdExtP() && isa(Val)) { + return 1; // Treat as single instruction cost for now + } + if (Opcode != Instruction::ExtractElement && Opcode != Instruction::InsertElement) return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll new file mode 100644 index 0000000000000..8a4ab1d545f41 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll @@ -0,0 +1,426 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+experimental-p -verify-machineinstrs < %s | FileCheck %s + +; Test basic add/sub operations for v2i16 +define void @test_padd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_padd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: padd.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %res = add <2 x i16> %a, %b + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_psub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psub_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: psub.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %res = sub <2 x i16> %a, %b + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test basic add/sub operations for v4i8 +define void @test_padd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_padd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: padd.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %res = add <4 x i8> %a, %b + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_psub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psub_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: psub.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %res = sub <4 x i8> %a, %b + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test saturating add operations for v2i16 +define void @test_psadd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psadd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: psadd.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %res = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %a, <2 x i16> %b) + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_psaddu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psaddu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: psaddu.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %res = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %a, <2 x i16> %b) + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test saturating sub operations for v2i16 +define void @test_pssub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssub_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pssub.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %res = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %a, <2 x i16> %b) + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_pssubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssubu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pssubu.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %res = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %a, <2 x i16> %b) + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test saturating add operations for v4i8 +define void @test_psadd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psadd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: psadd.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %res = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> %a, <4 x i8> %b) + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_psaddu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psaddu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: psaddu.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %res = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> %a, <4 x i8> %b) + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test saturating sub operations for v4i8 +define void @test_pssub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssub_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pssub.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %res = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> %a, <4 x i8> %b) + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_pssubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssubu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pssubu.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %res = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %a, <4 x i8> %b) + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor signed operations for v2i16 +define void @test_paadd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paadd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: paadd.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %ext.a = sext <2 x i16> %a to <2 x i32> + %ext.b = sext <2 x i16> %b to <2 x i32> + %add = add nsw <2 x i32> %ext.a, %ext.b + %shift = ashr <2 x i32> %add, + %res = trunc <2 x i32> %shift to <2 x i16> + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor unsigned operations for v2i16 +define void @test_paaddu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paaddu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: paaddu.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %and = and <2 x i16> %a, %b + %xor = xor <2 x i16> %a, %b + %shift = lshr <2 x i16> %xor, + %res = add <2 x i16> %and, %shift + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor signed operations for v4i8 +define void @test_paadd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paadd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: paadd.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %ext.a = sext <4 x i8> %a to <4 x i16> + %ext.b = sext <4 x i8> %b to <4 x i16> + %add = add nsw <4 x i16> %ext.a, %ext.b + %shift = ashr <4 x i16> %add, + %res = trunc <4 x i16> %shift to <4 x i8> + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor unsigned operations for v4i8 +define void @test_paaddu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paaddu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: paaddu.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %and = and <4 x i8> %a, %b + %xor = xor <4 x i8> %a, %b + %shift = lshr <4 x i8> %xor, + %res = add <4 x i8> %and, %shift + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference signed for v2i16 +define void @test_pdif_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdif_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pdif.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %min = call <2 x i16> @llvm.smin.v2i16(<2 x i16> %a, <2 x i16> %b) + %max = call <2 x i16> @llvm.smax.v2i16(<2 x i16> %a, <2 x i16> %b) + %res = sub <2 x i16> %max, %min + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference unsigned for v2i16 +define void @test_pdifu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdifu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pdifu.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %min = call <2 x i16> @llvm.umin.v2i16(<2 x i16> %a, <2 x i16> %b) + %max = call <2 x i16> @llvm.umax.v2i16(<2 x i16> %a, <2 x i16> %b) + %res = sub <2 x i16> %max, %min + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference signed for v4i8 +define void @test_pdif_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdif_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pdif.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %min = call <4 x i8> @llvm.smin.v4i8(<4 x i8> %a, <4 x i8> %b) + %max = call <4 x i8> @llvm.smax.v4i8(<4 x i8> %a, <4 x i8> %b) + %res = sub <4 x i8> %max, %min + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference unsigned for v4i8 +define void @test_pdifu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdifu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pdifu.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %min = call <4 x i8> @llvm.umin.v4i8(<4 x i8> %a, <4 x i8> %b) + %max = call <4 x i8> @llvm.umax.v4i8(<4 x i8> %a, <4 x i8> %b) + %res = sub <4 x i8> %max, %min + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction signed for v2i16 +; pasub pattern: (a - b) arithmetic shift right 1 +define void @test_pasub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasub_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pasub.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %sub = sub <2 x i16> %a, %b + %res = ashr <2 x i16> %sub, + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction unsigned for v2i16 +; pasubu pattern: (a - b) logical shift right 1 +define void @test_pasubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasubu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pasubu.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %sub = sub <2 x i16> %a, %b + %res = lshr <2 x i16> %sub, + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction signed for v4i8 +define void @test_pasub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasub_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pasub.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %sub = sub <4 x i8> %a, %b + %res = ashr <4 x i8> %sub, + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction unsigned for v4i8 +define void @test_pasubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasubu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pasubu.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %sub = sub <4 x i8> %a, %b + %res = lshr <4 x i8> %sub, + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Intrinsic declarations +declare <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.usub.sat.v2i16(<2 x i16>, <2 x i16>) +declare <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8>, <4 x i8>) +declare <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8>, <4 x i8>) +declare <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8>, <4 x i8>) +declare <4 x i8> @llvm.usub.sat.v4i8(<4 x i8>, <4 x i8>) +declare <2 x i16> @llvm.smin.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.smax.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.umin.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.umax.v2i16(<2 x i16>, <2 x i16>) +declare <4 x i8> @llvm.smin.v4i8(<4 x i8>, <4 x i8>) +declare <4 x i8> @llvm.smax.v4i8(<4 x i8>, <4 x i8>) +declare <4 x i8> @llvm.umin.v4i8(<4 x i8>, <4 x i8>) +declare <4 x i8> @llvm.umax.v4i8(<4 x i8>, <4 x i8>) diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll new file mode 100644 index 0000000000000..d4918e4e0aa62 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll @@ -0,0 +1,430 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+experimental-p -verify-machineinstrs < %s | FileCheck %s + +; Test basic add/sub operations for v4i16 +define void @test_padd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_padd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: padd.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %res = add <4 x i16> %a, %b + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_psub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psub_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: psub.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %res = sub <4 x i16> %a, %b + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test basic add/sub operations for v8i8 +define void @test_padd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_padd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: padd.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %res = add <8 x i8> %a, %b + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_psub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psub_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: psub.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %res = sub <8 x i8> %a, %b + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test saturating add operations for v4i16 +define void @test_psadd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psadd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: psadd.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %res = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %a, <4 x i16> %b) + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_psaddu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psaddu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: psaddu.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %res = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %a, <4 x i16> %b) + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test saturating sub operations for v4i16 +define void @test_pssub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssub_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pssub.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %res = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %a, <4 x i16> %b) + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_pssubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssubu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pssubu.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %res = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %a, <4 x i16> %b) + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test saturating add operations for v8i8 +define void @test_psadd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psadd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: psadd.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %res = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> %a, <8 x i8> %b) + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_psaddu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psaddu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: psaddu.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %res = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> %a, <8 x i8> %b) + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test saturating sub operations for v8i8 +define void @test_pssub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssub_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pssub.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %res = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> %a, <8 x i8> %b) + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_pssubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssubu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pssubu.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %res = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> %a, <8 x i8> %b) + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor signed operations for v4i16 +; avgfloors pattern: (a + b) arithmetic shift right 1 +define void @test_paadd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paadd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: paadd.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %ext.a = sext <4 x i16> %a to <4 x i32> + %ext.b = sext <4 x i16> %b to <4 x i32> + %add = add nsw <4 x i32> %ext.a, %ext.b + %shift = ashr <4 x i32> %add, + %res = trunc <4 x i32> %shift to <4 x i16> + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor unsigned operations for v4i16 +; avgflooru pattern: (a & b) + ((a ^ b) >> 1) +define void @test_paaddu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paaddu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: paaddu.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %and = and <4 x i16> %a, %b + %xor = xor <4 x i16> %a, %b + %shift = lshr <4 x i16> %xor, + %res = add <4 x i16> %and, %shift + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor signed operations for v8i8 +define void @test_paadd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paadd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: paadd.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %ext.a = sext <8 x i8> %a to <8 x i16> + %ext.b = sext <8 x i8> %b to <8 x i16> + %add = add nsw <8 x i16> %ext.a, %ext.b + %shift = ashr <8 x i16> %add, + %res = trunc <8 x i16> %shift to <8 x i8> + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor unsigned operations for v8i8 +define void @test_paaddu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paaddu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: paaddu.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %and = and <8 x i8> %a, %b + %xor = xor <8 x i8> %a, %b + %shift = lshr <8 x i8> %xor, + %res = add <8 x i8> %and, %shift + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference signed for v4i16 +; abds pattern: sub(smax(a,b), smin(a,b)) +define void @test_pdif_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdif_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pdif.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %min = call <4 x i16> @llvm.smin.v4i16(<4 x i16> %a, <4 x i16> %b) + %max = call <4 x i16> @llvm.smax.v4i16(<4 x i16> %a, <4 x i16> %b) + %res = sub <4 x i16> %max, %min + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference unsigned for v4i16 +; abdu pattern: sub(umax(a,b), umin(a,b)) +define void @test_pdifu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdifu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pdifu.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %min = call <4 x i16> @llvm.umin.v4i16(<4 x i16> %a, <4 x i16> %b) + %max = call <4 x i16> @llvm.umax.v4i16(<4 x i16> %a, <4 x i16> %b) + %res = sub <4 x i16> %max, %min + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference signed for v8i8 +define void @test_pdif_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdif_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pdif.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %min = call <8 x i8> @llvm.smin.v8i8(<8 x i8> %a, <8 x i8> %b) + %max = call <8 x i8> @llvm.smax.v8i8(<8 x i8> %a, <8 x i8> %b) + %res = sub <8 x i8> %max, %min + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference unsigned for v8i8 +define void @test_pdifu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdifu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pdifu.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %min = call <8 x i8> @llvm.umin.v8i8(<8 x i8> %a, <8 x i8> %b) + %max = call <8 x i8> @llvm.umax.v8i8(<8 x i8> %a, <8 x i8> %b) + %res = sub <8 x i8> %max, %min + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction signed for v4i16 +; pasub pattern: (a - b) arithmetic shift right 1 +define void @test_pasub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasub_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pasub.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %sub = sub <4 x i16> %a, %b + %res = ashr <4 x i16> %sub, + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction unsigned for v4i16 +; pasubu pattern: (a - b) logical shift right 1 +define void @test_pasubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasubu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pasubu.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %sub = sub <4 x i16> %a, %b + %res = lshr <4 x i16> %sub, + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction signed for v8i8 +define void @test_pasub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasub_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pasub.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %sub = sub <8 x i8> %a, %b + %res = ashr <8 x i8> %sub, + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction unsigned for v8i8 +define void @test_pasubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasubu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pasubu.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %sub = sub <8 x i8> %a, %b + %res = lshr <8 x i8> %sub, + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Intrinsic declarations +declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.usub.sat.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.usub.sat.v8i8(<8 x i8>, <8 x i8>) +declare <4 x i16> @llvm.smin.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.smax.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.umin.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.umax.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i8> @llvm.smin.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.smax.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.umin.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.umax.v8i8(<8 x i8>, <8 x i8>)