From cf815916f63d5db9b0431e5098c6dbaccc0e73af Mon Sep 17 00:00:00 2001 From: himadhith Date: Fri, 26 Sep 2025 06:51:21 +0000 Subject: [PATCH 1/4] [PowerPC] Replace vspltisw instruction with xxleqv as generation of vector of -1s is cheaper than vector of 1s --- llvm/lib/Target/PowerPC/PPCInstrVSX.td | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td index 979ba31b0431b..fc00883528dc2 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -3613,6 +3613,10 @@ def : Pat<(v4i32 (build_vector immSExt5NonZero:$A, immSExt5NonZero:$A, immSExt5NonZero:$A, immSExt5NonZero:$A)), (v4i32 (VSPLTISW imm:$A))>; +// Optimize for vector of 1s addition operation +def : Pat<(add v4i32:$A, (build_vector (i32 1), (i32 1), (i32 1), (i32 1))), + (VSUBUWM $A, (v4i32 (COPY_TO_REGCLASS (XXLEQVOnes), VSRC)))>; + // Splat loads. def : Pat<(v8i16 (PPCldsplat ForceXForm:$A)), (v8i16 (VSPLTHs 3, (MTVSRWZ (LHZX ForceXForm:$A))))>; From 73bd0ed9097e210ab92fcbdbb94c665a931a2ec2 Mon Sep 17 00:00:00 2001 From: himadhith Date: Mon, 13 Oct 2025 12:46:26 +0000 Subject: [PATCH 2/4] DAG combiner method as tablegen does not work with v2i64s --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 40 +++++++++++++++++++++ llvm/lib/Target/PowerPC/PPCInstrVSX.td | 4 --- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 8bf0d118da575..2bcce6004f0e2 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -19208,6 +19208,44 @@ static SDValue combineADDToMAT_PCREL_ADDR(SDNode *N, SelectionDAG &DAG, return MatPCRel; } +static SDValue combineADDToSUB(SDNode *N, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) { + EVT VT = N->getValueType(0); + + // Handle v2i64, v4i32, v8i16 and v16i8 types + if (!VT.isVector() || VT.getSizeInBits() != 128) + return SDValue(); + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // Check if RHS is BUILD_VECTOR + // To satisfy commutative property a+b = b+a + if (RHS.getOpcode() != ISD::BUILD_VECTOR) + std::swap(LHS, RHS); + + if (RHS.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + + // Check if all the elements are 1 + unsigned NumOfEles = RHS.getNumOperands(); + for (unsigned i = 0; i < NumOfEles; ++i) { + auto *CN = dyn_cast(RHS.getOperand(i)); + if (!CN || CN->getSExtValue() != 1) + return SDValue(); + } + SDLoc DL(N); + + SDValue MinusOne = DAG.getConstant(APInt::getAllOnes(32), DL, MVT::i32); + SmallVector Ops(4, MinusOne); + SDValue AllOnesVec = DAG.getBuildVector(MVT::v4i32, DL, Ops); + + // Bitcast to the target vector type + SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT, AllOnesVec); + + return DAG.getNode(ISD::SUB, DL, VT, LHS, Bitcast); +} + SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const { if (auto Value = combineADDToADDZE(N, DCI.DAG, Subtarget)) return Value; @@ -19215,6 +19253,8 @@ SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const { if (auto Value = combineADDToMAT_PCREL_ADDR(N, DCI.DAG, Subtarget)) return Value; + if (auto Value = combineADDToSUB(N, DCI.DAG, Subtarget)) + return Value; return SDValue(); } diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td index fc00883528dc2..979ba31b0431b 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -3613,10 +3613,6 @@ def : Pat<(v4i32 (build_vector immSExt5NonZero:$A, immSExt5NonZero:$A, immSExt5NonZero:$A, immSExt5NonZero:$A)), (v4i32 (VSPLTISW imm:$A))>; -// Optimize for vector of 1s addition operation -def : Pat<(add v4i32:$A, (build_vector (i32 1), (i32 1), (i32 1), (i32 1))), - (VSUBUWM $A, (v4i32 (COPY_TO_REGCLASS (XXLEQVOnes), VSRC)))>; - // Splat loads. def : Pat<(v8i16 (PPCldsplat ForceXForm:$A)), (v8i16 (VSPLTHs 3, (MTVSRWZ (LHZX ForceXForm:$A))))>; From d74869b2965328e696270c2ed3f55ebe29dcaaf1 Mon Sep 17 00:00:00 2001 From: himadhith Date: Thu, 16 Oct 2025 13:49:47 +0000 Subject: [PATCH 3/4] update checks for affected files --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 3 ++- .../PowerPC/addition-vector-all-ones.ll | 19 +++++++++---------- .../CodeGen/PowerPC/vec_add_sub_doubleword.ll | 7 +++---- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 2bcce6004f0e2..0102637945c98 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -19213,7 +19213,8 @@ static SDValue combineADDToSUB(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); // Handle v2i64, v4i32, v8i16 and v16i8 types - if (!VT.isVector() || VT.getSizeInBits() != 128) + if (!(VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v4i32 || + VT == MVT::v2i64)) return SDValue(); SDValue LHS = N->getOperand(0); diff --git a/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll b/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll index e67d031b1813f..4ec54fa8a0dee 100644 --- a/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll +++ b/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll @@ -8,15 +8,14 @@ ; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc-ibm-aix \ ; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s -; The addition of vector `A` with vector of 1s currently uses `vspltisw` to generate vector of 1s followed by add operation. +; Optimized version which `xxleqv` and `vsubu` to generate vector of -1s to leverage the identity A - (-1) = A + 1. ; Function for the vector type v2i64 `a + {1, 1}` define <2 x i64> @test_v2i64(<2 x i64> %a) { ; CHECK-LABEL: test_v2i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vspltisw v3, 1 -; CHECK-NEXT: vupklsw v3, v3 -; CHECK-NEXT: vaddudm v2, v2, v3 +; CHECK-NEXT: xxleqv v3, v3, v3 +; CHECK-NEXT: vsubudm v2, v2, v3 ; CHECK-NEXT: blr entry: %add = add <2 x i64> %a, splat (i64 1) @@ -27,8 +26,8 @@ entry: define <4 x i32> @test_v4i32(<4 x i32> %a) { ; CHECK-LABEL: test_v4i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vspltisw v3, 1 -; CHECK-NEXT: vadduwm v2, v2, v3 +; CHECK-NEXT: xxleqv v3, v3, v3 +; CHECK-NEXT: vsubuwm v2, v2, v3 ; CHECK-NEXT: blr entry: %add = add <4 x i32> %a, splat (i32 1) @@ -39,8 +38,8 @@ entry: define <8 x i16> @test_v8i16(<8 x i16> %a) { ; CHECK-LABEL: test_v8i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vspltish v3, 1 -; CHECK-NEXT: vadduhm v2, v2, v3 +; CHECK-NEXT: xxleqv v3, v3, v3 +; CHECK-NEXT: vsubuhm v2, v2, v3 ; CHECK-NEXT: blr entry: %add = add <8 x i16> %a, splat (i16 1) @@ -51,8 +50,8 @@ entry: define <16 x i8> @test_16i8(<16 x i8> %a) { ; CHECK-LABEL: test_16i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xxspltib v3, 1 -; CHECK-NEXT: vaddubm v2, v2, v3 +; CHECK-NEXT: xxleqv v3, v3, v3 +; CHECK-NEXT: vsububm v2, v2, v3 ; CHECK-NEXT: blr entry: %add = add <16 x i8> %a, splat (i8 1) diff --git a/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll b/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll index 210aee13486c3..033e0b76838df 100644 --- a/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll +++ b/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll @@ -16,9 +16,8 @@ define <2 x i64> @test_add(<2 x i64> %x, <2 x i64> %y) nounwind { define <2 x i64> @increment_by_one(<2 x i64> %x) nounwind { ; VSX-LABEL: increment_by_one: ; VSX: # %bb.0: -; VSX-NEXT: vspltisw 3, 1 -; VSX-NEXT: vupklsw 3, 3 -; VSX-NEXT: vaddudm 2, 2, 3 +; VSX-NEXT: xxleqv 35, 35, 35 +; VSX-NEXT: vsubudm 2, 2, 3 ; VSX-NEXT: blr ; ; NOVSX-LABEL: increment_by_one: @@ -26,7 +25,7 @@ define <2 x i64> @increment_by_one(<2 x i64> %x) nounwind { ; NOVSX-NEXT: addis 3, 2, .LCPI1_0@toc@ha ; NOVSX-NEXT: addi 3, 3, .LCPI1_0@toc@l ; NOVSX-NEXT: lvx 3, 0, 3 -; NOVSX-NEXT: vaddudm 2, 2, 3 +; NOVSX-NEXT: vsubudm 2, 2, 3 ; NOVSX-NEXT: blr %result = add <2 x i64> %x, ret <2 x i64> %result From 432d6e01111e6e7e83bbad5fc8991bdbf3023673 Mon Sep 17 00:00:00 2001 From: himadhith Date: Fri, 17 Oct 2025 05:45:57 +0000 Subject: [PATCH 4/4] addressing review comments --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 12 ++++++++---- llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 0102637945c98..125c96c3f8008 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -19208,9 +19208,17 @@ static SDValue combineADDToMAT_PCREL_ADDR(SDNode *N, SelectionDAG &DAG, return MatPCRel; } +// Transform (add X, (build_vector (T 1), (T 1), ...)) -> (sub X, (XXLEQVOnes)) +// XXLEQVOnes creates an all-1s vector (0xFFFFFFFF...) efficiently via xxleqv +// Mathematical identity: X + 1 = X - (-1) +// Applies to v4i32, v2i64, v8i16, v16i8 where all elements are constant 1 +// Requirement: VSX feature for efficient xxleqv generation static SDValue combineADDToSUB(SDNode *N, SelectionDAG &DAG, const PPCSubtarget &Subtarget) { + EVT VT = N->getValueType(0); + if (!Subtarget.hasVSX()) + return SDValue(); // Handle v2i64, v4i32, v8i16 and v16i8 types if (!(VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v4i32 || @@ -19221,10 +19229,6 @@ static SDValue combineADDToSUB(SDNode *N, SelectionDAG &DAG, SDValue RHS = N->getOperand(1); // Check if RHS is BUILD_VECTOR - // To satisfy commutative property a+b = b+a - if (RHS.getOpcode() != ISD::BUILD_VECTOR) - std::swap(LHS, RHS); - if (RHS.getOpcode() != ISD::BUILD_VECTOR) return SDValue(); diff --git a/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll b/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll index 033e0b76838df..d56b1be539b05 100644 --- a/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll +++ b/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll @@ -25,7 +25,7 @@ define <2 x i64> @increment_by_one(<2 x i64> %x) nounwind { ; NOVSX-NEXT: addis 3, 2, .LCPI1_0@toc@ha ; NOVSX-NEXT: addi 3, 3, .LCPI1_0@toc@l ; NOVSX-NEXT: lvx 3, 0, 3 -; NOVSX-NEXT: vsubudm 2, 2, 3 +; NOVSX-NEXT: vaddudm 2, 2, 3 ; NOVSX-NEXT: blr %result = add <2 x i64> %x, ret <2 x i64> %result