@@ -9679,6 +9679,13 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
9679
9679
BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
9680
9680
assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
9681
9681
9682
+ // Recognize build vector patterns to emit VSX vector instructions
9683
+ // instead of loading value from memory.
9684
+ if (Subtarget.isISA3_1() && Subtarget.hasVSX()) {
9685
+ if (SDValue VecPat = combineBVLoadsSpecialValue(Op, DAG))
9686
+ return VecPat;
9687
+ }
9688
+
9682
9689
if (Subtarget.hasP10Vector()) {
9683
9690
APInt BitMask(32, 0);
9684
9691
// If the value of the vector is all zeros or all ones,
@@ -15657,6 +15664,133 @@ combineElementTruncationToVectorTruncation(SDNode *N,
15657
15664
return SDValue();
15658
15665
}
15659
15666
15667
+ // LXVKQ instruction load VSX vector with a special quadword value
15668
+ // based on an immediate value. This helper method returns the details of the
15669
+ // match as a tuple of {LXVKQ unsigned IMM Value, right_shift_amount}
15670
+ // to help generate the LXVKQ instruction and the subsequent shift instruction
15671
+ // required to match the original build vector pattern.
15672
+
15673
+ // LXVKQPattern: {LXVKQ unsigned IMM Value, right_shift_amount}
15674
+ using LXVKQPattern = std::tuple<uint32_t, uint8_t>;
15675
+
15676
+ static std::optional<LXVKQPattern> getPatternInfo(const APInt &FullVal) {
15677
+
15678
+ static const auto BaseLXVKQPatterns = []() {
15679
+ // LXVKQ instruction loads the Quadword value:
15680
+ // 0x8000_0000_0000_0000_0000_0000_0000_0000 when imm = 0b10000
15681
+ return std::array<std::pair<APInt, uint32_t>, 1>{
15682
+ {{APInt(128, 0x8000000000000000ULL) << 64, 16}}};
15683
+ }();
15684
+
15685
+ // Check for direct LXVKQ match (no shift needed)
15686
+ for (const auto &[BasePattern, Uim] : BaseLXVKQPatterns) {
15687
+ if (FullVal == BasePattern)
15688
+ return std::make_tuple(Uim, uint8_t{0});
15689
+ }
15690
+
15691
+ // Check if FullValue can be generated by (right) shifting a base pattern
15692
+ for (const auto &[BasePattern, Uim] : BaseLXVKQPatterns) {
15693
+ if (BasePattern.lshr(127) == FullVal)
15694
+ return std::make_tuple(Uim, uint8_t{127});
15695
+ }
15696
+
15697
+ return std::nullopt;
15698
+ }
15699
+
15700
+ /// Combine vector loads to a single load by recognising patterns in the Build
15701
+ /// Vector. LXVKQ instruction load VSX vector with a special quadword value
15702
+ /// based on an immediate value.
15703
+ SDValue PPCTargetLowering::combineBVLoadsSpecialValue(SDValue Op,
15704
+ SelectionDAG &DAG) const {
15705
+
15706
+ assert((Op.getNode() && Op.getOpcode() == ISD::BUILD_VECTOR) &&
15707
+ "Expected a BuildVectorSDNode in combineBVLoadsSpecialValue");
15708
+
15709
+ // This transformation is only supported if we are loading either a byte,
15710
+ // halfword, word, or doubleword.
15711
+ EVT VT = Op.getValueType();
15712
+ if (!(VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v4i32 ||
15713
+ VT == MVT::v2i64))
15714
+ return SDValue();
15715
+
15716
+ LLVM_DEBUG(llvm::dbgs() << "\ncombineBVLoadsSpecialValue: Build vector ("
15717
+ << VT.getEVTString() << "): ";
15718
+ Op->dump());
15719
+
15720
+ unsigned NumElems = VT.getVectorNumElements();
15721
+ unsigned ElemBits = VT.getScalarSizeInBits();
15722
+
15723
+ bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();
15724
+
15725
+ // Check for Non-constant operand in the build vector.
15726
+ for (const SDValue &Operand : Op.getNode()->op_values()) {
15727
+ if (!isa<ConstantSDNode>(Operand))
15728
+ return SDValue();
15729
+ }
15730
+
15731
+ // Assemble build vector operands as a 128-bit register value
15732
+ // We need to reconstruct what the 128-bit register pattern would be
15733
+ // that produces this vector when interpreted with the current endianness
15734
+ APInt FullVal = APInt::getZero(128);
15735
+
15736
+ for (unsigned Index = 0; Index < NumElems; ++Index) {
15737
+ auto *C = cast<ConstantSDNode>(Op.getOperand(Index));
15738
+
15739
+ // Get element value as raw bits (zero-extended)
15740
+ uint64_t ElemValue = C->getZExtValue();
15741
+
15742
+ // Mask to element size to ensure we only get the relevant bits
15743
+ if (ElemBits < 64)
15744
+ ElemValue &= ((1ULL << ElemBits) - 1);
15745
+
15746
+ // Calculate bit position for this element in the 128-bit register
15747
+ unsigned BitPos =
15748
+ (IsLittleEndian) ? (Index * ElemBits) : (128 - (Index + 1) * ElemBits);
15749
+
15750
+ // Create APInt for the element value and shift it to correct position
15751
+ APInt ElemAPInt(128, ElemValue);
15752
+ ElemAPInt <<= BitPos;
15753
+
15754
+ // Place the element value at the correct bit position
15755
+ FullVal |= ElemAPInt;
15756
+ }
15757
+
15758
+ if (auto UIMOpt = getPatternInfo(FullVal)) {
15759
+ const auto &[Uim, ShiftAmount] = *UIMOpt;
15760
+ SDLoc Dl(Op);
15761
+
15762
+ // Generate LXVKQ instruction if the shift amount is zero.
15763
+ if (ShiftAmount == 0) {
15764
+ SDValue UimVal = DAG.getTargetConstant(Uim, Dl, MVT::i32);
15765
+ SDValue LxvkqInstr =
15766
+ SDValue(DAG.getMachineNode(PPC::LXVKQ, Dl, VT, UimVal), 0);
15767
+ LLVM_DEBUG(llvm::dbgs()
15768
+ << "combineBVLoadsSpecialValue: Instruction Emitted ";
15769
+ LxvkqInstr.dump());
15770
+ return LxvkqInstr;
15771
+ }
15772
+
15773
+ // The right shifted pattern can be constructed using a combination of
15774
+ // XXSPLITIB and VSRQ instruction. VSRQ uses the shift amount from the lower
15775
+ // 7 bits of byte 15. This can be specified using XXSPLITIB with immediate
15776
+ // value 255.
15777
+ SDValue ShiftAmountVec =
15778
+ SDValue(DAG.getMachineNode(PPC::XXSPLTIB, Dl, MVT::v4i32,
15779
+ DAG.getTargetConstant(255, Dl, MVT::i32)),
15780
+ 0);
15781
+ // Generate appropriate right shift instruction
15782
+ SDValue ShiftVec = SDValue(
15783
+ DAG.getMachineNode(PPC::VSRQ, Dl, VT, ShiftAmountVec, ShiftAmountVec),
15784
+ 0);
15785
+ LLVM_DEBUG(llvm::dbgs()
15786
+ << "\n combineBVLoadsSpecialValue: Instruction Emitted ";
15787
+ ShiftVec.dump());
15788
+ return ShiftVec;
15789
+ }
15790
+ // No patterns matched for build vectors.
15791
+ return SDValue();
15792
+ }
15793
+
15660
15794
/// Reduce the number of loads when building a vector.
15661
15795
///
15662
15796
/// Building a vector out of multiple loads can be converted to a load
0 commit comments