@@ -2620,6 +2620,122 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
2620
2620
getInstructionSize (const MCInst &Inst) const override {
2621
2621
return 4 ;
2622
2622
}
2623
+
2624
+ std::optional<uint64_t >
2625
+ extractMoveImmediate (const MCInst &Inst, MCPhysReg TargetReg) const override {
2626
+ // Match MOVZ instructions (both X and W register variants) with no shift.
2627
+ if ((Inst.getOpcode () == AArch64::MOVZXi ||
2628
+ Inst.getOpcode () == AArch64::MOVZWi) &&
2629
+ Inst.getOperand (2 ).getImm () == 0 &&
2630
+ getAliases (TargetReg)[Inst.getOperand (0 ).getReg ()])
2631
+ return Inst.getOperand (1 ).getImm ();
2632
+ return std::nullopt;
2633
+ }
2634
+
2635
+ std::optional<uint64_t >
2636
+ findMemcpySizeInBytes (const BinaryBasicBlock &BB,
2637
+ BinaryBasicBlock::iterator CallInst) const override {
2638
+ MCPhysReg SizeReg = getIntArgRegister (2 );
2639
+ if (SizeReg == getNoRegister ())
2640
+ return std::nullopt;
2641
+
2642
+ BitVector WrittenRegs (RegInfo->getNumRegs ());
2643
+ const BitVector &SizeRegAliases = getAliases (SizeReg);
2644
+
2645
+ for (auto InstIt = BB.begin (); InstIt != CallInst; ++InstIt) {
2646
+ const MCInst &Inst = *InstIt;
2647
+ WrittenRegs.reset ();
2648
+ getWrittenRegs (Inst, WrittenRegs);
2649
+
2650
+ if (WrittenRegs.anyCommon (SizeRegAliases))
2651
+ return extractMoveImmediate (Inst, SizeReg);
2652
+ }
2653
+ return std::nullopt;
2654
+ }
2655
+
2656
+ InstructionListType
2657
+ createInlineMemcpy (bool ReturnEnd,
2658
+ std::optional<uint64_t > KnownSize) const override {
2659
+ assert (KnownSize.has_value () &&
2660
+ " AArch64 memcpy inlining requires known size" );
2661
+ InstructionListType Code;
2662
+ uint64_t Size = *KnownSize;
2663
+
2664
+ generateSizeSpecificMemcpy (Code, Size);
2665
+
2666
+ // If _memcpy8, adjust X0 to return dest+size instead of dest.
2667
+ if (ReturnEnd)
2668
+ Code.emplace_back (MCInstBuilder (AArch64::ADDXri)
2669
+ .addReg (AArch64::X0)
2670
+ .addReg (AArch64::X0)
2671
+ .addImm (Size)
2672
+ .addImm (0 ));
2673
+ return Code;
2674
+ }
2675
+
2676
+ InstructionListType generateSizeSpecificMemcpy (InstructionListType &Code,
2677
+ uint64_t Size) const {
2678
+ auto AddLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc,
2679
+ unsigned Reg, unsigned Offset = 0 ) {
2680
+ Code.emplace_back (MCInstBuilder (LoadOpc)
2681
+ .addReg (Reg)
2682
+ .addReg (AArch64::X1)
2683
+ .addImm (Offset));
2684
+ Code.emplace_back (MCInstBuilder (StoreOpc)
2685
+ .addReg (Reg)
2686
+ .addReg (AArch64::X0)
2687
+ .addImm (Offset));
2688
+ };
2689
+
2690
+ // Generate optimal instruction sequences based on exact size.
2691
+ switch (Size) {
2692
+ case 1 :
2693
+ AddLoadStorePair (AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9);
2694
+ break ;
2695
+ case 2 :
2696
+ AddLoadStorePair (AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9);
2697
+ break ;
2698
+ case 4 :
2699
+ AddLoadStorePair (AArch64::LDRWui, AArch64::STRWui, AArch64::W9);
2700
+ break ;
2701
+ case 8 :
2702
+ AddLoadStorePair (AArch64::LDRXui, AArch64::STRXui, AArch64::X9);
2703
+ break ;
2704
+ case 16 :
2705
+ AddLoadStorePair (AArch64::LDRQui, AArch64::STRQui, AArch64::Q16);
2706
+ break ;
2707
+ case 32 :
2708
+ AddLoadStorePair (AArch64::LDRQui, AArch64::STRQui, AArch64::Q16, 0 );
2709
+ AddLoadStorePair (AArch64::LDRQui, AArch64::STRQui, AArch64::Q17, 1 );
2710
+ break ;
2711
+
2712
+ default :
2713
+ // For sizes up to 64 bytes, greedily use the largest possible loads.
2714
+ // Caller should have already filtered out sizes > 64 bytes.
2715
+ assert (Size <= 64 &&
2716
+ " Size should be <= 64 bytes for AArch64 memcpy inlining" );
2717
+
2718
+ uint64_t Remaining = Size;
2719
+ uint64_t Offset = 0 ;
2720
+
2721
+ const std::array<std::tuple<uint64_t , unsigned , unsigned , unsigned >, 5 >
2722
+ LoadStoreOps = {
2723
+ {{16 , AArch64::LDRQui, AArch64::STRQui, AArch64::Q16},
2724
+ {8 , AArch64::LDRXui, AArch64::STRXui, AArch64::X9},
2725
+ {4 , AArch64::LDRWui, AArch64::STRWui, AArch64::W9},
2726
+ {2 , AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9},
2727
+ {1 , AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9}}};
2728
+
2729
+ for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps)
2730
+ while (Remaining >= OpSize) {
2731
+ AddLoadStorePair (LoadOp, StoreOp, TempReg, Offset / OpSize);
2732
+ Remaining -= OpSize;
2733
+ Offset += OpSize;
2734
+ }
2735
+ break ;
2736
+ }
2737
+ return Code;
2738
+ }
2623
2739
};
2624
2740
2625
2741
} // end anonymous namespace
0 commit comments