diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 34d46e5c37569..2844001199deb 100644 --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -65,6 +65,8 @@ STATISTIC(NumFailedAlignmentCheck, "Number of load/store pair transformation " "not passed the alignment check"); STATISTIC(NumConstOffsetFolded, "Number of const offset of index address folded"); +STATISTIC(NumUMOVFoldedToFPRStore, + "Number of UMOV + GPR stores folded to FPR stores"); DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming", "Controls which pairs are considered for renaming"); @@ -219,6 +221,9 @@ struct AArch64LoadStoreOpt { // Find and merge an index ldr/st instruction into a base ld/st instruction. bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale); + // Replace a UMOV (lane 0) + GPR store with a direct FPR sub-register store. + bool tryToReplaceUMOVStore(MachineBasicBlock::iterator &MBBI); + bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt); bool runOnMachineFunction(MachineFunction &MF); @@ -3012,6 +3017,110 @@ bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, return false; } +// Given a UMOV-lane-0 opcode and a GPR store opcode, return the corresponding +// FPR store opcode and the sub-register index to extract from the vector, or +// return false if the combination is not supported. +static bool getUMOVToFPRStoreInfo(unsigned UMOVOpc, unsigned GPRStoreOpc, + unsigned &FPRStoreOpc, unsigned &SubRegIdx) { + switch (UMOVOpc) { + case AArch64::UMOVvi8_idx0: + if (GPRStoreOpc != AArch64::STRBBui) + return false; + FPRStoreOpc = AArch64::STRBui; + SubRegIdx = AArch64::bsub; + return true; + case AArch64::UMOVvi16_idx0: + if (GPRStoreOpc != AArch64::STRHHui) + return false; + FPRStoreOpc = AArch64::STRHui; + SubRegIdx = AArch64::hsub; + return true; + case AArch64::UMOVvi32_idx0: + if (GPRStoreOpc != AArch64::STRWui) + return false; + FPRStoreOpc = AArch64::STRSui; + SubRegIdx = AArch64::ssub; + return true; + case AArch64::UMOVvi64_idx0: + if (GPRStoreOpc != AArch64::STRXui) + return false; + FPRStoreOpc = AArch64::STRDui; + SubRegIdx = AArch64::dsub; + return true; + default: + return false; + } +} + +bool AArch64LoadStoreOpt::tryToReplaceUMOVStore( + MachineBasicBlock::iterator &MBBI) { + MachineInstr &StoreMI = *MBBI; + unsigned StoreOpc = StoreMI.getOpcode(); + + if (StoreOpc != AArch64::STRBBui && StoreOpc != AArch64::STRHHui && + StoreOpc != AArch64::STRWui && StoreOpc != AArch64::STRXui) + return false; + + MachineBasicBlock *MBB = StoreMI.getParent(); + unsigned FPRStoreOpc = 0, SubRegIdx = 0; + MCPhysReg StoreValReg = StoreMI.getOperand(0).getReg(); + + if (!StoreMI.getOperand(0).isKill()) + return false; + + // Scan backward to find the UMOV that defines the store's value register. + MachineInstr *UMOVMI = nullptr; + for (auto It = MBBI; It != MBB->begin() && !UMOVMI;) { + MachineInstr &MI = *--It; + if (MI.readsRegister(StoreValReg, TRI)) + return false; + if (MI.definesRegister(StoreValReg, TRI)) { + if (!getUMOVToFPRStoreInfo(MI.getOpcode(), StoreMI.getOpcode(), + FPRStoreOpc, SubRegIdx)) + return false; + UMOVMI = &MI; + } + } + if (!UMOVMI) + return false; + + MCPhysReg VecReg = UMOVMI->getOperand(1).getReg(); + + // Check that no instruction between UMOV and store clobbers the vector + // register. Also track whether VecReg is killed anywhere from the UMOV + // (inclusive) through the intervening instructions -- we need this to decide + // whether the FPR sub-register can be marked killed on the new store. + bool VecRegKilled = UMOVMI->killsRegister(VecReg, TRI); + for (auto It = std::next(UMOVMI->getIterator()); It != MBBI; ++It) { + if (It->modifiesRegister(VecReg, TRI)) + return false; + if (!VecRegKilled && It->killsRegister(VecReg, TRI)) + VecRegKilled = true; + } + + // Safe to proceed. Clear kill flags on the vector register between UMOV and + // the new store so the FPR sub-register stays live. + UMOVMI->clearRegisterKills(VecReg, TRI); + for (auto It = std::next(UMOVMI->getIterator()); It != MBBI; ++It) + It->clearRegisterKills(VecReg, TRI); + + LLVM_DEBUG(dbgs() << "Folding UMOV + store: " << *UMOVMI << " + " + << StoreMI); + + MCPhysReg FPRReg = TRI->getSubReg(VecReg, SubRegIdx); + BuildMI(*MBB, MBBI, StoreMI.getDebugLoc(), TII->get(FPRStoreOpc)) + .addReg(FPRReg, getKillRegState(VecRegKilled)) + .add(StoreMI.getOperand(1)) + .add(StoreMI.getOperand(2)) + .setMemRefs(StoreMI.memoperands()); + + MBBI = MBB->erase(MBBI); + UMOVMI->eraseFromParent(); + + ++NumUMOVFoldedToFPRStore; + return true; +} + bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt) { AArch64FunctionInfo &AFI = *MBB.getParent()->getInfo(); @@ -3114,6 +3223,20 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, ++MBBI; } + // 6) Replace UMOV (lane 0) + GPR store with a direct FPR sub-register store. + // e.g., + // umov w8, v0.h[0] + // strh w8, [x0] + // ; becomes + // str h0, [x0] + for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + MBBI != E;) { + if (tryToReplaceUMOVStore(MBBI)) + Modified = true; + else + ++MBBI; + } + return Modified; } diff --git a/llvm/test/CodeGen/AArch64/ldst-opt-umov-fpr-store.mir b/llvm/test/CodeGen/AArch64/ldst-opt-umov-fpr-store.mir new file mode 100644 index 0000000000000..5758cbebdb376 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ldst-opt-umov-fpr-store.mir @@ -0,0 +1,169 @@ +# RUN: llc -mtriple=aarch64-linux-gnu -run-pass=aarch64-ldst-opt -o - %s | FileCheck %s + +# Test that UMOV (lane 0) + GPR store is folded into a direct FPR store +# when the UMOV result has no other uses. + +--- +# UMOVvi16_idx0 + STRHHui → STRHui, with intervening ST1i8 that kills the +# vector register. +# CHECK-LABEL: name: umov_i16_to_fpr_store +# CHECK: ST1i8 renamable $q0 +# CHECK-NEXT: STRHui killed $h0 +# CHECK-NOT: UMOVvi16_idx0 +# CHECK-NOT: STRHHui +name: umov_i16_to_fpr_store +tracksRegLiveness: true +body: | + bb.0: + liveins: $q0, $x0, $x1 + + renamable $w8 = UMOVvi16_idx0 renamable $q0, 0 + ST1i8 killed renamable $q0, 8, killed renamable $x1 :: (store (s8)) + STRHHui killed renamable $w8, killed renamable $x0, 0 :: (store (s16)) + RET undef $lr +... +--- +# UMOVvi8_idx0 + STRBBui → STRBui +# CHECK-LABEL: name: umov_i8_to_fpr_store +# CHECK: STRBui killed $b0 +# CHECK-NOT: UMOVvi8_idx0 +# CHECK-NOT: STRBBui +name: umov_i8_to_fpr_store +tracksRegLiveness: true +body: | + bb.0: + liveins: $q0, $x0 + + renamable $w8 = UMOVvi8_idx0 killed renamable $q0, 0 + STRBBui killed renamable $w8, killed renamable $x0, 0 :: (store (s8)) + RET undef $lr +... +--- +# UMOVvi32_idx0 + STRWui → STRSui +# CHECK-LABEL: name: umov_i32_to_fpr_store +# CHECK: STRSui killed $s0 +# CHECK-NOT: UMOVvi32_idx0 +# CHECK-NOT: STRWui +name: umov_i32_to_fpr_store +tracksRegLiveness: true +body: | + bb.0: + liveins: $q0, $x0 + + renamable $w8 = UMOVvi32_idx0 killed renamable $q0, 0 + STRWui killed renamable $w8, killed renamable $x0, 0 :: (store (s32)) + RET undef $lr +... +--- +# UMOVvi64_idx0 + STRXui → STRDui +# CHECK-LABEL: name: umov_i64_to_fpr_store +# CHECK: STRDui killed $d0 +# CHECK-NOT: UMOVvi64_idx0 +# CHECK-NOT: STRXui +name: umov_i64_to_fpr_store +tracksRegLiveness: true +body: | + bb.0: + liveins: $q0, $x0 + + renamable $x8 = UMOVvi64_idx0 killed renamable $q0, 0 + STRXui killed renamable $x8, killed renamable $x0, 0 :: (store (s64)) + RET undef $lr +... +--- +# Negative: UMOV result is modified before the store. +# CHECK-LABEL: name: umov_i16_used_before_store +# CHECK: UMOVvi16_idx0 +# CHECK: EORWrr +# CHECK: STRHHui +name: umov_i16_used_before_store +tracksRegLiveness: true +body: | + bb.0: + liveins: $q0, $x0, $w9 + + renamable $w8 = UMOVvi16_idx0 killed renamable $q0, 0 + renamable $w8 = EORWrr killed renamable $w8, killed renamable $w9 + STRHHui killed renamable $w8, killed renamable $x0, 0 :: (store (s16)) + RET undef $lr +... +--- +# Negative: The vector register is clobbered between UMOV and store. +# CHECK-LABEL: name: umov_i16_vec_clobbered +# CHECK: UMOVvi16_idx0 +# CHECK: ORRv16i8 +# CHECK: STRHHui +name: umov_i16_vec_clobbered +tracksRegLiveness: true +body: | + bb.0: + liveins: $q0, $q1, $x0 + + renamable $w8 = UMOVvi16_idx0 renamable $q0, 0 + renamable $q0 = ORRv16i8 killed renamable $q0, killed renamable $q1 + STRHHui killed renamable $w8, killed renamable $x0, 0 :: (store (s16)) + RET undef $lr +... +--- +# Negative: The GPR result is not killed at the store (has a later use). +# CHECK-LABEL: name: umov_i16_not_killed +# CHECK: UMOVvi16_idx0 +# CHECK: STRHHui renamable $w8 +# CHECK: STRHHui killed renamable $w8 +name: umov_i16_not_killed +tracksRegLiveness: true +body: | + bb.0: + liveins: $q0, $x0, $x1 + + renamable $w8 = UMOVvi16_idx0 killed renamable $q0, 0 + STRHHui renamable $w8, renamable $x0, 0 :: (store (s16)) + STRHHui killed renamable $w8, killed renamable $x1, 0 :: (store (s16)) + RET undef $lr +... +--- +# Multiple folds in one block: two independent UMOV+store pairs using +# different vector registers and different element sizes. +# CHECK-LABEL: name: umov_multiple_folds +# CHECK: ORRv16i8 +# CHECK-NEXT: ST1i8 renamable $q0 +# CHECK-NEXT: STRHui killed $h0 +# CHECK-NEXT: STRSui killed $s1 +# CHECK-NOT: UMOVvi16_idx0 +# CHECK-NOT: UMOVvi32_idx0 +# CHECK-NOT: STRHHui +# CHECK-NOT: STRWui +name: umov_multiple_folds +tracksRegLiveness: true +body: | + bb.0: + liveins: $q0, $q1, $x0, $x1, $x2 + + renamable $q0 = ORRv16i8 killed renamable $q0, renamable $q1 + renamable $w8 = UMOVvi16_idx0 renamable $q0, 0 + ST1i8 killed renamable $q0, 8, killed renamable $x2 :: (store (s8)) + STRHHui killed renamable $w8, killed renamable $x0, 0 :: (store (s16)) + renamable $w9 = UMOVvi32_idx0 killed renamable $q1, 0 + STRWui killed renamable $w9, killed renamable $x1, 0 :: (store (s32)) + RET undef $lr +... +--- +# Vector register still live after the store (used via $s0 alias). +# The fold should happen but the FPR sub-register must NOT be killed. +# CHECK-LABEL: name: umov_i16_vec_live_after_store +# CHECK: STRHui $h0 +# CHECK-NEXT: renamable $w9 = FMOVSWr renamable $s0 +# CHECK-NOT: UMOVvi16_idx0 +# CHECK-NOT: STRHHui +name: umov_i16_vec_live_after_store +tracksRegLiveness: true +body: | + bb.0: + liveins: $q0, $x0, $x1 + + renamable $w8 = UMOVvi16_idx0 renamable $q0, 0 + STRHHui killed renamable $w8, killed renamable $x0, 0 :: (store (s16)) + renamable $w9 = FMOVSWr renamable $s0 + STRWui killed renamable $w9, killed renamable $x1, 0 :: (store (s32)) + RET undef $lr +...