From 24b8b479945ced40111017e219d12e21061246f7 Mon Sep 17 00:00:00 2001 From: Guy David Date: Sat, 8 Nov 2025 23:59:30 +0200 Subject: [PATCH] [AArch64] Treat COPY between cross-register banks as expensive The motivation is to allow passes such as MachineLICM to hoist trivial FMOV instructions out of loops, where previously it didn't do so even when the RHS is a constant. On most architectures, these expensive move instructions have a latency of 2-6 cycles, and certainly not cheap as a 0-1 cycle move. --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 25 +++ .../CodeGen/AArch64/licm-regclass-copy.mir | 197 ++++++++++++++++++ 2 files changed, 222 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/licm-regclass-copy.mir diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 221812f1ebc7b..00fe8ee8b9b4d 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -1144,6 +1144,28 @@ static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) { return Is.size() <= 2; } +// Check if a COPY instruction is cheap. +static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI) { + assert(MI.isCopy() && "Expected COPY instruction"); + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + + // Cross-bank copies (e.g., between GPR and FPR) are expensive on AArch64, + // typically requiring an FMOV instruction with a 2-6 cycle latency. + auto GetRegClass = [&](Register Reg) -> const TargetRegisterClass * { + if (Reg.isVirtual()) + return MRI.getRegClass(Reg); + if (Reg.isPhysical()) + return RI.getMinimalPhysRegClass(Reg); + return nullptr; + }; + const TargetRegisterClass *DstRC = GetRegClass(MI.getOperand(0).getReg()); + const TargetRegisterClass *SrcRC = GetRegClass(MI.getOperand(1).getReg()); + if (DstRC && SrcRC && !RI.getCommonSubClass(DstRC, SrcRC)) + return false; + + return MI.isAsCheapAsAMove(); +} + // FIXME: this implementation should be micro-architecture dependent, so a // micro-architecture target hook should be introduced here in future. bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { @@ -1157,6 +1179,9 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { default: return MI.isAsCheapAsAMove(); + case TargetOpcode::COPY: + return isCheapCopy(MI, RI); + case AArch64::ADDWrs: case AArch64::ADDXrs: case AArch64::SUBWrs: diff --git a/llvm/test/CodeGen/AArch64/licm-regclass-copy.mir b/llvm/test/CodeGen/AArch64/licm-regclass-copy.mir new file mode 100644 index 0000000000000..6a10df68ddc71 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/licm-regclass-copy.mir @@ -0,0 +1,197 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=aarch64 -run-pass=early-machinelicm -o - %s | FileCheck %s + +# This test verifies that cross-bank copies (e.g., GPR to FPR, FPR to GPR) +# are hoisted out of loops by MachineLICM, as they are expensive on AArch64. + +--- | + declare void @use_float(float) + declare void @use_int(i32) + + define void @gpr_to_fpr_virtual_copy_hoisted() { + ret void + } + + define void @gpr_to_fpr_physical_copy_hoisted() { + ret void + } + + define void @fpr_to_gpr_virtual_copy_hoisted() { + ret void + } +... +--- +name: gpr_to_fpr_virtual_copy_hoisted +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: gpr_to_fpr_virtual_copy_hoisted + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $w0, $w1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr32all = COPY $wzr + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr32all = COPY [[COPY2]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:fpr32 = COPY [[COPY1]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:gpr32common = PHI [[COPY3]], %bb.0, %5, %bb.2 + ; CHECK-NEXT: [[SUBSWrr:%[0-9]+]]:gpr32 = SUBSWrr [[PHI]], [[COPY]], implicit-def $nzcv + ; CHECK-NEXT: Bcc 1, %bb.3, implicit $nzcv + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $s0 = COPY [[COPY4]] + ; CHECK-NEXT: BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp + ; CHECK-NEXT: [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[PHI]], 1, 0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr32all = COPY [[ADDWri]] + ; CHECK-NEXT: B %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: RET_ReallyLR + bb.0: + liveins: $w0, $w1 + %1:gpr32 = COPY $w0 + %0:gpr32 = COPY $w1 + %3:gpr32all = COPY $wzr + %2:gpr32all = COPY %3:gpr32all + + bb.1: + %4:gpr32common = PHI %2:gpr32all, %bb.0, %5:gpr32all, %bb.2 + %6:gpr32 = SUBSWrr %4:gpr32common, %1:gpr32, implicit-def $nzcv + Bcc 1, %bb.3, implicit $nzcv + B %bb.2 + + bb.2: + %7:fpr32 = COPY %0:gpr32 + $s0 = COPY %7:fpr32 + BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp + %8:gpr32sp = ADDWri %4:gpr32common, 1, 0 + %5:gpr32all = COPY %8:gpr32sp + B %bb.1 + + bb.3: + RET_ReallyLR + +... +--- +name: gpr_to_fpr_physical_copy_hoisted +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: gpr_to_fpr_physical_copy_hoisted + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32all = COPY $wzr + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr32all = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY $wzr + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:gpr32common = PHI [[COPY2]], %bb.0, %4, %bb.2 + ; CHECK-NEXT: [[SUBSWrr:%[0-9]+]]:gpr32 = SUBSWrr [[PHI]], [[COPY]], implicit-def $nzcv + ; CHECK-NEXT: Bcc 1, %bb.3, implicit $nzcv + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $s0 = COPY [[COPY3]] + ; CHECK-NEXT: BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp + ; CHECK-NEXT: [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[PHI]], 1, 0 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr32all = COPY [[ADDWri]] + ; CHECK-NEXT: B %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: RET_ReallyLR + bb.0: + liveins: $w0 + %1:gpr32 = COPY $w0 + %3:gpr32all = COPY $wzr + %2:gpr32all = COPY %3:gpr32all + + bb.1: + %4:gpr32common = PHI %2:gpr32all, %bb.0, %5:gpr32all, %bb.2 + %6:gpr32 = SUBSWrr %4:gpr32common, %1:gpr32, implicit-def $nzcv + Bcc 1, %bb.3, implicit $nzcv + B %bb.2 + + bb.2: + %7:fpr32 = COPY $wzr + $s0 = COPY %7:fpr32 + BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp + %8:gpr32sp = ADDWri %4:gpr32common, 1, 0 + %5:gpr32all = COPY %8:gpr32sp + B %bb.1 + + bb.3: + RET_ReallyLR + +... +--- +name: fpr_to_gpr_virtual_copy_hoisted +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: fpr_to_gpr_virtual_copy_hoisted + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $w0, $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr32 = COPY $s0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr32all = COPY $wzr + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr32all = COPY [[COPY2]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr32 = COPY [[COPY1]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:gpr32common = PHI [[COPY3]], %bb.0, %5, %bb.2 + ; CHECK-NEXT: [[SUBSWrr:%[0-9]+]]:gpr32 = SUBSWrr [[PHI]], [[COPY]], implicit-def $nzcv + ; CHECK-NEXT: Bcc 1, %bb.3, implicit $nzcv + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $w0 = COPY [[COPY4]] + ; CHECK-NEXT: BL @use_int, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp + ; CHECK-NEXT: [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[PHI]], 1, 0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr32all = COPY [[ADDWri]] + ; CHECK-NEXT: B %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: RET_ReallyLR + bb.0: + liveins: $w0, $s0 + %1:gpr32 = COPY $w0 + %0:fpr32 = COPY $s0 + %3:gpr32all = COPY $wzr + %2:gpr32all = COPY %3:gpr32all + + bb.1: + %4:gpr32common = PHI %2:gpr32all, %bb.0, %5:gpr32all, %bb.2 + %6:gpr32 = SUBSWrr %4:gpr32common, %1:gpr32, implicit-def $nzcv + Bcc 1, %bb.3, implicit $nzcv + B %bb.2 + + bb.2: + %7:gpr32 = COPY %0:fpr32 + $w0 = COPY %7:gpr32 + BL @use_int, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp + %8:gpr32sp = ADDWri %4:gpr32common, 1, 0 + %5:gpr32all = COPY %8:gpr32sp + B %bb.1 + + bb.3: + RET_ReallyLR + +...