From f050ab5844f1ee3c3fef59de1e3b55dd6afe8a2e Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Mon, 13 Oct 2025 17:04:14 +0100 Subject: [PATCH 1/4] [ARM][MVE] Invalid tail predication in LowOverheadLoop pass MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #162644. In the new test case, the output of the loop is a MVE_VADDf32 using the ambient tail predicate, taking its false lanes from an MQPRCopy of a value defined once before the loop began. So if the last loop iteration is short, you expect the tail of the output vector to contain elements from that pre-loop value. But that loop using explicit MVE predication was being converted into one using DLSTP/LETP to set FPSCR.LTPSIZE, causing _everything_ inside the loop to be _implicitly_ predicated, including that register copy. So the tail of the output vector was no longer set to a copy of the pre-loop value, because the register copy only copied the true lanes. Instead the output vector contained the false lanes of whatever happened to be in that physical register before the register copy. To fix that, I've made two changes in `ValidateLiveOuts`: - Removed the exceptional treatment for `MQPRCopy` which prevented register copies being added to the `FalseLanesUnknown` set. If the loop is tail-predicated, then the false lanes will not be modified by the copy, so they _are_ unknown, unless the analysis loop further down `ValidateLiveOuts` can make them known again. - Added a check of every predicated instruction with merge semantics, i.e. which has an explicit vector register input that contributes the false lanes of its output. If that input is received from a `FalseLanesUnknown` instruction, then it is `FalseLanesUnknown` as well. The first change causes the register copy in the example to be FalseLanesUnknown; the second change causes that to be inherited by the MVE_VADDf32 which uses the copy's output for its false lanes. These two changes together are enough to tag the output value of the loop as not proven invariant under the tail-predication transformation. An existing test is affected by this change: the `no_combination_diff_reg_value` function in `vcmp-vpst-combination-across-blocks.mir` also becomes non-tail-predicated, for the same reason – the VORR generating the loop's output takes its false lanes from an unpredicated register copy in the input, which becomes a predicated one if you convert the loop to use DLSTP. I think this too was incorrectly code-generated, and it's correct that this patch prevents the tail predication: if the last loop iteration is short, then before the transformation, the unpredicated copy would have taken the false lanes from the value loaded by the MVE_VLDRWU32_post earlier in the loop, which would have zeroed them, but after the transformation those lanes would come from whatever was left in q0 from the previous loop iteration. --- llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp | 17 +- llvm/lib/Target/ARM/Thumb2InstrInfo.cpp | 10 + llvm/lib/Target/ARM/Thumb2InstrInfo.h | 3 + .../vcmp-vpst-combination-across-blocks.mir | 20 +- .../vctp-vs-unpredicated-copy.mir | 237 ++++++++++++++++++ 5 files changed, 280 insertions(+), 7 deletions(-) create mode 100644 llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-vs-unpredicated-copy.mir diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp index 96ee69cf3f4ce..0f84f63b7a6e4 100644 --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -955,8 +955,23 @@ bool LowOverheadLoop::ValidateLiveOuts() { else if (!isPredicated && retainsOrReduces) { LLVM_DEBUG(dbgs() << " Unpredicated instruction that retainsOrReduces: " << MI); return false; - } else if (!isPredicated && MI.getOpcode() != ARM::MQPRCopy) + } else if (!isPredicated) FalseLanesUnknown.insert(&MI); + else if (int InactiveIdx = findVPTInactiveOperandIdx(MI); + InactiveIdx != -1) { + auto MO = MI.getOperand(InactiveIdx); + SmallPtrSet Defs; + RDI.getGlobalReachingDefs(&MI, MO.getReg(), Defs); + for (auto *Def : Defs) { + if (Def != &MI && FalseLanesUnknown.count(Def)) { + LLVM_DEBUG(dbgs() << " we think this is FalseLanesUnknown: " << MI); + LLVM_DEBUG(dbgs() + << " because it takes the false lanes of this: " << *Def); + FalseLanesUnknown.insert(&MI); + break; + } + } + } } LLVM_DEBUG({ diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp index 431ce38ad6e99..bab7804be7d03 100644 --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -805,6 +805,16 @@ int llvm::findFirstVPTPredOperandIdx(const MachineInstr &MI) { return -1; } +int llvm::findVPTInactiveOperandIdx(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + + for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) + if (MCID.operands()[i].OperandType == ARM::OPERAND_VPRED_R) + return i + 3; + + return -1; +} + ARMVCC::VPTCodes llvm::getVPTInstrPredicate(const MachineInstr &MI, Register &PredReg) { int PIdx = findFirstVPTPredOperandIdx(MI); diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h index 3ec3a6216b9f6..1b0bf2d499510 100644 --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h @@ -90,6 +90,9 @@ inline ARMVCC::VPTCodes getVPTInstrPredicate(const MachineInstr &MI) { Register PredReg; return getVPTInstrPredicate(MI, PredReg); } +// Identify the input operand in an MVE predicated instruction which +// contributes the values of any inactive vector lanes. +int findVPTInactiveOperandIdx(const MachineInstr &MI); // Recomputes the Block Mask of Instr, a VPT or VPST instruction. // This rebuilds the block mask of the instruction depending on the predicates diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination-across-blocks.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination-across-blocks.mir index fb714f80f76f0..6991006cede8e 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination-across-blocks.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination-across-blocks.mir @@ -447,20 +447,28 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: liveins: $r0, $r1 ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $r3, dead $cpsr = nuw nsw tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg ; CHECK-NEXT: renamable $q0 = MVE_VDUP32 renamable $r1, 0, $noreg, $noreg, undef renamable $q0 - ; CHECK-NEXT: $lr = MVE_DLSTP_32 killed renamable $r1 + ; CHECK-NEXT: renamable $r3 = t2ANDri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: renamable $lr = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + ; CHECK-NEXT: renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $lr, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1 (align 4): ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) - ; CHECK-NEXT: liveins: $lr, $q0, $r0 + ; CHECK-NEXT: liveins: $lr, $q0, $r0, $r1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $r0, renamable $q1 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg, $noreg - ; CHECK-NEXT: renamable $q1 = MVE_VORR killed renamable $q1, renamable $q0, 0, $noreg, $noreg, killed renamable $q1 - ; CHECK-NEXT: renamable $vpr = MVE_VCMPf32 renamable $q1, renamable $q0, 12, 0, killed $noreg, $noreg + ; CHECK-NEXT: renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg, $noreg + ; CHECK-NEXT: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg + ; CHECK-NEXT: MVE_VPST 8, implicit $vpr + ; CHECK-NEXT: renamable $r0, renamable $q1 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr, $noreg + ; CHECK-NEXT: MVE_VPST 4, implicit $vpr + ; CHECK-NEXT: renamable $q1 = MVE_VORR killed renamable $q1, renamable $q0, 1, renamable $vpr, $noreg, killed renamable $q1 + ; CHECK-NEXT: renamable $vpr = MVE_VCMPf32 renamable $q1, renamable $q0, 12, 1, killed renamable $vpr, $noreg ; CHECK-NEXT: renamable $q0 = MVE_VORR renamable $q1, renamable $q1, 0, $noreg, $noreg, killed renamable $q0 ; CHECK-NEXT: MVE_VPST 8, implicit $vpr ; CHECK-NEXT: renamable $q0 = MVE_VORR killed renamable $q1, killed renamable $q1, 1, killed renamable $vpr, $noreg, killed renamable $q0 - ; CHECK-NEXT: $lr = MVE_LETP killed renamable $lr, %bb.1 + ; CHECK-NEXT: $lr = t2LEUpdate killed renamable $lr, %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-vs-unpredicated-copy.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-vs-unpredicated-copy.mir new file mode 100644 index 0000000000000..cf68d60602e72 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-vs-unpredicated-copy.mir @@ -0,0 +1,237 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -verify-machineinstrs -o - | FileCheck %s + +# Wrong output is +# $q2 = MVE_VORR killed $q0, killed $q0, 0, $noreg, $noreg, undef $q2 +# renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg, renamable $lr :: (load unknown-size from %ir.13, align 4) +# $q0 = MVE_VORR $q1, $q1, 0, $noreg, $noreg, undef $q0 +# renamable $q0 = MVE_VADDf32 killed renamable $q2, killed renamable $q3, 0, killed $noreg, renamable $lr, killed renamable $q0 +# $lr = MVE_LETP killed renamable $lr, %bb.1 + +# in which the problem is that the second MVE_VORR, copying q1 into +# q0, is required to be unpredicated so that the inactive lanes will +# be copied. + +--- | + ; ModuleID = '162644.c' + source_filename = "162644.c" + target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumbv8.1m.main-unknown-none-eabihf" + + @inactive = dso_local local_unnamed_addr global <4 x float> zeroinitializer, align 16 + + ; Function Attrs: nofree noinline norecurse nosync nounwind memory(read, inaccessiblemem: none) + define dso_local <4 x float> @test_func(ptr noundef readonly captures(none) %0, i32 noundef %1) local_unnamed_addr #0 { + %3 = load <4 x float>, ptr @inactive, align 16, !tbaa !3 + %4 = add i32 %1, 3 + %5 = call i32 @llvm.smin.i32(i32 %1, i32 4) + %6 = sub i32 %4, %5 + %7 = lshr i32 %6, 2 + %8 = add nuw nsw i32 %7, 1 + %9 = call i32 @llvm.start.loop.iterations.i32(i32 %8) + br label %10 + + 10: ; preds = %10, %2 + %11 = phi <4 x float> [ splat (float 0x3FB99999A0000000), %2 ], [ %17, %10 ] + %12 = phi i32 [ %1, %2 ], [ %19, %10 ] + %13 = phi ptr [ %0, %2 ], [ %18, %10 ] + %14 = phi i32 [ %9, %2 ], [ %20, %10 ] + %15 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %12) + %16 = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %13, i32 4, <4 x i1> %15, <4 x float> zeroinitializer) + %17 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %11, <4 x float> %16, <4 x i1> %15, <4 x float> %3) + %18 = getelementptr inbounds nuw i8, ptr %13, i32 16 + %19 = add i32 %12, -4 + %20 = call i32 @llvm.loop.decrement.reg.i32(i32 %14, i32 1) + %21 = icmp ne i32 %20, 0 + br i1 %21, label %10, label %22, !llvm.loop !6 + + 22: ; preds = %10 + ret <4 x float> %17 + } + + ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) + declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1 + + ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) + declare <4 x float> @llvm.masked.load.v4f32.p0(ptr captures(none), i32 immarg, <4 x i1>, <4 x float>) #2 + + ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) + declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #1 + + ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) + declare i32 @llvm.smin.i32(i32, i32) #3 + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare i32 @llvm.start.loop.iterations.i32(i32) #4 + + ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn + declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #4 + + attributes #0 = { nofree noinline norecurse nosync nounwind memory(read, inaccessiblemem: none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m52" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+pacbti,+ras,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" } + attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } + attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) } + attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } + attributes #4 = { nocallback noduplicate nofree nosync nounwind willreturn } + + !llvm.module.flags = !{!0, !1} + !llvm.ident = !{!2} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{i32 1, !"min_enum_size", i32 4} + !2 = !{!"clang version 22.0.0git"} + !3 = !{!4, !4, i64 0} + !4 = !{!"omnipotent char", !5, i64 0} + !5 = !{!"Simple C/C++ TBAA"} + !6 = distinct !{!6, !7, !8} + !7 = !{!"llvm.loop.mustprogress"} + !8 = !{!"llvm.loop.unroll.disable"} +... +--- +name: test_func +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +callsEHReturn: false +callsUnwindInit: false +hasEHContTarget: false +hasEHScopes: false +hasEHFunclets: false +isOutlined: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + isCalleeSavedInfoValid: true + localFrameSize: 0 +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +entry_values: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: + isLRSpilled: true +body: | + ; CHECK-LABEL: name: test_func + ; CHECK: bb.0 (%ir-block.2): + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $lr, $r0, $r1, $r7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK-NEXT: $r2 = t2MOVi16 target-flags(arm-lo16) @inactive, 14 /* CC::al */, $noreg + ; CHECK-NEXT: tCMPi8 renamable $r1, 4, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK-NEXT: $r2 = t2MOVTi16 killed $r2, target-flags(arm-hi16) @inactive, 14 /* CC::al */, $noreg + ; CHECK-NEXT: renamable $r3 = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: renamable $q1 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg, $noreg :: (dereferenceable load (s128) from @inactive, !tbaa !3) + ; CHECK-NEXT: $r2 = tMOVr $r1, 14 /* CC::al */, $noreg + ; CHECK-NEXT: t2IT 10, 8, implicit-def $itstate + ; CHECK-NEXT: renamable $r2 = tMOVi8 $noreg, 4, 10 /* CC::ge */, killed $cpsr, implicit killed renamable $r2, implicit killed $itstate + ; CHECK-NEXT: renamable $r2, dead $cpsr = tSUBrr renamable $r1, killed renamable $r2, 14 /* CC::al */, $noreg + ; CHECK-NEXT: renamable $r2, dead $cpsr = tADDi8 killed renamable $r2, 3, 14 /* CC::al */, $noreg + ; CHECK-NEXT: renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: $r3 = t2MOVi16 52429, 14 /* CC::al */, $noreg + ; CHECK-NEXT: $r3 = t2MOVTi16 killed $r3, 15820, 14 /* CC::al */, $noreg + ; CHECK-NEXT: renamable $q0 = MVE_VDUP32 killed renamable $r3, 0, $noreg, $noreg, undef renamable $q0 + ; CHECK-NEXT: $lr = t2DLS killed renamable $r2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1 (%ir-block.10, align 4): + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: liveins: $lr, $q0, $q1, $r0, $r1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg, $noreg + ; CHECK-NEXT: $q2 = MVE_VORR killed $q0, killed $q0, 0, $noreg, $noreg, undef $q2 + ; CHECK-NEXT: MVE_VPST 8, implicit $vpr + ; CHECK-NEXT: renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr, renamable $lr :: (load unknown-size from %ir.13, align 4) + ; CHECK-NEXT: $q0 = MVE_VORR $q1, $q1, 0, $noreg, $noreg, undef $q0 + ; CHECK-NEXT: MVE_VPST 8, implicit $vpr + ; CHECK-NEXT: renamable $q0 = MVE_VADDf32 killed renamable $q2, killed renamable $q3, 1, killed renamable $vpr, renamable $lr, killed renamable $q0 + ; CHECK-NEXT: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg + ; CHECK-NEXT: $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2 (%ir-block.22): + ; CHECK-NEXT: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $q0 + bb.0 (%ir-block.2): + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r7, $lr + + frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r2 = t2MOVi16 target-flags(arm-lo16) @inactive, 14 /* CC::al */, $noreg + tCMPi8 renamable $r1, 4, 14 /* CC::al */, $noreg, implicit-def $cpsr + $r2 = t2MOVTi16 killed $r2, target-flags(arm-hi16) @inactive, 14 /* CC::al */, $noreg + renamable $r3 = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg + renamable $q1 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg, $noreg :: (dereferenceable load (s128) from @inactive, !tbaa !3) + $r2 = tMOVr $r1, 14 /* CC::al */, $noreg + t2IT 10, 8, implicit-def $itstate + renamable $r2 = tMOVi8 $noreg, 4, 10 /* CC::ge */, killed $cpsr, implicit killed renamable $r2, implicit killed $itstate + renamable $r2, dead $cpsr = tSUBrr renamable $r1, killed renamable $r2, 14 /* CC::al */, $noreg + renamable $r2, dead $cpsr = tADDi8 killed renamable $r2, 3, 14 /* CC::al */, $noreg + renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg + $r3 = t2MOVi16 52429, 14 /* CC::al */, $noreg + $r3 = t2MOVTi16 killed $r3, 15820, 14 /* CC::al */, $noreg + renamable $q0 = MVE_VDUP32 killed renamable $r3, 0, $noreg, $noreg, undef renamable $q0 + renamable $lr = t2DoLoopStartTP killed renamable $r2, renamable $r1 + + bb.1 (%ir-block.10, align 4): + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $q0, $q1, $r0, $r1 + + renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg, $noreg + $q2 = MQPRCopy killed $q0 + MVE_VPST 8, implicit $vpr + renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr, renamable $lr :: (load unknown-size from %ir.13, align 4) + $q0 = MQPRCopy $q1 + MVE_VPST 8, implicit $vpr + renamable $q0 = MVE_VADDf32 killed renamable $q2, killed renamable $q3, 1, killed renamable $vpr, renamable $lr, killed renamable $q0 + renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg + renamable $lr = t2LoopEndDec killed renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14 /* CC::al */, $noreg + + bb.2 (%ir-block.22): + liveins: $q0 + + frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $q0 +... From 7dc18e75588ecc744c0496e0a6bfe6aba549adb8 Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Tue, 14 Oct 2025 14:02:05 +0100 Subject: [PATCH 2/4] Remove special case for MQPRCopy at end --- llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp | 8 +---- .../Thumb2/LowOverheadLoops/spillingmove.mir | 30 ++++++++++++------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp index 0f84f63b7a6e4..dae0e81ca4ccf 100644 --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -1050,13 +1050,7 @@ bool LowOverheadLoop::ValidateLiveOuts() { SmallVector Worklist(LiveOutMIs.begin(), LiveOutMIs.end()); while (!Worklist.empty()) { MachineInstr *MI = Worklist.pop_back_val(); - if (MI->getOpcode() == ARM::MQPRCopy) { - VMOVCopies.insert(MI); - MachineInstr *CopySrc = - RDI.getUniqueReachingMIDef(MI, MI->getOperand(1).getReg()); - if (CopySrc) - Worklist.push_back(CopySrc); - } else if (NonPredicated.count(MI) && FalseLanesUnknown.contains(MI)) { + if (NonPredicated.count(MI) && FalseLanesUnknown.contains(MI)) { LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI); VMOVCopies.clear(); return false; diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.mir index 0995521957b98..3c1fc40df09d0 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.mir @@ -309,34 +309,42 @@ body: | ; CHECK-NEXT: liveins: $r0, $r1, $r2, $r12 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $r3 = t2LDRHi12 $sp, 16, 14 /* CC::al */, $noreg + ; CHECK-NEXT: renamable $r6, dead $cpsr = nsw tADDi3 renamable $r2, 7, 14 /* CC::al */, $noreg + ; CHECK-NEXT: renamable $r5, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg ; CHECK-NEXT: renamable $r1, dead $cpsr = nsw tLSLri killed renamable $r1, 1, 14 /* CC::al */, $noreg ; CHECK-NEXT: renamable $r3 = t2RSBri killed renamable $r3, 256, 14 /* CC::al */, $noreg, $noreg ; CHECK-NEXT: renamable $q0 = MVE_VDUP16 killed renamable $r3, 0, $noreg, $noreg, undef renamable $q0 + ; CHECK-NEXT: renamable $r3 = t2BICri killed renamable $r6, 7, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 8, 14 /* CC::al */, $noreg + ; CHECK-NEXT: renamable $r6 = nuw nsw t2ADDrs killed renamable $r5, killed renamable $r3, 27, 14 /* CC::al */, $noreg, $noreg ; CHECK-NEXT: renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) - ; CHECK-NEXT: liveins: $d0, $d1, $r0, $r1, $r2, $r3, $r6, $r12 + ; CHECK-NEXT: liveins: $q0, $r0, $r1, $r2, $r3, $r6, $r12 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $r4 = tMOVr $r0, 14 /* CC::al */, $noreg - ; CHECK-NEXT: $lr = MVE_DLSTP_16 renamable $r2 + ; CHECK-NEXT: $r5 = tMOVr $r2, 14 /* CC::al */, $noreg + ; CHECK-NEXT: $lr = t2DLS renamable $r6 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.3(0x7c000000), %bb.4(0x04000000) - ; CHECK-NEXT: liveins: $lr, $d0, $d1, $r0, $r1, $r2, $r3, $r4, $r6, $r12 + ; CHECK-NEXT: liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r12 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $q1 = MVE_VLDRHU16 renamable $r4, 0, 0, $noreg, renamable $lr - ; CHECK-NEXT: $d4 = VMOVD killed $d0, 14 /* CC::al */, $noreg - ; CHECK-NEXT: $d5 = VMOVD killed $d1, 14 /* CC::al */, $noreg + ; CHECK-NEXT: renamable $vpr = MVE_VCTP16 renamable $r5, 0, $noreg, $noreg + ; CHECK-NEXT: renamable $r5, dead $cpsr = tSUBi8 killed renamable $r5, 8, 14 /* CC::al */, $noreg + ; CHECK-NEXT: MVE_VPST 8, implicit $vpr + ; CHECK-NEXT: renamable $q1 = MVE_VLDRHU16 renamable $r4, 0, 1, renamable $vpr, renamable $lr + ; CHECK-NEXT: $q2 = MVE_VORR killed $q0, killed $q0, 0, $noreg, $noreg, undef $q2 ; CHECK-NEXT: renamable $q1 = MVE_VAND killed renamable $q1, renamable $q2, 0, $noreg, renamable $lr, undef renamable $q1 - ; CHECK-NEXT: $d0 = VMOVD killed $d4, 14 /* CC::al */, $noreg - ; CHECK-NEXT: $d1 = VMOVD killed $d5, 14 /* CC::al */, $noreg - ; CHECK-NEXT: renamable $r4 = MVE_VSTRHU16_post killed renamable $q1, killed renamable $r4, 16, 0, killed $noreg, renamable $lr - ; CHECK-NEXT: $lr = MVE_LETP killed renamable $lr, %bb.3 + ; CHECK-NEXT: $q0 = MVE_VORR killed $q2, killed $q2, 0, $noreg, $noreg, undef $q0 + ; CHECK-NEXT: MVE_VPST 8, implicit $vpr + ; CHECK-NEXT: renamable $r4 = MVE_VSTRHU16_post killed renamable $q1, killed renamable $r4, 16, 1, killed renamable $vpr, renamable $lr + ; CHECK-NEXT: $lr = t2LEUpdate killed renamable $lr, %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x04000000), %bb.2(0x7c000000) - ; CHECK-NEXT: liveins: $d0, $d1, $r0, $r1, $r2, $r3, $r6, $r12 + ; CHECK-NEXT: liveins: $q0, $r0, $r1, $r2, $r3, $r6, $r12 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $r3, dead $cpsr = nuw nsw tADDi8 killed renamable $r3, 1, 14 /* CC::al */, $noreg ; CHECK-NEXT: renamable $r0 = tADDhirr killed renamable $r0, renamable $r1, 14 /* CC::al */, $noreg From 477af29090dc1a06c58bb98e9934d40f75486316 Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Tue, 14 Oct 2025 15:26:38 +0100 Subject: [PATCH 3/4] Add assertion justifying (and checking) the i+3 magic number --- llvm/lib/Target/ARM/Thumb2InstrInfo.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp index bab7804be7d03..361cdece7e808 100644 --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -809,8 +809,11 @@ int llvm::findVPTInactiveOperandIdx(const MachineInstr &MI) { const MCInstrDesc &MCID = MI.getDesc(); for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) - if (MCID.operands()[i].OperandType == ARM::OPERAND_VPRED_R) + if (MCID.operands()[i].OperandType == ARM::OPERAND_VPRED_R) { + assert(MCID.getOperandConstraint(i+3, MCOI::TIED_TO) != -1 && + "Operand #3 of VPRED_R is the one tied to the output register"); return i + 3; + } return -1; } From 0dc637a5c6f953771cda139fc7623e99e04db200 Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Tue, 14 Oct 2025 15:44:02 +0100 Subject: [PATCH 4/4] git-clang-format --- llvm/lib/Target/ARM/Thumb2InstrInfo.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp index 361cdece7e808..cb7c6a466aad4 100644 --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -809,8 +809,8 @@ int llvm::findVPTInactiveOperandIdx(const MachineInstr &MI) { const MCInstrDesc &MCID = MI.getDesc(); for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) - if (MCID.operands()[i].OperandType == ARM::OPERAND_VPRED_R) { - assert(MCID.getOperandConstraint(i+3, MCOI::TIED_TO) != -1 && + if (MCID.operands()[i].OperandType == ARM::OPERAND_VPRED_R) { + assert(MCID.getOperandConstraint(i + 3, MCOI::TIED_TO) != -1 && "Operand #3 of VPRED_R is the one tied to the output register"); return i + 3; }