From ea8e67094ea25c7b1d4367de491afdec4d7d7212 Mon Sep 17 00:00:00 2001 From: reucru01 Date: Thu, 20 Nov 2025 16:10:39 +0000 Subject: [PATCH 1/4] Adds HasV8Ops, FeatureDotProd to InlineAllowed Fixes issue where functions are not inlined when caller has these features, but callee does not. --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h | 100 +++++++++++++------ 1 file changed, 68 insertions(+), 32 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index 919a6fc9fd0b0..e11bc7298aabf 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -40,13 +40,13 @@ class Type; class Value; namespace TailPredication { - enum Mode { - Disabled = 0, - EnabledNoReductions, - Enabled, - ForceEnabledNoReductions, - ForceEnabled - }; +enum Mode { + Disabled = 0, + EnabledNoReductions, + Enabled, + ForceEnabledNoReductions, + ForceEnabled +}; } // For controlling conversion of memcpy into Tail Predicated loop. @@ -70,31 +70,67 @@ class ARMTTIImpl final : public BasicTTIImplBase { // -thumb-mode in a caller with +thumb-mode, may cause the assembler to // fail if the callee uses ARM only instructions, e.g. in inline asm. const FeatureBitset InlineFeaturesAllowed = { - ARM::FeatureVFP2, ARM::FeatureVFP3, ARM::FeatureNEON, ARM::FeatureThumb2, - ARM::FeatureFP16, ARM::FeatureVFP4, ARM::FeatureFPARMv8, - ARM::FeatureFullFP16, ARM::FeatureFP16FML, ARM::FeatureHWDivThumb, - ARM::FeatureHWDivARM, ARM::FeatureDB, ARM::FeatureV7Clrex, - ARM::FeatureAcquireRelease, ARM::FeatureSlowFPBrcc, - ARM::FeaturePerfMon, ARM::FeatureTrustZone, ARM::Feature8MSecExt, - ARM::FeatureCrypto, ARM::FeatureCRC, ARM::FeatureRAS, - ARM::FeatureFPAO, ARM::FeatureFuseAES, ARM::FeatureZCZeroing, - ARM::FeatureProfUnpredicate, ARM::FeatureSlowVGETLNi32, - ARM::FeatureSlowVDUP32, ARM::FeaturePreferVMOVSR, - ARM::FeaturePrefISHSTBarrier, ARM::FeatureMuxedUnits, - ARM::FeatureSlowOddRegister, ARM::FeatureSlowLoadDSubreg, - ARM::FeatureDontWidenVMOVS, ARM::FeatureExpandMLx, - ARM::FeatureHasVMLxHazards, ARM::FeatureNEONForFPMovs, - ARM::FeatureNEONForFP, ARM::FeatureCheckVLDnAlign, - ARM::FeatureHasSlowFPVMLx, ARM::FeatureHasSlowFPVFMx, - ARM::FeatureVMLxForwarding, ARM::FeaturePref32BitThumb, - ARM::FeatureAvoidPartialCPSR, ARM::FeatureCheapPredicableCPSR, - ARM::FeatureAvoidMOVsShOp, ARM::FeatureHasRetAddrStack, - ARM::FeatureHasNoBranchPredictor, ARM::FeatureDSP, ARM::FeatureMP, - ARM::FeatureVirtualization, ARM::FeatureMClass, ARM::FeatureRClass, - ARM::FeatureAClass, ARM::FeatureStrictAlign, ARM::FeatureLongCalls, - ARM::FeatureExecuteOnly, ARM::FeatureReserveR9, ARM::FeatureNoMovt, - ARM::FeatureNoNegativeImmediates - }; + ARM::FeatureDotProd, + ARM::HasV8Ops, + ARM::FeatureVFP2, + ARM::FeatureVFP3, + ARM::FeatureNEON, + ARM::FeatureThumb2, + ARM::FeatureFP16, + ARM::FeatureVFP4, + ARM::FeatureFPARMv8, + ARM::FeatureFullFP16, + ARM::FeatureFP16FML, + ARM::FeatureHWDivThumb, + ARM::FeatureHWDivARM, + ARM::FeatureDB, + ARM::FeatureV7Clrex, + ARM::FeatureAcquireRelease, + ARM::FeatureSlowFPBrcc, + ARM::FeaturePerfMon, + ARM::FeatureTrustZone, + ARM::Feature8MSecExt, + ARM::FeatureCrypto, + ARM::FeatureCRC, + ARM::FeatureRAS, + ARM::FeatureFPAO, + ARM::FeatureFuseAES, + ARM::FeatureZCZeroing, + ARM::FeatureProfUnpredicate, + ARM::FeatureSlowVGETLNi32, + ARM::FeatureSlowVDUP32, + ARM::FeaturePreferVMOVSR, + ARM::FeaturePrefISHSTBarrier, + ARM::FeatureMuxedUnits, + ARM::FeatureSlowOddRegister, + ARM::FeatureSlowLoadDSubreg, + ARM::FeatureDontWidenVMOVS, + ARM::FeatureExpandMLx, + ARM::FeatureHasVMLxHazards, + ARM::FeatureNEONForFPMovs, + ARM::FeatureNEONForFP, + ARM::FeatureCheckVLDnAlign, + ARM::FeatureHasSlowFPVMLx, + ARM::FeatureHasSlowFPVFMx, + ARM::FeatureVMLxForwarding, + ARM::FeaturePref32BitThumb, + ARM::FeatureAvoidPartialCPSR, + ARM::FeatureCheapPredicableCPSR, + ARM::FeatureAvoidMOVsShOp, + ARM::FeatureHasRetAddrStack, + ARM::FeatureHasNoBranchPredictor, + ARM::FeatureDSP, + ARM::FeatureMP, + ARM::FeatureVirtualization, + ARM::FeatureMClass, + ARM::FeatureRClass, + ARM::FeatureAClass, + ARM::FeatureStrictAlign, + ARM::FeatureLongCalls, + ARM::FeatureExecuteOnly, + ARM::FeatureReserveR9, + ARM::FeatureNoMovt, + ARM::FeatureNoNegativeImmediates}; const ARMSubtarget *getST() const { return ST; } const ARMTargetLowering *getTLI() const { return TLI; } From 0922a15f1c219eb5d11d8d32e0e8007fd36af39a Mon Sep 17 00:00:00 2001 From: reucru01 Date: Tue, 25 Nov 2025 10:06:24 +0000 Subject: [PATCH 2/4] Adds debug statements to ARM areInlineCompatible This makes it easier to see why your function isn't getting inlined for. --- .../lib/Target/ARM/ARMTargetTransformInfo.cpp | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index d12b802fe234f..f0d378b66883f 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -102,6 +102,53 @@ bool ARMTTIImpl::areInlineCompatible(const Function *Caller, // the callers'. bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) == (CalleeBits & InlineFeaturesAllowed); + + LLVM_DEBUG({ + dbgs() << "=== Inline compatibility debug ===\n"; + dbgs() << "Caller: " << Caller->getName() << "\n"; + dbgs() << "Callee: " << Callee->getName() << "\n"; + + // Bit diffs + FeatureBitset MissingInCaller = CalleeBits & ~CallerBits; // callee-only + FeatureBitset ExtraInCaller = CallerBits & ~CalleeBits; // caller-only + + // Counts + dbgs() << "Only-in-caller bit count: " << ExtraInCaller.count() << "\n"; + dbgs() << "Only-in-callee bit count: " << MissingInCaller.count() << "\n"; + + dbgs() << "Only-in-caller feature indices ["; + { + bool First = true; + for (size_t I = 0, E = ExtraInCaller.size(); I < E; ++I) { + if (ExtraInCaller.test(I)) { + if (!First) + dbgs() << ", "; + dbgs() << I; + First = false; + } + } + } + dbgs() << "]\n"; + + dbgs() << "Only-in-callee feature indices ["; + { + bool First = true; + for (size_t I = 0, E = MissingInCaller.size(); I < E; ++I) { + if (MissingInCaller.test(I)) { + if (!First) + dbgs() << ", "; + dbgs() << I; + First = false; + } + } + } + dbgs() << "]\n"; + + // Indicies map to features as found in + // llvm-project/(your_build)/lib/Target/ARM/ARMGenSubtargetInfo.inc + dbgs() << "MatchExact=" << (MatchExact ? "true" : "false") + << " MatchSubset=" << (MatchSubset ? "true" : "false") << "\n"; + }); return MatchExact && MatchSubset; } From af3232b6f6ef270088b08a7eb780ec75980ee33d Mon Sep 17 00:00:00 2001 From: reucru01 Date: Tue, 25 Nov 2025 10:09:56 +0000 Subject: [PATCH 3/4] Adds many features to ARM InlineFeaturesAllowed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removes FeatureD32 and FeatureFP64 from black list in comments as: - In https://reviews.llvm.org/D34697#805590 D16 and VFPOnlySP were added to this allowlist because they do "the opposite of what you would expect. - https://github.com/llvm/llvm-project/commit/760df47b778a530e9368a4b8706940ba103d57ba#diff-8165208908f69b3582d556451[…]6c4b474f2bf32c4ac7fec031cf2efd replaces the previous features with the inverse, but incorrectly keeps them in the allow list as the original reasoning no longer applies. Some subtarget features provide different instructions depending on whether they are set or unset, these features are believed safe as *not* having these features present does not add instructions. --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h | 38 +++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index e11bc7298aabf..b4b828b9ef79b 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -64,14 +64,50 @@ class ARMTTIImpl final : public BasicTTIImplBase { const ARMTargetLowering *TLI; // Currently the following features are excluded from InlineFeaturesAllowed. - // ModeThumb, FeatureNoARM, ModeSoftFloat, FeatureFP64, FeatureD32 + // ModeThumb, FeatureNoARM, ModeSoftFloat. // Depending on whether they are set or unset, different // instructions/registers are available. For example, inlining a callee with // -thumb-mode in a caller with +thumb-mode, may cause the assembler to // fail if the callee uses ARM only instructions, e.g. in inline asm. const FeatureBitset InlineFeaturesAllowed = { + ARM::FeatureD32, + ARM::FeatureFPRegs64, + ARM::FeatureFPRegs16, + ARM::FeatureFPRegs, + ARM::FeatureAES, + ARM::FeatureVFP2_SP, + ARM::FeatureSHA2, + ARM::HasV5TEOps, + ARM::HasV6Ops, + ARM::HasV6KOps, + ARM::HasV6T2Ops, + ARM::HasV7Ops, + ARM::HasV5TOps, + ARM::HasV6MOps, + ARM::HasV8MBaselineOps, + ARM::HasV8MMainlineOps, + ARM::HasV8_1aOps, + ARM::HasV8_2aOps, + ARM::HasV8_3aOps, + ARM::HasV8_4aOps, + ARM::HasV8_5aOps, + ARM::HasV8_6aOps, + ARM::HasV8_7aOps, + ARM::HasV8_8aOps, + ARM::HasV8_9aOps, + ARM::HasV9_0aOps, + ARM::HasV9_1aOps, + ARM::HasV9_2aOps, + ARM::HasV9_3aOps, + ARM::HasV9_4aOps, + ARM::HasV9_5aOps, + ARM::HasV9_6aOps, + ARM::HasV9_7aOps, + ARM::HasV8_1MMainlineOps, ARM::FeatureDotProd, ARM::HasV8Ops, + ARM::FeatureSB, + ARM::FeatureBF16, ARM::FeatureVFP2, ARM::FeatureVFP3, ARM::FeatureNEON, From 0f89803148119761e86f33bcb0c2e762b104d044 Mon Sep 17 00:00:00 2001 From: reucru01 Date: Wed, 26 Nov 2025 14:23:00 +0000 Subject: [PATCH 4/4] Adds tests for inlining with dotprod --- .../Transforms/Inline/ARM/inline-dotprod.ll | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 llvm/test/Transforms/Inline/ARM/inline-dotprod.ll diff --git a/llvm/test/Transforms/Inline/ARM/inline-dotprod.ll b/llvm/test/Transforms/Inline/ARM/inline-dotprod.ll new file mode 100644 index 0000000000000..2f8dbb7f01822 --- /dev/null +++ b/llvm/test/Transforms/Inline/ARM/inline-dotprod.ll @@ -0,0 +1,35 @@ +; RUN: opt < %s -mtriple=arm-unknown-linux-gnu -S -passes=inline | FileCheck %s +; RUN: opt < %s -mtriple=arm-unknown-linux-gnu -S -passes='cgscc(inline)' | FileCheck %s + +declare i32 @foo(...) #0 + +define i32 @callee() #0 { +entry: + %call = call i32 (...) @foo() + ret i32 %call +} + +define i32 @dotcallee() #1 { +entry: + %call = call i32 (...) @foo() + ret i32 %call +} + +define i32 @dotcaller() #1 { +entry: + %call = call i32 @callee() + ret i32 %call +; CHECK-LABEL: dotcaller +; CHECK: call i32 (...) @foo() +} + +define i32 @caller() #0 { +entry: + %call = call i32 @dotcallee() + ret i32 %call +; CHECK-LABEL: caller +; CHECK: call i32 @dotcallee() +} + +attributes #0 = { "target-cpu"="generic" "target-features"="+dsp,+neon" } +attributes #1 = { "target-cpu"="generic" "target-features"="+dsp,+neon,+dotprod" }