From 0988d3607d86ca8baeb19d3edca1ef61ccb14d85 Mon Sep 17 00:00:00 2001 From: sgokhale Date: Tue, 23 Sep 2025 01:52:29 -0700 Subject: [PATCH 1/2] Add FeatureFuseLiterals as SubTargetFeature for Grace and Olympus With this, we are gaining significantly with povray benchmark from SPEC17(around 12% with -flto -Ofast). This is attributable to transformation from this feature and subsequent shrink wrapping. We also see some improvement(around 2%) with xalanc benchmark from SPEC17. There are some improvements on some internal benchmarks as well. --- llvm/lib/Target/AArch64/AArch64Processors.td | 6 ++++-- llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll | 9 ++++++++- llvm/test/CodeGen/AArch64/selectopt-const.ll | 3 ++- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index 81f5d075729d9..1d07e82acae77 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -328,7 +328,8 @@ def TuneOlympus : SubtargetFeature<"olympus", "ARMProcFamily", "Olympus", FeatureFuseAdrpAdd, FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, - FeatureUseFixedOverScalableIfEqualCost]>; + FeatureUseFixedOverScalableIfEqualCost, + FeatureFuseLiterals]>; // Note that cyclone does not fuse AES instructions, but newer apple chips do // perform the fusion and cyclone is used by default when targeting apple OSes. @@ -641,7 +642,8 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2 FeatureUseFixedOverScalableIfEqualCost, FeatureAvoidLDAPUR, FeaturePredictableSelectIsExpensive, - FeatureDisableLatencySchedHeuristic]>; + FeatureDisableLatencySchedHeuristic, + FeatureFuseLiterals]>; def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3", "Neoverse V3 ARM processors", [ diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll b/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll index 70b6b91d3cf66..4b77e9eb71faf 100644 --- a/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll +++ b/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll @@ -12,7 +12,8 @@ ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-n1 | FileCheck %s ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v1 | FileCheck %s ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-n2 | FileCheck %s -; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v2 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v2 | FileCheck %s --check-prefix FUSE-LITERALS +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=olympus | FileCheck %s --check-prefix FUSE-LITERALS ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=apple-a16 -mattr=-fuse-literals | FileCheck %s ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=apple-a17 -mattr=-fuse-literals | FileCheck %s ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=ampere1 -mattr=-fuse-literals | FileCheck %s @@ -38,6 +39,12 @@ define double @litf() { ; CHECK-LABEL: litf: ; CHECK: adrp [[ADDR:x[0-9]+]], [[CSTLABEL:.LCP.*]] ; CHECK-NEXT: ldr {{d[0-9]+}}, {{[[]}}[[ADDR]], :lo12:[[CSTLABEL]]{{[]]}} +; +; FUSE-LITERALS: mov [[R:x[0-9]+]], #11544 +; FUSE-LITERALS: movk [[R]], #21572, lsl #16 +; FUSE-LITERALS: movk [[R]], #8699, lsl #32 +; FUSE-LITERALS: movk [[R]], #16393, lsl #48 +; FUSE-LITERALS: fmov {{d[0-9]+}}, [[R]] entry: ret double 0x400921FB54442D18 } diff --git a/llvm/test/CodeGen/AArch64/selectopt-const.ll b/llvm/test/CodeGen/AArch64/selectopt-const.ll index fe48dbaf1ab76..62ac297153962 100644 --- a/llvm/test/CodeGen/AArch64/selectopt-const.ll +++ b/llvm/test/CodeGen/AArch64/selectopt-const.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -O3 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=olympus -O3 < %s | FileCheck %s define i32 @test_const(ptr %in1, ptr %in2, ptr %out, i32 %n, ptr %tbl) { ; CHECK-LABEL: test_const: @@ -8,10 +9,10 @@ define i32 @test_const(ptr %in1, ptr %in2, ptr %out, i32 %n, ptr %tbl) { ; CHECK-NEXT: b.lt .LBB0_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: mov w9, #1267 // =0x4f3 +; CHECK-NEXT: movk w9, #16309, lsl #16 ; CHECK-NEXT: fmov s1, #1.00000000 ; CHECK-NEXT: fmov d2, #5.00000000 ; CHECK-NEXT: mov w8, w3 -; CHECK-NEXT: movk w9, #16309, lsl #16 ; CHECK-NEXT: fmov s0, w9 ; CHECK-NEXT: mov w9, #16 // =0x10 ; CHECK-NEXT: .p2align 5, , 16 From f7781b6aad1af21084ba234b6975fd13b27439fd Mon Sep 17 00:00:00 2001 From: sgokhale Date: Tue, 23 Sep 2025 23:04:58 -0700 Subject: [PATCH 2/2] Address comments Rather than adding the new feature, this changes the way how constants are materialized for Grace and Olympus. --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 11 +++++++++++ llvm/lib/Target/AArch64/AArch64Processors.td | 6 ++---- .../test/CodeGen/AArch64/misched-fusion-addadrp.ll | 14 +++++++------- llvm/test/CodeGen/AArch64/selectopt-const.ll | 3 +-- 4 files changed, 21 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index cd7f0e719ad0c..d484fccb92d2f 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12564,6 +12564,17 @@ bool AArch64TargetLowering::isOffsetFoldingLegal( bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, bool OptForSize) const { + // If the constant to be materialized is scalar, it maybe efficient to use + // sequence of 'mov + fmov' rather than 'adrp + ldr' on specified CPU's. + // However, when materializing vector of constants, there are two things to + // note: + // 1. Throughput of fmov instruction is very low. + // 2. ldr instruction can load multiple constants in one go. Also, it's + // throughput is higher as compared to fmov. + if (!VT.isVector() && (Subtarget->getCPU() == "neoverse-v2" || + Subtarget->getCPU() == "olympus")) + return true; + bool IsLegal = false; // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and // 16-bit case when target has full fp16 support. diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index 1d07e82acae77..81f5d075729d9 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -328,8 +328,7 @@ def TuneOlympus : SubtargetFeature<"olympus", "ARMProcFamily", "Olympus", FeatureFuseAdrpAdd, FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, - FeatureUseFixedOverScalableIfEqualCost, - FeatureFuseLiterals]>; + FeatureUseFixedOverScalableIfEqualCost]>; // Note that cyclone does not fuse AES instructions, but newer apple chips do // perform the fusion and cyclone is used by default when targeting apple OSes. @@ -642,8 +641,7 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2 FeatureUseFixedOverScalableIfEqualCost, FeatureAvoidLDAPUR, FeaturePredictableSelectIsExpensive, - FeatureDisableLatencySchedHeuristic, - FeatureFuseLiterals]>; + FeatureDisableLatencySchedHeuristic]>; def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3", "Neoverse V3 ARM processors", [ diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll b/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll index 4b77e9eb71faf..a30665cbbbc2a 100644 --- a/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll +++ b/llvm/test/CodeGen/AArch64/misched-fusion-addadrp.ll @@ -12,8 +12,8 @@ ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-n1 | FileCheck %s ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v1 | FileCheck %s ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-n2 | FileCheck %s -; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v2 | FileCheck %s --check-prefix FUSE-LITERALS -; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=olympus | FileCheck %s --check-prefix FUSE-LITERALS +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v2 | FileCheck %s --check-prefix NO-CONST-POOL +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=olympus | FileCheck %s --check-prefix NO-CONST-POOL ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=apple-a16 -mattr=-fuse-literals | FileCheck %s ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=apple-a17 -mattr=-fuse-literals | FileCheck %s ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=ampere1 -mattr=-fuse-literals | FileCheck %s @@ -40,11 +40,11 @@ define double @litf() { ; CHECK: adrp [[ADDR:x[0-9]+]], [[CSTLABEL:.LCP.*]] ; CHECK-NEXT: ldr {{d[0-9]+}}, {{[[]}}[[ADDR]], :lo12:[[CSTLABEL]]{{[]]}} ; -; FUSE-LITERALS: mov [[R:x[0-9]+]], #11544 -; FUSE-LITERALS: movk [[R]], #21572, lsl #16 -; FUSE-LITERALS: movk [[R]], #8699, lsl #32 -; FUSE-LITERALS: movk [[R]], #16393, lsl #48 -; FUSE-LITERALS: fmov {{d[0-9]+}}, [[R]] +; NO-CONST-POOL: mov [[R:x[0-9]+]], #11544 +; NO-CONST-POOL: movk [[R]], #21572, lsl #16 +; NO-CONST-POOL: movk [[R]], #8699, lsl #32 +; NO-CONST-POOL: movk [[R]], #16393, lsl #48 +; NO-CONST-POOL: fmov {{d[0-9]+}}, [[R]] entry: ret double 0x400921FB54442D18 } diff --git a/llvm/test/CodeGen/AArch64/selectopt-const.ll b/llvm/test/CodeGen/AArch64/selectopt-const.ll index 62ac297153962..fe48dbaf1ab76 100644 --- a/llvm/test/CodeGen/AArch64/selectopt-const.ll +++ b/llvm/test/CodeGen/AArch64/selectopt-const.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -O3 < %s | FileCheck %s -; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=olympus -O3 < %s | FileCheck %s define i32 @test_const(ptr %in1, ptr %in2, ptr %out, i32 %n, ptr %tbl) { ; CHECK-LABEL: test_const: @@ -9,10 +8,10 @@ define i32 @test_const(ptr %in1, ptr %in2, ptr %out, i32 %n, ptr %tbl) { ; CHECK-NEXT: b.lt .LBB0_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: mov w9, #1267 // =0x4f3 -; CHECK-NEXT: movk w9, #16309, lsl #16 ; CHECK-NEXT: fmov s1, #1.00000000 ; CHECK-NEXT: fmov d2, #5.00000000 ; CHECK-NEXT: mov w8, w3 +; CHECK-NEXT: movk w9, #16309, lsl #16 ; CHECK-NEXT: fmov s0, w9 ; CHECK-NEXT: mov w9, #16 // =0x10 ; CHECK-NEXT: .p2align 5, , 16