-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARC] Prefer RDPC over CALL to implement GETPCX for 64-bit target #78280
Conversation
On 64-bit target, prefer usng RDPC over CALL to get the value of %pc. This is faster on modern processors (Niagara T1 and newer) and avoids polluting the processor's predictor state. The old behavior of using a fake CALL is still done when tuning for classic UltraSPARC processors, since RDPC is much slower there. A quick pgbench test on a SPARC T4 shows about 2% speedup on SELECT loads, and about 7% speedup on INSERT/UPDATE loads.
@llvm/pr-subscribers-backend-sparc Author: Koakuma (koachan) ChangesOn 64-bit target, prefer usng RDPC over CALL to get the value of %pc. This is faster on modern processors (Niagara T1 and newer) and avoids polluting the processor's predictor state. The old behavior of using a fake CALL is still done when tuning for classic UltraSPARC processors, since RDPC is much slower there. A quick pgbench test on a SPARC T4 shows about 2% speedup on SELECT loads, and about 7% speedup on INSERT/UPDATE loads. Full diff: https://github.com/llvm/llvm-project/pull/78280.diff 4 Files Affected:
diff --git a/llvm/lib/Target/Sparc/Sparc.td b/llvm/lib/Target/Sparc/Sparc.td
index 1a71cfed3128f06..7b1033956524330 100644
--- a/llvm/lib/Target/Sparc/Sparc.td
+++ b/llvm/lib/Target/Sparc/Sparc.td
@@ -62,6 +62,13 @@ def UsePopc : SubtargetFeature<"popc", "UsePopc", "true",
def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
"Use software emulation for floating point">;
+//===----------------------------------------------------------------------===//
+// SPARC Subtarget tuning features.
+//
+
+def TuneSlowRDPC : SubtargetFeature<"slow-rdpc", "HasSlowRDPC", "true",
+ "rd %pc, %XX is slow", [FeatureV9]>;
+
//==== Features added predmoninantly for LEON subtarget support
include "LeonFeatures.td"
@@ -89,8 +96,9 @@ def SparcAsmParserVariant : AsmParserVariant {
// SPARC processors supported.
//===----------------------------------------------------------------------===//
-class Proc<string Name, list<SubtargetFeature> Features>
- : Processor<Name, NoItineraries, Features>;
+class Proc<string Name, list<SubtargetFeature> Features,
+ list<SubtargetFeature> TuneFeatures = []>
+ : Processor<Name, NoItineraries, Features, TuneFeatures>;
def : Proc<"generic", []>;
def : Proc<"v7", [FeatureSoftMulDiv, FeatureNoFSMULD]>;
@@ -118,9 +126,11 @@ def : Proc<"ma2480", [FeatureLeon, LeonCASA]>;
def : Proc<"ma2485", [FeatureLeon, LeonCASA]>;
def : Proc<"ma2x8x", [FeatureLeon, LeonCASA]>;
def : Proc<"v9", [FeatureV9]>;
-def : Proc<"ultrasparc", [FeatureV9, FeatureV8Deprecated, FeatureVIS]>;
+def : Proc<"ultrasparc", [FeatureV9, FeatureV8Deprecated, FeatureVIS],
+ [TuneSlowRDPC]>;
def : Proc<"ultrasparc3", [FeatureV9, FeatureV8Deprecated, FeatureVIS,
- FeatureVIS2]>;
+ FeatureVIS2],
+ [TuneSlowRDPC]>;
def : Proc<"niagara", [FeatureV9, FeatureV8Deprecated, FeatureVIS,
FeatureVIS2]>;
def : Proc<"niagara2", [FeatureV9, FeatureV8Deprecated, UsePopc,
diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index cca624e09267962..215a8ea8319046f 100644
--- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -13,6 +13,7 @@
#include "MCTargetDesc/SparcInstPrinter.h"
#include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCTargetDesc.h"
#include "MCTargetDesc/SparcTargetStreamer.h"
#include "Sparc.h"
#include "SparcInstrInfo.h"
@@ -111,6 +112,15 @@ static void EmitCall(MCStreamer &OutStreamer,
OutStreamer.emitInstruction(CallInst, STI);
}
+static void EmitRDPC(MCStreamer &OutStreamer, MCOperand &RD,
+ const MCSubtargetInfo &STI) {
+ MCInst RDPCInst;
+ RDPCInst.setOpcode(SP::RDASR);
+ RDPCInst.addOperand(RD);
+ RDPCInst.addOperand(MCOperand::createReg(SP::ASR5));
+ OutStreamer.emitInstruction(RDPCInst, STI);
+}
+
static void EmitSETHI(MCStreamer &OutStreamer,
MCOperand &Imm, MCOperand &RD,
const MCSubtargetInfo &STI)
@@ -226,7 +236,7 @@ void SparcAsmPrinter::LowerGETPCXAndEmitMCInsts(const MachineInstr *MI,
MCOperand RegO7 = MCOperand::createReg(SP::O7);
// <StartLabel>:
- // call <EndLabel>
+ // <GET-PC> // This will be either `call <EndLabel>` or `rd %pc, %o7`.
// <SethiLabel>:
// sethi %hi(_GLOBAL_OFFSET_TABLE_+(<SethiLabel>-<StartLabel>)), <MO>
// <EndLabel>:
@@ -234,8 +244,17 @@ void SparcAsmPrinter::LowerGETPCXAndEmitMCInsts(const MachineInstr *MI,
// add <MO>, %o7, <MO>
OutStreamer->emitLabel(StartLabel);
- MCOperand Callee = createPCXCallOP(EndLabel, OutContext);
- EmitCall(*OutStreamer, Callee, STI);
+ if (!STI.getTargetTriple().isSPARC64() ||
+ STI.hasFeature(Sparc::TuneSlowRDPC)) {
+ MCOperand Callee = createPCXCallOP(EndLabel, OutContext);
+ EmitCall(*OutStreamer, Callee, STI);
+ } else {
+ // TODO find out whether it is possible to store PC
+ // in other registers, to enable leaf function optimization.
+ // (On the other hand, approx. over 97.8% of GETPCXes happen
+ // in non-leaf functions, so would this be worth the effort?)
+ EmitRDPC(*OutStreamer, RegO7, STI);
+ }
OutStreamer->emitLabel(SethiLabel);
MCOperand hiImm = createPCXRelExprOp(SparcMCExpr::VK_Sparc_PC22,
GOTLabel, StartLabel, SethiLabel,
diff --git a/llvm/test/CodeGen/SPARC/getpcx-call.ll b/llvm/test/CodeGen/SPARC/getpcx-call.ll
new file mode 100644
index 000000000000000..72d7b5a0bc2f5b4
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/getpcx-call.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -relocation-model=pic -mtriple=sparc | FileCheck --check-prefix=SPARC %s
+; RUN: llc < %s -relocation-model=pic -mtriple=sparcv9 | FileCheck --check-prefix=SPARC64 %s
+
+;; SPARC32 and SPARC64 for classic UltraSPARCs implement GETPCX
+;; with a fake `call`.
+;; All other SPARC64 targets implement it with `rd %pc, %o7`.
+;; Need to do the tests in separate files because apparently `tune-cpu`
+;; attribute applies to the entire file at once.
+
+@value = external global i32
+
+define i32 @testCall() nounwind #0 {
+; SPARC-LABEL: testCall:
+; SPARC: ! %bb.0:
+; SPARC-NEXT: save %sp, -96, %sp
+; SPARC-NEXT: .Ltmp0:
+; SPARC-NEXT: call .Ltmp1
+; SPARC-NEXT: .Ltmp2:
+; SPARC-NEXT: sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.Ltmp0)), %i0
+; SPARC-NEXT: .Ltmp1:
+; SPARC-NEXT: or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.Ltmp0)), %i0
+; SPARC-NEXT: add %i0, %o7, %i0
+; SPARC-NEXT: sethi %hi(value), %i1
+; SPARC-NEXT: add %i1, %lo(value), %i1
+; SPARC-NEXT: ld [%i0+%i1], %i0
+; SPARC-NEXT: ld [%i0], %i0
+; SPARC-NEXT: ret
+; SPARC-NEXT: restore
+;
+; SPARC64-LABEL: testCall:
+; SPARC64: ! %bb.0:
+; SPARC64-NEXT: save %sp, -128, %sp
+; SPARC64-NEXT: .Ltmp0:
+; SPARC64-NEXT: call .Ltmp1
+; SPARC64-NEXT: .Ltmp2:
+; SPARC64-NEXT: sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.Ltmp0)), %i0
+; SPARC64-NEXT: .Ltmp1:
+; SPARC64-NEXT: or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.Ltmp0)), %i0
+; SPARC64-NEXT: add %i0, %o7, %i0
+; SPARC64-NEXT: sethi %hi(value), %i1
+; SPARC64-NEXT: add %i1, %lo(value), %i1
+; SPARC64-NEXT: ldx [%i0+%i1], %i0
+; SPARC64-NEXT: ld [%i0], %i0
+; SPARC64-NEXT: ret
+; SPARC64-NEXT: restore
+ %1 = load i32, ptr @value
+ ret i32 %1
+}
+
+attributes #0 = { "tune-cpu"="ultrasparc" }
diff --git a/llvm/test/CodeGen/SPARC/getpcx-rdpc.ll b/llvm/test/CodeGen/SPARC/getpcx-rdpc.ll
new file mode 100644
index 000000000000000..286750a014e82db
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/getpcx-rdpc.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -relocation-model=pic -mtriple=sparc | FileCheck --check-prefix=SPARC %s
+; RUN: llc < %s -relocation-model=pic -mtriple=sparcv9 | FileCheck --check-prefix=SPARC64 %s
+
+;; SPARC32 and SPARC64 for classic UltraSPARCs implement GETPCX
+;; with a fake `call`.
+;; All other SPARC64 targets implement it with `rd %pc, %o7`.
+;; Need to do the tests in separate files because apparently `tune-cpu`
+;; attribute applies to the entire file at once.
+
+@value = external global i32
+
+define i32 @testRdpc() nounwind #0 {
+; SPARC-LABEL: testRdpc:
+; SPARC: ! %bb.0:
+; SPARC-NEXT: save %sp, -96, %sp
+; SPARC-NEXT: .Ltmp0:
+; SPARC-NEXT: call .Ltmp1
+; SPARC-NEXT: .Ltmp2:
+; SPARC-NEXT: sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.Ltmp0)), %i0
+; SPARC-NEXT: .Ltmp1:
+; SPARC-NEXT: or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.Ltmp0)), %i0
+; SPARC-NEXT: add %i0, %o7, %i0
+; SPARC-NEXT: sethi %hi(value), %i1
+; SPARC-NEXT: add %i1, %lo(value), %i1
+; SPARC-NEXT: ld [%i0+%i1], %i0
+; SPARC-NEXT: ld [%i0], %i0
+; SPARC-NEXT: ret
+; SPARC-NEXT: restore
+;
+; SPARC64-LABEL: testRdpc:
+; SPARC64: ! %bb.0:
+; SPARC64-NEXT: save %sp, -128, %sp
+; SPARC64-NEXT: .Ltmp0:
+; SPARC64-NEXT: rd %pc, %o7
+; SPARC64-NEXT: .Ltmp2:
+; SPARC64-NEXT: sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.Ltmp0)), %i0
+; SPARC64-NEXT: .Ltmp1:
+; SPARC64-NEXT: or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.Ltmp0)), %i0
+; SPARC64-NEXT: add %i0, %o7, %i0
+; SPARC64-NEXT: sethi %hi(value), %i1
+; SPARC64-NEXT: add %i1, %lo(value), %i1
+; SPARC64-NEXT: ldx [%i0+%i1], %i0
+; SPARC64-NEXT: ld [%i0], %i0
+; SPARC64-NEXT: ret
+; SPARC64-NEXT: restore
+ %1 = load i32, ptr @value
+ ret i32 %1
+}
+
+attributes #0 = { "tune-cpu"="niagara" }
|
Note: this is a remake of PR #77196 because it was not merged to |
Thanks! |
On 64-bit target, prefer using RDPC over CALL to get the value of %pc. This is faster on modern processors (Niagara T1 and newer) and avoids polluting the processor's predictor state. The old behavior of using a fake CALL is still done when tuning for classic UltraSPARC processors, since RDPC is much slower there. A quick pgbench test on a SPARC T4 shows about 2% speedup on SELECT loads, and about 7% speedup on INSERT/UPDATE loads. Reviewed By: @s-barannikov Github PR: llvm#78280
On 64-bit target, prefer using RDPC over CALL to get the value of %pc. This is faster on modern processors (Niagara T1 and newer) and avoids polluting the processor's predictor state.
The old behavior of using a fake CALL is still done when tuning for classic UltraSPARC processors, since RDPC is much slower there.
A quick pgbench test on a SPARC T4 shows about 2% speedup on SELECT loads, and about 7% speedup on INSERT/UPDATE loads.