-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64][SME] Remove implicit-def's on smstart #69012
Conversation
This is an alternative approach to the problem that #68630 tries to address. |
@llvm/pr-subscribers-backend-aarch64 Author: Jon Roelofs (jroelofs) ChangesWhen we lower calls, the sequence of argument copy-to-reg nodes are glued to the smstart. In the InstrEmitter, these glued copies are turned into implicit defs, since the actual call instruction uses those physregs, resulting in the register allocator adding unnecessary copies of regs that are preserved anyway. Full diff: https://github.com/llvm/llvm-project/pull/69012.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 64d00dafd835b11..72e1e49040aac09 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7221,6 +7221,21 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
return ZExtBool;
}
+void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
+ SDNode *Node) const {
+ // Live-in physreg copies that are glued to SMSTART are applied as
+ // implicit-def's in the InstrEmitter. Here we remove them, allowing the
+ // register allocator to pass call args in callee saved regs, without extra
+ // copies to avoid these fake clobbers of actually-preserved GPRs.
+ if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1)
+ for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
+ if (MachineOperand &MO = MI.getOperand(I);
+ MO.isReg() && MO.isImplicit() && MO.isDef() &&
+ (AArch64::GPR32RegClass.contains(MO.getReg()) ||
+ AArch64::GPR64RegClass.contains(MO.getReg())))
+ MI.removeOperand(I);
+}
+
SDValue AArch64TargetLowering::changeStreamingMode(
SelectionDAG &DAG, SDLoc DL, bool Enable,
SDValue Chain, SDValue InGlue, SDValue PStateSM, bool Entry) const {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 9dcfba3a229cccd..fa234bd6273e093 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -963,6 +963,9 @@ class AArch64TargetLowering : public TargetLowering {
const SDLoc &DL, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals) const override;
+ void AdjustInstrPostInstrSelection(MachineInstr &MI,
+ SDNode *Node) const override;
+
SDValue LowerCall(CallLoweringInfo & /*CLI*/,
SmallVectorImpl<SDValue> &InVals) const override;
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 823115c7d025005..8de5a1541a3bb5c 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -215,6 +215,7 @@ def MSRpstatesvcrImm1
let Inst{11-9} = pstatefield;
let Inst{8} = imm;
let Inst{7-5} = 0b011; // op2
+ let hasPostISelHook = 1;
}
def : InstAlias<"smstart", (MSRpstatesvcrImm1 0b011, 0b1)>;
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
index 102ed896ce7b3e8..dd7d6470ad7b084 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
@@ -368,15 +368,11 @@ define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-3
-; CHECK-NEXT: rdsvl x8, #1
-; CHECK-NEXT: addvl x9, sp, #2
-; CHECK-NEXT: addvl x10, sp, #1
-; CHECK-NEXT: mov x11, sp
+; CHECK-NEXT: rdsvl x3, #1
+; CHECK-NEXT: addvl x0, sp, #2
+; CHECK-NEXT: addvl x1, sp, #1
+; CHECK-NEXT: mov x2, sp
; CHECK-NEXT: smstop sm
-; CHECK-NEXT: mov x0, x9
-; CHECK-NEXT: mov x1, x10
-; CHECK-NEXT: mov x2, x11
-; CHECK-NEXT: mov x3, x8
; CHECK-NEXT: bl foo
; CHECK-NEXT: smstart sm
; CHECK-NEXT: ptrue p0.b
@@ -400,8 +396,37 @@ entry:
ret i8 %vecext
}
+define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) #0 {
+; CHECK-LABEL: call_to_non_streaming_pass_args:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d2, d3, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: bl bar
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #112
+; CHECK-NEXT: ret
+entry:
+ call void @bar(ptr noundef nonnull %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2)
+ ret void
+}
+
declare i64 @llvm.aarch64.sme.cntsb()
declare void @foo(ptr noundef, ptr noundef, ptr noundef, i64 noundef)
+declare void @bar(ptr noundef, i64 noundef, i64 noundef, i32 noundef, i32 noundef, float noundef, float noundef, double noundef, double noundef)
attributes #0 = { nounwind vscale_range(1,16) "aarch64_pstate_sm_enabled" }
|
ping |
1 similar comment
ping |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry for the delay in reviewing this patch, I've been catching up after getting back from some annual leave.
I think this patch makes sense. The issue seems to be caused by SelectionDAG, which adds the unnecessary implicit defs because of Glue (used for the smstart/smstop nodes), for which it makes some special assumptions.
Just an FYI that I've started looking into redesigning the smstart/smstop mechanism to simplify its design and also remove the need to use Glue in the DAG. That might make the code in this patch unnecessary in the future. But until we have something better, I'm happy moving forward with these changes.
// implicit-def's in the InstrEmitter. Here we remove them, allowing the | ||
// register allocator to pass call args in callee saved regs, without extra | ||
// copies to avoid these fake clobbers of actually-preserved GPRs. | ||
if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we do the same thing for MSRpstatePseudo
?
cacfde5
to
fd79f50
Compare
// copies to avoid these fake clobbers of actually-preserved GPRs. | ||
if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 || | ||
MI.getOpcode() == AArch64::MSRpstatePseudo) | ||
for (unsigned I = MI.getNumOperands() - 1; I > 0; --I) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
minor nit: Can this be an incrementing loop instead?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This appears to be the common pattern for removing operands. Starting from the end means marginally ness shuffling of not-removed operands, too.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Doh! Of course, you're right, not sure how I missed that when I made the suggestion :)
@@ -400,8 +396,37 @@ entry: | |||
ret i8 %vecext | |||
} | |||
|
|||
define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) #0 { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you add a similar test to sme-streaming-compatible-interface.ll
?
When we lower calls, the sequence of argument copy-to-reg nodes are glued to the smstart. In the InstrEmitter, these glued copies are turned into implicit defs, since the actual call instruction uses those physregs, resulting in the register allocator adding unnecessary copies of regs that are preserved anyway.
fd79f50
to
09eb08d
Compare
When we lower calls, the sequence of argument copy-to-reg nodes are glued to the smstart. In the InstrEmitter, these glued copies are turned into implicit defs, since the actual call instruction uses those physregs, resulting in the register allocator adding unnecessary copies of regs that are preserved anyway.