Skip to content

Commit

Permalink
[AArch64][SME] Prevent SVE object address calculations between smstop…
Browse files Browse the repository at this point in the history
… and call

This patch introduces a new AArch64 ISD node (OBSCURE_COPY) that can
be used when we want to prevent SVE object address calculations
from being rematerialised between a smstop/smstart and a call.
At the moment we use COPY to copy the frame index to a register,
which leads to problems because the "simple register coalescing"
pass understands the COPY instruction and attempts to rematerialise
an address calculation with 'addvl' between an smstop and a call.
When in streaming mode the 'addvl' instruction may have different
behaviour because the streaming SVE vector length is not guaranteed
to equal the normal SVE vector length.

The new ISD opcode OBSCURE_COPY gets lowered to a new pseudo
instruction also called OBSCURE_COPY. This ensures it cannot be
rematerialised and we expand this into a simple move very late in
the machine instruction pipeline.

A new test is added here:

CodeGen/AArch64/sme-streaming-interface.ll

Differential Revision: https://reviews.llvm.org/D134940
  • Loading branch information
david-arm committed Oct 5, 2022
1 parent e50e19a commit 991a36d
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 0 deletions.
11 changes: 11 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
Expand Up @@ -1377,6 +1377,17 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated.
return true;
}
case AArch64::OBSCURE_COPY: {
if (MI.getOperand(0).getReg() != MI.getOperand(1).getReg()) {
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXrs))
.add(MI.getOperand(0))
.addReg(AArch64::XZR)
.add(MI.getOperand(1))
.addImm(0);
}
MI.eraseFromParent();
return true;
}
}
return false;
}
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Expand Up @@ -2065,6 +2065,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((AArch64ISD::NodeType)Opcode) {
case AArch64ISD::FIRST_NUMBER:
break;
MAKE_CASE(AArch64ISD::OBSCURE_COPY)
MAKE_CASE(AArch64ISD::SMSTART)
MAKE_CASE(AArch64ISD::SMSTOP)
MAKE_CASE(AArch64ISD::CALL)
Expand Down Expand Up @@ -7036,6 +7037,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
return ArgReg.Reg == VA.getLocReg();
});
} else {
// Add an extra level of indirection for streaming mode changes by
// using a pseudo copy node that cannot be rematerialised between a
// smstart/smstop and the call by the simple register coalescer.
if (RequiresSMChange && isa<FrameIndexSDNode>(Arg))
Arg = DAG.getNode(AArch64ISD::OBSCURE_COPY, DL, MVT::i64, Arg);
RegsToPass.emplace_back(VA.getLocReg(), Arg);
RegsUsed.insert(VA.getLocReg());
const TargetOptions &Options = DAG.getTarget().Options;
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.h
Expand Up @@ -58,6 +58,13 @@ enum NodeType : unsigned {

CALL_BTI, // Function call followed by a BTI instruction.

// Essentially like a normal COPY that works on GPRs, but cannot be
// rematerialised by passes like the simple register coalescer. It's
// required for SME when lowering calls because we cannot allow frame
// index calculations using addvl to slip in between the smstart/smstop
// and the bl instruction. The scalable vector length may change across
// the smstart/smstop boundary.
OBSCURE_COPY,
SMSTART,
SMSTOP,

Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
Expand Up @@ -19,6 +19,8 @@ def AArch64_smstop : SDNode<"AArch64ISD::SMSTOP", SDTypeProfile<0, 3,
[SDNPHasChain, SDNPSideEffect, SDNPVariadic,
SDNPOptInGlue, SDNPOutGlue]>;

def AArch64ObscureCopy : SDNode<"AArch64ISD::OBSCURE_COPY", SDTypeProfile<1, 1, []>, []>;

//===----------------------------------------------------------------------===//
// Add vector elements horizontally or vertically to ZA tile.
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -202,6 +204,10 @@ def : Pat<(int_aarch64_sme_set_tpidr2 i64:$val),
def : Pat<(i64 (int_aarch64_sme_get_tpidr2)),
(MRS 0xde85)>;

def OBSCURE_COPY : Pseudo<(outs GPR64:$dst), (ins GPR64:$idx), []>, Sched<[]> { }
def : Pat<(i64 (AArch64ObscureCopy (i64 GPR64:$idx))),
(OBSCURE_COPY GPR64:$idx)>;

//===----------------------------------------------------------------------===//
// SVE2 instructions
//===----------------------------------------------------------------------===//
Expand Down
46 changes: 46 additions & 0 deletions llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
Expand Up @@ -360,4 +360,50 @@ define void @disable_tailcallopt() nounwind {
ret void;
}

define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone %ptr) #1 {
; CHECK-LABEL: call_to_non_streaming_pass_sve_objects:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: addvl x9, sp, #2
; CHECK-NEXT: addvl x10, sp, #1
; CHECK-NEXT: mov x11, sp
; CHECK-NEXT: smstop sm
; CHECK-NEXT: mov x0, x9
; CHECK-NEXT: mov x1, x10
; CHECK-NEXT: mov x2, x11
; CHECK-NEXT: mov x3, x8
; CHECK-NEXT: bl foo
; CHECK-NEXT: smstart sm
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: ld1b { z0.b }, p0/z, [sp, #2, mul vl]
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
; CHECK-NEXT: ret
entry:
%Data1 = alloca <vscale x 16 x i8>, align 16
%Data2 = alloca <vscale x 16 x i8>, align 16
%Data3 = alloca <vscale x 16 x i8>, align 16
%0 = tail call i64 @llvm.aarch64.sme.cntsb()
call void @foo(ptr noundef nonnull %Data1, ptr noundef nonnull %Data2, ptr noundef nonnull %Data3, i64 noundef %0)
%1 = load <vscale x 16 x i8>, ptr %Data1, align 16
%vecext = extractelement <vscale x 16 x i8> %1, i64 0
ret i8 %vecext
}

declare i64 @llvm.aarch64.sme.cntsb()

declare void @foo(ptr noundef, ptr noundef, ptr noundef, i64 noundef)

attributes #0 = { nounwind "target-features"="+sve" }
attributes #1 = { nounwind vscale_range(1,16) "aarch64_pstate_sm_enabled" }

0 comments on commit 991a36d

Please sign in to comment.