[ARM][NEON] Add constraint to vld2 Odd/Even Pseudo instructions. #79287

AlfieRichardsArm · 2024-01-24T13:16:35Z

This ensures the odd/even pseudo instructions are allocated to the same register range.

This fixes #71763

llvmbot · 2024-01-24T13:17:15Z

@llvm/pr-subscribers-backend-arm

Author: Alfie Richards (AlfieRichardsArm)

Changes

This ensures the odd/even pseudo instructions are allocated to the same register range.

This fixes #71763 (#71763)

Full diff: https://github.com/llvm/llvm-project/pull/79287.diff

4 Files Affected:

(modified) llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp (+3-6)
(modified) llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp (-5)
(modified) llvm/lib/Target/ARM/ARMInstrNEON.td (+26-12)
(modified) llvm/test/CodeGen/ARM/bf16-intrinsics-ld-st.ll (+1-1)

diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 2f9236bb977fc9..f0b69b0b09809f 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -640,12 +640,9 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
   // has an extra operand that is a use of the super-register.  Record the
   // operand index and skip over it.
   unsigned SrcOpIdx = 0;
-  if (!IsVLD2DUP) {
-    if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc ||
-        RegSpc == SingleLowSpc || RegSpc == SingleHighQSpc ||
-        RegSpc == SingleHighTSpc)
-      SrcOpIdx = OpIdx++;
-  }
+  if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc || RegSpc == SingleLowSpc ||
+      RegSpc == SingleHighQSpc || RegSpc == SingleHighTSpc)
+    SrcOpIdx = OpIdx++;
 
   // Copy the predicate operands.
   MIB.add(MI.getOperand(OpIdx++));
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index e99ee299412a5f..20dd3e7baf8498 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -3032,11 +3032,6 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic,
   }
   if (is64BitVector || NumVecs == 1) {
     // Double registers and VLD1 quad registers are directly supported.
-  } else if (NumVecs == 2) {
-    const SDValue OpsA[] = {MemAddr, Align, Pred, Reg0, Chain};
-    SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, ResTy,
-                                          MVT::Other, OpsA);
-    Chain = SDValue(VLdA, 1);
   } else {
     SDValue ImplDef = SDValue(
         CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0);
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
index f31e1e9f97892f..e7cf8b4657b2c8 100644
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -1491,12 +1491,26 @@ def VLD2DUPd16x2 : VLD2DUP<{0,1,1,?}, "16", VecListDPairSpacedAllLanes,
 def VLD2DUPd32x2 : VLD2DUP<{1,0,1,?}, "32", VecListDPairSpacedAllLanes,
                            addrmode6dupalign64>;
 
-def VLD2DUPq8EvenPseudo  : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq8OddPseudo   : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq16EvenPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq16OddPseudo  : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq32EvenPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq32OddPseudo  : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+// Duplicate of VLDQQPseudo but with a constraint variable
+// to ensure the odd and even lanes use the same register range 
+class VLDQQPseudoConstrained<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst), (ins addrmode6:$addr, QQPR: $src), itin, 
+                "$src = $dst">;
+class VLDQQWBPseudoConstrained<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQPR: $src), itin,
+                "$addr.addr = $wb, $src = $dst">;
+class VLDQQWBfixedPseudoConstrained<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, QQPR: $src), itin,
+                "$addr.addr = $wb, $src = $dst">;
+
+def VLD2DUPq8EvenPseudo  : VLDQQPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq8OddPseudo   : VLDQQPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq16EvenPseudo : VLDQQPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq16OddPseudo  : VLDQQPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq32EvenPseudo : VLDQQPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq32OddPseudo  : VLDQQPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
 
 // ...with address register writeback:
 multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy,
@@ -1534,12 +1548,12 @@ defm VLD2DUPd16x2wb : VLD2DUPWB<{0,1,1,?}, "16", VecListDPairSpacedAllLanes,
 defm VLD2DUPd32x2wb : VLD2DUPWB<{1,0,1,?}, "32", VecListDPairSpacedAllLanes,
                                 addrmode6dupalign64>;
 
-def VLD2DUPq8OddPseudoWB_fixed     : VLDQQWBfixedPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq16OddPseudoWB_fixed    : VLDQQWBfixedPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq32OddPseudoWB_fixed    : VLDQQWBfixedPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq8OddPseudoWB_register  : VLDQQWBPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq16OddPseudoWB_register : VLDQQWBPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq32OddPseudoWB_register : VLDQQWBPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq8OddPseudoWB_fixed     : VLDQQWBfixedPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq16OddPseudoWB_fixed    : VLDQQWBfixedPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq32OddPseudoWB_fixed    : VLDQQWBfixedPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq8OddPseudoWB_register  : VLDQQWBPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq16OddPseudoWB_register : VLDQQWBPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq32OddPseudoWB_register : VLDQQWBPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
 
 //   VLD3DUP  : Vector Load (single 3-element structure to all lanes)
 class VLD3DUP<bits<4> op7_4, string Dt>
diff --git a/llvm/test/CodeGen/ARM/bf16-intrinsics-ld-st.ll b/llvm/test/CodeGen/ARM/bf16-intrinsics-ld-st.ll
index cccbdd04357650..e49128f53b1157 100644
--- a/llvm/test/CodeGen/ARM/bf16-intrinsics-ld-st.ll
+++ b/llvm/test/CodeGen/ARM/bf16-intrinsics-ld-st.ll
@@ -488,7 +488,7 @@ entry:
 define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld2q_dup_bf16(ptr %ptr) {
 ; CHECK-LABEL: test_vld2q_dup_bf16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vld2.16 {d16[], d18[]}, [r0]
+; CHECK-NEXT:    vld2.16 {d0[], d2[]}, [r0]
 ; CHECK-NEXT:    vld2.16 {d1[], d3[]}, [r0]
 ; CHECK-NEXT:    bx lr
 entry:

efriedma-quic · 2024-01-24T18:13:04Z

Why does the original code have special cases for vld2dup? Looking at https://reviews.llvm.org/D48439, it had some special-cases specifically for VLD2... do you think the special-cases were just working around the bug in the definition of the pseudo-instruction?

Can we come up with better names for TableGen classes, so it's more clear how they're different from each other? In particular, the important thing isn't the constraint itself; it's whether or not there's an input register.

AlfieRichardsArm · 2024-01-25T09:53:01Z

Caveat that I'm relatively new to LLVM work and this code base.

There are multiple reasons for the special cases IMO and none of them bugs (but maybe outdated design choices). The encoding for VLDQQPseudo is intended to be more general than for the Q/QQQQ Pseudos as it is used in more cases, but that leads to some extra complexity and not being able to use it in this case.

In this case I think that VLD2DUPq8OddPseudo and VLD2DUPq8EvenPseudo have this need for glue between them that doesn't fit into the abstractions in VLDQQPseudo.

I am happy to rename, however I don't follow your point about the input register. By my interpretation the "input" register is there solely to allow the constraint that Odd and Even use the same output register range?

davemgreen · 2024-01-25T15:23:29Z

Could you add the testcase from https://godbolt.org/z/hevqGKfqj, or ideally change the arm-vlddup.ll test (and maybe vlddup-update.ll too), to have a hard-float calling convention and use update_llc_test_checks to generate the check lines? I think that should show the same issue. Is it possible to test the other variants? They don't have even pairs?

AlfieRichardsArm · 2024-01-25T16:46:40Z

I have updated the tests as suggested

I checked and the variants like VLDQQWBfixedPseudoConstrained with VLD2DUPq32OddPseudoWB_fixed are tested in vlddup-update.ll if that was what you meant?

AlfieRichardsArm · 2024-01-25T16:48:25Z

I believe only the odd variants have the WB variant as it is constraining the input register which is the same for both, so only one constraint is needed.

efriedma-quic · 2024-01-25T16:55:46Z

By my interpretation the "input" register is there solely to allow the constraint that Odd and Even use the same output register range?

You have two pseudo-instructions: the first one writes to the low half of a pair of Q registers, the second writes to the high half of a pair of Q registers. So each half of the pair is an "insert element" operation, which takes two vectors as input, and writes into half of each of them. (The expansion ends up referring to D registers, not Q registers, but that's not relevant to how we treat them when they're still pseudos.)

If you think about it that way, each pseudo-instruction obviously has two inputs: the address, and the pair of vectors we're inserting into.

AlfieRichardsArm · 2024-01-25T17:16:12Z

@efriedma-quic Ah yes I think I see your point. My view was that $dst value in both was where the insertion is happening and the $src register is only there to facilitate for the Odd and Even instructions to use the same destination range.

But it is more natural to view it as VLDQQPseudoConstrained has $src input which then sets the $dst output?

So in this understanding a name such as VLDQQPseudoInputDST may be better?

efriedma-quic · 2024-01-25T20:32:16Z

That name seems okay.

This ensures the odd/even pseudo instructions are allocated to the same register range. This fixes llvm#71763. (llvm#71763)

AlfieRichardsArm · 2024-01-30T14:18:54Z

Note as far as I can tell failing tests are CI failures, not related to the patch

AlfieRichardsArm · 2024-01-31T13:51:59Z

Ping

davemgreen

I think this LGTM, thanks.

AlfieRichardsArm requested a review from ostannard January 24, 2024 13:16

llvmbot added the backend:ARM label Jan 24, 2024

AlfieRichardsArm mentioned this pull request Jan 24, 2024

[NEON] Wrong result of NEON intrinsic vld2q_dup_p16 with the -march=armv7-a flag in CLANG-15. #71763

Closed

AlfieRichardsArm requested a review from davemgreen January 24, 2024 13:46

AlfieRichardsArm added 3 commits January 26, 2024 09:08

[ARM][NEON] Add constraint to vld2 Odd/Even Pseudo instructions.

9e23dfe

This ensures the odd/even pseudo instructions are allocated to the same register range. This fixes llvm#71763. (llvm#71763)

Update tests to use hard-float calling convention

4670f14

Changed names

ff0891a

AlfieRichardsArm force-pushed the SDCOMP-64591 branch from 8ea21d7 to ff0891a Compare January 26, 2024 09:22

davemgreen reviewed Jan 31, 2024

View reviewed changes

AlfieRichardsArm merged commit de75e50 into llvm:main Jan 31, 2024
3 of 4 checks passed

AlfieRichardsArm deleted the SDCOMP-64591 branch January 31, 2024 14:09

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[ARM][NEON] Add constraint to vld2 Odd/Even Pseudo instructions. #79287

[ARM][NEON] Add constraint to vld2 Odd/Even Pseudo instructions. #79287

AlfieRichardsArm commented Jan 24, 2024 •

edited

llvmbot commented Jan 24, 2024

efriedma-quic commented Jan 24, 2024

AlfieRichardsArm commented Jan 25, 2024

davemgreen commented Jan 25, 2024

AlfieRichardsArm commented Jan 25, 2024

AlfieRichardsArm commented Jan 25, 2024

efriedma-quic commented Jan 25, 2024

AlfieRichardsArm commented Jan 25, 2024

efriedma-quic commented Jan 25, 2024

AlfieRichardsArm commented Jan 30, 2024

AlfieRichardsArm commented Jan 31, 2024

davemgreen left a comment

[ARM][NEON] Add constraint to vld2 Odd/Even Pseudo instructions. #79287

[ARM][NEON] Add constraint to vld2 Odd/Even Pseudo instructions. #79287

Conversation

AlfieRichardsArm commented Jan 24, 2024 • edited

llvmbot commented Jan 24, 2024

efriedma-quic commented Jan 24, 2024

AlfieRichardsArm commented Jan 25, 2024

davemgreen commented Jan 25, 2024

AlfieRichardsArm commented Jan 25, 2024

AlfieRichardsArm commented Jan 25, 2024

efriedma-quic commented Jan 25, 2024

AlfieRichardsArm commented Jan 25, 2024

efriedma-quic commented Jan 25, 2024

AlfieRichardsArm commented Jan 30, 2024

AlfieRichardsArm commented Jan 31, 2024

davemgreen left a comment

Choose a reason for hiding this comment

AlfieRichardsArm commented Jan 24, 2024 •

edited