Skip to content

Commit

Permalink
[ARM] Convert VDUPLANE to VDUP under MVE
Browse files Browse the repository at this point in the history
Unlike Neon, MVE does not have a way of duplicating from a vector lane,
so a VDUPLANE currently selects to a VDUP(move_from_lane(..)). This
forces that to be done earlier as a dag combine to allow other folds to
happen.

It converts to a VDUP(EXTRACT). On FP16 this is then folded to a
VGETLANEu to prevent it from creating a vmovx;vmovhr pair, using a
single move_from_reg instead.

Differential Revision: https://reviews.llvm.org/D79606
  • Loading branch information
davemgreen committed May 9, 2020
1 parent 0e49ac7 commit 6eee2d9
Show file tree
Hide file tree
Showing 10 changed files with 415 additions and 497 deletions.
24 changes: 21 additions & 3 deletions llvm/lib/Target/ARM/ARMISelLowering.cpp
Expand Up @@ -13090,6 +13090,12 @@ static SDValue PerformVMOVrhCombine(SDNode *N,
return Load;
}

// Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
isa<ConstantSDNode>(N0->getOperand(1)))
return DCI.DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
N0->getOperand(1));

return SDValue();
}

Expand Down Expand Up @@ -13840,8 +13846,21 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
/// PerformVDUPLANECombine - Target-specific dag combine xforms for
/// ARMISD::VDUPLANE.
static SDValue PerformVDUPLANECombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
SDValue Op = N->getOperand(0);
EVT VT = N->getValueType(0);

// On MVE, we just convert the VDUPLANE to a VDUP with an extract.
if (Subtarget->hasMVEIntegerOps()) {
EVT ExtractVT = VT.getVectorElementType();
// We need to ensure we are creating a legal type.
if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
ExtractVT = MVT::i32;
SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,
N->getOperand(0), N->getOperand(1));
return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);
}

// If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
// of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
Expand All @@ -13862,7 +13881,6 @@ static SDValue PerformVDUPLANECombine(SDNode *N,
unsigned EltBits;
if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
EltSize = 8;
EVT VT = N->getValueType(0);
if (EltSize > VT.getScalarSizeInBits())
return SDValue();

Expand Down Expand Up @@ -15343,7 +15361,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
case ISD::EXTRACT_VECTOR_ELT: return PerformExtractEltCombine(N, DCI);
case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget);
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/ARM/ARMInstrInfo.td
Expand Up @@ -264,7 +264,7 @@ def ARMvrev64 : SDNode<"ARMISD::VREV64", SDTARMVSHUF>;
def ARMvrev32 : SDNode<"ARMISD::VREV32", SDTARMVSHUF>;
def ARMvrev16 : SDNode<"ARMISD::VREV16", SDTARMVSHUF>;

def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>,
def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVec<1>,
SDTCisVT<2, i32>]>;
def ARMvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>;
def ARMvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>;
Expand Down
18 changes: 4 additions & 14 deletions llvm/lib/Target/ARM/ARMInstrMVE.td
Expand Up @@ -1690,10 +1690,14 @@ let Predicates = [HasMVEInt] in {
(MVE_VMOV_from_lane_s8 MQPR:$src, imm:$lane)>;
def : Pat<(ARMvgetlanes (v8i16 MQPR:$src), imm:$lane),
(MVE_VMOV_from_lane_s16 MQPR:$src, imm:$lane)>;
def : Pat<(ARMvgetlanes (v8f16 MQPR:$src), imm:$lane),
(MVE_VMOV_from_lane_s16 MQPR:$src, imm:$lane)>;
def : Pat<(ARMvgetlaneu (v16i8 MQPR:$src), imm:$lane),
(MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane)>;
def : Pat<(ARMvgetlaneu (v8i16 MQPR:$src), imm:$lane),
(MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane)>;
def : Pat<(ARMvgetlaneu (v8f16 MQPR:$src), imm:$lane),
(MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane)>;

def : Pat<(v16i8 (scalar_to_vector GPR:$src)),
(MVE_VMOV_to_lane_8 (v16i8 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
Expand Down Expand Up @@ -2227,25 +2231,11 @@ let Predicates = [HasMVEInt] in {
def : Pat<(v4i32 (ARMvdup (i32 rGPR:$elem))),
(MVE_VDUP32 rGPR:$elem)>;

def : Pat<(v4i32 (ARMvduplane (v4i32 MQPR:$src), imm:$lane)),
(MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>;
// For the 16-bit and 8-bit vduplanes we don't care about the signedness
// of the lane move operation as we only want the lowest 8/16 bits anyway.
def : Pat<(v8i16 (ARMvduplane (v8i16 MQPR:$src), imm:$lane)),
(MVE_VDUP16 (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane))>;
def : Pat<(v16i8 (ARMvduplane (v16i8 MQPR:$src), imm:$lane)),
(MVE_VDUP8 (MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane))>;

def : Pat<(v8f16 (ARMvdup (i32 rGPR:$elem))),
(MVE_VDUP16 rGPR:$elem)>;
def : Pat<(v4f32 (ARMvdup (i32 rGPR:$elem))),
(MVE_VDUP32 rGPR:$elem)>;

def : Pat<(v4f32 (ARMvduplane (v4f32 MQPR:$src), imm:$lane)),
(MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>;
def : Pat<(v8f16 (ARMvduplane (v8f16 MQPR:$src), imm:$lane)),
(MVE_VDUP16 (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane))>;

// Match a vselect with an ARMvdup as a predicated MVE_VDUP
def : Pat<(v16i8 (vselect (v16i1 VCCR:$pred),
(v16i8 (ARMvdup (i32 rGPR:$elem))),
Expand Down
10 changes: 4 additions & 6 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
Expand Up @@ -243,9 +243,8 @@ define arm_aapcs_vfpcc float @fast_float_mac(float* nocapture readonly %b, float
; CHECK-NEXT: vmov.f32 s4, s2
; CHECK-NEXT: vmov.f32 s5, s3
; CHECK-NEXT: vadd.f32 q0, q0, q1
; CHECK-NEXT: vmov.32 r0, q0[1]
; CHECK-NEXT: vdup.32 q1, r0
; CHECK-NEXT: vadd.f32 q0, q0, q1
; CHECK-NEXT: vmov r0, s1
; CHECK-NEXT: vadd.f32 q0, q0, r0
; CHECK-NEXT: @ kill: def $s0 killed $s0 killed $q0
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: .LBB1_4:
Expand Down Expand Up @@ -513,9 +512,8 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
; CHECK-NEXT: vmov.f32 s4, s2
; CHECK-NEXT: vmov.f32 s5, s3
; CHECK-NEXT: vadd.f32 q0, q0, q1
; CHECK-NEXT: vmov.32 r0, q0[1]
; CHECK-NEXT: vdup.32 q1, r0
; CHECK-NEXT: vadd.f32 q0, q0, q1
; CHECK-NEXT: vmov r0, s1
; CHECK-NEXT: vadd.f32 q0, q0, r0
; CHECK-NEXT: b .LBB2_23
; CHECK-NEXT: .LBB2_22:
; CHECK-NEXT: vldr s0, .LCPI2_0
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll
Expand Up @@ -148,7 +148,7 @@ define <4 x i32> @shuffle3_v4i32(<4 x i32> %src, <4 x i32> %a, <4 x i32> %b) {
; CHECK-NEXT: vcmp.i32 eq, q0, zr
; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vmov.32 r0, q0[0]
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vdup.32 q0, r0
; CHECK-NEXT: add r0, sp, #16
; CHECK-NEXT: vcmp.i32 ne, q0, zr
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/Thumb2/mve-vdup.ll
Expand Up @@ -162,7 +162,7 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @vduplane_i32(<4 x i32> %src) {
; CHECK-LABEL: vduplane_i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.32 r0, q0[3]
; CHECK-NEXT: vmov r0, s3
; CHECK-NEXT: vdup.32 q0, r0
; CHECK-NEXT: bx lr
entry:
Expand Down Expand Up @@ -206,7 +206,7 @@ entry:
define arm_aapcs_vfpcc <4 x float> @vduplane_f32(<4 x float> %src) {
; CHECK-LABEL: vduplane_f32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.32 r0, q0[3]
; CHECK-NEXT: vmov r0, s3
; CHECK-NEXT: vdup.32 q0, r0
; CHECK-NEXT: bx lr
entry:
Expand Down

0 comments on commit 6eee2d9

Please sign in to comment.