Skip to content

Commit

Permalink
[ARM] Change VDUP type to i32 for MVE
Browse files Browse the repository at this point in the history
The MVE VDUP instruction take a GPR and splats into every lane of a
vector register. Unlike NEON we do not have a VDUPLANE equivalent
instruction, doing the same splat from a fp register. Previously a VDUP
to a v4f32/v8f16 would be represented as a (v4f32 VDUP f32), which
would mean the instruction pattern needs to add a COPY_TO_REGCLASS to
the GPR.

Instead this now converts that earlier during an ISel DAG combine,
converting (VDUP x) to (VDUP (bitcast x)). This can allow instruction
selection to tell that the input needs to be an i32, which in one of the
testcases allows it to use ldr (or specifically ldm) over (vldr;vmov).

Whilst being simple enough for floats, as the types sizes are the same,
these is no BITCAST equivalent for getting a half into a i32. This uses
a VMOVrh ARMISD node, which doesn't know the same tricks yet.

Differential Revision: https://reviews.llvm.org/D76292
  • Loading branch information
davemgreen committed Mar 20, 2020
1 parent 3c24aee commit b3499f5
Show file tree
Hide file tree
Showing 14 changed files with 303 additions and 333 deletions.
12 changes: 12 additions & 0 deletions llvm/lib/Target/ARM/ARMISelLowering.cpp
Expand Up @@ -13705,6 +13705,18 @@ static SDValue PerformVDUPCombine(SDNode *N,
const ARMSubtarget *Subtarget) {
SelectionDAG &DAG = DCI.DAG;
SDValue Op = N->getOperand(0);
SDLoc dl(N);

if (Subtarget->hasMVEIntegerOps()) {
// Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
// need to come from a GPR.
if (Op.getValueType() == MVT::f32)
return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
else if (Op.getValueType() == MVT::f16)
return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
}

if (!Subtarget->hasNEON())
return SDValue();
Expand Down
126 changes: 62 additions & 64 deletions llvm/lib/Target/ARM/ARMInstrMVE.td
Expand Up @@ -2107,10 +2107,10 @@ let Predicates = [HasMVEInt] in {
def : Pat<(v16i8 (ARMvduplane (v16i8 MQPR:$src), imm:$lane)),
(MVE_VDUP8 (MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane))>;

def : Pat<(v4f32 (ARMvdup (f32 SPR:$elem))),
(v4f32 (MVE_VDUP32 (i32 (COPY_TO_REGCLASS (f32 SPR:$elem), rGPR))))>;
def : Pat<(v8f16 (ARMvdup (f16 HPR:$elem))),
(v8f16 (MVE_VDUP16 (i32 (COPY_TO_REGCLASS (f16 HPR:$elem), rGPR))))>;
def : Pat<(v8f16 (ARMvdup (i32 rGPR:$elem))),
(MVE_VDUP16 rGPR:$elem)>;
def : Pat<(v4f32 (ARMvdup (i32 rGPR:$elem))),
(MVE_VDUP32 rGPR:$elem)>;

def : Pat<(v4f32 (ARMvduplane (v4f32 MQPR:$src), imm:$lane)),
(MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>;
Expand All @@ -2134,15 +2134,15 @@ let Predicates = [HasMVEInt] in {
(MVE_VDUP32 rGPR:$elem, ARMVCCThen, (v4i1 VCCR:$pred),
(v4i32 MQPR:$inactive))>;
def : Pat<(v4f32 (vselect (v4i1 VCCR:$pred),
(v4f32 (ARMvdup (f32 SPR:$elem))),
(v4f32 (ARMvdup (i32 rGPR:$elem))),
(v4f32 MQPR:$inactive))),
(MVE_VDUP32 (i32 (COPY_TO_REGCLASS (f32 SPR:$elem), rGPR)),
ARMVCCThen, (v4i1 VCCR:$pred), (v4f32 MQPR:$inactive))>;
(MVE_VDUP32 rGPR:$elem, ARMVCCThen, (v4i1 VCCR:$pred),
(v4f32 MQPR:$inactive))>;
def : Pat<(v8f16 (vselect (v8i1 VCCR:$pred),
(v8f16 (ARMvdup (f16 HPR:$elem))),
(v8f16 (ARMvdup (i32 rGPR:$elem))),
(v8f16 MQPR:$inactive))),
(MVE_VDUP16 (i32 (COPY_TO_REGCLASS (f16 HPR:$elem), rGPR)),
ARMVCCThen, (v8i1 VCCR:$pred), (v8f16 MQPR:$inactive))>;
(MVE_VDUP16 rGPR:$elem, ARMVCCThen, (v8i1 VCCR:$pred),
(v8f16 MQPR:$inactive))>;
}


Expand Down Expand Up @@ -4024,12 +4024,12 @@ multiclass unpred_vcmp_r<string suffix, PatLeaf fc> {
def i32 : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc)),
(v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc))>;

def i8r : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), fc)),
(v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc))>;
def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), fc)),
(v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc))>;
def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), fc)),
(v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc))>;
def i8r : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup rGPR:$v2)), fc)),
(v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 rGPR:$v2), fc))>;
def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup rGPR:$v2)), fc)),
(v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 rGPR:$v2), fc))>;
def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup rGPR:$v2)), fc)),
(v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 rGPR:$v2), fc))>;

def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc)))),
(v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
Expand All @@ -4038,12 +4038,12 @@ multiclass unpred_vcmp_r<string suffix, PatLeaf fc> {
def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc)))),
(v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;

def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), fc)))),
(v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), fc)))),
(v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), fc)))),
(v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup rGPR:$v2)), fc)))),
(v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup rGPR:$v2)), fc)))),
(v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup rGPR:$v2)), fc)))),
(v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
}

multiclass unpred_vcmpf_z<PatLeaf fc> {
Expand All @@ -4059,25 +4059,25 @@ multiclass unpred_vcmpf_z<PatLeaf fc> {
}

multiclass unpred_vcmpf_r<int fc> {
def f16 : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)),
(v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc))>;
def f32 : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)),
(v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc))>;
def : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)),
(v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc))>;
def : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)),
(v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc))>;

def f16r : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), fc)),
(v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc))>;
def f32r : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), fc)),
(v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc))>;
def : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup rGPR:$v2)), fc)),
(v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 rGPR:$v2), fc))>;
def : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup rGPR:$v2)), fc)),
(v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 rGPR:$v2), fc))>;

def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)))),
(v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)))),
(v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;

def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), fc)))),
(v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc, ARMVCCThen, VCCR:$p1))>;
def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), fc)))),
(v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc, ARMVCCThen, VCCR:$p1))>;
def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup rGPR:$v2)), fc)))),
(v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup rGPR:$v2)), fc)))),
(v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
}

let Predicates = [HasMVEInt] in {
Expand Down Expand Up @@ -4788,25 +4788,21 @@ multiclass MVE_vec_scalar_fp_pat_m<SDNode unpred_op, Intrinsic pred_int,
Instruction instr_f32> {
let Predicates = [HasMVEFloat] in {
// Unpredicated F16
def : Pat<(v8f16 (unpred_op (v8f16 MQPR:$Qm), (v8f16 (ARMvdup HPR:$val)))),
(v8f16 (instr_f16 (v8f16 MQPR:$Qm),
(i32 (COPY_TO_REGCLASS (f16 HPR:$val), rGPR))))>;
def : Pat<(v8f16 (unpred_op (v8f16 MQPR:$Qm), (v8f16 (ARMvdup rGPR:$val)))),
(v8f16 (instr_f16 (v8f16 MQPR:$Qm), (i32 rGPR:$val)))>;
// Unpredicated F32
def : Pat<(v4f32 (unpred_op (v4f32 MQPR:$Qm), (v4f32 (ARMvdup SPR:$val)))),
(v4f32 (instr_f32 (v4f32 MQPR:$Qm),
(i32 (COPY_TO_REGCLASS (f32 SPR:$val), rGPR))))>;
def : Pat<(v4f32 (unpred_op (v4f32 MQPR:$Qm), (v4f32 (ARMvdup rGPR:$val)))),
(v4f32 (instr_f32 (v4f32 MQPR:$Qm), (i32 rGPR:$val)))>;
// Predicated F16
def : Pat<(v8f16 (pred_int (v8f16 MQPR:$Qm), (v8f16 (ARMvdup HPR:$val)),
def : Pat<(v8f16 (pred_int (v8f16 MQPR:$Qm), (v8f16 (ARMvdup rGPR:$val)),
(v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))),
(v8f16 (instr_f16 (v8f16 MQPR:$Qm),
(i32 (COPY_TO_REGCLASS (f16 HPR:$val), rGPR)),
(v8f16 (instr_f16 (v8f16 MQPR:$Qm), (i32 rGPR:$val),
ARMVCCThen, (v8i1 VCCR:$mask),
(v8f16 MQPR:$inactive)))>;
// Preicated F32
def : Pat<(v4f32 (pred_int (v4f32 MQPR:$Qm), (v4f32 (ARMvdup SPR:$val)),
def : Pat<(v4f32 (pred_int (v4f32 MQPR:$Qm), (v4f32 (ARMvdup rGPR:$val)),
(v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))),
(v4f32 (instr_f32 (v4f32 MQPR:$Qm),
(i32 (COPY_TO_REGCLASS (f32 SPR:$val), rGPR)),
(v4f32 (instr_f32 (v4f32 MQPR:$Qm), (i32 rGPR:$val),
ARMVCCThen, (v4i1 VCCR:$mask),
(v4f32 MQPR:$inactive)))>;
}
Expand Down Expand Up @@ -5029,19 +5025,19 @@ defm MVE_VQSHL_qr : MVE_VxSHL_qr_types<"vqshl", 0b1, 0b0>;
defm MVE_VQRSHL_qr : MVE_VxSHL_qr_types<"vqrshl", 0b1, 0b1>;

let Predicates = [HasMVEInt] in {
def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))),
(v4i32 (MVE_VSHL_qru32 (v4i32 MQPR:$Qm), GPR:$Rm))>;
def : Pat<(v8i16 (ARMvshlu (v8i16 MQPR:$Qm), (v8i16 (ARMvdup GPR:$Rm)))),
(v8i16 (MVE_VSHL_qru16 (v8i16 MQPR:$Qm), GPR:$Rm))>;
def : Pat<(v16i8 (ARMvshlu (v16i8 MQPR:$Qm), (v16i8 (ARMvdup GPR:$Rm)))),
(v16i8 (MVE_VSHL_qru8 (v16i8 MQPR:$Qm), GPR:$Rm))>;
def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 (ARMvdup rGPR:$Rm)))),
(v4i32 (MVE_VSHL_qru32 (v4i32 MQPR:$Qm), rGPR:$Rm))>;
def : Pat<(v8i16 (ARMvshlu (v8i16 MQPR:$Qm), (v8i16 (ARMvdup rGPR:$Rm)))),
(v8i16 (MVE_VSHL_qru16 (v8i16 MQPR:$Qm), rGPR:$Rm))>;
def : Pat<(v16i8 (ARMvshlu (v16i8 MQPR:$Qm), (v16i8 (ARMvdup rGPR:$Rm)))),
(v16i8 (MVE_VSHL_qru8 (v16i8 MQPR:$Qm), rGPR:$Rm))>;

def : Pat<(v4i32 (ARMvshls (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))),
(v4i32 (MVE_VSHL_qrs32 (v4i32 MQPR:$Qm), GPR:$Rm))>;
def : Pat<(v8i16 (ARMvshls (v8i16 MQPR:$Qm), (v8i16 (ARMvdup GPR:$Rm)))),
(v8i16 (MVE_VSHL_qrs16 (v8i16 MQPR:$Qm), GPR:$Rm))>;
def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 (ARMvdup GPR:$Rm)))),
(v16i8 (MVE_VSHL_qrs8 (v16i8 MQPR:$Qm), GPR:$Rm))>;
def : Pat<(v4i32 (ARMvshls (v4i32 MQPR:$Qm), (v4i32 (ARMvdup rGPR:$Rm)))),
(v4i32 (MVE_VSHL_qrs32 (v4i32 MQPR:$Qm), rGPR:$Rm))>;
def : Pat<(v8i16 (ARMvshls (v8i16 MQPR:$Qm), (v8i16 (ARMvdup rGPR:$Rm)))),
(v8i16 (MVE_VSHL_qrs16 (v8i16 MQPR:$Qm), rGPR:$Rm))>;
def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 (ARMvdup rGPR:$Rm)))),
(v16i8 (MVE_VSHL_qrs8 (v16i8 MQPR:$Qm), rGPR:$Rm))>;
}

class MVE_VBRSR<string iname, string suffix, bits<2> size, list<dag> pattern=[]>
Expand Down Expand Up @@ -5223,19 +5219,21 @@ multiclass MVE_VFMA_qr_multi<string iname, MVEVectorVTInfo VTI,
defvar pred_int = int_arm_mve_fma_predicated;
defvar v1 = (VTI.Vec MQPR:$v1);
defvar v2 = (VTI.Vec MQPR:$v2);
defvar s = !if(VTI.Size{0}, (f16 HPR:$s), (f32 SPR:$s));
defvar vs = (VTI.Vec (ARMvdup s));
defvar is = (i32 (COPY_TO_REGCLASS s, rGPR));
defvar vs = (VTI.Vec (ARMvdup (i32 rGPR:$s)));
defvar is = (i32 rGPR:$s);
defvar pred = (VTI.Pred VCCR:$pred);

let Predicates = [HasMVEFloat] in {
if scalar_addend then {
def : Pat<(VTI.Vec (fma v1, v2, vs)), (VTI.Vec (Inst v1, v2, is))>;
def : Pat<(VTI.Vec (fma v1, v2, vs)),
(VTI.Vec (Inst v1, v2, is))>;
def : Pat<(VTI.Vec (pred_int v1, v2, vs, pred)),
(VTI.Vec (Inst v1, v2, is, ARMVCCThen, pred))>;
} else {
def : Pat<(VTI.Vec (fma v1, vs, v2)), (VTI.Vec (Inst v2, v1, is))>;
def : Pat<(VTI.Vec (fma vs, v1, v2)), (VTI.Vec (Inst v2, v1, is))>;
def : Pat<(VTI.Vec (fma v1, vs, v2)),
(VTI.Vec (Inst v2, v1, is))>;
def : Pat<(VTI.Vec (fma vs, v1, v2)),
(VTI.Vec (Inst v2, v1, is))>;
def : Pat<(VTI.Vec (pred_int v1, vs, v2, pred)),
(VTI.Vec (Inst v2, v1, is, ARMVCCThen, pred))>;
def : Pat<(VTI.Vec (pred_int vs, v1, v2, pred)),
Expand Down

0 comments on commit b3499f5

Please sign in to comment.