Skip to content

Commit

Permalink
[ARM] Guard VMOVH and VINS patterns.
Browse files Browse the repository at this point in the history
These instructions are only available when fp is available, so cannot be
used with just +mve. Add predicates to ensure we fall-back under the
right circumstances.
  • Loading branch information
davemgreen committed Jul 17, 2022
1 parent decf385 commit cb806ce
Show file tree
Hide file tree
Showing 4 changed files with 212 additions and 91 deletions.
4 changes: 2 additions & 2 deletions llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
Expand Up @@ -3131,7 +3131,7 @@ bool ARMDAGToDAGISel::tryInsertVectorElt(SDNode *N) {

// Else v8i16 pattern of an extract and an insert, with a optional vmovx for
// extracting odd lanes.
if (VT == MVT::v8i16) {
if (VT == MVT::v8i16 && Subtarget->hasFullFP16()) {
SDValue Inp1 = CurDAG->getTargetExtractSubreg(
ARM::ssub_0 + ExtractLane1 / 2, dl, MVT::f32, Val1.getOperand(0));
SDValue Inp2 = CurDAG->getTargetExtractSubreg(
Expand All @@ -3151,7 +3151,7 @@ bool ARMDAGToDAGISel::tryInsertVectorElt(SDNode *N) {

// The inserted values are not extracted - if they are f16 then insert them
// directly using a VINS.
if (VT == MVT::v8f16) {
if (VT == MVT::v8f16 && Subtarget->hasFullFP16()) {
SDNode *VINS = CurDAG->getMachineNode(ARM::VINSH, dl, MVT::f32, Val2, Val1);
SDValue NewIns =
CurDAG->getTargetInsertSubreg(ARM::ssub_0 + Lane2 / 2, dl, MVT::v4f32,
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/ARM/ARMInstrMVE.td
Expand Up @@ -1882,13 +1882,15 @@ let Predicates = [HasMVEInt] in {
def : Pat<(ARMvgetlaneu (v8f16 MQPR:$src), imm:$lane),
(MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane)>;
// For i16's inserts being extracted from low lanes, then may use VINS.
let Predicates = [HasFullFP16] in {
def : Pat<(ARMinsertelt (v8i16 MQPR:$src1),
(ARMvgetlaneu (v8i16 MQPR:$src2), imm_even:$extlane),
imm_odd:$inslane),
(COPY_TO_REGCLASS (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$src1, MQPR)),
(VINSH (EXTRACT_SUBREG MQPR:$src1, (SSubReg_f16_reg imm_odd:$inslane)),
(EXTRACT_SUBREG MQPR:$src2, (SSubReg_f16_reg imm_even:$extlane))),
(SSubReg_f16_reg imm_odd:$inslane)), MQPR)>;
}

def : Pat<(v16i8 (scalar_to_vector GPR:$src)),
(MVE_VMOV_to_lane_8 (v16i8 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
Expand All @@ -1905,17 +1907,21 @@ let Predicates = [HasMVEInt] in {

def : Pat<(insertelt (v8f16 MQPR:$src1), (f16 HPR:$src2), imm_even:$lane),
(MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS (f16 HPR:$src2), rGPR), imm:$lane)>;
let Predicates = [HasFullFP16] in {
def : Pat<(insertelt (v8f16 MQPR:$src1), (f16 HPR:$src2), imm_odd:$lane),
(COPY_TO_REGCLASS (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$src1, MQPR)),
(VINSH (EXTRACT_SUBREG MQPR:$src1, (SSubReg_f16_reg imm_odd:$lane)),
(COPY_TO_REGCLASS HPR:$src2, SPR)),
(SSubReg_f16_reg imm_odd:$lane)), MQPR)>;
}
def : Pat<(extractelt (v8f16 MQPR:$src), imm_even:$lane),
(EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_even:$lane))>;
let Predicates = [HasFullFP16] in {
def : Pat<(extractelt (v8f16 MQPR:$src), imm_odd:$lane),
(COPY_TO_REGCLASS
(VMOVH (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_odd:$lane))),
HPR)>;
}

def : Pat<(v2f64 (scalar_to_vector (f64 DPR:$src))),
(INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), DPR:$src, dsub_0)>;
Expand Down
200 changes: 136 additions & 64 deletions llvm/test/CodeGen/Thumb2/mve-vmovn.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
; RUN: llc -mtriple=thumbebv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECKBE
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-MVE
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-MVEFP
; RUN: llc -mtriple=thumbebv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECKBE

define arm_aapcs_vfpcc <8 x i16> @vmovn32_trunc1(<4 x i32> %src1, <4 x i32> %src2) {
; CHECK-LABEL: vmovn32_trunc1:
Expand Down Expand Up @@ -60,33 +61,52 @@ entry:
}

define arm_aapcs_vfpcc <8 x i16> @vmovn32_trunc1_viabitcast(<4 x i32> %src1, <4 x i32> %src2) {
; CHECK-LABEL: vmovn32_trunc1_viabitcast:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: vmov.f32 s8, s2
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vmov.f32 s9, s6
; CHECK-NEXT: vmov.f32 s10, s3
; CHECK-NEXT: vmov.f32 s11, s7
; CHECK-NEXT: vstrh.32 q2, [r0, #8]
; CHECK-NEXT: vmov.f32 s8, s0
; CHECK-NEXT: vmov.f32 s9, s4
; CHECK-NEXT: vmov.f32 s10, s1
; CHECK-NEXT: vmov.f32 s11, s5
; CHECK-NEXT: vstrh.32 q2, [r0]
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: add sp, #16
; CHECK-NEXT: bx lr
; CHECK-MVE-LABEL: vmovn32_trunc1_viabitcast:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: .pad #16
; CHECK-MVE-NEXT: sub sp, #16
; CHECK-MVE-NEXT: vmov.f32 s8, s2
; CHECK-MVE-NEXT: mov r0, sp
; CHECK-MVE-NEXT: vmov.f32 s9, s6
; CHECK-MVE-NEXT: vmov.f32 s10, s3
; CHECK-MVE-NEXT: vmov.f32 s11, s7
; CHECK-MVE-NEXT: vstrh.32 q2, [r0, #8]
; CHECK-MVE-NEXT: vmov.f32 s8, s0
; CHECK-MVE-NEXT: vmov.f32 s9, s4
; CHECK-MVE-NEXT: vmov.f32 s10, s1
; CHECK-MVE-NEXT: vmov.f32 s11, s5
; CHECK-MVE-NEXT: vstrh.32 q2, [r0]
; CHECK-MVE-NEXT: vldrw.u32 q0, [r0]
; CHECK-MVE-NEXT: add sp, #16
; CHECK-MVE-NEXT: bx lr
;
; CHECK-MVEFP-LABEL: vmovn32_trunc1_viabitcast:
; CHECK-MVEFP: @ %bb.0: @ %entry
; CHECK-MVEFP-NEXT: .pad #16
; CHECK-MVEFP-NEXT: sub sp, #16
; CHECK-MVEFP-NEXT: mov r0, sp
; CHECK-MVEFP-NEXT: vmov.f32 s8, s2
; CHECK-MVEFP-NEXT: vmov.f32 s9, s6
; CHECK-MVEFP-NEXT: vmov.f32 s10, s3
; CHECK-MVEFP-NEXT: vmov.f32 s11, s7
; CHECK-MVEFP-NEXT: vstrh.32 q2, [r0, #8]
; CHECK-MVEFP-NEXT: vmov.f32 s8, s0
; CHECK-MVEFP-NEXT: vmov.f32 s9, s4
; CHECK-MVEFP-NEXT: vmov.f32 s10, s1
; CHECK-MVEFP-NEXT: vmov.f32 s11, s5
; CHECK-MVEFP-NEXT: vstrh.32 q2, [r0]
; CHECK-MVEFP-NEXT: vldrw.u32 q0, [r0]
; CHECK-MVEFP-NEXT: add sp, #16
; CHECK-MVEFP-NEXT: bx lr
;
; CHECKBE-LABEL: vmovn32_trunc1_viabitcast:
; CHECKBE: @ %bb.0: @ %entry
; CHECKBE-NEXT: .pad #16
; CHECKBE-NEXT: sub sp, #16
; CHECKBE-NEXT: vrev64.32 q2, q1
; CHECKBE-NEXT: vrev64.32 q1, q0
; CHECKBE-NEXT: vmov.f32 s0, s6
; CHECKBE-NEXT: mov r0, sp
; CHECKBE-NEXT: vmov.f32 s0, s6
; CHECKBE-NEXT: vmov.f32 s1, s10
; CHECKBE-NEXT: vmov.f32 s2, s7
; CHECKBE-NEXT: vmov.f32 s3, s11
Expand Down Expand Up @@ -497,29 +517,50 @@ entry:
}

define arm_aapcs_vfpcc <8 x i16> @vmovn16_b2(<8 x i16> %src1, <8 x i16> %src2) {
; CHECK-LABEL: vmovn16_b2:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovx.f16 s5, s5
; CHECK-NEXT: vmovx.f16 s4, s4
; CHECK-NEXT: vmovx.f16 s6, s6
; CHECK-NEXT: vmovx.f16 s7, s7
; CHECK-NEXT: vins.f16 s5, s1
; CHECK-NEXT: vins.f16 s4, s0
; CHECK-NEXT: vins.f16 s6, s2
; CHECK-NEXT: vins.f16 s7, s3
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
; CHECK-MVE-LABEL: vmovn16_b2:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmov q2, q0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[1]
; CHECK-MVE-NEXT: vmov.16 q0[0], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q2[0]
; CHECK-MVE-NEXT: vmov.16 q0[1], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[3]
; CHECK-MVE-NEXT: vmov.16 q0[2], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q2[2]
; CHECK-MVE-NEXT: vmov.16 q0[3], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[5]
; CHECK-MVE-NEXT: vmov.16 q0[4], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q2[4]
; CHECK-MVE-NEXT: vmov.16 q0[5], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[7]
; CHECK-MVE-NEXT: vmov.16 q0[6], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q2[6]
; CHECK-MVE-NEXT: vmov.16 q0[7], r0
; CHECK-MVE-NEXT: bx lr
;
; CHECK-MVEFP-LABEL: vmovn16_b2:
; CHECK-MVEFP: @ %bb.0: @ %entry
; CHECK-MVEFP-NEXT: vmovx.f16 s4, s4
; CHECK-MVEFP-NEXT: vmovx.f16 s5, s5
; CHECK-MVEFP-NEXT: vmovx.f16 s6, s6
; CHECK-MVEFP-NEXT: vmovx.f16 s7, s7
; CHECK-MVEFP-NEXT: vins.f16 s4, s0
; CHECK-MVEFP-NEXT: vins.f16 s5, s1
; CHECK-MVEFP-NEXT: vins.f16 s6, s2
; CHECK-MVEFP-NEXT: vins.f16 s7, s3
; CHECK-MVEFP-NEXT: vmov q0, q1
; CHECK-MVEFP-NEXT: bx lr
;
; CHECKBE-LABEL: vmovn16_b2:
; CHECKBE: @ %bb.0: @ %entry
; CHECKBE-NEXT: vrev64.16 q2, q0
; CHECKBE-NEXT: vrev64.16 q0, q1
; CHECKBE-NEXT: vmovx.f16 s5, s1
; CHECKBE-NEXT: vmovx.f16 s4, s0
; CHECKBE-NEXT: vmovx.f16 s5, s1
; CHECKBE-NEXT: vmovx.f16 s6, s2
; CHECKBE-NEXT: vmovx.f16 s7, s3
; CHECKBE-NEXT: vins.f16 s5, s9
; CHECKBE-NEXT: vins.f16 s4, s8
; CHECKBE-NEXT: vins.f16 s5, s9
; CHECKBE-NEXT: vins.f16 s6, s10
; CHECKBE-NEXT: vins.f16 s7, s11
; CHECKBE-NEXT: vrev64.16 q0, q1
Expand All @@ -530,28 +571,49 @@ entry:
}

define arm_aapcs_vfpcc <8 x i16> @vmovn16_b3(<8 x i16> %src1, <8 x i16> %src2) {
; CHECK-LABEL: vmovn16_b3:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovx.f16 s1, s1
; CHECK-NEXT: vmovx.f16 s0, s0
; CHECK-NEXT: vmovx.f16 s2, s2
; CHECK-NEXT: vmovx.f16 s3, s3
; CHECK-NEXT: vins.f16 s1, s5
; CHECK-NEXT: vins.f16 s0, s4
; CHECK-NEXT: vins.f16 s2, s6
; CHECK-NEXT: vins.f16 s3, s7
; CHECK-NEXT: bx lr
; CHECK-MVE-LABEL: vmovn16_b3:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmov.u16 r0, q0[1]
; CHECK-MVE-NEXT: vmov q2, q0
; CHECK-MVE-NEXT: vmov.16 q0[0], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[0]
; CHECK-MVE-NEXT: vmov.16 q0[1], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q2[3]
; CHECK-MVE-NEXT: vmov.16 q0[2], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[2]
; CHECK-MVE-NEXT: vmov.16 q0[3], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q2[5]
; CHECK-MVE-NEXT: vmov.16 q0[4], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[4]
; CHECK-MVE-NEXT: vmov.16 q0[5], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q2[7]
; CHECK-MVE-NEXT: vmov.16 q0[6], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[6]
; CHECK-MVE-NEXT: vmov.16 q0[7], r0
; CHECK-MVE-NEXT: bx lr
;
; CHECK-MVEFP-LABEL: vmovn16_b3:
; CHECK-MVEFP: @ %bb.0: @ %entry
; CHECK-MVEFP-NEXT: vmovx.f16 s0, s0
; CHECK-MVEFP-NEXT: vmovx.f16 s1, s1
; CHECK-MVEFP-NEXT: vmovx.f16 s2, s2
; CHECK-MVEFP-NEXT: vmovx.f16 s3, s3
; CHECK-MVEFP-NEXT: vins.f16 s0, s4
; CHECK-MVEFP-NEXT: vins.f16 s1, s5
; CHECK-MVEFP-NEXT: vins.f16 s2, s6
; CHECK-MVEFP-NEXT: vins.f16 s3, s7
; CHECK-MVEFP-NEXT: bx lr
;
; CHECKBE-LABEL: vmovn16_b3:
; CHECKBE: @ %bb.0: @ %entry
; CHECKBE-NEXT: vrev64.16 q2, q1
; CHECKBE-NEXT: vrev64.16 q1, q0
; CHECKBE-NEXT: vmovx.f16 s5, s5
; CHECKBE-NEXT: vmovx.f16 s4, s4
; CHECKBE-NEXT: vmovx.f16 s5, s5
; CHECKBE-NEXT: vmovx.f16 s6, s6
; CHECKBE-NEXT: vmovx.f16 s7, s7
; CHECKBE-NEXT: vins.f16 s5, s9
; CHECKBE-NEXT: vins.f16 s4, s8
; CHECKBE-NEXT: vins.f16 s5, s9
; CHECKBE-NEXT: vins.f16 s6, s10
; CHECKBE-NEXT: vins.f16 s7, s11
; CHECKBE-NEXT: vrev64.16 q0, q1
Expand Down Expand Up @@ -922,27 +984,37 @@ entry:
}

define arm_aapcs_vfpcc <8 x i16> @vmovn32_badlanes(<4 x i32> %src1) {
; CHECK-LABEL: vmovn32_badlanes:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov r0, r1, d0
; CHECK-NEXT: vmov.16 q1[1], r0
; CHECK-NEXT: vmov r0, s2
; CHECK-NEXT: vmov.16 q1[3], r1
; CHECK-NEXT: vmov.16 q1[5], r1
; CHECK-NEXT: vmov.16 q1[7], r0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
; CHECK-MVE-LABEL: vmovn32_badlanes:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmov r0, r1, d0
; CHECK-MVE-NEXT: vmov.16 q1[1], r0
; CHECK-MVE-NEXT: vmov r0, s2
; CHECK-MVE-NEXT: vmov.16 q1[3], r1
; CHECK-MVE-NEXT: vmov.16 q1[5], r1
; CHECK-MVE-NEXT: vmov.16 q1[7], r0
; CHECK-MVE-NEXT: vmov q0, q1
; CHECK-MVE-NEXT: bx lr
;
; CHECK-MVEFP-LABEL: vmovn32_badlanes:
; CHECK-MVEFP: @ %bb.0: @ %entry
; CHECK-MVEFP-NEXT: vmov r1, r2, d0
; CHECK-MVEFP-NEXT: vmov r0, s2
; CHECK-MVEFP-NEXT: vmov.16 q0[1], r1
; CHECK-MVEFP-NEXT: vmov.16 q0[3], r2
; CHECK-MVEFP-NEXT: vmov.16 q0[5], r2
; CHECK-MVEFP-NEXT: vmov.16 q0[7], r0
; CHECK-MVEFP-NEXT: bx lr
;
; CHECKBE-LABEL: vmovn32_badlanes:
; CHECKBE: @ %bb.0: @ %entry
; CHECKBE-NEXT: vrev64.32 q1, q0
; CHECKBE-NEXT: vmov r0, r1, d2
; CHECKBE-NEXT: vmov.16 q2[1], r0
; CHECKBE-NEXT: vmov r0, s6
; CHECKBE-NEXT: vmov.16 q2[3], r1
; CHECKBE-NEXT: vmov.16 q2[5], r1
; CHECKBE-NEXT: vmov.16 q2[7], r0
; CHECKBE-NEXT: vrev64.16 q0, q2
; CHECKBE-NEXT: vmov r2, s6
; CHECKBE-NEXT: vmov.16 q1[1], r0
; CHECKBE-NEXT: vmov.16 q1[3], r1
; CHECKBE-NEXT: vmov.16 q1[5], r1
; CHECKBE-NEXT: vmov.16 q1[7], r2
; CHECKBE-NEXT: vrev64.16 q0, q1
; CHECKBE-NEXT: bx lr
entry:
%strided.vec = shufflevector <4 x i32> %src1, <4 x i32> undef, <8 x i32> <i32 4, i32 0, i32 5, i32 1, i32 6, i32 1, i32 7, i32 2>
Expand Down

0 comments on commit cb806ce

Please sign in to comment.