Skip to content

Commit

Permalink
[ARM] Fold predicate_cast(load) into vldr p0
Browse files Browse the repository at this point in the history
This adds a simple tablegen pattern for folding predicate_cast(load)
into vldr p0, providing the alignment and offset are correct.

Differential Revision: https://reviews.llvm.org/D86702
  • Loading branch information
davemgreen committed Sep 4, 2020
1 parent 0faf393 commit 294c0cc
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 44 deletions.
11 changes: 11 additions & 0 deletions llvm/lib/Target/ARM/ARMInstrMVE.td
Expand Up @@ -4382,6 +4382,10 @@ let Predicates = [HasMVEInt] in {
// vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles.
def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>;

def load_align4 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
return cast<LoadSDNode>(N)->getAlignment() >= 4;
}]>;

let Predicates = [HasMVEInt] in {
foreach VT = [ v4i1, v8i1, v16i1 ] in {
def : Pat<(i32 (predicate_cast (VT VCCR:$src))),
Expand All @@ -4394,6 +4398,13 @@ let Predicates = [HasMVEInt] in {
(VT (COPY_TO_REGCLASS (VT2 VCCR:$src), VCCR))>;
}

// If we happen to be casting from a load we can convert that straight
// into a predicate load, so long as the load is of the correct type.
foreach VT = [ v4i1, v8i1, v16i1 ] in {
def : Pat<(VT (predicate_cast (i32 (load_align4 taddrmode_imm7<2>:$addr)))),
(VT (VLDR_P0_off taddrmode_imm7<2>:$addr))>;
}

// Here we match the specific SDNode type 'ARMVectorRegCastImpl'
// rather than the more general 'ARMVectorRegCast' which would also
// match some bitconverts. If we use the latter in cases where the
Expand Down
68 changes: 24 additions & 44 deletions llvm/test/CodeGen/Thumb2/mve-pred-loadstore.ll
Expand Up @@ -386,18 +386,16 @@ define arm_aapcs_vfpcc <4 x i32> @load_predcastzext(i16* %i, <4 x i32> %a) {
define arm_aapcs_vfpcc <4 x i32> @load_bc4(i32* %i, <4 x i32> %a) {
; CHECK-LE-LABEL: load_bc4:
; CHECK-LE: @ %bb.0:
; CHECK-LE-NEXT: ldr r0, [r0]
; CHECK-LE-NEXT: vldr p0, [r0]
; CHECK-LE-NEXT: vmov.i32 q1, #0x0
; CHECK-LE-NEXT: vmsr p0, r0
; CHECK-LE-NEXT: vpsel q0, q0, q1
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: load_bc4:
; CHECK-BE: @ %bb.0:
; CHECK-BE-NEXT: ldr r0, [r0]
; CHECK-BE-NEXT: vldr p0, [r0]
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: vmov.i32 q0, #0x0
; CHECK-BE-NEXT: vmsr p0, r0
; CHECK-BE-NEXT: vpsel q1, q1, q0
; CHECK-BE-NEXT: vrev64.32 q0, q1
; CHECK-BE-NEXT: bx lr
Expand All @@ -410,19 +408,17 @@ define arm_aapcs_vfpcc <4 x i32> @load_bc4(i32* %i, <4 x i32> %a) {
define arm_aapcs_vfpcc <8 x i16> @load_predcast8(i32* %i, <8 x i16> %a) {
; CHECK-LE-LABEL: load_predcast8:
; CHECK-LE: @ %bb.0:
; CHECK-LE-NEXT: ldr r0, [r0]
; CHECK-LE-NEXT: vldr p0, [r0]
; CHECK-LE-NEXT: vmov.i32 q1, #0x0
; CHECK-LE-NEXT: vmsr p0, r0
; CHECK-LE-NEXT: vpsel q0, q0, q1
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: load_predcast8:
; CHECK-BE: @ %bb.0:
; CHECK-BE-NEXT: ldr r0, [r0]
; CHECK-BE-NEXT: vrev64.16 q1, q0
; CHECK-BE-NEXT: vmov.i32 q0, #0x0
; CHECK-BE-NEXT: vldr p0, [r0]
; CHECK-BE-NEXT: vrev32.16 q0, q0
; CHECK-BE-NEXT: vmsr p0, r0
; CHECK-BE-NEXT: vpsel q1, q1, q0
; CHECK-BE-NEXT: vrev64.16 q0, q1
; CHECK-BE-NEXT: bx lr
Expand All @@ -435,19 +431,17 @@ define arm_aapcs_vfpcc <8 x i16> @load_predcast8(i32* %i, <8 x i16> %a) {
define arm_aapcs_vfpcc <16 x i8> @load_predcast16(i32* %i, <16 x i8> %a) {
; CHECK-LE-LABEL: load_predcast16:
; CHECK-LE: @ %bb.0:
; CHECK-LE-NEXT: ldr r0, [r0]
; CHECK-LE-NEXT: vldr p0, [r0]
; CHECK-LE-NEXT: vmov.i32 q1, #0x0
; CHECK-LE-NEXT: vmsr p0, r0
; CHECK-LE-NEXT: vpsel q0, q0, q1
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: load_predcast16:
; CHECK-BE: @ %bb.0:
; CHECK-BE-NEXT: ldr r0, [r0]
; CHECK-BE-NEXT: vrev64.8 q1, q0
; CHECK-BE-NEXT: vmov.i32 q0, #0x0
; CHECK-BE-NEXT: vldr p0, [r0]
; CHECK-BE-NEXT: vrev32.8 q0, q0
; CHECK-BE-NEXT: vmsr p0, r0
; CHECK-BE-NEXT: vpsel q1, q1, q0
; CHECK-BE-NEXT: vrev64.8 q0, q1
; CHECK-BE-NEXT: bx lr
Expand Down Expand Up @@ -484,18 +478,18 @@ define arm_aapcs_vfpcc <4 x i32> @load_bc4_align2(i32* %i, <4 x i32> %a) {
define arm_aapcs_vfpcc <4 x i32> @load_bc4_offset(i16* %i, <4 x i32> %a) {
; CHECK-LE-LABEL: load_bc4_offset:
; CHECK-LE: @ %bb.0:
; CHECK-LE-NEXT: ldr.w r0, [r0, #6]
; CHECK-LE-NEXT: adds r0, #6
; CHECK-LE-NEXT: vmov.i32 q1, #0x0
; CHECK-LE-NEXT: vmsr p0, r0
; CHECK-LE-NEXT: vldr p0, [r0]
; CHECK-LE-NEXT: vpsel q0, q0, q1
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: load_bc4_offset:
; CHECK-BE: @ %bb.0:
; CHECK-BE-NEXT: ldr.w r0, [r0, #6]
; CHECK-BE-NEXT: adds r0, #6
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: vldr p0, [r0]
; CHECK-BE-NEXT: vmov.i32 q0, #0x0
; CHECK-BE-NEXT: vmsr p0, r0
; CHECK-BE-NEXT: vpsel q1, q1, q0
; CHECK-BE-NEXT: vrev64.32 q0, q1
; CHECK-BE-NEXT: bx lr
Expand All @@ -510,18 +504,16 @@ define arm_aapcs_vfpcc <4 x i32> @load_bc4_offset(i16* %i, <4 x i32> %a) {
define arm_aapcs_vfpcc <4 x i32> @load_bc4_range4(i32* %i, <4 x i32> %a) {
; CHECK-LE-LABEL: load_bc4_range4:
; CHECK-LE: @ %bb.0:
; CHECK-LE-NEXT: ldr r0, [r0, #4]
; CHECK-LE-NEXT: vldr p0, [r0, #4]
; CHECK-LE-NEXT: vmov.i32 q1, #0x0
; CHECK-LE-NEXT: vmsr p0, r0
; CHECK-LE-NEXT: vpsel q0, q0, q1
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: load_bc4_range4:
; CHECK-BE: @ %bb.0:
; CHECK-BE-NEXT: ldr r0, [r0, #4]
; CHECK-BE-NEXT: vldr p0, [r0, #4]
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: vmov.i32 q0, #0x0
; CHECK-BE-NEXT: vmsr p0, r0
; CHECK-BE-NEXT: vpsel q1, q1, q0
; CHECK-BE-NEXT: vrev64.32 q0, q1
; CHECK-BE-NEXT: bx lr
Expand All @@ -535,18 +527,16 @@ define arm_aapcs_vfpcc <4 x i32> @load_bc4_range4(i32* %i, <4 x i32> %a) {
define arm_aapcs_vfpcc <4 x i32> @load_bc4_range(i32* %i, <4 x i32> %a) {
; CHECK-LE-LABEL: load_bc4_range:
; CHECK-LE: @ %bb.0:
; CHECK-LE-NEXT: ldr.w r0, [r0, #508]
; CHECK-LE-NEXT: vldr p0, [r0, #508]
; CHECK-LE-NEXT: vmov.i32 q1, #0x0
; CHECK-LE-NEXT: vmsr p0, r0
; CHECK-LE-NEXT: vpsel q0, q0, q1
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: load_bc4_range:
; CHECK-BE: @ %bb.0:
; CHECK-BE-NEXT: ldr.w r0, [r0, #508]
; CHECK-BE-NEXT: vldr p0, [r0, #508]
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: vmov.i32 q0, #0x0
; CHECK-BE-NEXT: vmsr p0, r0
; CHECK-BE-NEXT: vpsel q1, q1, q0
; CHECK-BE-NEXT: vrev64.32 q0, q1
; CHECK-BE-NEXT: bx lr
Expand All @@ -560,22 +550,16 @@ define arm_aapcs_vfpcc <4 x i32> @load_bc4_range(i32* %i, <4 x i32> %a) {
define arm_aapcs_vfpcc <4 x i32> @load_bc4_range2(i32* %i, <4 x i32> %a) {
; CHECK-LE-LABEL: load_bc4_range2:
; CHECK-LE: @ %bb.0:
; CHECK-LE-NEXT: movw r1, #65028
; CHECK-LE-NEXT: vldr p0, [r0, #-508]
; CHECK-LE-NEXT: vmov.i32 q1, #0x0
; CHECK-LE-NEXT: movt r1, #65535
; CHECK-LE-NEXT: ldr r0, [r0, r1]
; CHECK-LE-NEXT: vmsr p0, r0
; CHECK-LE-NEXT: vpsel q0, q0, q1
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: load_bc4_range2:
; CHECK-BE: @ %bb.0:
; CHECK-BE-NEXT: movw r1, #65028
; CHECK-BE-NEXT: vldr p0, [r0, #-508]
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: movt r1, #65535
; CHECK-BE-NEXT: vmov.i32 q0, #0x0
; CHECK-BE-NEXT: ldr r0, [r0, r1]
; CHECK-BE-NEXT: vmsr p0, r0
; CHECK-BE-NEXT: vpsel q1, q1, q0
; CHECK-BE-NEXT: vrev64.32 q0, q1
; CHECK-BE-NEXT: bx lr
Expand All @@ -589,18 +573,18 @@ define arm_aapcs_vfpcc <4 x i32> @load_bc4_range2(i32* %i, <4 x i32> %a) {
define arm_aapcs_vfpcc <4 x i32> @load_bc4_range3(i32* %i, <4 x i32> %a) {
; CHECK-LE-LABEL: load_bc4_range3:
; CHECK-LE: @ %bb.0:
; CHECK-LE-NEXT: ldr.w r0, [r0, #512]
; CHECK-LE-NEXT: add.w r0, r0, #512
; CHECK-LE-NEXT: vmov.i32 q1, #0x0
; CHECK-LE-NEXT: vmsr p0, r0
; CHECK-LE-NEXT: vldr p0, [r0]
; CHECK-LE-NEXT: vpsel q0, q0, q1
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: load_bc4_range3:
; CHECK-BE: @ %bb.0:
; CHECK-BE-NEXT: ldr.w r0, [r0, #512]
; CHECK-BE-NEXT: add.w r0, r0, #512
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: vldr p0, [r0]
; CHECK-BE-NEXT: vmov.i32 q0, #0x0
; CHECK-BE-NEXT: vmsr p0, r0
; CHECK-BE-NEXT: vpsel q1, q1, q0
; CHECK-BE-NEXT: vrev64.32 q0, q1
; CHECK-BE-NEXT: bx lr
Expand All @@ -614,22 +598,18 @@ define arm_aapcs_vfpcc <4 x i32> @load_bc4_range3(i32* %i, <4 x i32> %a) {
define arm_aapcs_vfpcc <4 x i32> @load_bc4_range5(i32* %i, <4 x i32> %a) {
; CHECK-LE-LABEL: load_bc4_range5:
; CHECK-LE: @ %bb.0:
; CHECK-LE-NEXT: movw r1, #65024
; CHECK-LE-NEXT: sub.w r0, r0, #512
; CHECK-LE-NEXT: vmov.i32 q1, #0x0
; CHECK-LE-NEXT: movt r1, #65535
; CHECK-LE-NEXT: ldr r0, [r0, r1]
; CHECK-LE-NEXT: vmsr p0, r0
; CHECK-LE-NEXT: vldr p0, [r0]
; CHECK-LE-NEXT: vpsel q0, q0, q1
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: load_bc4_range5:
; CHECK-BE: @ %bb.0:
; CHECK-BE-NEXT: movw r1, #65024
; CHECK-BE-NEXT: sub.w r0, r0, #512
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: movt r1, #65535
; CHECK-BE-NEXT: vldr p0, [r0]
; CHECK-BE-NEXT: vmov.i32 q0, #0x0
; CHECK-BE-NEXT: ldr r0, [r0, r1]
; CHECK-BE-NEXT: vmsr p0, r0
; CHECK-BE-NEXT: vpsel q1, q1, q0
; CHECK-BE-NEXT: vrev64.32 q0, q1
; CHECK-BE-NEXT: bx lr
Expand Down

0 comments on commit 294c0cc

Please sign in to comment.