56 changes: 43 additions & 13 deletions llvm/test/CodeGen/Thumb2/mve-pred-not.ll
Original file line number Diff line number Diff line change
Expand Up @@ -405,12 +405,42 @@ declare <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>,
define arm_aapcs_vfpcc <4 x i32> @vpttet_v4i1(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
; CHECK-LABEL: vpttet_v4i1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: vcmp.s32 ge, q0, q2
; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill
; CHECK-NEXT: vpstt
; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload
; CHECK-NEXT: vpnot
; CHECK-NEXT: vpst
; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload
; CHECK-NEXT: vpst
; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: bx lr
entry:
%0 = icmp sge <4 x i32> %x, %z
%1 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %z, <4 x i32> %z, <4 x i1> %0, <4 x i32> %x)
%2 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %z, <4 x i32> %z, <4 x i1> %0, <4 x i32> %1)
%3 = xor <4 x i1> %0, <i1 true, i1 true, i1 true, i1 true>
%4 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %z, <4 x i32> %z, <4 x i1> %3, <4 x i32> %2)
%5 = xor <4 x i1> %3, <i1 true, i1 true, i1 true, i1 true>
%6 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %z, <4 x i32> %z, <4 x i1> %5, <4 x i32> %4)
ret <4 x i32> %6
}

define arm_aapcs_vfpcc <4 x i32> @vpttee_v4i1(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
; CHECK-LABEL: vpttee_v4i1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov q3, q2
; CHECK-NEXT: vpttet.s32 ge, q0, q2
; CHECK-NEXT: vpttee.s32 ge, q0, q2
; CHECK-NEXT: vmaxt.s32 q3, q0, q1
; CHECK-NEXT: vcmpt.s32 gt, q0, zr
; CHECK-NEXT: vcmpe.s32 gt, q1, zr
; CHECK-NEXT: vmovt q3, q2
; CHECK-NEXT: vmove q3, q2
; CHECK-NEXT: vmove q3, q2
; CHECK-NEXT: vmov q0, q3
; CHECK-NEXT: bx lr
entry:
Expand All @@ -419,20 +449,19 @@ entry:
%2 = icmp sgt <4 x i32> %x, zeroinitializer
%3 = and <4 x i1> %0, %2
%4 = xor <4 x i1> %3, <i1 true, i1 true, i1 true, i1 true>
%5 = icmp sgt <4 x i32> %y, zeroinitializer
%6 = and <4 x i1> %5, %4
%7 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %z, <4 x i32> %z, <4 x i1> %6, <4 x i32> %1)
ret <4 x i32> %7
%5 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %z, <4 x i32> %z, <4 x i1> %4, <4 x i32> %1)
%6 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %z, <4 x i32> %z, <4 x i1> %4, <4 x i32> %5)
ret <4 x i32> %6
}

define arm_aapcs_vfpcc <4 x i32> @vpttee_v4i1(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
; CHECK-LABEL: vpttee_v4i1:
define arm_aapcs_vfpcc <4 x i32> @vpttee2_v4i1(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
; CHECK-LABEL: vpttee2_v4i1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov q3, q2
; CHECK-NEXT: vpttee.s32 ge, q0, q2
; CHECK-NEXT: vmaxt.s32 q3, q0, q1
; CHECK-NEXT: vcmpt.s32 gt, q0, zr
; CHECK-NEXT: vmove q3, q2
; CHECK-NEXT: vcmpe.s32 gt, q1, zr
; CHECK-NEXT: vmove q3, q2
; CHECK-NEXT: vmov q0, q3
; CHECK-NEXT: bx lr
Expand All @@ -442,9 +471,10 @@ entry:
%2 = icmp sgt <4 x i32> %x, zeroinitializer
%3 = and <4 x i1> %0, %2
%4 = xor <4 x i1> %3, <i1 true, i1 true, i1 true, i1 true>
%5 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %z, <4 x i32> %z, <4 x i1> %4, <4 x i32> %1)
%6 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %z, <4 x i32> %z, <4 x i1> %4, <4 x i32> %5)
ret <4 x i32> %6
%5 = icmp sgt <4 x i32> %y, zeroinitializer
%6 = and <4 x i1> %5, %4
%7 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %z, <4 x i32> %z, <4 x i1> %6, <4 x i32> %1)
ret <4 x i32> %7
}

define arm_aapcs_vfpcc <4 x i32> @vpttte_v4i1(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
Expand Down
35 changes: 26 additions & 9 deletions llvm/test/CodeGen/Thumb2/mve-vpt-block-elses.mir
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,11 @@ body: |
; CHECK: liveins: $q0, $q1, $q2
; CHECK: $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
; CHECK: BUNDLE implicit-def dead $vpr, implicit-def $q3, implicit-def $d6, implicit-def $s12, implicit-def $s13, implicit-def $d7, implicit-def $s14, implicit-def $s15, implicit killed $q0, implicit $q2, implicit $q1, implicit killed $q3, implicit $zr {
; CHECK: MVE_VPTv4s32 5, renamable $q0, renamable $q2, 10, implicit-def $vpr
; CHECK: MVE_VPTv4s32 7, renamable $q0, renamable $q2, 10, implicit-def $vpr
; CHECK: renamable $q3 = MVE_VMAXs32 renamable $q0, renamable $q1, 1, internal renamable $vpr, killed renamable $q3
; CHECK: renamable $vpr = MVE_VCMPs32r killed renamable $q0, $zr, 12, 1, internal killed renamable $vpr
; CHECK: renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 2, internal killed renamable $vpr
; CHECK: renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, internal killed renamable $vpr, internal renamable $q3
; CHECK: renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 2, internal killed renamable $vpr, internal renamable $q3
; CHECK: }
; CHECK: $q0 = MVE_VORR $q3, $q3, 0, $noreg, undef $q0
; CHECK: $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
Expand All @@ -87,20 +87,20 @@ body: |
; CHECK: $q0 = MVE_VORR $q3, $q3, 0, $noreg, undef $q0
; CHECK: $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
; CHECK: BUNDLE implicit-def dead $vpr, implicit-def $q3, implicit-def $d6, implicit-def $s12, implicit-def $s13, implicit-def $d7, implicit-def $s14, implicit-def $s15, implicit $q0, implicit $q2, implicit $q1, implicit killed $q3, implicit $zr {
; CHECK: MVE_VPTv4s32 13, renamable $q0, renamable $q2, 10, implicit-def $vpr
; CHECK: MVE_VPTv4s32 15, renamable $q0, renamable $q2, 10, implicit-def $vpr
; CHECK: renamable $q3 = MVE_VMAXs32 renamable $q0, renamable $q1, 1, internal renamable $vpr, killed renamable $q3
; CHECK: renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 2, internal renamable $vpr, internal renamable $q3
; CHECK: renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 2, internal killed renamable $vpr
; CHECK: renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, internal killed renamable $vpr, internal killed renamable $q3
; CHECK: renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 2, internal killed renamable $vpr, internal killed renamable $q3
; CHECK: }
; CHECK: $q0 = MVE_VORR killed $q3, killed $q3, 0, $noreg, undef $q0
; CHECK: $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
; CHECK: BUNDLE implicit-def dead $vpr, implicit-def $q3, implicit-def $d6, implicit-def $s12, implicit-def $s13, implicit-def $d7, implicit-def $s14, implicit-def $s15, implicit $q0, implicit $q2, implicit $q1, implicit killed $q3, implicit $zr {
; CHECK: MVE_VPTv4s32 9, renamable $q0, renamable $q2, 10, implicit-def $vpr
; CHECK: MVE_VPTv4s32 15, renamable $q0, renamable $q2, 10, implicit-def $vpr
; CHECK: renamable $q3 = MVE_VMAXs32 renamable $q0, renamable $q1, 1, internal renamable $vpr, killed renamable $q3
; CHECK: renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 2, internal killed renamable $vpr
; CHECK: renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, internal renamable $vpr, internal renamable $q3
; CHECK: renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, internal killed renamable $vpr, internal killed renamable $q3
; CHECK: renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 2, internal renamable $vpr, internal renamable $q3
; CHECK: renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 2, internal killed renamable $vpr, internal killed renamable $q3
; CHECK: }
; CHECK: $q0 = MVE_VORR killed $q3, killed $q3, 0, $noreg, undef $q0
; CHECK: $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
Expand All @@ -122,10 +122,10 @@ body: |
; CHECK: $q0 = MVE_VORR killed $q3, killed $q3, 0, $noreg, undef $q0
; CHECK: $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
; CHECK: BUNDLE implicit-def dead $vpr, implicit-def $q3, implicit-def $d6, implicit-def $s12, implicit-def $s13, implicit-def $d7, implicit-def $s14, implicit-def $s15, implicit $q0, implicit $q2, implicit $q1, implicit killed $q3, implicit $zr {
; CHECK: MVE_VPTv4s32 10, renamable $q0, renamable $q2, 10, implicit-def $vpr
; CHECK: MVE_VPTv4s32 14, renamable $q0, renamable $q2, 10, implicit-def $vpr
; CHECK: renamable $q3 = MVE_VMAXs32 renamable $q0, renamable $q1, 1, internal renamable $vpr, killed renamable $q3
; CHECK: renamable $vpr = MVE_VCMPs32r renamable $q1, $zr, 12, 2, internal killed renamable $vpr
; CHECK: renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, internal killed renamable $vpr, internal killed renamable $q3
; CHECK: renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 2, internal killed renamable $vpr, internal killed renamable $q3
; CHECK: }
; CHECK: $q0 = MVE_VORR killed $q3, killed $q3, 0, $noreg, undef $q0
; CHECK: $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
Expand All @@ -145,6 +145,14 @@ body: |
; CHECK: renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 2, internal renamable $vpr, internal killed renamable $q3
; CHECK: }
; CHECK: $q0 = MVE_VORR killed $q3, killed $q3, 0, $noreg, undef $q0
; CHECK: $q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
; CHECK: BUNDLE implicit-def $vpr, implicit-def $q3, implicit-def $d6, implicit-def $s12, implicit-def $s13, implicit-def $d7, implicit-def $s14, implicit-def $s15, implicit $q0, implicit $q2, implicit killed $q3 {
; CHECK: MVE_VPTv4s32 13, renamable $q0, renamable $q2, 10, implicit-def $vpr
; CHECK: renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, internal renamable $vpr, killed renamable $q3
; CHECK: renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 11, 2, internal killed renamable $vpr
; CHECK: renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 2, internal renamable $vpr, internal killed renamable $q3
; CHECK: renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 11, 1, internal killed renamable $vpr
; CHECK: }
; CHECK: tBX_RET 14 /* CC::al */, $noreg, implicit $q0
renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 10, 0, $noreg
$q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
Expand Down Expand Up @@ -226,6 +234,15 @@ body: |
renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, renamable $vpr, killed renamable $q3
$q0 = MVE_VORR killed $q3, killed $q3, 0, $noreg, undef $q0
$q3 = MVE_VORR $q2, $q2, 0, $noreg, undef $q3
renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 10, 0, $noreg
renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, renamable $vpr, killed renamable $q3
renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg
renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 11, 1, killed renamable $vpr
renamable $q3 = MVE_VORR renamable $q2, renamable $q2, 1, renamable $vpr, killed renamable $q3
renamable $vpr = MVE_VPNOT killed renamable $vpr, 0, $noreg
renamable $vpr = MVE_VCMPs32 renamable $q0, renamable $q2, 11, 1, killed renamable $vpr
tBX_RET 14, $noreg, implicit $q0
...
323 changes: 323 additions & 0 deletions llvm/test/CodeGen/Thumb2/mve-vpt-blocks.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,323 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -O3 -mtriple=thumbv8.1m.main-arm-none-eabi --verify-machineinstrs -mattr=+mve.fp %s -o - | FileCheck %s

declare <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)

define arm_aapcs_vfpcc <4 x i32> @vpt_block(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; CHECK-LABEL: vpt_block:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.s32 ge, q0, q2
; CHECK-NEXT: vorrt q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = icmp sge <4 x i32> %a, %c
%1 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %b, <4 x i32> %c, <4 x i1> %0, <4 x i32> %a)
ret <4 x i32> %1
}

define arm_aapcs_vfpcc <4 x i32> @vptt_block(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; CHECK-LABEL: vptt_block:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov q3, q0
; CHECK-NEXT: vptt.s32 ge, q0, q2
; CHECK-NEXT: vorrt q3, q1, q2
; CHECK-NEXT: vorrt q0, q3, q2
; CHECK-NEXT: bx lr
entry:
%0 = icmp sge <4 x i32> %a, %c
%1 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %b, <4 x i32> %c, <4 x i1> %0, <4 x i32> %a)
%2 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %1, <4 x i32> %c, <4 x i1> %0, <4 x i32> %a)
ret <4 x i32> %2
}

define arm_aapcs_vfpcc <4 x i32> @vpttt_block(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; CHECK-LABEL: vpttt_block:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpttt.s32 ge, q0, q2
; CHECK-NEXT: vorrt q0, q1, q2
; CHECK-NEXT: vorrt q0, q1, q2
; CHECK-NEXT: vorrt q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = icmp sge <4 x i32> %a, %c
%1 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %b, <4 x i32> %c, <4 x i1> %0, <4 x i32> %a)
%2 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %b, <4 x i32> %c, <4 x i1> %0, <4 x i32> %1)
%3 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %b, <4 x i32> %c, <4 x i1> %0, <4 x i32> %2)
ret <4 x i32> %3
}

define arm_aapcs_vfpcc <4 x i32> @vptttt_block(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; CHECK-LABEL: vptttt_block:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vptttt.s32 ge, q0, q2
; CHECK-NEXT: vorrt q0, q1, q2
; CHECK-NEXT: vorrt q0, q1, q2
; CHECK-NEXT: vorrt q0, q1, q2
; CHECK-NEXT: vorrt q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = icmp sge <4 x i32> %a, %c
%1 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %b, <4 x i32> %c, <4 x i1> %0, <4 x i32> %a)
%2 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %b, <4 x i32> %c, <4 x i1> %0, <4 x i32> %1)
%3 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %b, <4 x i32> %c, <4 x i1> %0, <4 x i32> %2)
%4 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %b, <4 x i32> %c, <4 x i1> %0, <4 x i32> %3)
ret <4 x i32> %4
}


define arm_aapcs_vfpcc <4 x i32> @vpte_block(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; CHECK-LABEL: vpte_block:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpte.s32 ge, q0, q2
; CHECK-NEXT: vorrt q0, q1, q2
; CHECK-NEXT: vmove q0, q2
; CHECK-NEXT: bx lr
entry:
%0 = icmp sge <4 x i32> %a, %c
%1 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %b, <4 x i32> %c, <4 x i1> %0, <4 x i32> %a)
%2 = xor <4 x i1> %0, <i1 true, i1 true, i1 true, i1 true>
%3 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %c, <4 x i32> %c, <4 x i1> %2, <4 x i32> %1)
ret <4 x i32> %3
}

define arm_aapcs_vfpcc <4 x i32> @vptte_block(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; CHECK-LABEL: vptte_block:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vptte.s32 ge, q0, q2
; CHECK-NEXT: vorrt q0, q1, q2
; CHECK-NEXT: vorrt q0, q1, q2
; CHECK-NEXT: vorre q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = icmp sge <4 x i32> %a, %c
%1 = xor <4 x i1> %0, <i1 true, i1 true, i1 true, i1 true>
%2 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %b, <4 x i32> %c, <4 x i1> %0, <4 x i32> %a)
%3 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %b, <4 x i32> %c, <4 x i1> %0, <4 x i32> %2)
%4 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %b, <4 x i32> %c, <4 x i1> %1, <4 x i32> %3)
ret <4 x i32> %4
}

define arm_aapcs_vfpcc <4 x i32> @vptee_block(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; CHECK-LABEL: vptee_block:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vptee.s32 ge, q0, q2
; CHECK-NEXT: vorrt q0, q1, q2
; CHECK-NEXT: vorre q0, q1, q2
; CHECK-NEXT: vorre q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = icmp sge <4 x i32> %a, %c
%1 = xor <4 x i1> %0, <i1 true, i1 true, i1 true, i1 true>
%2 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %b, <4 x i32> %c, <4 x i1> %0, <4 x i32> %a)
%3 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %b, <4 x i32> %c, <4 x i1> %1, <4 x i32> %2)
%4 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %b, <4 x i32> %c, <4 x i1> %1, <4 x i32> %3)
ret <4 x i32> %4
}

define arm_aapcs_vfpcc <4 x i32> @vptet_block(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; CHECK-LABEL: vptet_block:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: vcmp.s32 ge, q0, q2
; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill
; CHECK-NEXT: vpst
; CHECK-NEXT: vorrt q0, q1, q2
; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload
; CHECK-NEXT: vpnot
; CHECK-NEXT: vpst
; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload
; CHECK-NEXT: vpst
; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: bx lr
entry:
%0 = icmp sge <4 x i32> %a, %c
%1 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %b, <4 x i32> %c, <4 x i1> %0, <4 x i32> %a)
%2 = xor <4 x i1> %0, <i1 true, i1 true, i1 true, i1 true>
%3 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %c, <4 x i32> %c, <4 x i1> %2, <4 x i32> %1)
%4 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %c, <4 x i32> %c, <4 x i1> %0, <4 x i32> %3)
ret <4 x i32> %4
}

define arm_aapcs_vfpcc <4 x i32> @vpttet_block(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; CHECK-LABEL: vpttet_block:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: vcmp.s32 ge, q0, q2
; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill
; CHECK-NEXT: vpstt
; CHECK-NEXT: vorrt q0, q1, q2
; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload
; CHECK-NEXT: vpnot
; CHECK-NEXT: vpst
; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload
; CHECK-NEXT: vpst
; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: bx lr
entry:
%0 = icmp sge <4 x i32> %a, %c
%1 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %b, <4 x i32> %c, <4 x i1> %0, <4 x i32> %a)
%2 = xor <4 x i1> %0, <i1 true, i1 true, i1 true, i1 true>
%3 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %c, <4 x i32> %c, <4 x i1> %0, <4 x i32> %1)
%4 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %c, <4 x i32> %c, <4 x i1> %2, <4 x i32> %3)
%5 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %c, <4 x i32> %c, <4 x i1> %0, <4 x i32> %4)
ret <4 x i32> %5
}

define arm_aapcs_vfpcc <4 x i32> @vptett_block(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; CHECK-LABEL: vptett_block:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: vcmp.s32 ge, q0, q2
; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill
; CHECK-NEXT: vpst
; CHECK-NEXT: vorrt q0, q1, q2
; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload
; CHECK-NEXT: vpnot
; CHECK-NEXT: vpst
; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload
; CHECK-NEXT: vpstt
; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: bx lr
entry:
%0 = icmp sge <4 x i32> %a, %c
%1 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %b, <4 x i32> %c, <4 x i1> %0, <4 x i32> %a)
%2 = xor <4 x i1> %0, <i1 true, i1 true, i1 true, i1 true>
%3 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %c, <4 x i32> %c, <4 x i1> %2, <4 x i32> %1)
%4 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %c, <4 x i32> %c, <4 x i1> %0, <4 x i32> %3)
%5 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %c, <4 x i32> %c, <4 x i1> %0, <4 x i32> %4)
ret <4 x i32> %5
}

define arm_aapcs_vfpcc <4 x i32> @vpteet_block(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; CHECK-LABEL: vpteet_block:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: vcmp.s32 ge, q0, q2
; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill
; CHECK-NEXT: vpst
; CHECK-NEXT: vorrt q0, q1, q2
; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload
; CHECK-NEXT: vpnot
; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: vpst
; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: vpst
; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload
; CHECK-NEXT: vpst
; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: bx lr
entry:
%0 = icmp sge <4 x i32> %a, %c
%1 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %b, <4 x i32> %c, <4 x i1> %0, <4 x i32> %a)
%2 = xor <4 x i1> %0, <i1 true, i1 true, i1 true, i1 true>
%3 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %c, <4 x i32> %c, <4 x i1> %2, <4 x i32> %1)
%4 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %c, <4 x i32> %c, <4 x i1> %2, <4 x i32> %3)
%5 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %c, <4 x i32> %c, <4 x i1> %0, <4 x i32> %4)
ret <4 x i32> %5
}

define arm_aapcs_vfpcc <4 x i32> @vpteee_block(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; CHECK-LABEL: vpteee_block:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpteee.s32 ge, q0, q2
; CHECK-NEXT: vorrt q0, q1, q2
; CHECK-NEXT: vmove q0, q2
; CHECK-NEXT: vmove q0, q2
; CHECK-NEXT: vmove q0, q2
; CHECK-NEXT: bx lr
entry:
%0 = icmp sge <4 x i32> %a, %c
%1 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %b, <4 x i32> %c, <4 x i1> %0, <4 x i32> %a)
%2 = xor <4 x i1> %0, <i1 true, i1 true, i1 true, i1 true>
%3 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %c, <4 x i32> %c, <4 x i1> %2, <4 x i32> %1)
%4 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %c, <4 x i32> %c, <4 x i1> %2, <4 x i32> %3)
%5 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %c, <4 x i32> %c, <4 x i1> %2, <4 x i32> %4)
ret <4 x i32> %5
}

define arm_aapcs_vfpcc <4 x i32> @vptete_block(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; CHECK-LABEL: vptete_block:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: vcmp.s32 ge, q0, q2
; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill
; CHECK-NEXT: vpst
; CHECK-NEXT: vorrt q0, q1, q2
; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload
; CHECK-NEXT: vpnot
; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: vpst
; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload
; CHECK-NEXT: vpst
; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: vpst
; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: bx lr
entry:
%0 = icmp sge <4 x i32> %a, %c
%1 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %b, <4 x i32> %c, <4 x i1> %0, <4 x i32> %a)
%2 = xor <4 x i1> %0, <i1 true, i1 true, i1 true, i1 true>
%3 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %c, <4 x i32> %c, <4 x i1> %2, <4 x i32> %1)
%4 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %c, <4 x i32> %c, <4 x i1> %0, <4 x i32> %3)
%5 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %c, <4 x i32> %c, <4 x i1> %2, <4 x i32> %4)
ret <4 x i32> %5
}

define arm_aapcs_vfpcc <4 x i32> @vpttte_block(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; CHECK-LABEL: vpttte_block:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpttte.s32 ge, q0, q2
; CHECK-NEXT: vorrt q0, q1, q2
; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: vmove q0, q2
; CHECK-NEXT: bx lr
entry:
%0 = icmp sge <4 x i32> %a, %c
%1 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %b, <4 x i32> %c, <4 x i1> %0, <4 x i32> %a)
%2 = xor <4 x i1> %0, <i1 true, i1 true, i1 true, i1 true>
%3 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %c, <4 x i32> %c, <4 x i1> %0, <4 x i32> %1)
%4 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %c, <4 x i32> %c, <4 x i1> %0, <4 x i32> %3)
%5 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %c, <4 x i32> %c, <4 x i1> %2, <4 x i32> %4)
ret <4 x i32> %5
}

define arm_aapcs_vfpcc <4 x i32> @vpttee_block(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; CHECK-LABEL: vpttee_block:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpttee.s32 ge, q0, q2
; CHECK-NEXT: vorrt q0, q1, q2
; CHECK-NEXT: vmovt q0, q2
; CHECK-NEXT: vmove q0, q2
; CHECK-NEXT: vmove q0, q2
; CHECK-NEXT: bx lr
entry:
%0 = icmp sge <4 x i32> %a, %c
%1 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %b, <4 x i32> %c, <4 x i1> %0, <4 x i32> %a)
%2 = xor <4 x i1> %0, <i1 true, i1 true, i1 true, i1 true>
%3 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %c, <4 x i32> %c, <4 x i1> %0, <4 x i32> %1)
%4 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %c, <4 x i32> %c, <4 x i1> %2, <4 x i32> %3)
%5 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %c, <4 x i32> %c, <4 x i1> %2, <4 x i32> %4)
ret <4 x i32> %5
}
547 changes: 547 additions & 0 deletions llvm/test/CodeGen/Thumb2/mve-vpt-optimisations.mir

Large diffs are not rendered by default.