16 changes: 10 additions & 6 deletions llvm/test/CodeGen/AArch64/insert-subvector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,11 @@ define <16 x i8> @insert_v16i8_4_1(float %tmp, <16 x i8> %b, <16 x i8> %a) {
define <16 x i8> @insert_v16i8_4_15(float %tmp, <16 x i8> %b, <16 x i8> %a) {
; CHECK-LABEL: insert_v16i8_4_15:
; CHECK: // %bb.0:
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: // kill: def $q2 killed $q2 def $q2_q3
; CHECK-NEXT: adrp x8, .LCPI4_0
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_0]
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-NEXT: mov v3.16b, v1.16b
; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI4_0]
; CHECK-NEXT: tbl v0.16b, { v2.16b, v3.16b }, v0.16b
; CHECK-NEXT: ret
%s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 0, i32 1, i32 2, i32 3, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
ret <16 x i8> %s2
Expand Down Expand Up @@ -145,10 +146,11 @@ define <8 x i16> @insert_v8i16_2_1(float %tmp, <8 x i16> %b, <8 x i16> %a) {
define <8 x i16> @insert_v8i16_2_15(float %tmp, <8 x i16> %b, <8 x i16> %a) {
; CHECK-LABEL: insert_v8i16_2_15:
; CHECK: // %bb.0:
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: // kill: def $q2 killed $q2 def $q2_q3
; CHECK-NEXT: adrp x8, .LCPI13_0
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_0]
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-NEXT: mov v3.16b, v1.16b
; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI13_0]
; CHECK-NEXT: tbl v0.16b, { v2.16b, v3.16b }, v0.16b
; CHECK-NEXT: ret
%s2 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 0, i32 1, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <8 x i16> %s2
Expand Down Expand Up @@ -270,6 +272,7 @@ define <16 x i8> @load_v16i8_4_1(float %tmp, <16 x i8> %b, ptr %a) {
define <16 x i8> @load_v16i8_4_15(float %tmp, <16 x i8> %b, ptr %a) {
; CHECK-LABEL: load_v16i8_4_15:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q1 killed $q1 def $q0_q1
; CHECK-NEXT: adrp x8, .LCPI24_0
; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI24_0]
Expand Down Expand Up @@ -490,6 +493,7 @@ define <8 x i16> @load_v8i16_2_1(float %tmp, <8 x i16> %b, ptr %a) {
define <8 x i16> @load_v8i16_2_15(float %tmp, <8 x i16> %b, ptr %a) {
; CHECK-LABEL: load_v8i16_2_15:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q1 killed $q1 def $q0_q1
; CHECK-NEXT: adrp x8, .LCPI40_0
; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI40_0]
Expand Down
12 changes: 10 additions & 2 deletions llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1349,14 +1349,18 @@ define <8 x i16> @vselect_equivalent_shuffle_v8i16(<8 x i16> %a, <8 x i16> %b) {
; CHECK-SD-LABEL: vselect_equivalent_shuffle_v8i16:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: adrp x8, .LCPI92_0
; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI92_0]
; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECK-SD-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: vselect_equivalent_shuffle_v8i16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: adrp x8, .LCPI92_0
; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI92_0]
; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-GI-NEXT: ret
%c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 9, i32 4, i32 5, i32 6, i32 7>
Expand All @@ -1382,8 +1386,9 @@ define <8 x i16> @vselect_equivalent_shuffle_v8i16_zero(<8 x i16> %a) {
;
; CHECK-GI-LABEL: vselect_equivalent_shuffle_v8i16_zero:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
; CHECK-GI-NEXT: // kill: def $q0 killed $q0 def $q0_q1
; CHECK-GI-NEXT: adrp x8, .LCPI93_0
; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI93_0]
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-GI-NEXT: ret
Expand Down Expand Up @@ -1417,8 +1422,9 @@ define <8 x i16> @vselect_equivalent_shuffle_v8i16_zeroswap(<8 x i16> %a) {
;
; CHECK-GI-LABEL: vselect_equivalent_shuffle_v8i16_zeroswap:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: movi v31.2d, #0000000000000000
; CHECK-GI-NEXT: // kill: def $q0 killed $q0 def $q31_q0
; CHECK-GI-NEXT: adrp x8, .LCPI94_0
; CHECK-GI-NEXT: movi v31.2d, #0000000000000000
; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI94_0]
; CHECK-GI-NEXT: tbl v0.16b, { v31.16b, v0.16b }, v1.16b
; CHECK-GI-NEXT: ret
Expand Down Expand Up @@ -1460,7 +1466,9 @@ define <4 x i32> @vselect_equivalent_shuffle_v4i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-GI-LABEL: vselect_equivalent_shuffle_v4i32:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: adrp x8, .LCPI96_0
; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI96_0]
; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-GI-NEXT: ret
%c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 3>
Expand Down
4 changes: 4 additions & 0 deletions llvm/test/CodeGen/AArch64/neon-extracttruncate.ll
Original file line number Diff line number Diff line change
Expand Up @@ -267,8 +267,12 @@ entry:
define <16 x i8> @extract_4_v4i32_badindex(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
; CHECK-LABEL: extract_4_v4i32_badindex:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
; CHECK-NEXT: adrp x8, .LCPI5_0
; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI5_0]
; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
; CHECK-NEXT: ret
entry:
Expand Down
4 changes: 4 additions & 0 deletions llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@ define <8 x i16> @v8i16_2(<4 x i16> %a, <4 x i16> %b) {
; CHECK-LABEL: v8i16_2:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x8, .LCPI4_0
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-NEXT: ret
entry:
Expand Down Expand Up @@ -80,7 +82,9 @@ define <16 x i8> @v16i8_2(<8 x i8> %a, <8 x i8> %b) {
; CHECK-LABEL: v16i8_2:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x8, .LCPI7_0
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-NEXT: ret
entry:
Expand Down
6 changes: 6 additions & 0 deletions llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,9 @@ define <8 x i16> @shuffle_widen_faili1(<4 x i16> %a, <4 x i16> %b) {
; CHECK-LABEL: shuffle_widen_faili1:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x8, .LCPI12_0
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-NEXT: ret
entry:
Expand All @@ -150,7 +152,9 @@ define <8 x i16> @shuffle_widen_fail2(<4 x i16> %a, <4 x i16> %b) {
; CHECK-LABEL: shuffle_widen_fail2:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x8, .LCPI13_0
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-NEXT: ret
entry:
Expand All @@ -163,7 +167,9 @@ define <8 x i16> @shuffle_widen_fail3(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: shuffle_widen_fail3:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x8, .LCPI14_0
; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_0]
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-NEXT: ret
entry:
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AArch64/seqpairspill.mir
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ body: |
bb.0:
; Check the spill/reload sequence for the %0 register
; CHECK: renamable $[[REG0:[a-z0-9]+]]_[[REG1:[a-z0-9]+]] = CASPALX
; CHECK-NEXT: STPXi killed renamable $[[REG0]], renamable $[[REG1]], %stack.0, 0 :: (store (s128) into %stack.0, align 8)
; CHECK-NEXT: STPXi renamable $[[REG0]], renamable $[[REG1]], %stack.0, 0, implicit killed $[[REG0]]_[[REG1]] :: (store (s128) into %stack.0, align 8)
; CHECK: INLINEASM
; CHECK: renamable $[[REG2:[a-z0-9]+]], renamable $[[REG3:[a-z0-9]+]] = LDPXi %stack.0, 0 :: (load (s128) from %stack.0, align 8)
; CHECK: renamable $[[REG2:[a-z0-9]+]], renamable $[[REG3:[a-z0-9]+]] = LDPXi %stack.0, 0, implicit-def $[[REG2]]_[[REG3]] :: (load (s128) from %stack.0, align 8)
; CHECK-NEXT: $xzr = COPY renamable $[[REG2]]
; CHECK-NEXT: $xzr = COPY killed renamable $[[REG3]]
; CHECK-NEXT: $xzr = COPY renamable $[[REG3]]
%0 : xseqpairsclass = IMPLICIT_DEF
%1 : xseqpairsclass = IMPLICIT_DEF
%2 : gpr64common = IMPLICIT_DEF
Expand All @@ -27,11 +27,11 @@ body: |
bb.0:
; Check the spill/reload sequence for the %0 register
; CHECK: $[[REG0:[a-z0-9]+]]_[[REG1:[a-z0-9]+]] = CASPALW
; CHECK-NEXT: STPWi killed renamable $[[REG0]], renamable $[[REG1]], %stack.0, 0 :: (store (s64) into %stack.0, align 4)
; CHECK-NEXT: STPWi renamable $[[REG0]], renamable $[[REG1]], %stack.0, 0, implicit killed $[[REG0]]_[[REG1]] :: (store (s64) into %stack.0, align 4)
; CHECK: INLINEASM
; CHECK: renamable $[[REG2:[a-z0-9]+]], renamable $[[REG3:[a-z0-9]+]] = LDPWi %stack.0, 0 :: (load (s64) from %stack.0, align 4)
; CHECK: renamable $[[REG2:[a-z0-9]+]], renamable $[[REG3:[a-z0-9]+]] = LDPWi %stack.0, 0, implicit-def $[[REG2]]_[[REG3]] :: (load (s64) from %stack.0, align 4)
; CHECK-NEXT: $xzr = COPY renamable $[[REG2]]
; CHECK-NEXT: $xzr = COPY killed renamable $[[REG3]]
; CHECK-NEXT: $xzr = COPY renamable $[[REG3]]
%0 : wseqpairsclass = IMPLICIT_DEF
%1 : wseqpairsclass = IMPLICIT_DEF
%2 : gpr64common = IMPLICIT_DEF
Expand Down
83 changes: 50 additions & 33 deletions llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,12 @@
define <16 x i8> @shuffle4_v4i8_16(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
; CHECK-LABEL: shuffle4_v4i8_16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
; CHECK-NEXT: adrp x8, .LCPI0_0
; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI0_0]
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
; CHECK-NEXT: ret
%x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
Expand All @@ -43,8 +47,12 @@ define <16 x i8> @shuffle4_v4i8_16(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i
define <8 x i8> @shuffle4_v4i8_8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
; CHECK-LABEL: shuffle4_v4i8_8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
; CHECK-NEXT: adrp x8, .LCPI1_0
; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
; CHECK-NEXT: ldr d4, [x8, :lo12:.LCPI1_0]
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
; CHECK-NEXT: tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.8b
; CHECK-NEXT: ret
%x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
Expand Down Expand Up @@ -93,10 +101,10 @@ define <8 x i8> @shuffle4_v4i8_8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8>
define <16 x i8> @shuffle4_v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
; CHECK-LABEL: shuffle4_v8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
; CHECK-NEXT: adrp x8, .LCPI2_0
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: mov v2.d[1], v3.d[0]
Expand Down Expand Up @@ -206,10 +214,10 @@ define <8 x i16> @shuffle4_v8i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x
; CHECK-LABEL: shuffle4_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov d5, d2
; CHECK-NEXT: fmov d4, d0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
; CHECK-NEXT: adrp x8, .LCPI4_0
; CHECK-NEXT: fmov d4, d0
; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI4_0]
; CHECK-NEXT: mov v4.d[1], v1.d[0]
; CHECK-NEXT: mov v5.d[1], v3.d[0]
Expand Down Expand Up @@ -274,10 +282,10 @@ define <16 x i8> @shuffle4_v8i8_v16i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8
; CHECK-NEXT: mov v0.d[1], v0.d[0]
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI6_0]
; CHECK-NEXT: adrp x8, .LCPI6_1
; CHECK-NEXT: tbl v2.8b, { v2.16b }, v1.8b
; CHECK-NEXT: tbl v1.8b, { v0.16b }, v1.8b
; CHECK-NEXT: tbl v3.8b, { v2.16b }, v1.8b
; CHECK-NEXT: tbl v2.8b, { v0.16b }, v1.8b
; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI6_1]
; CHECK-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v0.16b
; CHECK-NEXT: tbl v0.16b, { v2.16b, v3.16b }, v0.16b
; CHECK-NEXT: ret
%x = shufflevector <8 x i8> %a, <8 x i8> %b, <4 x i32> <i32 0, i32 7, i32 5, i32 1>
%y = shufflevector <8 x i8> %c, <8 x i8> %d, <4 x i32> <i32 0, i32 7, i32 5, i32 1>
Expand Down Expand Up @@ -346,10 +354,10 @@ define <8 x i16> @shuffle4_v4i8_zext(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x
; CHECK-LABEL: shuffle4_v4i8_zext:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov d5, d2
; CHECK-NEXT: fmov d4, d0
; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
; CHECK-NEXT: adrp x8, .LCPI8_0
; CHECK-NEXT: fmov d4, d0
; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI8_0]
; CHECK-NEXT: mov v4.d[1], v1.d[0]
; CHECK-NEXT: mov v5.d[1], v3.d[0]
Expand Down Expand Up @@ -385,8 +393,12 @@ define <8 x i16> @shuffle4_v4i8_zext(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x
define <16 x i8> @shuffle4_v4i16_trunc(<4 x i16> %ae, <4 x i16> %be, <4 x i16> %ce, <4 x i16> %de) {
; CHECK-LABEL: shuffle4_v4i16_trunc:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
; CHECK-NEXT: adrp x8, .LCPI9_0
; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI9_0]
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
; CHECK-NEXT: ret
%a = trunc <4 x i16> %ae to <4 x i8>
Expand Down Expand Up @@ -420,13 +432,13 @@ define <16 x i8> @shuffle4_v4i16_trunc(<4 x i16> %ae, <4 x i16> %be, <4 x i16> %
define <16 x i8> @shuffle4_v4i32_trunc(<4 x i32> %ae, <4 x i32> %be, <4 x i32> %ce, <4 x i32> %de) {
; CHECK-LABEL: shuffle4_v4i32_trunc:
; CHECK: // %bb.0:
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: xtn v1.4h, v1.4s
; CHECK-NEXT: xtn v4.4h, v0.4s
; CHECK-NEXT: adrp x8, .LCPI10_0
; CHECK-NEXT: xtn v2.4h, v2.4s
; CHECK-NEXT: xtn v3.4h, v3.4s
; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI10_0]
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
; CHECK-NEXT: xtn v5.4h, v1.4s
; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI10_0]
; CHECK-NEXT: xtn v6.4h, v2.4s
; CHECK-NEXT: xtn v7.4h, v3.4s
; CHECK-NEXT: tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b
; CHECK-NEXT: ret
%a = trunc <4 x i32> %ae to <4 x i8>
%b = trunc <4 x i32> %be to <4 x i8>
Expand Down Expand Up @@ -458,8 +470,11 @@ define <16 x i8> @shuffle4_v4i32_trunc(<4 x i32> %ae, <4 x i32> %be, <4 x i32> %
define <12 x i8> @shuffle3_v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) {
; CHECK-LABEL: shuffle3_v4i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q0_q1_q2 def $q0_q1_q2
; CHECK-NEXT: adrp x8, .LCPI11_0
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI11_0]
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1_q2 def $q0_q1_q2
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1_q2 def $q0_q1_q2
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b }, v3.16b
; CHECK-NEXT: ret
%x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
Expand Down Expand Up @@ -489,9 +504,9 @@ define <8 x i16> @shuffle3_v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) {
; CHECK-LABEL: shuffle3_v4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov d3, d2
; CHECK-NEXT: fmov d2, d0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: adrp x8, .LCPI12_0
; CHECK-NEXT: fmov d2, d0
; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI12_0]
; CHECK-NEXT: mov v2.d[1], v1.d[0]
; CHECK-NEXT: tbl v0.16b, { v2.16b, v3.16b }, v0.16b
Expand Down Expand Up @@ -548,12 +563,12 @@ define <8 x i8> @insert4_v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c, <16 x i8>
; CHECK-LABEL: insert4_v8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: mov v4.16b, v3.16b
; CHECK-NEXT: mov v3.16b, v1.16b
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: adrp x8, .LCPI14_0
; CHECK-NEXT: mov v0.d[1], v2.d[0]
; CHECK-NEXT: adrp x9, .LCPI14_1
; CHECK-NEXT: mov v0.d[1], v2.d[0]
; CHECK-NEXT: mov v3.16b, v1.16b
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI14_0]
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI14_1]
; CHECK-NEXT: tbl v0.8b, { v0.16b }, v1.8b
Expand Down Expand Up @@ -617,14 +632,16 @@ define <8 x i8> @insert4_v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c, <16 x i8>
define <16 x i8> @insert4_v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c, <16 x i8> %d) {
; CHECK-LABEL: insert4_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: mov v4.16b, v3.16b
; CHECK-NEXT: adrp x8, .LCPI15_0
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q31_q0
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: mov v3.16b, v1.16b
; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI15_0]
; CHECK-NEXT: mov v0.d[1], v2.d[0]
; CHECK-NEXT: mov v2.16b, v1.16b
; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI15_0]
; CHECK-NEXT: adrp x8, .LCPI15_1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_1]
; CHECK-NEXT: tbl v31.16b, { v2.16b, v3.16b }, v4.16b
; CHECK-NEXT: tbl v31.16b, { v3.16b, v4.16b }, v5.16b
; CHECK-NEXT: tbl v0.16b, { v31.16b, v0.16b }, v1.16b
; CHECK-NEXT: ret
%e1 = extractelement <8 x i8> %a, i32 4
Expand Down Expand Up @@ -688,7 +705,6 @@ define <16 x i16> @test(<2 x double> %l213, <2 x double> %l231, <2 x double> %l2
; CHECK-NEXT: adrp x8, .LCPI16_0
; CHECK-NEXT: frintm v1.2d, v1.2d
; CHECK-NEXT: frintm v5.2d, v5.2d
; CHECK-NEXT: ldr q16, [x8, :lo12:.LCPI16_0]
; CHECK-NEXT: frintm v2.2d, v2.2d
; CHECK-NEXT: frintm v6.2d, v6.2d
; CHECK-NEXT: frintm v3.2d, v3.2d
Expand All @@ -701,16 +717,17 @@ define <16 x i16> @test(<2 x double> %l213, <2 x double> %l231, <2 x double> %l2
; CHECK-NEXT: fcvtzs v6.2d, v6.2d
; CHECK-NEXT: fcvtzs v3.2d, v3.2d
; CHECK-NEXT: fcvtzs v7.2d, v7.2d
; CHECK-NEXT: xtn v0.2s, v0.2d
; CHECK-NEXT: xtn v4.2s, v4.2d
; CHECK-NEXT: xtn v1.2s, v1.2d
; CHECK-NEXT: xtn v5.2s, v5.2d
; CHECK-NEXT: xtn v2.2s, v2.2d
; CHECK-NEXT: xtn v6.2s, v6.2d
; CHECK-NEXT: xtn v3.2s, v3.2d
; CHECK-NEXT: xtn v7.2s, v7.2d
; CHECK-NEXT: tbl v1.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v16.16b
; CHECK-NEXT: tbl v2.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v16.16b
; CHECK-NEXT: xtn v16.2s, v0.2d
; CHECK-NEXT: xtn v20.2s, v4.2d
; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI16_0]
; CHECK-NEXT: xtn v17.2s, v1.2d
; CHECK-NEXT: xtn v21.2s, v5.2d
; CHECK-NEXT: xtn v18.2s, v2.2d
; CHECK-NEXT: xtn v22.2s, v6.2d
; CHECK-NEXT: xtn v19.2s, v3.2d
; CHECK-NEXT: xtn v23.2s, v7.2d
; CHECK-NEXT: tbl v1.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b
; CHECK-NEXT: tbl v2.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b
; CHECK-NEXT: uzp1 v0.8h, v1.8h, v2.8h
; CHECK-NEXT: uzp2 v1.8h, v1.8h, v2.8h
; CHECK-NEXT: ret
Expand Down
60 changes: 36 additions & 24 deletions llvm/test/CodeGen/AArch64/shuffles.ll
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,9 @@ define <8 x i16> @test_shuf9(<8 x i16> %a, <8 x i16> %b)
; CHECKLE-LABEL: test_shuf9:
; CHECKLE: // %bb.0:
; CHECKLE-NEXT: adrp x8, .LCPI13_0
; CHECKLE-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECKLE-NEXT: ldr q2, [x8, :lo12:.LCPI13_0]
; CHECKLE-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECKLE-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECKLE-NEXT: ret
;
Expand All @@ -376,10 +378,10 @@ define <8 x i16> @test_shuf9(<8 x i16> %a, <8 x i16> %b)
; CHECKBE-NEXT: rev64 v0.16b, v0.16b
; CHECKBE-NEXT: adrp x8, .LCPI13_0
; CHECKBE-NEXT: add x8, x8, :lo12:.LCPI13_0
; CHECKBE-NEXT: ld1 { v2.16b }, [x8]
; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECKBE-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECKBE-NEXT: ext v2.16b, v1.16b, v1.16b, #8
; CHECKBE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECKBE-NEXT: ld1 { v0.16b }, [x8]
; CHECKBE-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v0.16b
; CHECKBE-NEXT: rev64 v0.16b, v0.16b
; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECKBE-NEXT: ret
Expand Down Expand Up @@ -416,7 +418,9 @@ define <8 x half> @test_shuf11(<8 x half> %a, <8 x half> %b)
; CHECKLE-LABEL: test_shuf11:
; CHECKLE: // %bb.0:
; CHECKLE-NEXT: adrp x8, .LCPI15_0
; CHECKLE-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECKLE-NEXT: ldr q2, [x8, :lo12:.LCPI15_0]
; CHECKLE-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECKLE-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECKLE-NEXT: ret
;
Expand All @@ -426,10 +430,10 @@ define <8 x half> @test_shuf11(<8 x half> %a, <8 x half> %b)
; CHECKBE-NEXT: rev64 v0.16b, v0.16b
; CHECKBE-NEXT: adrp x8, .LCPI15_0
; CHECKBE-NEXT: add x8, x8, :lo12:.LCPI15_0
; CHECKBE-NEXT: ld1 { v2.16b }, [x8]
; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECKBE-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECKBE-NEXT: ext v2.16b, v1.16b, v1.16b, #8
; CHECKBE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECKBE-NEXT: ld1 { v0.16b }, [x8]
; CHECKBE-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v0.16b
; CHECKBE-NEXT: rev64 v0.16b, v0.16b
; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECKBE-NEXT: ret
Expand All @@ -442,7 +446,9 @@ define <8 x half> @test_shuf12(<8 x half> %a, <8 x half> %b)
; CHECKLE-LABEL: test_shuf12:
; CHECKLE: // %bb.0:
; CHECKLE-NEXT: adrp x8, .LCPI16_0
; CHECKLE-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECKLE-NEXT: ldr q2, [x8, :lo12:.LCPI16_0]
; CHECKLE-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECKLE-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECKLE-NEXT: ret
;
Expand All @@ -452,10 +458,10 @@ define <8 x half> @test_shuf12(<8 x half> %a, <8 x half> %b)
; CHECKBE-NEXT: rev64 v0.16b, v0.16b
; CHECKBE-NEXT: adrp x8, .LCPI16_0
; CHECKBE-NEXT: add x8, x8, :lo12:.LCPI16_0
; CHECKBE-NEXT: ld1 { v2.16b }, [x8]
; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECKBE-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECKBE-NEXT: ext v2.16b, v1.16b, v1.16b, #8
; CHECKBE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECKBE-NEXT: ld1 { v0.16b }, [x8]
; CHECKBE-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v0.16b
; CHECKBE-NEXT: rev64 v0.16b, v0.16b
; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECKBE-NEXT: ret
Expand All @@ -468,7 +474,9 @@ define <8 x half> @test_shuf13(<8 x half> %a, <8 x half> %b)
; CHECKLE-LABEL: test_shuf13:
; CHECKLE: // %bb.0:
; CHECKLE-NEXT: adrp x8, .LCPI17_0
; CHECKLE-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECKLE-NEXT: ldr q2, [x8, :lo12:.LCPI17_0]
; CHECKLE-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECKLE-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECKLE-NEXT: ret
;
Expand All @@ -478,10 +486,10 @@ define <8 x half> @test_shuf13(<8 x half> %a, <8 x half> %b)
; CHECKBE-NEXT: rev64 v0.16b, v0.16b
; CHECKBE-NEXT: adrp x8, .LCPI17_0
; CHECKBE-NEXT: add x8, x8, :lo12:.LCPI17_0
; CHECKBE-NEXT: ld1 { v2.16b }, [x8]
; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECKBE-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECKBE-NEXT: ext v2.16b, v1.16b, v1.16b, #8
; CHECKBE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECKBE-NEXT: ld1 { v0.16b }, [x8]
; CHECKBE-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v0.16b
; CHECKBE-NEXT: rev64 v0.16b, v0.16b
; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECKBE-NEXT: ret
Expand All @@ -494,7 +502,9 @@ define <8 x half> @test_shuf14(<8 x half> %a, <8 x half> %b)
; CHECKLE-LABEL: test_shuf14:
; CHECKLE: // %bb.0:
; CHECKLE-NEXT: adrp x8, .LCPI18_0
; CHECKLE-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECKLE-NEXT: ldr q2, [x8, :lo12:.LCPI18_0]
; CHECKLE-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECKLE-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECKLE-NEXT: ret
;
Expand All @@ -504,10 +514,10 @@ define <8 x half> @test_shuf14(<8 x half> %a, <8 x half> %b)
; CHECKBE-NEXT: rev64 v0.16b, v0.16b
; CHECKBE-NEXT: adrp x8, .LCPI18_0
; CHECKBE-NEXT: add x8, x8, :lo12:.LCPI18_0
; CHECKBE-NEXT: ld1 { v2.16b }, [x8]
; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECKBE-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECKBE-NEXT: ext v2.16b, v1.16b, v1.16b, #8
; CHECKBE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECKBE-NEXT: ld1 { v0.16b }, [x8]
; CHECKBE-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v0.16b
; CHECKBE-NEXT: rev64 v0.16b, v0.16b
; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECKBE-NEXT: ret
Expand All @@ -520,7 +530,9 @@ define <8 x half> @test_shuf15(<8 x half> %a, <8 x half> %b)
; CHECKLE-LABEL: test_shuf15:
; CHECKLE: // %bb.0:
; CHECKLE-NEXT: adrp x8, .LCPI19_0
; CHECKLE-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECKLE-NEXT: ldr q2, [x8, :lo12:.LCPI19_0]
; CHECKLE-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECKLE-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECKLE-NEXT: ret
;
Expand All @@ -530,10 +542,10 @@ define <8 x half> @test_shuf15(<8 x half> %a, <8 x half> %b)
; CHECKBE-NEXT: rev64 v0.16b, v0.16b
; CHECKBE-NEXT: adrp x8, .LCPI19_0
; CHECKBE-NEXT: add x8, x8, :lo12:.LCPI19_0
; CHECKBE-NEXT: ld1 { v2.16b }, [x8]
; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECKBE-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECKBE-NEXT: ext v2.16b, v1.16b, v1.16b, #8
; CHECKBE-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECKBE-NEXT: ld1 { v0.16b }, [x8]
; CHECKBE-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v0.16b
; CHECKBE-NEXT: rev64 v0.16b, v0.16b
; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECKBE-NEXT: ret
Expand Down
115 changes: 77 additions & 38 deletions llvm/test/CodeGen/AArch64/shufflevector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,23 @@ define <8 x i8> @shufflevector_v8i8(<8 x i8> %a, <8 x i8> %b) {
}

define <16 x i8> @shufflevector_v16i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: shufflevector_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI1_0
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_0]
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-NEXT: ret
; CHECK-SD-LABEL: shufflevector_v16i8:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: adrp x8, .LCPI1_0
; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI1_0]
; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECK-SD-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: shufflevector_v16i8:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: adrp x8, .LCPI1_0
; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI1_0]
; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-GI-NEXT: ret
%c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15, i32 2, i32 4, i32 6, i32 8, i32 25, i32 30, i32 31, i32 31>
ret <16 x i8> %c
}
Expand All @@ -53,12 +64,23 @@ define <4 x i16> @shufflevector_v4i16(<4 x i16> %a, <4 x i16> %b) {
}

define <8 x i16> @shufflevector_v8i16(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: shufflevector_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI3_0
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_0]
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-NEXT: ret
; CHECK-SD-LABEL: shufflevector_v8i16:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: adrp x8, .LCPI3_0
; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI3_0]
; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECK-SD-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: shufflevector_v8i16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: adrp x8, .LCPI3_0
; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI3_0]
; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-GI-NEXT: ret
%c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15>
ret <8 x i16> %c
}
Expand Down Expand Up @@ -215,25 +237,26 @@ define i32 @shufflevector_v4i8(<4 x i8> %a, <4 x i8> %b){
define <32 x i8> @shufflevector_v32i8(<32 x i8> %a, <32 x i8> %b){
; CHECK-SD-LABEL: shufflevector_v32i8:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: mov v1.16b, v2.16b
; CHECK-SD-NEXT: // kill: def $q2 killed $q2 def $q1_q2
; CHECK-SD-NEXT: adrp x8, .LCPI16_0
; CHECK-SD-NEXT: adrp x9, .LCPI16_1
; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI16_0]
; CHECK-SD-NEXT: ldr q3, [x9, :lo12:.LCPI16_1]
; CHECK-SD-NEXT: tbl v2.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-SD-NEXT: tbl v1.16b, { v0.16b, v1.16b }, v3.16b
; CHECK-SD-NEXT: mov v0.16b, v2.16b
; CHECK-SD-NEXT: mov v1.16b, v0.16b
; CHECK-SD-NEXT: ldr q3, [x8, :lo12:.LCPI16_0]
; CHECK-SD-NEXT: ldr q4, [x9, :lo12:.LCPI16_1]
; CHECK-SD-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v3.16b
; CHECK-SD-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v4.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: shufflevector_v32i8:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov v1.16b, v0.16b
; CHECK-GI-NEXT: mov v3.16b, v0.16b
; CHECK-GI-NEXT: adrp x8, .LCPI16_1
; CHECK-GI-NEXT: adrp x9, .LCPI16_0
; CHECK-GI-NEXT: mov v4.16b, v2.16b
; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI16_1]
; CHECK-GI-NEXT: ldr q3, [x9, :lo12:.LCPI16_0]
; CHECK-GI-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v0.16b
; CHECK-GI-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v3.16b
; CHECK-GI-NEXT: ldr q1, [x9, :lo12:.LCPI16_0]
; CHECK-GI-NEXT: tbl v0.16b, { v3.16b, v4.16b }, v0.16b
; CHECK-GI-NEXT: tbl v1.16b, { v3.16b, v4.16b }, v1.16b
; CHECK-GI-NEXT: ret
%c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 1, i32 32, i32 32, i32 32, i32 2, i32 32, i32 32, i32 32, i32 3, i32 32, i32 32, i32 32, i32 4, i32 32, i32 32, i32 32, i32 5, i32 32, i32 32, i32 32, i32 6, i32 32, i32 32, i32 32, i32 7, i32 32, i32 32, i32 32>
ret <32 x i8> %c
Expand Down Expand Up @@ -275,25 +298,26 @@ define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){
define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b){
; CHECK-SD-LABEL: shufflevector_v16i16:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: mov v1.16b, v2.16b
; CHECK-SD-NEXT: // kill: def $q2 killed $q2 def $q1_q2
; CHECK-SD-NEXT: adrp x8, .LCPI18_0
; CHECK-SD-NEXT: adrp x9, .LCPI18_1
; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI18_0]
; CHECK-SD-NEXT: ldr q3, [x9, :lo12:.LCPI18_1]
; CHECK-SD-NEXT: tbl v2.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-SD-NEXT: tbl v1.16b, { v0.16b, v1.16b }, v3.16b
; CHECK-SD-NEXT: mov v0.16b, v2.16b
; CHECK-SD-NEXT: mov v1.16b, v0.16b
; CHECK-SD-NEXT: ldr q3, [x8, :lo12:.LCPI18_0]
; CHECK-SD-NEXT: ldr q4, [x9, :lo12:.LCPI18_1]
; CHECK-SD-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v3.16b
; CHECK-SD-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v4.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: shufflevector_v16i16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov v1.16b, v0.16b
; CHECK-GI-NEXT: mov v3.16b, v0.16b
; CHECK-GI-NEXT: adrp x8, .LCPI18_1
; CHECK-GI-NEXT: adrp x9, .LCPI18_0
; CHECK-GI-NEXT: mov v4.16b, v2.16b
; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI18_1]
; CHECK-GI-NEXT: ldr q3, [x9, :lo12:.LCPI18_0]
; CHECK-GI-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v0.16b
; CHECK-GI-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v3.16b
; CHECK-GI-NEXT: ldr q1, [x9, :lo12:.LCPI18_0]
; CHECK-GI-NEXT: tbl v0.16b, { v3.16b, v4.16b }, v0.16b
; CHECK-GI-NEXT: tbl v1.16b, { v3.16b, v4.16b }, v1.16b
; CHECK-GI-NEXT: ret
%c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 1, i32 16, i32 16, i32 16, i32 1, i32 16, i32 16, i32 16, i32 3, i32 16, i32 16, i32 16>
ret <16 x i16> %c
Expand All @@ -320,8 +344,10 @@ define <8 x i32> @shufflevector_v8i32(<8 x i32> %a, <8 x i32> %b) {
; CHECK-GI-LABEL: shufflevector_v8i32:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: adrp x8, .LCPI20_0
; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
; CHECK-GI-NEXT: uzp2 v0.4s, v0.4s, v1.4s
; CHECK-GI-NEXT: ldr q4, [x8, :lo12:.LCPI20_0]
; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
; CHECK-GI-NEXT: tbl v1.16b, { v2.16b, v3.16b }, v4.16b
; CHECK-GI-NEXT: ret
%c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15>
Expand Down Expand Up @@ -537,12 +563,23 @@ define <3 x i16> @shufflevector_v3i16(<3 x i16> %a, <3 x i16> %b) {
}

define <7 x i16> @shufflevector_v7i16(<7 x i16> %a, <7 x i16> %b) {
; CHECK-LABEL: shufflevector_v7i16:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI33_0
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI33_0]
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-NEXT: ret
; CHECK-SD-LABEL: shufflevector_v7i16:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: adrp x8, .LCPI33_0
; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI33_0]
; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECK-SD-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: shufflevector_v7i16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: adrp x8, .LCPI33_0
; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI33_0]
; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-GI-NEXT: ret
%c = shufflevector <7 x i16> %a, <7 x i16> %b, <7 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12>
ret <7 x i16> %c
}
Expand All @@ -557,7 +594,9 @@ define <3 x i32> @shufflevector_v3i32(<3 x i32> %a, <3 x i32> %b) {
; CHECK-GI-LABEL: shufflevector_v3i32:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: adrp x8, .LCPI34_0
; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI34_0]
; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-GI-NEXT: ret
%c = shufflevector <3 x i32> %a, <3 x i32> %b, <3 x i32> <i32 1, i32 2, i32 4>
Expand Down
24 changes: 24 additions & 0 deletions llvm/test/CodeGen/AArch64/sme2-intrinsics-add-sub-za16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ target triple = "aarch64-linux"
define void @add_f16_vg1x2(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1) #0 {
; CHECK-LABEL: add_f16_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: fadd za.h[w8, 0, vgx2], { z0.h, z1.h }
; CHECK-NEXT: fadd za.h[w8, 7, vgx2], { z0.h, z1.h }
; CHECK-NEXT: ret
Expand All @@ -19,7 +21,11 @@ define void @add_f16_vg1x2(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x h
define void @add_f16_vg1x4(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1,
; CHECK-LABEL: add_f16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: fadd za.h[w8, 0, vgx4], { z0.h - z3.h }
; CHECK-NEXT: fadd za.h[w8, 7, vgx4], { z0.h - z3.h }
; CHECK-NEXT: ret
Expand All @@ -35,7 +41,9 @@ define void @add_f16_vg1x4(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x h
define void @sub_f16_vg1x2(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1) #1 {
; CHECK-LABEL: sub_f16_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: fsub za.h[w8, 0, vgx2], { z0.h, z1.h }
; CHECK-NEXT: fsub za.h[w8, 7, vgx2], { z0.h, z1.h }
; CHECK-NEXT: ret
Expand All @@ -48,7 +56,11 @@ define void @sub_f16_vg1x2(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x h
define void @sub_f16_vg1x4(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1,
; CHECK-LABEL: sub_f16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: fsub za.h[w8, 0, vgx4], { z0.h - z3.h }
; CHECK-NEXT: fsub za.h[w8, 7, vgx4], { z0.h - z3.h }
; CHECK-NEXT: ret
Expand All @@ -64,7 +76,9 @@ define void @sub_f16_vg1x4(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x h
define void @add_bf16_vg1x2(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1) #2 {
; CHECK-LABEL: add_bf16_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: bfadd za.h[w8, 0, vgx2], { z0.h, z1.h }
; CHECK-NEXT: bfadd za.h[w8, 7, vgx2], { z0.h, z1.h }
; CHECK-NEXT: ret
Expand All @@ -77,7 +91,11 @@ define void @add_bf16_vg1x2(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8
define void @add_bf16_vg1x4(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1,
; CHECK-LABEL: add_bf16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: bfadd za.h[w8, 0, vgx4], { z0.h - z3.h }
; CHECK-NEXT: bfadd za.h[w8, 7, vgx4], { z0.h - z3.h }
; CHECK-NEXT: ret
Expand All @@ -93,7 +111,9 @@ define void @add_bf16_vg1x4(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8
define void @sub_bf16_vg1x2(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1) #2 {
; CHECK-LABEL: sub_bf16_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: bfsub za.h[w8, 0, vgx2], { z0.h, z1.h }
; CHECK-NEXT: bfsub za.h[w8, 7, vgx2], { z0.h, z1.h }
; CHECK-NEXT: ret
Expand All @@ -106,7 +126,11 @@ define void @sub_bf16_vg1x2(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8
define void @sub_bf16_vg1x4(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1,
; CHECK-LABEL: sub_bf16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: bfsub za.h[w8, 0, vgx4], { z0.h - z3.h }
; CHECK-NEXT: bfsub za.h[w8, 7, vgx4], { z0.h - z3.h }
; CHECK-NEXT: ret
Expand Down
60 changes: 60 additions & 0 deletions llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
define void @multi_vector_add_write_single_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zm) {
; CHECK-LABEL: multi_vector_add_write_single_za_vg1x2_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: add za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s
; CHECK-NEXT: add za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s
; CHECK-NEXT: ret
Expand All @@ -25,7 +27,9 @@ define void @multi_vector_add_write_single_za_vg1x2_i32(i32 %slice, <vscale x 4
define void @multi_vector_add_write_single_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zm) {
; CHECK-LABEL: multi_vector_add_write_single_za_vg1x2_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: add za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d
; CHECK-NEXT: add za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d
; CHECK-NEXT: ret
Expand All @@ -46,7 +50,11 @@ define void @multi_vector_add_write_single_za_vg1x2_i64(i32 %slice, <vscale x 2
define void @multi_vector_add_write_single_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
; CHECK-LABEL: multi_vector_add_write_single_za_vg1x4_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: add za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s
; CHECK-NEXT: add za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s
; CHECK-NEXT: ret
Expand All @@ -67,7 +75,11 @@ define void @multi_vector_add_write_single_za_vg1x4_i32(i32 %slice, <vscale x 4
define void @multi_vector_add_write_single_za_vg1x4_i64(i32 %slice,
; CHECK-LABEL: multi_vector_add_write_single_za_vg1x4_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: add za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d
; CHECK-NEXT: add za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d
; CHECK-NEXT: ret
Expand All @@ -93,7 +105,11 @@ define void @multi_vector_add_write_single_za_vg1x4_i64(i32 %slice,
define void @multi_vector_add_write_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
; CHECK-LABEL: multi_vector_add_write_za_vg1x2_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: add za.s[w8, 0, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
; CHECK-NEXT: add za.s[w8, 7, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
; CHECK-NEXT: ret
Expand All @@ -112,7 +128,11 @@ define void @multi_vector_add_write_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32>
define void @multi_vector_add_write_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
; CHECK-LABEL: multi_vector_add_write_za_vg1x2_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: add za.d[w8, 0, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
; CHECK-NEXT: add za.d[w8, 7, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
; CHECK-NEXT: ret
Expand All @@ -135,7 +155,15 @@ define void @multi_vector_add_write_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64>
define void @multi_vector_add_write_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
; CHECK-LABEL: multi_vector_add_write_za_vg1x4_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: add za.s[w8, 0, vgx4], { z0.s - z3.s }, { z4.s - z7.s }
; CHECK-NEXT: add za.s[w8, 7, vgx4], { z0.s - z3.s }, { z4.s - z7.s }
; CHECK-NEXT: ret
Expand All @@ -159,7 +187,15 @@ define void @multi_vector_add_write_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32>
define void @multi_vector_add_write_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
; CHECK-LABEL: multi_vector_add_write_za_vg1x4_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: add za.d[w8, 0, vgx4], { z0.d - z3.d }, { z4.d - z7.d }
; CHECK-NEXT: add za.d[w8, 7, vgx4], { z0.d - z3.d }, { z4.d - z7.d }
; CHECK-NEXT: ret
Expand Down Expand Up @@ -187,7 +223,9 @@ define void @multi_vector_add_write_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64>
define void @multi_vector_add_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1) {
; CHECK-LABEL: multi_vector_add_za_vg1x2_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: add za.s[w8, 0, vgx2], { z0.s, z1.s }
; CHECK-NEXT: add za.s[w8, 7, vgx2], { z0.s, z1.s }
; CHECK-NEXT: ret
Expand All @@ -200,7 +238,9 @@ define void @multi_vector_add_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0,
define void @multi_vector_add_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1) {
; CHECK-LABEL: multi_vector_add_za_vg1x2_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: add za.d[w8, 0, vgx2], { z0.d, z1.d }
; CHECK-NEXT: add za.d[w8, 7, vgx2], { z0.d, z1.d }
; CHECK-NEXT: ret
Expand All @@ -213,7 +253,9 @@ define void @multi_vector_add_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0,
define void @multi_vector_add_za_vg1x2_f32(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1) {
; CHECK-LABEL: multi_vector_add_za_vg1x2_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: fadd za.s[w8, 0, vgx2], { z0.s, z1.s }
; CHECK-NEXT: fadd za.s[w8, 7, vgx2], { z0.s, z1.s }
; CHECK-NEXT: ret
Expand All @@ -228,7 +270,9 @@ define void @multi_vector_add_za_vg1x2_f32(i32 %slice, <vscale x 4 x float> %zn0
define void @multi_vector_add_za_vg1x2_f64(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1) {
; CHECK-LABEL: multi_vector_add_za_vg1x2_f64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: fadd za.d[w8, 0, vgx2], { z0.d, z1.d }
; CHECK-NEXT: fadd za.d[w8, 7, vgx2], { z0.d, z1.d }
; CHECK-NEXT: ret
Expand All @@ -245,7 +289,11 @@ define void @multi_vector_add_za_vg1x2_f64(i32 %slice, <vscale x 2 x double> %zn
define void @multi_vector_add_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3) {
; CHECK-LABEL: multi_vector_add_za_vg1x4_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: add za.s[w8, 0, vgx4], { z0.s - z3.s }
; CHECK-NEXT: add za.s[w8, 7, vgx4], { z0.s - z3.s }
; CHECK-NEXT: ret
Expand All @@ -262,7 +310,11 @@ define void @multi_vector_add_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0,
define void @multi_vector_add_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3) {
; CHECK-LABEL: multi_vector_add_za_vg1x4_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: add za.d[w8, 0, vgx4], { z0.d - z3.d }
; CHECK-NEXT: add za.d[w8, 7, vgx4], { z0.d - z3.d }
; CHECK-NEXT: ret
Expand All @@ -279,7 +331,11 @@ define void @multi_vector_add_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64> %zn0,
define void @multi_vector_add_za_vg1x4_f32(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3) {
; CHECK-LABEL: multi_vector_add_za_vg1x4_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: fadd za.s[w8, 0, vgx4], { z0.s - z3.s }
; CHECK-NEXT: fadd za.s[w8, 7, vgx4], { z0.s - z3.s }
; CHECK-NEXT: ret
Expand All @@ -296,7 +352,11 @@ define void @multi_vector_add_za_vg1x4_f32(i32 %slice, <vscale x 4 x float> %zn0
define void @multi_vector_add_za_vg1x4_f64(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3) {
; CHECK-LABEL: multi_vector_add_za_vg1x4_f64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: fadd za.d[w8, 0, vgx4], { z0.d - z3.d }
; CHECK-NEXT: fadd za.d[w8, 7, vgx4], { z0.d - z3.d }
; CHECK-NEXT: ret
Expand Down
4 changes: 4 additions & 0 deletions llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtn.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
define <vscale x 8 x half> @multi_vector_cvtn_x2_f16(<vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) {
; CHECK-LABEL: multi_vector_cvtn_x2_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: fcvtn z0.h, { z0.s, z1.s }
; CHECK-NEXT: ret
%res = call <vscale x 8 x half> @llvm.aarch64.sve.fcvtn.x2.nxv4f32(<vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2)
Expand All @@ -20,6 +22,8 @@ define <vscale x 8 x half> @multi_vector_cvtn_x2_f16(<vscale x 4 x float> %zn1,
define <vscale x 8 x bfloat> @multi_vector_bfcvtn_x2(<vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) {
; CHECK-LABEL: multi_vector_bfcvtn_x2:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: bfcvtn z0.h, { z0.s, z1.s }
; CHECK-NEXT: ret
%res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.bfcvtn.x2(<vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2)
Expand Down
100 changes: 98 additions & 2 deletions llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas.ll

Large diffs are not rendered by default.

52 changes: 32 additions & 20 deletions llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,18 @@ define void @fdot_multi_za32_f16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
define void @fdot_multi_za32_f16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
; CHECK-LABEL: fdot_multi_za32_f16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
; CHECK-NEXT: mov z7.d, z4.d
; CHECK-NEXT: mov z31.d, z4.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov z25.d, z6.d
; CHECK-NEXT: mov z30.d, z3.d
; CHECK-NEXT: mov z24.d, z5.d
; CHECK-NEXT: mov z29.d, z2.d
; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
; CHECK-NEXT: mov z6.d, z3.d
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: fdot za.s[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
; CHECK-NEXT: fdot za.s[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
; CHECK-NEXT: mov z28.d, z1.d
; CHECK-NEXT: fdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
; CHECK-NEXT: fdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
; CHECK-NEXT: ret
<vscale x 8 x half> %zn4, <vscale x 8 x half> %zn5, <vscale x 8 x half> %zn6, <vscale x 8 x half> %zn7) #0 {
call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
Expand Down Expand Up @@ -71,18 +71,18 @@ define void @bfdot_multi_za32_bf16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
define void @fdot_multi_za32_bf16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
; CHECK-LABEL: fdot_multi_za32_bf16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
; CHECK-NEXT: mov z7.d, z4.d
; CHECK-NEXT: mov z31.d, z4.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov z25.d, z6.d
; CHECK-NEXT: mov z30.d, z3.d
; CHECK-NEXT: mov z24.d, z5.d
; CHECK-NEXT: mov z29.d, z2.d
; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
; CHECK-NEXT: mov z6.d, z3.d
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: bfdot za.s[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
; CHECK-NEXT: bfdot za.s[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
; CHECK-NEXT: mov z28.d, z1.d
; CHECK-NEXT: bfdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
; CHECK-NEXT: bfdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
; CHECK-NEXT: ret
<vscale x 8 x bfloat> %zn4, <vscale x 8 x bfloat> %zn5, <vscale x 8 x bfloat> %zn6, <vscale x 8 x bfloat> %zn7) #0 {
call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
Expand All @@ -99,7 +99,9 @@ define void @fdot_multi_za32_bf16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
define void @fdot_single_za32_f16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) #0 {
; CHECK-LABEL: fdot_single_za32_f16_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: fdot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: fdot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: ret
Expand All @@ -112,7 +114,11 @@ define void @fdot_single_za32_f16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
define void @fdot_single_za32_f16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4) #0 {
; CHECK-LABEL: fdot_single_za32_f16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: fdot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: fdot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: ret
Expand All @@ -128,7 +134,9 @@ define void @fdot_single_za32_f16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
define void @bfdot_single_za32_bf16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) #0 {
; CHECK-LABEL: bfdot_single_za32_bf16_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: bfdot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: bfdot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: ret
Expand All @@ -141,7 +149,11 @@ define void @bfdot_single_za32_bf16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused
define void @bfdot_single_za32_bf16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4) #0 {
; CHECK-LABEL: bfdot_single_za32_bf16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: bfdot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: bfdot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: ret
Expand All @@ -158,8 +170,8 @@ define void @fdot_lane_za32_f16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: fdot_lane_za32_f16_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: fdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
; CHECK-NEXT: fdot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
; CHECK-NEXT: ret
Expand All @@ -173,8 +185,8 @@ define void @fdot_lane_za32_f16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: fdot_lane_za32_f16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z27.d, z4.d
; CHECK-NEXT: mov z26.d, z3.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z26.d, z3.d
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: fdot za.s[w8, 0, vgx4], { z24.h - z27.h }, z5.h[3]
Expand All @@ -195,8 +207,8 @@ define void @bfdot_lane_za32_bf16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
; CHECK-LABEL: bfdot_lane_za32_bf16_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: bfdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
; CHECK-NEXT: bfdot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
; CHECK-NEXT: ret
Expand All @@ -210,8 +222,8 @@ define void @bfdot_lane_za32_bf16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
; CHECK-LABEL: bfdot_lane_za32_bf16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z27.d, z4.d
; CHECK-NEXT: mov z26.d, z3.d
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z26.d, z3.d
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: bfdot za.s[w8, 0, vgx4], { z24.h - z27.h }, z5.h[3]
Expand Down
144 changes: 144 additions & 0 deletions llvm/test/CodeGen/AArch64/sme2-intrinsics-insert-mova.ll

Large diffs are not rendered by default.

576 changes: 125 additions & 451 deletions llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll

Large diffs are not rendered by default.

400 changes: 218 additions & 182 deletions llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll

Large diffs are not rendered by default.

400 changes: 218 additions & 182 deletions llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll

Large diffs are not rendered by default.

244 changes: 152 additions & 92 deletions llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll

Large diffs are not rendered by default.

193 changes: 192 additions & 1 deletion llvm/test/CodeGen/AArch64/sme2-intrinsics-mlals.ll

Large diffs are not rendered by default.

208 changes: 104 additions & 104 deletions llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -324,20 +324,20 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_rounding_shl_x2_s64
define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_rounding_shl_x4_s8(<vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
; CHECK-LABEL: multi_vec_rounding_shl_x4_s8:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z30.d, z7.d
; CHECK-NEXT: mov z27.d, z4.d
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
; CHECK-NEXT: mov z7.d, z4.d
; CHECK-NEXT: mov z24.d, z5.d
; CHECK-NEXT: mov z6.d, z3.d
; CHECK-NEXT: ld1b { z27.b }, p0/z, [x0]
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: srshl { z4.b - z7.b }, { z4.b - z7.b }, { z24.b - z27.b }
; CHECK-NEXT: mov z0.d, z4.d
; CHECK-NEXT: mov z1.d, z5.d
; CHECK-NEXT: mov z2.d, z6.d
; CHECK-NEXT: mov z3.d, z7.d
; CHECK-NEXT: mov z29.d, z6.d
; CHECK-NEXT: mov z26.d, z3.d
; CHECK-NEXT: mov z28.d, z5.d
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: ld1b { z31.b }, p0/z, [x0]
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: srshl { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b }
; CHECK-NEXT: mov z0.d, z24.d
; CHECK-NEXT: mov z1.d, z25.d
; CHECK-NEXT: mov z2.d, z26.d
; CHECK-NEXT: mov z3.d, z27.d
; CHECK-NEXT: ret
%res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
@llvm.aarch64.sve.srshl.x4.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
Expand All @@ -348,20 +348,20 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_rounding_shl_x4_s16(<vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
; CHECK-LABEL: multi_vec_rounding_shl_x4_s16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z30.d, z7.d
; CHECK-NEXT: mov z27.d, z4.d
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
; CHECK-NEXT: mov z7.d, z4.d
; CHECK-NEXT: mov z24.d, z5.d
; CHECK-NEXT: mov z6.d, z3.d
; CHECK-NEXT: ld1h { z27.h }, p0/z, [x0]
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: srshl { z4.h - z7.h }, { z4.h - z7.h }, { z24.h - z27.h }
; CHECK-NEXT: mov z0.d, z4.d
; CHECK-NEXT: mov z1.d, z5.d
; CHECK-NEXT: mov z2.d, z6.d
; CHECK-NEXT: mov z3.d, z7.d
; CHECK-NEXT: mov z29.d, z6.d
; CHECK-NEXT: mov z26.d, z3.d
; CHECK-NEXT: mov z28.d, z5.d
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0]
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: srshl { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
; CHECK-NEXT: mov z0.d, z24.d
; CHECK-NEXT: mov z1.d, z25.d
; CHECK-NEXT: mov z2.d, z26.d
; CHECK-NEXT: mov z3.d, z27.d
; CHECK-NEXT: ret
%res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
@llvm.aarch64.sve.srshl.x4.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
Expand All @@ -372,20 +372,20 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_rounding_shl_x4_s32(<vscale x 4 x i32> %dummy, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
; CHECK-LABEL: multi_vec_rounding_shl_x4_s32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z30.d, z7.d
; CHECK-NEXT: mov z27.d, z4.d
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
; CHECK-NEXT: mov z7.d, z4.d
; CHECK-NEXT: mov z24.d, z5.d
; CHECK-NEXT: mov z6.d, z3.d
; CHECK-NEXT: ld1w { z27.s }, p0/z, [x0]
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: srshl { z4.s - z7.s }, { z4.s - z7.s }, { z24.s - z27.s }
; CHECK-NEXT: mov z0.d, z4.d
; CHECK-NEXT: mov z1.d, z5.d
; CHECK-NEXT: mov z2.d, z6.d
; CHECK-NEXT: mov z3.d, z7.d
; CHECK-NEXT: mov z29.d, z6.d
; CHECK-NEXT: mov z26.d, z3.d
; CHECK-NEXT: mov z28.d, z5.d
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0]
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: srshl { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
; CHECK-NEXT: mov z0.d, z24.d
; CHECK-NEXT: mov z1.d, z25.d
; CHECK-NEXT: mov z2.d, z26.d
; CHECK-NEXT: mov z3.d, z27.d
; CHECK-NEXT: ret
%res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
@llvm.aarch64.sve.srshl.x4.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
Expand All @@ -396,20 +396,20 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4
define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_rounding_shl_x4_s64(<vscale x 2 x i64> %dummy, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
; CHECK-LABEL: multi_vec_rounding_shl_x4_s64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z30.d, z7.d
; CHECK-NEXT: mov z27.d, z4.d
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
; CHECK-NEXT: mov z7.d, z4.d
; CHECK-NEXT: mov z24.d, z5.d
; CHECK-NEXT: mov z6.d, z3.d
; CHECK-NEXT: ld1d { z27.d }, p0/z, [x0]
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: srshl { z4.d - z7.d }, { z4.d - z7.d }, { z24.d - z27.d }
; CHECK-NEXT: mov z0.d, z4.d
; CHECK-NEXT: mov z1.d, z5.d
; CHECK-NEXT: mov z2.d, z6.d
; CHECK-NEXT: mov z3.d, z7.d
; CHECK-NEXT: mov z29.d, z6.d
; CHECK-NEXT: mov z26.d, z3.d
; CHECK-NEXT: mov z28.d, z5.d
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0]
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: srshl { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
; CHECK-NEXT: mov z0.d, z24.d
; CHECK-NEXT: mov z1.d, z25.d
; CHECK-NEXT: mov z2.d, z26.d
; CHECK-NEXT: mov z3.d, z27.d
; CHECK-NEXT: ret
%res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
@llvm.aarch64.sve.srshl.x4.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
Expand Down Expand Up @@ -484,20 +484,20 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_rounding_uhl_x2_u64
define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_rounding_shl_x4_u8(<vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
; CHECK-LABEL: multi_vec_rounding_shl_x4_u8:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z30.d, z7.d
; CHECK-NEXT: mov z27.d, z4.d
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
; CHECK-NEXT: mov z7.d, z4.d
; CHECK-NEXT: mov z24.d, z5.d
; CHECK-NEXT: mov z6.d, z3.d
; CHECK-NEXT: ld1b { z27.b }, p0/z, [x0]
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: urshl { z4.b - z7.b }, { z4.b - z7.b }, { z24.b - z27.b }
; CHECK-NEXT: mov z0.d, z4.d
; CHECK-NEXT: mov z1.d, z5.d
; CHECK-NEXT: mov z2.d, z6.d
; CHECK-NEXT: mov z3.d, z7.d
; CHECK-NEXT: mov z29.d, z6.d
; CHECK-NEXT: mov z26.d, z3.d
; CHECK-NEXT: mov z28.d, z5.d
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: ld1b { z31.b }, p0/z, [x0]
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: urshl { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b }
; CHECK-NEXT: mov z0.d, z24.d
; CHECK-NEXT: mov z1.d, z25.d
; CHECK-NEXT: mov z2.d, z26.d
; CHECK-NEXT: mov z3.d, z27.d
; CHECK-NEXT: ret
%res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
@llvm.aarch64.sve.urshl.x4.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
Expand All @@ -508,20 +508,20 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_rounding_shl_x4_u16(<vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
; CHECK-LABEL: multi_vec_rounding_shl_x4_u16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z30.d, z7.d
; CHECK-NEXT: mov z27.d, z4.d
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
; CHECK-NEXT: mov z7.d, z4.d
; CHECK-NEXT: mov z24.d, z5.d
; CHECK-NEXT: mov z6.d, z3.d
; CHECK-NEXT: ld1h { z27.h }, p0/z, [x0]
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: urshl { z4.h - z7.h }, { z4.h - z7.h }, { z24.h - z27.h }
; CHECK-NEXT: mov z0.d, z4.d
; CHECK-NEXT: mov z1.d, z5.d
; CHECK-NEXT: mov z2.d, z6.d
; CHECK-NEXT: mov z3.d, z7.d
; CHECK-NEXT: mov z29.d, z6.d
; CHECK-NEXT: mov z26.d, z3.d
; CHECK-NEXT: mov z28.d, z5.d
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0]
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: urshl { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
; CHECK-NEXT: mov z0.d, z24.d
; CHECK-NEXT: mov z1.d, z25.d
; CHECK-NEXT: mov z2.d, z26.d
; CHECK-NEXT: mov z3.d, z27.d
; CHECK-NEXT: ret
%res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
@llvm.aarch64.sve.urshl.x4.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
Expand All @@ -532,20 +532,20 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_rounding_shl_x4_u32(<vscale x 4 x i32> %dummy, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
; CHECK-LABEL: multi_vec_rounding_shl_x4_u32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z30.d, z7.d
; CHECK-NEXT: mov z27.d, z4.d
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
; CHECK-NEXT: mov z7.d, z4.d
; CHECK-NEXT: mov z24.d, z5.d
; CHECK-NEXT: mov z6.d, z3.d
; CHECK-NEXT: ld1w { z27.s }, p0/z, [x0]
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: urshl { z4.s - z7.s }, { z4.s - z7.s }, { z24.s - z27.s }
; CHECK-NEXT: mov z0.d, z4.d
; CHECK-NEXT: mov z1.d, z5.d
; CHECK-NEXT: mov z2.d, z6.d
; CHECK-NEXT: mov z3.d, z7.d
; CHECK-NEXT: mov z29.d, z6.d
; CHECK-NEXT: mov z26.d, z3.d
; CHECK-NEXT: mov z28.d, z5.d
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0]
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: urshl { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
; CHECK-NEXT: mov z0.d, z24.d
; CHECK-NEXT: mov z1.d, z25.d
; CHECK-NEXT: mov z2.d, z26.d
; CHECK-NEXT: mov z3.d, z27.d
; CHECK-NEXT: ret
%res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
@llvm.aarch64.sve.urshl.x4.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
Expand All @@ -556,20 +556,20 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4
define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_rounding_shl_x4_u64(<vscale x 2 x i64> %dummy, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
; CHECK-LABEL: multi_vec_rounding_shl_x4_u64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z30.d, z7.d
; CHECK-NEXT: mov z27.d, z4.d
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
; CHECK-NEXT: mov z7.d, z4.d
; CHECK-NEXT: mov z24.d, z5.d
; CHECK-NEXT: mov z6.d, z3.d
; CHECK-NEXT: ld1d { z27.d }, p0/z, [x0]
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: urshl { z4.d - z7.d }, { z4.d - z7.d }, { z24.d - z27.d }
; CHECK-NEXT: mov z0.d, z4.d
; CHECK-NEXT: mov z1.d, z5.d
; CHECK-NEXT: mov z2.d, z6.d
; CHECK-NEXT: mov z3.d, z7.d
; CHECK-NEXT: mov z29.d, z6.d
; CHECK-NEXT: mov z26.d, z3.d
; CHECK-NEXT: mov z28.d, z5.d
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0]
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: urshl { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
; CHECK-NEXT: mov z0.d, z24.d
; CHECK-NEXT: mov z1.d, z25.d
; CHECK-NEXT: mov z2.d, z26.d
; CHECK-NEXT: mov z3.d, z27.d
; CHECK-NEXT: ret
%res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
@llvm.aarch64.sve.urshl.x4.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ define <vscale x 2 x i64> @test_tileslice_no_add(i32 %idx) #0 {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1
; CHECK-NEXT: ret
entry:
%read = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 %idx)
Expand All @@ -20,6 +21,7 @@ define <vscale x 2 x i64> @test_tileslice_add_nonconstant(i32 %idx1, i32 %idx2)
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: add w8, w0, w1
; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1
; CHECK-NEXT: ret
entry:
%add = add i32 %idx1, %idx2
Expand Down
104 changes: 52 additions & 52 deletions llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll
Original file line number Diff line number Diff line change
Expand Up @@ -196,20 +196,20 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_sat_double_mulh_mul
define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_sat_double_mulh_multi_x4_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
; CHECK-LABEL: multi_vec_sat_double_mulh_multi_x4_s8:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z30.d, z7.d
; CHECK-NEXT: mov z27.d, z4.d
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
; CHECK-NEXT: mov z7.d, z4.d
; CHECK-NEXT: mov z24.d, z5.d
; CHECK-NEXT: mov z6.d, z3.d
; CHECK-NEXT: ld1b { z27.b }, p0/z, [x0]
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: sqdmulh { z4.b - z7.b }, { z4.b - z7.b }, { z24.b - z27.b }
; CHECK-NEXT: mov z0.d, z4.d
; CHECK-NEXT: mov z1.d, z5.d
; CHECK-NEXT: mov z2.d, z6.d
; CHECK-NEXT: mov z3.d, z7.d
; CHECK-NEXT: mov z29.d, z6.d
; CHECK-NEXT: mov z26.d, z3.d
; CHECK-NEXT: mov z28.d, z5.d
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: ld1b { z31.b }, p0/z, [x0]
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: sqdmulh { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b }
; CHECK-NEXT: mov z0.d, z24.d
; CHECK-NEXT: mov z1.d, z25.d
; CHECK-NEXT: mov z2.d, z26.d
; CHECK-NEXT: mov z3.d, z27.d
; CHECK-NEXT: ret
<vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
%res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
Expand All @@ -221,20 +221,20 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_sat_double_mulh_multi_x4_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
; CHECK-LABEL: multi_vec_sat_double_mulh_multi_x4_s16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z30.d, z7.d
; CHECK-NEXT: mov z27.d, z4.d
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
; CHECK-NEXT: mov z7.d, z4.d
; CHECK-NEXT: mov z24.d, z5.d
; CHECK-NEXT: mov z6.d, z3.d
; CHECK-NEXT: ld1h { z27.h }, p0/z, [x0]
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: sqdmulh { z4.h - z7.h }, { z4.h - z7.h }, { z24.h - z27.h }
; CHECK-NEXT: mov z0.d, z4.d
; CHECK-NEXT: mov z1.d, z5.d
; CHECK-NEXT: mov z2.d, z6.d
; CHECK-NEXT: mov z3.d, z7.d
; CHECK-NEXT: mov z29.d, z6.d
; CHECK-NEXT: mov z26.d, z3.d
; CHECK-NEXT: mov z28.d, z5.d
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0]
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: sqdmulh { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
; CHECK-NEXT: mov z0.d, z24.d
; CHECK-NEXT: mov z1.d, z25.d
; CHECK-NEXT: mov z2.d, z26.d
; CHECK-NEXT: mov z3.d, z27.d
; CHECK-NEXT: ret
<vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
%res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
Expand All @@ -246,20 +246,20 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_sat_double_mulh_multi_x4_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
; CHECK-LABEL: multi_vec_sat_double_mulh_multi_x4_s32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z30.d, z7.d
; CHECK-NEXT: mov z27.d, z4.d
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
; CHECK-NEXT: mov z7.d, z4.d
; CHECK-NEXT: mov z24.d, z5.d
; CHECK-NEXT: mov z6.d, z3.d
; CHECK-NEXT: ld1w { z27.s }, p0/z, [x0]
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: sqdmulh { z4.s - z7.s }, { z4.s - z7.s }, { z24.s - z27.s }
; CHECK-NEXT: mov z0.d, z4.d
; CHECK-NEXT: mov z1.d, z5.d
; CHECK-NEXT: mov z2.d, z6.d
; CHECK-NEXT: mov z3.d, z7.d
; CHECK-NEXT: mov z29.d, z6.d
; CHECK-NEXT: mov z26.d, z3.d
; CHECK-NEXT: mov z28.d, z5.d
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0]
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: sqdmulh { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
; CHECK-NEXT: mov z0.d, z24.d
; CHECK-NEXT: mov z1.d, z25.d
; CHECK-NEXT: mov z2.d, z26.d
; CHECK-NEXT: mov z3.d, z27.d
; CHECK-NEXT: ret
<vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
%res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
Expand All @@ -271,20 +271,20 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4
define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_sat_double_mulh_multi_x4_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
; CHECK-LABEL: multi_vec_sat_double_mulh_multi_x4_s64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z30.d, z7.d
; CHECK-NEXT: mov z27.d, z4.d
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
; CHECK-NEXT: mov z7.d, z4.d
; CHECK-NEXT: mov z24.d, z5.d
; CHECK-NEXT: mov z6.d, z3.d
; CHECK-NEXT: ld1d { z27.d }, p0/z, [x0]
; CHECK-NEXT: mov z5.d, z2.d
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: sqdmulh { z4.d - z7.d }, { z4.d - z7.d }, { z24.d - z27.d }
; CHECK-NEXT: mov z0.d, z4.d
; CHECK-NEXT: mov z1.d, z5.d
; CHECK-NEXT: mov z2.d, z6.d
; CHECK-NEXT: mov z3.d, z7.d
; CHECK-NEXT: mov z29.d, z6.d
; CHECK-NEXT: mov z26.d, z3.d
; CHECK-NEXT: mov z28.d, z5.d
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0]
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: sqdmulh { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
; CHECK-NEXT: mov z0.d, z24.d
; CHECK-NEXT: mov z1.d, z25.d
; CHECK-NEXT: mov z2.d, z26.d
; CHECK-NEXT: mov z3.d, z27.d
; CHECK-NEXT: ret
<vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
%res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
Expand Down
60 changes: 60 additions & 0 deletions llvm/test/CodeGen/AArch64/sme2-intrinsics-sub.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
define void @multi_vector_sub_write_single_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zm) {
; CHECK-LABEL: multi_vector_sub_write_single_za_vg1x2_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: sub za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s
; CHECK-NEXT: sub za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s
; CHECK-NEXT: ret
Expand All @@ -25,7 +27,9 @@ define void @multi_vector_sub_write_single_za_vg1x2_i32(i32 %slice, <vscale x 4
define void @multi_vector_sub_write_single_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zm) {
; CHECK-LABEL: multi_vector_sub_write_single_za_vg1x2_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: sub za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d
; CHECK-NEXT: sub za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d
; CHECK-NEXT: ret
Expand All @@ -46,7 +50,11 @@ define void @multi_vector_sub_write_single_za_vg1x2_i64(i32 %slice, <vscale x 2
define void @multi_vector_sub_write_single_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
; CHECK-LABEL: multi_vector_sub_write_single_za_vg1x4_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: sub za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s
; CHECK-NEXT: sub za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s
; CHECK-NEXT: ret
Expand All @@ -67,7 +75,11 @@ define void @multi_vector_sub_write_single_za_vg1x4_i32(i32 %slice, <vscale x 4
define void @multi_vector_sub_write_single_za_vg1x4_i64(i32 %slice,
; CHECK-LABEL: multi_vector_sub_write_single_za_vg1x4_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: sub za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d
; CHECK-NEXT: sub za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d
; CHECK-NEXT: ret
Expand All @@ -93,7 +105,11 @@ define void @multi_vector_sub_write_single_za_vg1x4_i64(i32 %slice,
define void @multi_vector_sub_write_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
; CHECK-LABEL: multi_vector_sub_write_za_vg1x2_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: sub za.s[w8, 0, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
; CHECK-NEXT: sub za.s[w8, 7, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
; CHECK-NEXT: ret
Expand All @@ -112,7 +128,11 @@ define void @multi_vector_sub_write_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32>
define void @multi_vector_sub_write_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
; CHECK-LABEL: multi_vector_sub_write_za_vg1x2_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: sub za.d[w8, 0, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
; CHECK-NEXT: sub za.d[w8, 7, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
; CHECK-NEXT: ret
Expand All @@ -135,7 +155,15 @@ define void @multi_vector_sub_write_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64>
define void @multi_vector_sub_write_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
; CHECK-LABEL: multi_vector_sub_write_za_vg1x4_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: sub za.s[w8, 0, vgx4], { z0.s - z3.s }, { z4.s - z7.s }
; CHECK-NEXT: sub za.s[w8, 7, vgx4], { z0.s - z3.s }, { z4.s - z7.s }
; CHECK-NEXT: ret
Expand All @@ -159,7 +187,15 @@ define void @multi_vector_sub_write_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32>
define void @multi_vector_sub_write_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
; CHECK-LABEL: multi_vector_sub_write_za_vg1x4_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: sub za.d[w8, 0, vgx4], { z0.d - z3.d }, { z4.d - z7.d }
; CHECK-NEXT: sub za.d[w8, 7, vgx4], { z0.d - z3.d }, { z4.d - z7.d }
; CHECK-NEXT: ret
Expand Down Expand Up @@ -189,7 +225,9 @@ define void @multi_vector_sub_write_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64>
define void @multi_vector_sub_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1) {
; CHECK-LABEL: multi_vector_sub_za_vg1x2_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: sub za.s[w8, 0, vgx2], { z0.s, z1.s }
; CHECK-NEXT: sub za.s[w8, 7, vgx2], { z0.s, z1.s }
; CHECK-NEXT: ret
Expand All @@ -202,7 +240,9 @@ define void @multi_vector_sub_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0,
define void @multi_vector_sub_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1) {
; CHECK-LABEL: multi_vector_sub_za_vg1x2_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: sub za.d[w8, 0, vgx2], { z0.d, z1.d }
; CHECK-NEXT: sub za.d[w8, 7, vgx2], { z0.d, z1.d }
; CHECK-NEXT: ret
Expand All @@ -215,7 +255,9 @@ define void @multi_vector_sub_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0,
define void @multi_vector_sub_za_vg1x2_f32(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1) {
; CHECK-LABEL: multi_vector_sub_za_vg1x2_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: fsub za.s[w8, 0, vgx2], { z0.s, z1.s }
; CHECK-NEXT: fsub za.s[w8, 7, vgx2], { z0.s, z1.s }
; CHECK-NEXT: ret
Expand All @@ -230,7 +272,9 @@ define void @multi_vector_sub_za_vg1x2_f32(i32 %slice, <vscale x 4 x float> %zn0
define void @multi_vector_sub_za_vg1x2_f64(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1) {
; CHECK-LABEL: multi_vector_sub_za_vg1x2_f64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: fsub za.d[w8, 0, vgx2], { z0.d, z1.d }
; CHECK-NEXT: fsub za.d[w8, 7, vgx2], { z0.d, z1.d }
; CHECK-NEXT: ret
Expand All @@ -247,7 +291,11 @@ define void @multi_vector_sub_za_vg1x2_f64(i32 %slice, <vscale x 2 x double> %zn
define void @multi_vector_sub_za_vg1x4_i32(i32 %slice,
; CHECK-LABEL: multi_vector_sub_za_vg1x4_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: sub za.s[w8, 0, vgx4], { z0.s - z3.s }
; CHECK-NEXT: sub za.s[w8, 7, vgx4], { z0.s - z3.s }
; CHECK-NEXT: ret
Expand All @@ -266,7 +314,11 @@ define void @multi_vector_sub_za_vg1x4_i32(i32 %slice,
define void @multi_vector_sub_za_vg1x4_i64(i32 %slice,
; CHECK-LABEL: multi_vector_sub_za_vg1x4_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: sub za.d[w8, 0, vgx4], { z0.d - z3.d }
; CHECK-NEXT: sub za.d[w8, 7, vgx4], { z0.d - z3.d }
; CHECK-NEXT: ret
Expand All @@ -285,7 +337,11 @@ define void @multi_vector_sub_za_vg1x4_i64(i32 %slice,
define void @multi_vector_sub_za_vg1x4_f32(i32 %slice,
; CHECK-LABEL: multi_vector_sub_za_vg1x4_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: fsub za.s[w8, 0, vgx4], { z0.s - z3.s }
; CHECK-NEXT: fsub za.s[w8, 7, vgx4], { z0.s - z3.s }
; CHECK-NEXT: ret
Expand All @@ -304,7 +360,11 @@ define void @multi_vector_sub_za_vg1x4_f32(i32 %slice,
define void @multi_vector_sub_za_vg1x4_f64(i32 %slice,
; CHECK-LABEL: multi_vector_sub_za_vg1x4_f64:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: fsub za.d[w8, 0, vgx4], { z0.d - z3.d }
; CHECK-NEXT: fsub za.d[w8, 7, vgx4], { z0.d - z3.d }
; CHECK-NEXT: ret
Expand Down
Loading