From 8232ab76d0bae090b1720a8d096c795e400d2525 Mon Sep 17 00:00:00 2001 From: Dinar Temirbulatov Date: Tue, 3 Oct 2023 15:19:00 +0000 Subject: [PATCH] [AArch64][SVE][SVE2] Enable tbl, tbl2 for shuffle lowering for fixed vector types. This change enablse some of shuffle lowering with TBL instruction with SVE and SVE2 for indexing for one register and TBL version for SVE2 while indexing to both registers. Differential Revision: https://reviews.llvm.org/D152205 --- .../Target/AArch64/AArch64ISelLowering.cpp | 76 ++++++ .../AArch64/sve-fixed-length-permute-rev.ll | 131 ++------- .../sve-fixed-length-vector-shuffle-tbl.ll | 254 ++++++++++++++++++ 3 files changed, 358 insertions(+), 103 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 31bea4e7d9d1a..669cd8e6018b2 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -25830,6 +25830,77 @@ AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op, } } +static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, + ArrayRef ShuffleMask, EVT VT, + EVT ContainerVT, SelectionDAG &DAG) { + auto &Subtarget = DAG.getSubtarget(); + SDLoc DL(Op); + unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits(); + unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits(); + bool IsSingleOp = ShuffleVectorInst::isSingleSourceMask(ShuffleMask); + + // Ignore two operands if no SVE2 or all index numbers couldn't + // be represented. + if (!IsSingleOp && (!Subtarget.hasSVE2() || MinSVESize != MaxSVESize)) + return SDValue(); + + EVT VTOp1 = Op.getOperand(0).getValueType(); + unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits(); + unsigned IndexLen = MinSVESize / BitsPerElt; + unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements(); + unsigned MaskSize = ShuffleMask.size(); + uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue(); + assert(ElementsPerVectorReg <= IndexLen && MaskSize <= IndexLen && + "Incorrectly legalised shuffle operation"); + + SmallVector TBLMask; + for (int Index : ShuffleMask) { + // Handling poison index value. + if (Index < 0) + Index = 0; + // If we refer to the second operand then we have to add elements + // number in hardware register minus number of elements in a type. + if ((unsigned)Index >= ElementsPerVectorReg) + Index += IndexLen - ElementsPerVectorReg; + // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals + // to 255, this might point to the last element of in the second operand + // of the shufflevector, thus we are rejecting this transform. + if ((unsigned)Index >= MaxOffset) + return SDValue(); + TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64)); + } + + // Choosing an out-of-range index leads to the lane being zeroed vs zero + // value where it would perform first lane duplication for out of + // index elements. For i8 elements an out-of-range index could be a valid + // for 2048-bit vector register size. + for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) + TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64)); + + EVT MaskEltType = EVT::getIntegerVT(*DAG.getContext(), BitsPerElt); + EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen); + EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType); + SDValue VecMask = + DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen)); + SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask); + + SDValue Shuffle; + if (IsSingleOp) + Shuffle = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, + DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32), + Op1, SVEMask); + else if (Subtarget.hasSVE2()) + Shuffle = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, + DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32), + Op1, Op2, SVEMask); + else + llvm_unreachable("Cannot lower shuffle without SVE2 TBL"); + Shuffle = convertFromScalableVector(DAG, VT, Shuffle); + return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); +} + SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE( SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); @@ -25975,6 +26046,11 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE( } } + // Avoid producing TBL instruction if we don't know SVE register minimal size. + if (MinSVESize) + return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT, + DAG); + return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll index 24ece2873adb8..6871fd53fa6ad 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll @@ -194,28 +194,13 @@ define void @test_revhv32i16(ptr %a) #0 { define void @test_rev_elts_fail(ptr %a) #1 { ; CHECK-LABEL: test_rev_elts_fail: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: sub x9, sp, #48 -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 -; CHECK-NEXT: .cfi_def_cfa w29, 16 -; CHECK-NEXT: .cfi_offset w30, -8 -; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: adrp x8, .LCPI11_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI11_0 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: mov z1.d, z0.d[2] -; CHECK-NEXT: mov z2.d, z0.d[3] -; CHECK-NEXT: mov x9, v0.d[1] -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: fmov x10, d2 -; CHECK-NEXT: stp x10, x8, [sp, #16] -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: stp x9, x8, [sp] -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x8] +; CHECK-NEXT: tbl z0.d, { z0.d }, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret %tmp1 = load <4 x i64>, ptr %a %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> @@ -260,39 +245,26 @@ define void @test_revdv4f64_sve2p1(ptr %a) #2 { ; sve-vector-bits-min=256, sve-vector-bits-max is not set, REV inst can't be generated. define void @test_revv8i32(ptr %a) #0 { -; CHECK-LABEL: test_revv8i32: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: sub x9, sp, #48 -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 -; CHECK-NEXT: .cfi_def_cfa w29, 16 -; CHECK-NEXT: .cfi_offset w30, -8 -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.s, vl8 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: mov w9, v0.s[2] -; CHECK-NEXT: mov w10, v0.s[3] -; CHECK-NEXT: fmov w11, s0 -; CHECK-NEXT: mov z1.s, z0.s[4] -; CHECK-NEXT: mov z2.s, z0.s[5] -; CHECK-NEXT: mov z3.s, z0.s[6] -; CHECK-NEXT: mov z0.s, z0.s[7] -; CHECK-NEXT: stp w8, w11, [sp, #24] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: stp w10, w9, [sp, #16] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: stp w9, w8, [sp, #8] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: stp w9, w8, [sp] -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] -; CHECK-NEXT: st1w { z0.s }, p0, [x0] -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-NEXT: ret +; VBITS_GE_256-LABEL: test_revv8i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: index z0.s, #7, #-1 +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: tbl z0.s, { z1.s }, z0.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret +; +; VBITS_GE_512-LABEL: test_revv8i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: adrp x8, .LCPI14_0 +; VBITS_GE_512-NEXT: add x8, x8, :lo12:.LCPI14_0 +; VBITS_GE_512-NEXT: ptrue p1.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p1/z, [x8] +; VBITS_GE_512-NEXT: tbl z0.s, { z0.s }, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret %tmp1 = load <8 x i32>, ptr %a %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> store <8 x i32> %tmp2, ptr %a @@ -379,60 +351,13 @@ define void @test_revv8i32v8i32(ptr %a, ptr %b) #1 { define void @test_rev_fail(ptr %a) #1 { ; CHECK-LABEL: test_rev_fail: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: sub x9, sp, #48 -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 -; CHECK-NEXT: .cfi_def_cfa w29, 16 -; CHECK-NEXT: .cfi_offset w30, -8 -; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: adrp x8, .LCPI20_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI20_0 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: mov z1.h, z0.h[8] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z2.h, z0.h[9] -; CHECK-NEXT: mov z3.h, z0.h[10] -; CHECK-NEXT: mov z4.h, z0.h[11] -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov z1.h, z0.h[12] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z2.h, z0.h[13] -; CHECK-NEXT: strh w8, [sp, #30] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.h, z0.h[14] -; CHECK-NEXT: strh w9, [sp, #28] -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: mov z4.h, z0.h[15] -; CHECK-NEXT: fmov w10, s2 -; CHECK-NEXT: strh w8, [sp, #26] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w11, s3 -; CHECK-NEXT: strh w9, [sp, #24] -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: fmov w12, s4 -; CHECK-NEXT: strh w10, [sp, #20] -; CHECK-NEXT: umov w10, v0.h[3] -; CHECK-NEXT: strh w8, [sp, #22] -; CHECK-NEXT: umov w8, v0.h[2] -; CHECK-NEXT: strh w11, [sp, #18] -; CHECK-NEXT: umov w11, v0.h[4] -; CHECK-NEXT: strh w12, [sp, #16] -; CHECK-NEXT: umov w12, v0.h[5] -; CHECK-NEXT: strh w9, [sp, #12] -; CHECK-NEXT: umov w9, v0.h[6] -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: umov w8, v0.h[7] -; CHECK-NEXT: strh w10, [sp, #8] -; CHECK-NEXT: strh w11, [sp, #6] -; CHECK-NEXT: strh w12, [sp, #4] -; CHECK-NEXT: strh w9, [sp, #2] -; CHECK-NEXT: strh w8, [sp] -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x8] +; CHECK-NEXT: tbl z0.h, { z0.h }, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret %tmp1 = load <16 x i16>, ptr %a %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll new file mode 100644 index 0000000000000..f646319ba5fcc --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll @@ -0,0 +1,254 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s -check-prefixes=CHECK,SVE2_128 +; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefixes=CHECK,SVE2_128_NOMAX + +target triple = "aarch64-unknown-linux-gnu" + +; SVE2_128: .LCPI0_0: +; SVE2_128-NEXT: .byte 0 // 0x0 +; SVE2_128-NEXT: .byte 7 // 0x7 +; SVE2_128-NEXT: .byte 2 // 0x2 +; SVE2_128-NEXT: .byte 3 // 0x3 +; SVE2_128-NEXT: .byte 4 // 0x4 +; SVE2_128-NEXT: .byte 5 // 0x5 +; SVE2_128-NEXT: .byte 6 // 0x6 +; SVE2_128-NEXT: .byte 7 // 0x7 +; SVE2_128-NEXT: .byte 255 // 0xff +; SVE2_128-NEXT: .byte 255 // 0xff +define <8 x i8> @shuffle_index_indices_from_op1(ptr %a, ptr %b) { +; CHECK-LABEL: shuffle_index_indices_from_op1: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: tbl z0.b, { z0.b }, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %op1 = load <8 x i8>, ptr %a + %op2 = load <8 x i8>, ptr %b + %1 = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> + ret <8 x i8> %1 +} + +; SVE2_128: .LCPI1_0: +; SVE2_128-NEXT: .byte 0 // 0x0 +; SVE2_128-NEXT: .byte 1 // 0x1 +; SVE2_128-NEXT: .byte 1 // 0x1 +; SVE2_128-NEXT: .byte 3 // 0x3 +; SVE2_128-NEXT: .byte 4 // 0x4 +; SVE2_128-NEXT: .byte 7 // 0x7 +; SVE2_128-NEXT: .byte 6 // 0x6 +; SVE2_128-NEXT: .byte 7 // 0x7 +; SVE2_128-NEXT: .byte 255 // 0xff +; SVE2_128-NEXT: .byte 255 // 0xff +define <8 x i8> @shuffle_index_indices_from_op2(ptr %a, ptr %b) { +; CHECK-LABEL: shuffle_index_indices_from_op2: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: ldr d0, [x1] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] +; CHECK-NEXT: tbl z0.b, { z0.b }, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %op1 = load <8 x i8>, ptr %a + %op2 = load <8 x i8>, ptr %b + %1 = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> + ret <8 x i8> %1 +} + +; SVE2_128: .LCPI2_0: +; SVE2_128-NEXT: .byte 1 // 0x1 +; SVE2_128-NEXT: .byte 17 // 0x11 +; SVE2_128-NEXT: .byte 18 // 0x12 +; SVE2_128-NEXT: .byte 19 // 0x13 +; SVE2_128-NEXT: .byte 20 // 0x14 +; SVE2_128-NEXT: .byte 20 // 0x14 +; SVE2_128-NEXT: .byte 22 // 0x16 +; SVE2_128-NEXT: .byte 23 // 0x17 +; SVE2_128-NEXT: .byte 255 // 0xff +; SVE2_128-NEXT: .byte 255 // 0xff +define <8 x i8> @shuffle_index_indices_from_both_ops(ptr %a, ptr %b) { +; SVE2_128-LABEL: shuffle_index_indices_from_both_ops: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: adrp x8, .LCPI2_0 +; SVE2_128-NEXT: ldr d0, [x0] +; SVE2_128-NEXT: ldr d1, [x1] +; SVE2_128-NEXT: ldr q2, [x8, :lo12:.LCPI2_0] +; SVE2_128-NEXT: tbl z0.b, { z0.b, z1.b }, z2.b +; SVE2_128-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_128-NEXT: ret +; +; SVE2_128_NOMAX-LABEL: shuffle_index_indices_from_both_ops: +; SVE2_128_NOMAX: // %bb.0: +; SVE2_128_NOMAX-NEXT: sub sp, sp, #16 +; SVE2_128_NOMAX-NEXT: .cfi_def_cfa_offset 16 +; SVE2_128_NOMAX-NEXT: ldr d0, [x1] +; SVE2_128_NOMAX-NEXT: mov z1.b, z0.b[7] +; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[6] +; SVE2_128_NOMAX-NEXT: mov z3.b, z0.b[4] +; SVE2_128_NOMAX-NEXT: fmov w8, s1 +; SVE2_128_NOMAX-NEXT: ldr d1, [x0] +; SVE2_128_NOMAX-NEXT: fmov w9, s2 +; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[3] +; SVE2_128_NOMAX-NEXT: mov z1.b, z1.b[1] +; SVE2_128_NOMAX-NEXT: strb w8, [sp, #15] +; SVE2_128_NOMAX-NEXT: fmov w8, s3 +; SVE2_128_NOMAX-NEXT: mov z3.b, z0.b[2] +; SVE2_128_NOMAX-NEXT: strb w9, [sp, #14] +; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[1] +; SVE2_128_NOMAX-NEXT: fmov w9, s2 +; SVE2_128_NOMAX-NEXT: strb w8, [sp, #13] +; SVE2_128_NOMAX-NEXT: strb w8, [sp, #12] +; SVE2_128_NOMAX-NEXT: fmov w8, s3 +; SVE2_128_NOMAX-NEXT: strb w9, [sp, #11] +; SVE2_128_NOMAX-NEXT: fmov w9, s0 +; SVE2_128_NOMAX-NEXT: strb w8, [sp, #10] +; SVE2_128_NOMAX-NEXT: fmov w8, s1 +; SVE2_128_NOMAX-NEXT: strb w9, [sp, #9] +; SVE2_128_NOMAX-NEXT: strb w8, [sp, #8] +; SVE2_128_NOMAX-NEXT: ldr d0, [sp, #8] +; SVE2_128_NOMAX-NEXT: add sp, sp, #16 +; SVE2_128_NOMAX-NEXT: ret + %op1 = load <8 x i8>, ptr %a + %op2 = load <8 x i8>, ptr %b + %1 = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> + ret <8 x i8> %1 +} + +; SVE2_128: .LCPI3_0: +; SVE2_128-NEXT: .byte 1 // 0x1 +; SVE2_128-NEXT: .byte 17 // 0x11 +; SVE2_128-NEXT: .byte 18 // 0x12 +; SVE2_128-NEXT: .byte 19 // 0x13 +; SVE2_128-NEXT: .byte 20 // 0x14 +; SVE2_128-NEXT: .byte 20 // 0x14 +; SVE2_128-NEXT: .byte 22 // 0x16 +; SVE2_128-NEXT: .byte 0 // 0x0 +; SVE2_128-NEXT: .byte 255 // 0xff +; SVE2_128-NEXT: .byte 255 // 0xff +define <8 x i8> @shuffle_index_poison_value(ptr %a, ptr %b) { +; SVE2_128-LABEL: shuffle_index_poison_value: +; SVE2_128: // %bb.0: +; SVE2_128-NEXT: adrp x8, .LCPI3_0 +; SVE2_128-NEXT: ldr d0, [x0] +; SVE2_128-NEXT: ldr d1, [x1] +; SVE2_128-NEXT: ldr q2, [x8, :lo12:.LCPI3_0] +; SVE2_128-NEXT: tbl z0.b, { z0.b, z1.b }, z2.b +; SVE2_128-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2_128-NEXT: ret +; +; SVE2_128_NOMAX-LABEL: shuffle_index_poison_value: +; SVE2_128_NOMAX: // %bb.0: +; SVE2_128_NOMAX-NEXT: sub sp, sp, #16 +; SVE2_128_NOMAX-NEXT: .cfi_def_cfa_offset 16 +; SVE2_128_NOMAX-NEXT: ldr d0, [x1] +; SVE2_128_NOMAX-NEXT: ldr d3, [x0] +; SVE2_128_NOMAX-NEXT: mov z1.b, z0.b[6] +; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[4] +; SVE2_128_NOMAX-NEXT: fmov w8, s1 +; SVE2_128_NOMAX-NEXT: mov z1.b, z0.b[3] +; SVE2_128_NOMAX-NEXT: fmov w9, s2 +; SVE2_128_NOMAX-NEXT: mov z2.b, z0.b[2] +; SVE2_128_NOMAX-NEXT: mov z0.b, z0.b[1] +; SVE2_128_NOMAX-NEXT: strb w8, [sp, #14] +; SVE2_128_NOMAX-NEXT: fmov w8, s1 +; SVE2_128_NOMAX-NEXT: mov z1.b, z3.b[1] +; SVE2_128_NOMAX-NEXT: strb w9, [sp, #13] +; SVE2_128_NOMAX-NEXT: strb w9, [sp, #12] +; SVE2_128_NOMAX-NEXT: fmov w9, s2 +; SVE2_128_NOMAX-NEXT: strb w8, [sp, #11] +; SVE2_128_NOMAX-NEXT: fmov w8, s0 +; SVE2_128_NOMAX-NEXT: strb w9, [sp, #10] +; SVE2_128_NOMAX-NEXT: fmov w9, s1 +; SVE2_128_NOMAX-NEXT: strb w8, [sp, #9] +; SVE2_128_NOMAX-NEXT: strb w9, [sp, #8] +; SVE2_128_NOMAX-NEXT: ldr d0, [sp, #8] +; SVE2_128_NOMAX-NEXT: add sp, sp, #16 +; SVE2_128_NOMAX-NEXT: ret + %op1 = load <8 x i8>, ptr %a + %op2 = load <8 x i8>, ptr %b + %1 = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> + ret <8 x i8> %1 +} + +define <8 x i8> @shuffle_op1_poison(ptr %a, ptr %b) { +; CHECK-LABEL: shuffle_op1_poison: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: ldr d0, [x1] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: tbl z0.b, { z0.b }, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %op2 = load <8 x i8>, ptr %b + %1 = shufflevector <8 x i8> poison, <8 x i8> %op2, <8 x i32> + ret <8 x i8> %1 +} + +; In this function, we could not represent indexes for the second operand +; because for i8 type, the maximum constant in the mask is 256. +define <8 x i8> @negative_test_shuffle_index_size_op_both_maxhw(ptr %a, ptr %b) "target-features"="+sve2" vscale_range(16,16) { +; CHECK-LABEL: negative_test_shuffle_index_size_op_both_maxhw: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr d0, [x1] +; CHECK-NEXT: mov z1.b, z0.b[7] +; CHECK-NEXT: mov z2.b, z0.b[6] +; CHECK-NEXT: mov z3.b, z0.b[4] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z2.b, z0.b[3] +; CHECK-NEXT: mov z1.b, z1.b[1] +; CHECK-NEXT: strb w8, [sp, #15] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: mov z3.b, z0.b[2] +; CHECK-NEXT: strb w9, [sp, #14] +; CHECK-NEXT: mov z0.b, z0.b[1] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: strb w8, [sp, #13] +; CHECK-NEXT: strb w8, [sp, #12] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: strb w9, [sp, #11] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: strb w8, [sp, #10] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: strb w9, [sp, #9] +; CHECK-NEXT: strb w8, [sp, #8] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %op1 = load <8 x i8>, ptr %a + %op2 = load <8 x i8>, ptr %b + %1 = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> + ret <8 x i8> %1 +} + +; CHECK: .LCPI6_0: +; CHECK-NEXT: .byte 0 // 0x0 +; CHECK-NEXT: .byte 7 // 0x7 +; CHECK-NEXT: .byte 2 // 0x2 +; CHECK-NEXT: .byte 3 // 0x3 +; CHECK-NEXT: .byte 4 // 0x4 +; CHECK-NEXT: .byte 5 // 0x5 +; CHECK-NEXT: .byte 6 // 0x6 +; CHECK-NEXT: .byte 7 // 0x7 +; CHECK-NEXT: .byte 255 // 0xff +; CHECK-NEXT: .byte 255 // 0xff +define <8 x i8> @shuffle_index_size_op1_maxhw(ptr %a, ptr %b) "target-features"="+sve2" vscale_range(16,16) { +; CHECK-LABEL: shuffle_index_size_op1_maxhw: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: adrp x8, .LCPI6_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI6_0 +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x8] +; CHECK-NEXT: tbl z0.b, { z0.b }, z1.b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret + %op1 = load <8 x i8>, ptr %a + %op2 = load <8 x i8>, ptr %b + %1 = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> + ret <8 x i8> %1 +}