diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index dc8e7c84f5e2c..3f338e1f5c282 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -30605,6 +30605,43 @@ AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op, assert(OpVT.isScalableVector() && "Expected scalable vector in LowerVECTOR_DEINTERLEAVE."); + if (Op->getNumOperands() == 3) { + // aarch64_sve_ld3 only supports packed datatypes. + EVT PackedVT = getPackedSVEVectorVT(OpVT.getVectorElementCount()); + Align Alignment = DAG.getReducedAlign(PackedVT, /*UseABI=*/false); + SDValue StackPtr = + DAG.CreateStackTemporary(PackedVT.getStoreSize() * 3, Alignment); + + // Write out unmodified operands. + SmallVector Chains; + for (unsigned I = 0; I < 3; ++I) { + SDValue Ptr = + DAG.getMemBasePlusOffset(StackPtr, PackedVT.getStoreSize() * I, DL); + SDValue V = getSVESafeBitCast(PackedVT, Op.getOperand(I), DAG); + Chains.push_back( + DAG.getStore(DAG.getEntryNode(), DL, V, Ptr, MachinePointerInfo())); + } + + Intrinsic::ID IntID = Intrinsic::aarch64_sve_ld3_sret; + EVT PredVT = PackedVT.changeVectorElementType(MVT::i1); + + SmallVector Ops; + Ops.push_back(DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains)); + Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64)); + Ops.push_back(DAG.getConstant(1, DL, PredVT)); + Ops.push_back(StackPtr); + + // Read back and deinterleave data. + SDVTList VTs = DAG.getVTList(PackedVT, PackedVT, PackedVT, MVT::Other); + SDValue LD3 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops); + + SmallVector Results; + Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(0), DAG)); + Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(1), DAG)); + Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(2), DAG)); + return DAG.getMergeValues(Results, DL); + } + // Are multi-register uzp instructions available? if (Subtarget->hasSME2() && Subtarget->isStreaming() && OpVT.getVectorElementType() != MVT::i1) { @@ -30646,6 +30683,42 @@ SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op, assert(OpVT.isScalableVector() && "Expected scalable vector in LowerVECTOR_INTERLEAVE."); + if (Op->getNumOperands() == 3) { + // aarch64_sve_st3 only supports packed datatypes. + EVT PackedVT = getPackedSVEVectorVT(OpVT.getVectorElementCount()); + SmallVector InVecs; + for (SDValue V : Op->ops()) + InVecs.push_back(getSVESafeBitCast(PackedVT, V, DAG)); + + Align Alignment = DAG.getReducedAlign(PackedVT, /*UseABI=*/false); + SDValue StackPtr = + DAG.CreateStackTemporary(PackedVT.getStoreSize() * 3, Alignment); + + Intrinsic::ID IntID = Intrinsic::aarch64_sve_st3; + EVT PredVT = PackedVT.changeVectorElementType(MVT::i1); + + SmallVector Ops; + Ops.push_back(DAG.getEntryNode()); + Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64)); + Ops.append(InVecs); + Ops.push_back(DAG.getConstant(1, DL, PredVT)); + Ops.push_back(StackPtr); + + // Interleave operands and store. + SDValue Ch = DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops); + + // Read back the interleaved data. + SmallVector Results; + for (unsigned I = 0; I < 3; ++I) { + SDValue Ptr = + DAG.getMemBasePlusOffset(StackPtr, PackedVT.getStoreSize() * I, DL); + SDValue L = DAG.getLoad(PackedVT, DL, Ch, Ptr, MachinePointerInfo()); + Results.push_back(getSVESafeBitCast(OpVT, L, DAG)); + } + + return DAG.getMergeValues(Results, DL); + } + // Are multi-register zip instructions available? if (Subtarget->hasSME2() && Subtarget->isStreaming() && OpVT.getVectorElementType() != MVT::i1) { diff --git a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll index 139ecafaff0eb..67197b3fe4e80 100644 --- a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll +++ b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll @@ -231,6 +231,274 @@ define {, } @vector_deinterleave_nxv2i64_nxv ret {, } %retval } +define {, , } @vector_deinterleave_nxv2f16_nxv6f16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv6f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: str z1, [sp, #2, mul vl] +; CHECK-NEXT: str z2, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {, , } @llvm.vector.deinterleave3.nxv6f16( %vec) + ret {, , } %retval +} + +define {, , } @vector_deinterleave_nxv4f16_nxv12f16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv12f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpkhi z2.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: str z1, [sp, #2, mul vl] +; CHECK-NEXT: str z2, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3w { z0.s - z2.s }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {, , } @llvm.vector.deinterleave3.nxv12f16( %vec) + ret {, , } %retval +} + +define {, , } @vector_deinterleave_nxv8f16_nxv24f16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv24f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: str z2, [sp, #2, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3h { z0.h - z2.h }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {, , } @llvm.vector.deinterleave3.nxv24f16( %vec) + ret {, , } %retval +} + +define {, , } @vector_deinterleave_nxv2f32_nxv6f32( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv6f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: str z1, [sp, #2, mul vl] +; CHECK-NEXT: str z2, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {, , } @llvm.vector.deinterleave3.nxv6f32( %vec) + ret {, , } %retval +} + +define {, , } @vector_deinterleave_nxv4f32_nxv12f32( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv12f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: str z2, [sp, #2, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3w { z0.s - z2.s }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {, , } @llvm.vector.deinterleave3.nxv12f32( %vec) + ret {, , } %retval +} + +define {, , } @vector_deinterleave_nxv2f64_nxv6f64( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv6f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: str z2, [sp, #2, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {, , } @llvm.vector.deinterleave3.nxv6f64( %vec) + ret {, , } %retval +} + +define {, , } @vector_deinterleave_nxv2bf16_nxv6bf16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2bf16_nxv6bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: str z1, [sp, #2, mul vl] +; CHECK-NEXT: str z2, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {, , } @llvm.vector.deinterleave3.nxv6bf16( %vec) + ret {, , } %retval +} + +define {, , } @vector_deinterleave_nxv4bf16_nxv12bf16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4bf16_nxv12bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpkhi z2.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: str z1, [sp, #2, mul vl] +; CHECK-NEXT: str z2, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3w { z0.s - z2.s }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {, , } @llvm.vector.deinterleave3.nxv12bf16( %vec) + ret {, , } %retval +} + +define {, , } @vector_deinterleave_nxv8bf16_nxv24bf16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv8bf16_nxv24bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: str z2, [sp, #2, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3h { z0.h - z2.h }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {, , } @llvm.vector.deinterleave3.nxv24bf16( %vec) + ret {, , } %retval +} + +; Integers + +define {, , } @vector_deinterleave_nxv16i8_nxv48i8( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv48i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: str z2, [sp, #2, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3b { z0.b - z2.b }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {, , } @llvm.vector.deinterleave3.nxv48i8( %vec) + ret {, , } %retval +} + +define {, , } @vector_deinterleave_nxv8i16_nxv24i16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv24i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: str z2, [sp, #2, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3h { z0.h - z2.h }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {, , } @llvm.vector.deinterleave3.nxv24i16( %vec) + ret {, , } %retval +} + +define {, , } @vector_deinterleave_nxv4i32_nxvv12i32( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4i32_nxvv12i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: str z2, [sp, #2, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3w { z0.s - z2.s }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {, , } @llvm.vector.deinterleave3.nxv12i32( %vec) + ret {, , } %retval +} + +define {, , } @vector_deinterleave_nxv2i64_nxv6i64( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv6i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: str z2, [sp, #2, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {, , } @llvm.vector.deinterleave3.nxv6i64( %vec) + ret {, , } %retval +} + define {, , , } @vector_deinterleave_nxv16i8_nxv64i8( %vec) { ; SVE-LABEL: vector_deinterleave_nxv16i8_nxv64i8: ; SVE: // %bb.0: @@ -599,31 +867,3 @@ define {, } @vector_deinterleave_nxv2i32_nxv %retval = call {,} @llvm.vector.deinterleave2.nxv4i32( %vec) ret {, } %retval } - -; Floating declarations -declare {,} @llvm.vector.deinterleave2.nxv4f16() -declare {, } @llvm.vector.deinterleave2.nxv8f16() -declare {, } @llvm.vector.deinterleave2.nxv4f32() -declare {, } @llvm.vector.deinterleave2.nxv16f16() -declare {, } @llvm.vector.deinterleave2.nxv8f32() -declare {, } @llvm.vector.deinterleave2.nxv4f64() - -; Integer declarations -declare {, } @llvm.vector.deinterleave2.nxv32i8() -declare {, } @llvm.vector.deinterleave2.nxv16i16() -declare {, } @llvm.vector.deinterleave2.nxv8i32() -declare {, } @llvm.vector.deinterleave2.nxv4i64() - -; Predicated declarations -declare {, } @llvm.vector.deinterleave2.nxv32i1() -declare {, } @llvm.vector.deinterleave2.nxv16i1() -declare {, } @llvm.vector.deinterleave2.nxv8i1() -declare {, } @llvm.vector.deinterleave2.nxv4i1() - -; Illegal size type -declare {, } @llvm.vector.deinterleave2.nxv8i64() -declare {, } @llvm.vector.deinterleave2.nxv16i64() - -declare {, } @llvm.vector.deinterleave2.nxv16i8() -declare {, } @llvm.vector.deinterleave2.nxv8i16() -declare {, } @llvm.vector.deinterleave2.nxv4i32() diff --git a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll index c7fb2db53d2a3..49f185c4312a2 100644 --- a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll +++ b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll @@ -221,6 +221,318 @@ define @interleave2_nxv4i64( %vec0, %retval } +define @interleave3_nxv6f16( %vec0, %vec1, %vec2) { +; CHECK-LABEL: interleave3_nxv6f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3d { z0.d - z2.d }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp, #2, mul vl] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z2, [sp] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z1.s, z2.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call @llvm.vector.interleave3.nxv6f16( %vec0, %vec1, %vec2) + ret %retval +} + +define @interleave3_nxv12f16( %vec0, %vec1, %vec2) { +; CHECK-LABEL: interleave3_nxv12f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x28, 0x1e, 0x22 // sp + 16 + 40 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3w { z0.s - z2.s }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp, #1, mul vl] +; CHECK-NEXT: ldr z1, [sp] +; CHECK-NEXT: ldr z2, [sp, #2, mul vl] +; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h +; CHECK-NEXT: st1h { z2.s }, p0, [x8, #7, mul vl] +; CHECK-NEXT: str z0, [sp, #3, mul vl] +; CHECK-NEXT: ldr z1, [sp, #4, mul vl] +; CHECK-NEXT: ldr z0, [sp, #3, mul vl] +; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call @llvm.vector.interleave3.nxv12f16( %vec0, %vec1, %vec2) + ret %retval +} + +define @interleave3_nxv24f16( %vec0, %vec1, %vec2) { +; CHECK-LABEL: interleave3_nxv24f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3h { z0.h - z2.h }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z2, [sp, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call @llvm.vector.interleave3.nxv24f16( %vec0, %vec1, %vec2) + ret %retval +} + +define @interleave3_nxv6f32( %vec0, %vec1, %vec2) { +; CHECK-LABEL: interleave3_nxv6f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x28, 0x1e, 0x22 // sp + 16 + 40 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3d { z0.d - z2.d }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp, #1, mul vl] +; CHECK-NEXT: ldr z1, [sp] +; CHECK-NEXT: ldr z2, [sp, #2, mul vl] +; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: st1w { z2.d }, p0, [x8, #7, mul vl] +; CHECK-NEXT: str z0, [sp, #3, mul vl] +; CHECK-NEXT: ldr z1, [sp, #4, mul vl] +; CHECK-NEXT: ldr z0, [sp, #3, mul vl] +; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call @llvm.vector.interleave3.nxv6f32( %vec0, %vec1, %vec2) + ret %retval +} + +define @interleave3_nxv12f32( %vec0, %vec1, %vec2) { +; CHECK-LABEL: interleave3_nxv12f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3w { z0.s - z2.s }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z2, [sp, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call @llvm.vector.interleave3.nxv12f32( %vec0, %vec1, %vec2) + ret %retval +} + +define @interleave3_nxv6f64( %vec0, %vec1, %vec2) { +; CHECK-LABEL: interleave3_nxv6f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3d { z0.d - z2.d }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z2, [sp, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call @llvm.vector.interleave3.nxv6f64( %vec0, %vec1, %vec2) + ret %retval +} + +define @interleave3_nxv6bf16( %vec0, %vec1, %vec2) { +; CHECK-LABEL: interleave3_nxv6bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3d { z0.d - z2.d }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp, #2, mul vl] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z2, [sp] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z1.s, z2.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call @llvm.vector.interleave3.nxv6bf16( %vec0, %vec1, %vec2) + ret %retval +} + +define @interleave3_nxv12bf16( %vec0, %vec1, %vec2) { +; CHECK-LABEL: interleave3_nxv12bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x28, 0x1e, 0x22 // sp + 16 + 40 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3w { z0.s - z2.s }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp, #1, mul vl] +; CHECK-NEXT: ldr z1, [sp] +; CHECK-NEXT: ldr z2, [sp, #2, mul vl] +; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h +; CHECK-NEXT: st1h { z2.s }, p0, [x8, #7, mul vl] +; CHECK-NEXT: str z0, [sp, #3, mul vl] +; CHECK-NEXT: ldr z1, [sp, #4, mul vl] +; CHECK-NEXT: ldr z0, [sp, #3, mul vl] +; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call @llvm.vector.interleave3.nxv12bf16( %vec0, %vec1, %vec2) + ret %retval +} + +define @interleave3_nxv24bf16( %vec0, %vec1, %vec2) { +; CHECK-LABEL: interleave3_nxv24bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3h { z0.h - z2.h }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z2, [sp, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call @llvm.vector.interleave3.nxv24bf16( %vec0, %vec1, %vec2) + ret %retval +} + +; Integers + +define @interleave3_nxv48i8( %vec0, %vec1, %vec2) { +; CHECK-LABEL: interleave3_nxv48i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3b { z0.b - z2.b }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z2, [sp, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call @llvm.vector.interleave3.nxv48i8( %vec0, %vec1, %vec2) + ret %retval +} + +define @interleave3_nxv24i16( %vec0, %vec1, %vec2) { +; CHECK-LABEL: interleave3_nxv24i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3h { z0.h - z2.h }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z2, [sp, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call @llvm.vector.interleave3.nxv24i16( %vec0, %vec1, %vec2) + ret %retval +} + +define @interleave3_nxv12i32( %vec0, %vec1, %vec2) { +; CHECK-LABEL: interleave3_nxv12i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3w { z0.s - z2.s }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z2, [sp, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call @llvm.vector.interleave3.nxv12i32( %vec0, %vec1, %vec2) + ret %retval +} + +define @interleave3_nxv6i64( %vec0, %vec1, %vec2) { +; CHECK-LABEL: interleave3_nxv6i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3d { z0.d - z2.d }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z2, [sp, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call @llvm.vector.interleave3.nxv6i64( %vec0, %vec1, %vec2) + ret %retval +} + define @interleave4_nxv16i8( %vec0, %vec1, %vec2, %vec3) { ; SVE-LABEL: interleave4_nxv16i8: ; SVE: // %bb.0: