Skip to content

Commit

Permalink
[AArch64][SVE] Lower vector.insert to predicated merged MOV
Browse files Browse the repository at this point in the history
Use predicated SEL for vector.insert instead of going through memory

Differential Revision: https://reviews.llvm.org/D115259
  • Loading branch information
MDevereau committed Dec 13, 2021
1 parent e90630e commit 2e585dd
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 116 deletions.
26 changes: 17 additions & 9 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Expand Up @@ -10958,16 +10958,15 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
EVT InVT = Op.getOperand(1).getValueType();
unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();

if (InVT.isScalableVector()) {
SDLoc DL(Op);
EVT VT = Op.getValueType();
SDValue Vec0 = Op.getOperand(0);
SDValue Vec1 = Op.getOperand(1);
SDLoc DL(Op);
EVT VT = Op.getValueType();

if (InVT.isScalableVector()) {
if (!isTypeLegal(VT))
return SDValue();

SDValue Vec0 = Op.getOperand(0);
SDValue Vec1 = Op.getOperand(1);

// Ensure the subvector is half the size of the main vector.
if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
return SDValue();
Expand Down Expand Up @@ -10997,9 +10996,18 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
return SDValue();
}

// This will be matched by custom code during ISelDAGToDAG.
if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef())
return Op;
if (Idx == 0 && isPackedVectorType(VT, DAG)) {
// This will be matched by custom code during ISelDAGToDAG.
if (Vec0.isUndef())
return Op;

unsigned int PredPattern =
getSVEPredPatternFromNumElements(InVT.getVectorNumElements());
auto PredTy = VT.changeVectorElementType(MVT::i1);
SDValue PTrue = getPTrue(DAG, DL, PredTy, PredPattern);
SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
}

return SDValue();
}
Expand Down
27 changes: 6 additions & 21 deletions llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
Expand Up @@ -74,17 +74,12 @@ define <vscale x 4 x i16> @vec_scalable_subvec_scalable_idx_nonzero_i16(<vscale
define <vscale x 8 x i8> @vec_scalable_subvec_fixed_idx_zero_i8(<vscale x 8 x i8>* %a, <8 x i8>* %b) #0 {
; CHECK-LABEL: vec_scalable_subvec_fixed_idx_zero_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ptrue p0.h, vl8
; CHECK-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-NEXT: st1h { z0.h }, p0, [sp]
; CHECK-NEXT: str q1, [sp]
; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: mov z0.h, p0/m, z1.h
; CHECK-NEXT: ret
%vec = load <vscale x 8 x i8>, <vscale x 8 x i8>* %a
%subvec = load <8 x i8>, <8 x i8>* %b
Expand Down Expand Up @@ -123,17 +118,12 @@ define <vscale x 8 x i8> @vec_scalable_subvec_fixed_idx_nonzero_i8(<vscale x 8 x
define <vscale x 4 x i16> @vec_scalable_subvec_fixed_idx_zero_i16(<vscale x 4 x i16>* %a, <4 x i16>* %b) #0 {
; CHECK-LABEL: vec_scalable_subvec_fixed_idx_zero_i16:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-NEXT: st1w { z0.s }, p0, [sp]
; CHECK-NEXT: str q1, [sp]
; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: mov z0.s, p0/m, z1.s
; CHECK-NEXT: ret
%vec = load <vscale x 4 x i16>, <vscale x 4 x i16>* %a
%subvec = load <4 x i16>, <4 x i16>* %b
Expand Down Expand Up @@ -172,17 +162,12 @@ define <vscale x 4 x i16> @vec_scalable_subvec_fixed_idx_nonzero_i16(<vscale x 4
define <vscale x 2 x i32> @vec_scalable_subvec_fixed_idx_zero_i32(<vscale x 2 x i32>* %a, <2 x i32>* %b) #0 {
; CHECK-LABEL: vec_scalable_subvec_fixed_idx_zero_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: str q1, [sp]
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: mov z0.d, p0/m, z1.d
; CHECK-NEXT: ret
%vec = load <vscale x 2 x i32>, <vscale x 2 x i32>* %a
%subvec = load <2 x i32>, <2 x i32>* %b
Expand Down
72 changes: 38 additions & 34 deletions llvm/test/CodeGen/AArch64/split-vector-insert.ll
Expand Up @@ -17,44 +17,46 @@ define <vscale x 2 x i64> @test_nxv2i64_v8i64(<vscale x 2 x i64> %a, <8 x i64> %
; CHECK-LABEL: test_nxv2i64_v8i64:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-4
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: cntd x8
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sub x8, x8, #2
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: str q1, [sp]
; CHECK-NEXT: mov w9, #2
; CHECK-NEXT: sub x8, x8, #2
; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: cmp x8, #2
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
; CHECK-NEXT: mov x10, sp
; CHECK-NEXT: csel x9, x8, x9, lo
; CHECK-NEXT: addvl x10, sp, #1
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: mov z0.d, p0/m, z1.d
; CHECK-NEXT: lsl x9, x9, #3
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: cmp x8, #4
; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl]
; CHECK-NEXT: str q2, [x10, x9]
; CHECK-NEXT: mov w9, #4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl]
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
; CHECK-NEXT: csel x9, x8, x9, lo
; CHECK-NEXT: lsl x9, x9, #3
; CHECK-NEXT: addvl x10, sp, #2
; CHECK-NEXT: addvl x10, sp, #1
; CHECK-NEXT: cmp x8, #6
; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl]
; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl]
; CHECK-NEXT: str q3, [x10, x9]
; CHECK-NEXT: mov w9, #6
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl]
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl]
; CHECK-NEXT: csel x8, x8, x9, lo
; CHECK-NEXT: addvl x9, sp, #3
; CHECK-NEXT: addvl x9, sp, #2
; CHECK-NEXT: lsl x8, x8, #3
; CHECK-NEXT: st1d { z0.d }, p0, [sp, #3, mul vl]
; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl]
; CHECK-NEXT: str q4, [x9, x8]
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #3, mul vl]
; CHECK-NEXT: addvl sp, sp, #4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl]
; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret




%r = call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v8i64(<vscale x 2 x i64> %a, <8 x i64> %b, i64 0)
ret <vscale x 2 x i64> %r
}
Expand All @@ -68,44 +70,46 @@ define <vscale x 2 x double> @test_nxv2f64_v8f64(<vscale x 2 x double> %a, <8 x
; CHECK-LABEL: test_nxv2f64_v8f64:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-4
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: cntd x8
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sub x8, x8, #2
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: str q1, [sp]
; CHECK-NEXT: mov w9, #2
; CHECK-NEXT: sub x8, x8, #2
; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: cmp x8, #2
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
; CHECK-NEXT: mov x10, sp
; CHECK-NEXT: csel x9, x8, x9, lo
; CHECK-NEXT: addvl x10, sp, #1
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: mov z0.d, p0/m, z1.d
; CHECK-NEXT: lsl x9, x9, #3
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: cmp x8, #4
; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl]
; CHECK-NEXT: str q2, [x10, x9]
; CHECK-NEXT: mov w9, #4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl]
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
; CHECK-NEXT: csel x9, x8, x9, lo
; CHECK-NEXT: lsl x9, x9, #3
; CHECK-NEXT: addvl x10, sp, #2
; CHECK-NEXT: addvl x10, sp, #1
; CHECK-NEXT: cmp x8, #6
; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl]
; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl]
; CHECK-NEXT: str q3, [x10, x9]
; CHECK-NEXT: mov w9, #6
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl]
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl]
; CHECK-NEXT: csel x8, x8, x9, lo
; CHECK-NEXT: addvl x9, sp, #3
; CHECK-NEXT: addvl x9, sp, #2
; CHECK-NEXT: lsl x8, x8, #3
; CHECK-NEXT: st1d { z0.d }, p0, [sp, #3, mul vl]
; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl]
; CHECK-NEXT: str q4, [x9, x8]
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #3, mul vl]
; CHECK-NEXT: addvl sp, sp, #4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl]
; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret




%r = call <vscale x 2 x double> @llvm.experimental.vector.insert.nxv2f64.v8f64(<vscale x 2 x double> %a, <8 x double> %b, i64 0)
ret <vscale x 2 x double> %r
}
79 changes: 27 additions & 52 deletions llvm/test/CodeGen/AArch64/sve-insert-vector.ll
Expand Up @@ -4,14 +4,9 @@
define <vscale x 2 x i64> @insert_v2i64_nxv2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec) nounwind {
; CHECK-LABEL: insert_v2i64_nxv2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: str q1, [sp]
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: mov z0.d, p0/m, z1.d
; CHECK-NEXT: ret
%retval = call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec, i64 0)
ret <vscale x 2 x i64> %retval
Expand Down Expand Up @@ -43,14 +38,9 @@ define <vscale x 2 x i64> @insert_v2i64_nxv2i64_idx2(<vscale x 2 x i64> %vec, <2
define <vscale x 4 x i32> @insert_v4i32_nxv4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec) nounwind {
; CHECK-LABEL: insert_v4i32_nxv4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: st1w { z0.s }, p0, [sp]
; CHECK-NEXT: str q1, [sp]
; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: mov z0.s, p0/m, z1.s
; CHECK-NEXT: ret
%retval = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec, i64 0)
ret <vscale x 4 x i32> %retval
Expand Down Expand Up @@ -82,14 +72,9 @@ define <vscale x 4 x i32> @insert_v4i32_nxv4i32_idx4(<vscale x 4 x i32> %vec, <4
define <vscale x 8 x i16> @insert_v8i16_nxv8i16(<vscale x 8 x i16> %vec, <8 x i16> %subvec) nounwind {
; CHECK-LABEL: insert_v8i16_nxv8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: st1h { z0.h }, p0, [sp]
; CHECK-NEXT: str q1, [sp]
; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ptrue p0.h, vl8
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: mov z0.h, p0/m, z1.h
; CHECK-NEXT: ret
%retval = call <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> %vec, <8 x i16> %subvec, i64 0)
ret <vscale x 8 x i16> %retval
Expand Down Expand Up @@ -121,14 +106,9 @@ define <vscale x 8 x i16> @insert_v8i16_nxv8i16_idx8(<vscale x 8 x i16> %vec, <8
define <vscale x 16 x i8> @insert_v16i8_nxv16i8(<vscale x 16 x i8> %vec, <16 x i8> %subvec) nounwind {
; CHECK-LABEL: insert_v16i8_nxv16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: st1b { z0.b }, p0, [sp]
; CHECK-NEXT: str q1, [sp]
; CHECK-NEXT: ld1b { z0.b }, p0/z, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ptrue p0.b, vl16
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: mov z0.b, p0/m, z1.b
; CHECK-NEXT: ret
%retval = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> %vec, <16 x i8> %subvec, i64 0)
ret <vscale x 16 x i8> %retval
Expand Down Expand Up @@ -469,7 +449,7 @@ define <vscale x 12 x i32> @insert_nxv12i32_nxv4i32(<vscale x 4 x i32> %sv0, <vs
define <vscale x 2 x bfloat> @insert_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %sv0, <vscale x 2 x bfloat> %sv1) nounwind {
; CHECK-LABEL: insert_nxv2bf16_nxv2bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: z0.d, z1.d
; CHECK-NEXT: mov z0.d, z1.d
; CHECK-NEXT: ret
%v0 = call <vscale x 2 x bfloat> @llvm.experimental.vector.insert.nxv2bf16.nxv2bf16(<vscale x 2 x bfloat> %sv0, <vscale x 2 x bfloat> %sv1, i64 0)
ret <vscale x 2 x bfloat> %v0
Expand All @@ -478,7 +458,7 @@ define <vscale x 2 x bfloat> @insert_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %sv
define <vscale x 4 x bfloat> @insert_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %sv0, <vscale x 4 x bfloat> %sv1) nounwind {
; CHECK-LABEL: insert_nxv4bf16_nxv4bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: z0.d, z1.d
; CHECK-NEXT: mov z0.d, z1.d
; CHECK-NEXT: ret
%v0 = call <vscale x 4 x bfloat> @llvm.experimental.vector.insert.nxv4bf16.nxv4bf16(<vscale x 4 x bfloat> %sv0, <vscale x 4 x bfloat> %sv1, i64 0)
ret <vscale x 4 x bfloat> %v0
Expand All @@ -487,15 +467,15 @@ define <vscale x 4 x bfloat> @insert_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %sv
define <vscale x 4 x bfloat> @insert_nxv4bf16_v4bf16(<vscale x 4 x bfloat> %sv0, <4 x bfloat> %v1) nounwind {
; CHECK-LABEL: insert_nxv4bf16_v4bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl]
; CHECK-NEXT: addpl x8, sp, #4
; CHECK-NEXT: str d1, [x8]
; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl]
; CHECK-NEXT: addpl x8, sp, #4
; CHECK-NEXT: str d1, [x8]
; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%v0 = call <vscale x 4 x bfloat> @llvm.experimental.vector.insert.nxv4bf16.v4bf16(<vscale x 4 x bfloat> %sv0, <4 x bfloat> %v1, i64 0)
ret <vscale x 4 x bfloat> %v0
Expand All @@ -504,7 +484,7 @@ define <vscale x 4 x bfloat> @insert_nxv4bf16_v4bf16(<vscale x 4 x bfloat> %sv0,
define <vscale x 8 x bfloat> @insert_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %sv0, <vscale x 8 x bfloat> %sv1) nounwind {
; CHECK-LABEL: insert_nxv8bf16_nxv8bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: z0.d, z1.d
; CHECK-NEXT: mov z0.d, z1.d
; CHECK-NEXT: ret
%v0 = call <vscale x 8 x bfloat> @llvm.experimental.vector.insert.nxv8bf16.nxv8bf16(<vscale x 8 x bfloat> %sv0, <vscale x 8 x bfloat> %sv1, i64 0)
ret <vscale x 8 x bfloat> %v0
Expand All @@ -513,14 +493,9 @@ define <vscale x 8 x bfloat> @insert_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %sv
define <vscale x 8 x bfloat> @insert_nxv8bf16_v8bf16(<vscale x 8 x bfloat> %sv0, <8 x bfloat> %v1) nounwind {
; CHECK-LABEL: insert_nxv8bf16_v8bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: st1h { z0.h }, p0, [sp]
; CHECK-NEXT: str q1, [sp]
; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ptrue p0.h, vl8
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: mov z0.h, p0/m, z1.h
; CHECK-NEXT: ret
%v0 = call <vscale x 8 x bfloat> @llvm.experimental.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> %sv0, <8 x bfloat> %v1, i64 0)
ret <vscale x 8 x bfloat> %v0
Expand Down

0 comments on commit 2e585dd

Please sign in to comment.