Skip to content

Commit

Permalink
[ARM] Lower MVETRUNC to stack operations
Browse files Browse the repository at this point in the history
The MVETRUNC node truncates two wide vectors to a single vector with
narrower elements. This is usually lowered to a series of extract/insert
elements, going via GPR registers. This patch changes that to instead
use a pair of truncating stores and a stack reload. This cuts down the
number of instructions at the expense of some stack space.

Differential Revision: https://reviews.llvm.org/D104515
  • Loading branch information
davemgreen committed Jun 26, 2021
1 parent d6144c3 commit 41d8149
Show file tree
Hide file tree
Showing 5 changed files with 162 additions and 261 deletions.
55 changes: 38 additions & 17 deletions llvm/lib/Target/ARM/ARMISelLowering.cpp
Expand Up @@ -17217,7 +17217,7 @@ static SDValue PerformBITCASTCombine(SDNode *N,
}

// Some combines for the MVETrunc truncations legalizer helper. Also lowers the
// node into a buildvector after legalizeOps.
// node into stack operations after legalizeOps.
SDValue ARMTargetLowering::PerformMVETruncCombine(
SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
Expand Down Expand Up @@ -17265,7 +17265,14 @@ SDValue ARMTargetLowering::PerformMVETruncCombine(
}
}

auto LowerToBuildVec = [&]() {
// For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
// truncate to a buildvector to allow the generic optimisations to kick in.
if (all_of(N->ops(), [](SDValue Op) {
return Op.getOpcode() == ISD::BUILD_VECTOR ||
Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
(Op.getOpcode() == ISD::BITCAST &&
Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
})) {
SmallVector<SDValue, 8> Extracts;
for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
SDValue O = N->getOperand(Op);
Expand All @@ -17276,26 +17283,40 @@ SDValue ARMTargetLowering::PerformMVETruncCombine(
}
}
return DAG.getBuildVector(VT, DL, Extracts);
};

// For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
// truncate to a buildvector to allow the generic optimisations to kick in.
if (all_of(N->ops(), [](SDValue Op) {
return Op.getOpcode() == ISD::BUILD_VECTOR ||
Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
(Op.getOpcode() == ISD::BITCAST &&
Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
}))
return LowerToBuildVec();
}

// If we are late in the legalization process and nothing has optimised
// the trunc to anything better lower it to a series of extracts and a
// buildvector.
// the trunc to anything better, lower it to a stack store and reload,
// performing the truncation whilst keeping the lanes in the correct order:
// VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
if (DCI.isBeforeLegalizeOps())
return SDValue();

SDValue BuildVec = LowerToBuildVec();
return LowerBUILD_VECTOR(BuildVec, DCI.DAG, Subtarget);
SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4));
int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
int NumIns = N->getNumOperands();
assert((NumIns == 2 || NumIns == 4) &&
"Expected 2 or 4 inputs to an MVETrunc");
EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
if (N->getNumOperands() == 4)
StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());

SmallVector<SDValue> Chains;
for (int I = 0; I < NumIns; I++) {
SDValue Ptr = DAG.getNode(
ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));
MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(
DAG.getMachineFunction(), SPFI, I * 16 / NumIns);
SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),
Ptr, MPI, StoreVT, Align(4));
Chains.push_back(Ch);
}

SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
MachinePointerInfo MPI =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0);
return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
}

SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
Expand Down
159 changes: 71 additions & 88 deletions llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
Expand Up @@ -275,105 +275,88 @@ entry:
define arm_aapcs_vfpcc <16 x i8> @ext_add_ashr_trunc_i8i32(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: ext_add_ashr_trunc_i8i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r7, lr}
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vmov.u8 r0, q1[14]
; CHECK-NEXT: vmov.u8 r1, q1[12]
; CHECK-NEXT: vmov q3[2], q3[0], r1, r0
; CHECK-NEXT: vmov.u8 r0, q1[15]
; CHECK-NEXT: vmov.u8 r1, q1[13]
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: vmov.u8 r1, q1[14]
; CHECK-NEXT: vmov.u8 r2, q1[12]
; CHECK-NEXT: vmov q3[2], q3[0], r2, r1
; CHECK-NEXT: vmov.u8 r1, q1[15]
; CHECK-NEXT: vmov.u8 r2, q1[13]
; CHECK-NEXT: vmov.i32 q2, #0xff
; CHECK-NEXT: vmov q3[3], q3[1], r1, r0
; CHECK-NEXT: vmov.u8 r0, q0[14]
; CHECK-NEXT: vmov.u8 r1, q0[12]
; CHECK-NEXT: vmov q3[3], q3[1], r2, r1
; CHECK-NEXT: vmov.u8 r1, q0[14]
; CHECK-NEXT: vmov.u8 r2, q0[12]
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vmov q4[2], q4[0], r1, r0
; CHECK-NEXT: vmov.u8 r0, q0[15]
; CHECK-NEXT: vmov.u8 r1, q0[13]
; CHECK-NEXT: vmov.u8 r4, q1[6]
; CHECK-NEXT: vmov q4[3], q4[1], r1, r0
; CHECK-NEXT: vmov.u8 r0, q1[2]
; CHECK-NEXT: vmov q4[2], q4[0], r2, r1
; CHECK-NEXT: vmov.u8 r1, q0[15]
; CHECK-NEXT: vmov.u8 r2, q0[13]
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vmov q4[3], q4[1], r2, r1
; CHECK-NEXT: vmov.u8 r1, q1[10]
; CHECK-NEXT: vmovlb.s8 q4, q4
; CHECK-NEXT: vmov.u8 r1, q1[0]
; CHECK-NEXT: vmov.u8 r2, q1[8]
; CHECK-NEXT: vmovlb.s16 q4, q4
; CHECK-NEXT: vmov.u8 r5, q1[4]
; CHECK-NEXT: vadd.i32 q3, q4, q3
; CHECK-NEXT: vshr.u32 q3, q3, #1
; CHECK-NEXT: vmov lr, r12, d7
; CHECK-NEXT: vmov r3, r2, d6
; CHECK-NEXT: vmov q3[2], q3[0], r1, r0
; CHECK-NEXT: vmov.u8 r0, q1[3]
; CHECK-NEXT: vmov.u8 r1, q1[1]
; CHECK-NEXT: vmov q3[3], q3[1], r1, r0
; CHECK-NEXT: vmov.u8 r0, q0[2]
; CHECK-NEXT: vmov.u8 r1, q0[0]
; CHECK-NEXT: vstrb.32 q3, [r0, #12]
; CHECK-NEXT: vmov q3[2], q3[0], r2, r1
; CHECK-NEXT: vmov.u8 r1, q1[11]
; CHECK-NEXT: vmov.u8 r2, q1[9]
; CHECK-NEXT: vmov q3[3], q3[1], r2, r1
; CHECK-NEXT: vmov.u8 r1, q0[10]
; CHECK-NEXT: vmov.u8 r2, q0[8]
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vmov q4[2], q4[0], r1, r0
; CHECK-NEXT: vmov.u8 r0, q0[3]
; CHECK-NEXT: vmov.u8 r1, q0[1]
; CHECK-NEXT: vmov q4[3], q4[1], r1, r0
; CHECK-NEXT: vmov q4[2], q4[0], r2, r1
; CHECK-NEXT: vmov.u8 r1, q0[11]
; CHECK-NEXT: vmov.u8 r2, q0[9]
; CHECK-NEXT: vmov q4[3], q4[1], r2, r1
; CHECK-NEXT: vmov.u8 r1, q1[6]
; CHECK-NEXT: vmovlb.s8 q4, q4
; CHECK-NEXT: vmov.u8 r2, q1[4]
; CHECK-NEXT: vmovlb.s16 q4, q4
; CHECK-NEXT: vadd.i32 q3, q4, q3
; CHECK-NEXT: vmov q4[2], q4[0], r5, r4
; CHECK-NEXT: vmov.u8 r4, q1[7]
; CHECK-NEXT: vmov.u8 r5, q1[5]
; CHECK-NEXT: vmov q4[3], q4[1], r5, r4
; CHECK-NEXT: vmov.u8 r4, q0[6]
; CHECK-NEXT: vmov.u8 r5, q0[4]
; CHECK-NEXT: vshr.u32 q3, q3, #1
; CHECK-NEXT: vmov q5[2], q5[0], r5, r4
; CHECK-NEXT: vmov.u8 r4, q0[7]
; CHECK-NEXT: vmov.u8 r5, q0[5]
; CHECK-NEXT: vand q4, q4, q2
; CHECK-NEXT: vmov q5[3], q5[1], r5, r4
; CHECK-NEXT: vmov.u8 r4, q0[10]
; CHECK-NEXT: vmovlb.s8 q5, q5
; CHECK-NEXT: vmov.u8 r5, q0[8]
; CHECK-NEXT: vmovlb.s16 q5, q5
; CHECK-NEXT: vmov r1, r0, d6
; CHECK-NEXT: vadd.i32 q4, q5, q4
; CHECK-NEXT: vmov q5[2], q5[0], r5, r4
; CHECK-NEXT: vmov.u8 r4, q0[11]
; CHECK-NEXT: vmov.u8 r5, q0[9]
; CHECK-NEXT: vmov q5[3], q5[1], r5, r4
; CHECK-NEXT: vmov.8 q0[0], r1
; CHECK-NEXT: vmov.u8 r4, q1[10]
; CHECK-NEXT: vmov.u8 r5, q1[8]
; CHECK-NEXT: vmov q6[2], q6[0], r5, r4
; CHECK-NEXT: vmov.8 q0[1], r0
; CHECK-NEXT: vmov r0, r1, d7
; CHECK-NEXT: vmov.u8 r4, q1[11]
; CHECK-NEXT: vmov.u8 r5, q1[9]
; CHECK-NEXT: vmov.8 q0[2], r0
; CHECK-NEXT: vmov q6[3], q6[1], r5, r4
; CHECK-NEXT: vshr.u32 q4, q4, #1
; CHECK-NEXT: vmov.8 q0[3], r1
; CHECK-NEXT: vmov r0, r1, d8
; CHECK-NEXT: vand q1, q6, q2
; CHECK-NEXT: vmovlb.s8 q2, q5
; CHECK-NEXT: vmov.8 q0[4], r0
; CHECK-NEXT: vmovlb.s16 q2, q2
; CHECK-NEXT: vadd.i32 q1, q2, q1
; CHECK-NEXT: vmov r4, r5, d9
; CHECK-NEXT: vmov.8 q0[5], r1
; CHECK-NEXT: vshr.u32 q1, q1, #1
; CHECK-NEXT: vmov.8 q0[6], r4
; CHECK-NEXT: vmov r0, r1, d3
; CHECK-NEXT: vmov.8 q0[7], r5
; CHECK-NEXT: vmov r4, r5, d2
; CHECK-NEXT: vmov.8 q0[8], r4
; CHECK-NEXT: vmov.8 q0[9], r5
; CHECK-NEXT: vmov.8 q0[10], r0
; CHECK-NEXT: vmov.8 q0[11], r1
; CHECK-NEXT: vmov.8 q0[12], r3
; CHECK-NEXT: vmov.8 q0[13], r2
; CHECK-NEXT: vmov.8 q0[14], lr
; CHECK-NEXT: vmov.8 q0[15], r12
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: pop {r4, r5, r7, pc}
; CHECK-NEXT: vstrb.32 q3, [r0, #8]
; CHECK-NEXT: vmov q3[2], q3[0], r2, r1
; CHECK-NEXT: vmov.u8 r1, q1[7]
; CHECK-NEXT: vmov.u8 r2, q1[5]
; CHECK-NEXT: vmov q3[3], q3[1], r2, r1
; CHECK-NEXT: vmov.u8 r1, q0[6]
; CHECK-NEXT: vmov.u8 r2, q0[4]
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vmov q4[2], q4[0], r2, r1
; CHECK-NEXT: vmov.u8 r1, q0[7]
; CHECK-NEXT: vmov.u8 r2, q0[5]
; CHECK-NEXT: vmov q4[3], q4[1], r2, r1
; CHECK-NEXT: vmov.u8 r1, q1[2]
; CHECK-NEXT: vmovlb.s8 q4, q4
; CHECK-NEXT: vmov.u8 r2, q1[0]
; CHECK-NEXT: vmovlb.s16 q4, q4
; CHECK-NEXT: vadd.i32 q3, q4, q3
; CHECK-NEXT: vshr.u32 q3, q3, #1
; CHECK-NEXT: vstrb.32 q3, [r0, #4]
; CHECK-NEXT: vmov q3[2], q3[0], r2, r1
; CHECK-NEXT: vmov.u8 r1, q1[3]
; CHECK-NEXT: vmov.u8 r2, q1[1]
; CHECK-NEXT: vmov q3[3], q3[1], r2, r1
; CHECK-NEXT: vmov.u8 r1, q0[2]
; CHECK-NEXT: vmov.u8 r2, q0[0]
; CHECK-NEXT: vand q1, q3, q2
; CHECK-NEXT: vmov q2[2], q2[0], r2, r1
; CHECK-NEXT: vmov.u8 r1, q0[3]
; CHECK-NEXT: vmov.u8 r2, q0[1]
; CHECK-NEXT: vmov q2[3], q2[1], r2, r1
; CHECK-NEXT: vmovlb.s8 q0, q2
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vadd.i32 q0, q0, q1
; CHECK-NEXT: vshr.u32 q0, q0, #1
; CHECK-NEXT: vstrb.32 q0, [r0]
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: add sp, #16
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: bx lr
entry:
%sa = sext <16 x i8> %a to <16 x i32>
%sb = zext <16 x i8> %b to <16 x i32>
Expand Down
27 changes: 10 additions & 17 deletions llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
Expand Up @@ -362,23 +362,16 @@ entry:
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) {
; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r7, lr}
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: vmov r4, r5, d0
; CHECK-NEXT: vmov.16 q2[0], r4
; CHECK-NEXT: vmov lr, r12, d3
; CHECK-NEXT: vmov r3, r2, d2
; CHECK-NEXT: vldrb.u16 q1, [r1]
; CHECK-NEXT: vmov r1, r4, d1
; CHECK-NEXT: vmov.16 q2[1], r5
; CHECK-NEXT: vmov.16 q2[2], r1
; CHECK-NEXT: vmov.16 q2[3], r4
; CHECK-NEXT: vmov.16 q2[4], r3
; CHECK-NEXT: vmov.16 q2[5], r2
; CHECK-NEXT: vmov.16 q2[6], lr
; CHECK-NEXT: vmov.16 q2[7], r12
; CHECK-NEXT: vstrh.16 q2, [r0, q1]
; CHECK-NEXT: pop {r4, r5, r7, pc}
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: mov r2, sp
; CHECK-NEXT: vstrh.32 q1, [r2, #8]
; CHECK-NEXT: vstrh.32 q0, [r2]
; CHECK-NEXT: vldrb.u16 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vstrh.16 q1, [r0, q0]
; CHECK-NEXT: add sp, #16
; CHECK-NEXT: bx lr
entry:
%offs = load <8 x i8>, <8 x i8>* %offptr, align 1
%offs.zext = zext <8 x i8> %offs to <8 x i32>
Expand Down
85 changes: 20 additions & 65 deletions llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
Expand Up @@ -374,38 +374,18 @@ entry:
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <16 x i8>* %offptr, <16 x i32> %input) {
; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r7, lr}
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vmov r4, r5, d0
; CHECK-NEXT: vmov.8 q4[0], r4
; CHECK-NEXT: vmov lr, r12, d7
; CHECK-NEXT: vmov r3, r2, d6
; CHECK-NEXT: vldrb.u8 q3, [r1]
; CHECK-NEXT: vmov r1, r4, d1
; CHECK-NEXT: vmov.8 q4[1], r5
; CHECK-NEXT: vmov.8 q4[2], r1
; CHECK-NEXT: vmov r1, r5, d2
; CHECK-NEXT: vmov.8 q4[3], r4
; CHECK-NEXT: vmov.8 q4[4], r1
; CHECK-NEXT: vmov r1, r4, d3
; CHECK-NEXT: vmov.8 q4[5], r5
; CHECK-NEXT: vmov.8 q4[6], r1
; CHECK-NEXT: vmov r1, r5, d4
; CHECK-NEXT: vmov.8 q4[7], r4
; CHECK-NEXT: vmov.8 q4[8], r1
; CHECK-NEXT: vmov r1, r4, d5
; CHECK-NEXT: vmov.8 q4[9], r5
; CHECK-NEXT: vmov.8 q4[10], r1
; CHECK-NEXT: vmov.8 q4[11], r4
; CHECK-NEXT: vmov.8 q4[12], r3
; CHECK-NEXT: vmov.8 q4[13], r2
; CHECK-NEXT: vmov.8 q4[14], lr
; CHECK-NEXT: vmov.8 q4[15], r12
; CHECK-NEXT: vstrb.8 q4, [r0, q3]
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: pop {r4, r5, r7, pc}
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: mov r2, sp
; CHECK-NEXT: vstrb.32 q3, [r2, #12]
; CHECK-NEXT: vstrb.32 q2, [r2, #8]
; CHECK-NEXT: vstrb.32 q1, [r2, #4]
; CHECK-NEXT: vstrb.32 q0, [r2]
; CHECK-NEXT: vldrb.u8 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vstrb.8 q1, [r0, q0]
; CHECK-NEXT: add sp, #16
; CHECK-NEXT: bx lr
entry:
%offs = load <16 x i8>, <16 x i8>* %offptr, align 1
%offs.zext = zext <16 x i8> %offs to <16 x i32>
Expand All @@ -418,40 +398,15 @@ entry:
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <16 x i8>* %offptr, <16 x i16> %input) {
; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.u16 r3, q0[0]
; CHECK-NEXT: vmov.u16 r2, q1[7]
; CHECK-NEXT: vmov.8 q2[0], r3
; CHECK-NEXT: vmov.u16 r3, q0[1]
; CHECK-NEXT: vmov.8 q2[1], r3
; CHECK-NEXT: vmov.u16 r3, q0[2]
; CHECK-NEXT: vmov.8 q2[2], r3
; CHECK-NEXT: vmov.u16 r3, q0[3]
; CHECK-NEXT: vmov.8 q2[3], r3
; CHECK-NEXT: vmov.u16 r3, q0[4]
; CHECK-NEXT: vmov.8 q2[4], r3
; CHECK-NEXT: vmov.u16 r3, q0[5]
; CHECK-NEXT: vmov.8 q2[5], r3
; CHECK-NEXT: vmov.u16 r3, q0[6]
; CHECK-NEXT: vmov.8 q2[6], r3
; CHECK-NEXT: vmov.u16 r3, q0[7]
; CHECK-NEXT: vmov.8 q2[7], r3
; CHECK-NEXT: vmov.u16 r3, q1[0]
; CHECK-NEXT: vmov.8 q2[8], r3
; CHECK-NEXT: vmov.u16 r3, q1[1]
; CHECK-NEXT: vmov.8 q2[9], r3
; CHECK-NEXT: vmov.u16 r3, q1[2]
; CHECK-NEXT: vmov.8 q2[10], r3
; CHECK-NEXT: vmov.u16 r3, q1[3]
; CHECK-NEXT: vmov.8 q2[11], r3
; CHECK-NEXT: vmov.u16 r3, q1[4]
; CHECK-NEXT: vmov.8 q2[12], r3
; CHECK-NEXT: vmov.u16 r3, q1[5]
; CHECK-NEXT: vmov.8 q2[13], r3
; CHECK-NEXT: vmov.u16 r3, q1[6]
; CHECK-NEXT: vmov.8 q2[14], r3
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: mov r2, sp
; CHECK-NEXT: vstrb.16 q1, [r2, #8]
; CHECK-NEXT: vstrb.16 q0, [r2]
; CHECK-NEXT: vldrb.u8 q0, [r1]
; CHECK-NEXT: vmov.8 q2[15], r2
; CHECK-NEXT: vstrb.8 q2, [r0, q0]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vstrb.8 q1, [r0, q0]
; CHECK-NEXT: add sp, #16
; CHECK-NEXT: bx lr
entry:
%offs = load <16 x i8>, <16 x i8>* %offptr, align 1
Expand Down

0 comments on commit 41d8149

Please sign in to comment.