diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 332fb37655288..48bb2d346b483 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -21471,6 +21471,53 @@ bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) { (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32); } +// Combine store (trunc X to <3 x i8>) to sequence of ST1.b. +static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + SDValue Value = ST->getValue(); + EVT ValueVT = Value.getValueType(); + + if (ST->isVolatile() || !Subtarget->isLittleEndian() || + Value.getOpcode() != ISD::TRUNCATE || + ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3)) + return SDValue(); + + assert(ST->getOffset().isUndef() && "undef offset expected"); + SDLoc DL(ST); + auto WideVT = EVT::getVectorVT( + *DAG.getContext(), + Value->getOperand(0).getValueType().getVectorElementType(), 4); + SDValue UndefVector = DAG.getUNDEF(WideVT); + SDValue WideTrunc = DAG.getNode( + ISD::INSERT_SUBVECTOR, DL, WideVT, + {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)}); + SDValue Cast = DAG.getNode( + ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8, + WideTrunc); + + MachineFunction &MF = DAG.getMachineFunction(); + SDValue Chain = ST->getChain(); + MachineMemOperand *MMO = ST->getMemOperand(); + unsigned IdxScale = WideVT.getScalarSizeInBits() / 8; + SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast, + DAG.getConstant(2 * IdxScale, DL, MVT::i64)); + TypeSize Offset2 = TypeSize::getFixed(2); + SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL); + Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1)); + + SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast, + DAG.getConstant(1 * IdxScale, DL, MVT::i64)); + TypeSize Offset1 = TypeSize::getFixed(1); + SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL); + Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1)); + + SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast, + DAG.getConstant(0, DL, MVT::i64)); + Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(), + MF.getMachineMemOperand(MMO, 0, 1)); + return Chain; +} + static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, @@ -21486,6 +21533,9 @@ static SDValue performSTORECombine(SDNode *N, return EltVT == MVT::f32 || EltVT == MVT::f64; }; + if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget)) + return Res; + // If this is an FP_ROUND followed by a store, fold this into a truncating // store. We can do this even if this is already a truncstore. // We purposefully don't care about legality of the nodes here as we know diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index 0ef87c3293055..21079ef778776 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -348,17 +348,15 @@ define <3 x i32> @load_v3i32(ptr %src) { define void @store_trunc_from_64bits(ptr %src, ptr %dst) { ; CHECK-LABEL: store_trunc_from_64bits: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: ldrh w8, [x0, #4] -; CHECK-NEXT: mov.h v0[2], w8 -; CHECK-NEXT: xtn.8b v0, v0 -; CHECK-NEXT: str s0, [sp, #12] -; CHECK-NEXT: ldrh w9, [sp, #12] -; CHECK-NEXT: strb w8, [x1, #2] -; CHECK-NEXT: strh w9, [x1] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: add x9, x0, #4 +; CHECK-NEXT: ld1r.4h { v0 }, [x9] +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: strb w8, [x1] +; CHECK-NEXT: add x8, x1, #1 +; CHECK-NEXT: st1.b { v1 }[2], [x8] +; CHECK-NEXT: add x8, x1, #2 +; CHECK-NEXT: st1.b { v0 }[4], [x8] ; CHECK-NEXT: ret ; ; BE-LABEL: store_trunc_from_64bits: @@ -387,23 +385,19 @@ entry: define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) { ; CHECK-LABEL: store_trunc_add_from_64bits: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: add x9, x0, #4 ; CHECK-NEXT: Lloh0: ; CHECK-NEXT: adrp x8, lCPI9_0@PAGE ; CHECK-NEXT: Lloh1: ; CHECK-NEXT: ldr d1, [x8, lCPI9_0@PAGEOFF] +; CHECK-NEXT: add x8, x1, #1 ; CHECK-NEXT: ld1.h { v0 }[2], [x9] +; CHECK-NEXT: add x9, x1, #2 ; CHECK-NEXT: add.4h v0, v0, v1 -; CHECK-NEXT: xtn.8b v1, v0 -; CHECK-NEXT: umov.h w8, v0[2] -; CHECK-NEXT: str s1, [sp, #12] -; CHECK-NEXT: ldrh w9, [sp, #12] -; CHECK-NEXT: strb w8, [x1, #2] -; CHECK-NEXT: strh w9, [x1] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: st1.b { v0 }[2], [x8] +; CHECK-NEXT: st1.b { v0 }[4], [x9] +; CHECK-NEXT: st1.b { v0 }[0], [x1] ; CHECK-NEXT: ret ; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh1 ; @@ -594,17 +588,13 @@ entry: define void @shift_trunc_store(ptr %src, ptr %dst) { ; CHECK-LABEL: shift_trunc_store: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: shrn.4h v0, v0, #16 -; CHECK-NEXT: xtn.8b v1, v0 -; CHECK-NEXT: umov.h w8, v0[2] -; CHECK-NEXT: str s1, [sp, #12] -; CHECK-NEXT: ldrh w9, [sp, #12] -; CHECK-NEXT: strb w8, [x1, #2] -; CHECK-NEXT: strh w9, [x1] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: add x8, x1, #1 +; CHECK-NEXT: add x9, x1, #2 +; CHECK-NEXT: ushr.4s v0, v0, #16 +; CHECK-NEXT: st1.b { v0 }[4], [x8] +; CHECK-NEXT: st1.b { v0 }[8], [x9] +; CHECK-NEXT: st1.b { v0 }[0], [x1] ; CHECK-NEXT: ret ; ; BE-LABEL: shift_trunc_store: @@ -632,17 +622,13 @@ define void @shift_trunc_store(ptr %src, ptr %dst) { define void @shift_trunc_store_default_align(ptr %src, ptr %dst) { ; CHECK-LABEL: shift_trunc_store_default_align: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: shrn.4h v0, v0, #16 -; CHECK-NEXT: xtn.8b v1, v0 -; CHECK-NEXT: umov.h w8, v0[2] -; CHECK-NEXT: str s1, [sp, #12] -; CHECK-NEXT: ldrh w9, [sp, #12] -; CHECK-NEXT: strb w8, [x1, #2] -; CHECK-NEXT: strh w9, [x1] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: add x8, x1, #1 +; CHECK-NEXT: add x9, x1, #2 +; CHECK-NEXT: ushr.4s v0, v0, #16 +; CHECK-NEXT: st1.b { v0 }[4], [x8] +; CHECK-NEXT: st1.b { v0 }[8], [x9] +; CHECK-NEXT: st1.b { v0 }[0], [x1] ; CHECK-NEXT: ret ; ; BE-LABEL: shift_trunc_store_default_align: @@ -670,17 +656,13 @@ define void @shift_trunc_store_default_align(ptr %src, ptr %dst) { define void @shift_trunc_store_align_4(ptr %src, ptr %dst) { ; CHECK-LABEL: shift_trunc_store_align_4: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: shrn.4h v0, v0, #16 -; CHECK-NEXT: xtn.8b v1, v0 -; CHECK-NEXT: umov.h w8, v0[2] -; CHECK-NEXT: str s1, [sp, #12] -; CHECK-NEXT: ldrh w9, [sp, #12] -; CHECK-NEXT: strb w8, [x1, #2] -; CHECK-NEXT: strh w9, [x1] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: add x8, x1, #1 +; CHECK-NEXT: add x9, x1, #2 +; CHECK-NEXT: ushr.4s v0, v0, #16 +; CHECK-NEXT: st1.b { v0 }[4], [x8] +; CHECK-NEXT: st1.b { v0 }[8], [x9] +; CHECK-NEXT: st1.b { v0 }[0], [x1] ; CHECK-NEXT: ret ; ; BE-LABEL: shift_trunc_store_align_4: @@ -708,17 +690,14 @@ define void @shift_trunc_store_align_4(ptr %src, ptr %dst) { define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) { ; CHECK-LABEL: shift_trunc_store_const_offset_1: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: shrn.4h v0, v0, #16 -; CHECK-NEXT: xtn.8b v1, v0 -; CHECK-NEXT: umov.h w8, v0[2] -; CHECK-NEXT: str s1, [sp, #12] -; CHECK-NEXT: ldrh w9, [sp, #12] -; CHECK-NEXT: strb w8, [x1, #3] -; CHECK-NEXT: sturh w9, [x1, #1] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: add x8, x1, #2 +; CHECK-NEXT: add x9, x1, #3 +; CHECK-NEXT: ushr.4s v0, v0, #16 +; CHECK-NEXT: st1.b { v0 }[4], [x8] +; CHECK-NEXT: add x8, x1, #1 +; CHECK-NEXT: st1.b { v0 }[8], [x9] +; CHECK-NEXT: st1.b { v0 }[0], [x8] ; CHECK-NEXT: ret ; ; BE-LABEL: shift_trunc_store_const_offset_1: @@ -747,17 +726,14 @@ define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) { define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) { ; CHECK-LABEL: shift_trunc_store_const_offset_3: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: shrn.4h v0, v0, #16 -; CHECK-NEXT: xtn.8b v1, v0 -; CHECK-NEXT: umov.h w8, v0[2] -; CHECK-NEXT: str s1, [sp, #12] -; CHECK-NEXT: ldrh w9, [sp, #12] -; CHECK-NEXT: strb w8, [x1, #5] -; CHECK-NEXT: sturh w9, [x1, #3] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: add x8, x1, #4 +; CHECK-NEXT: add x9, x1, #5 +; CHECK-NEXT: ushr.4s v0, v0, #16 +; CHECK-NEXT: st1.b { v0 }[4], [x8] +; CHECK-NEXT: add x8, x1, #3 +; CHECK-NEXT: st1.b { v0 }[8], [x9] +; CHECK-NEXT: st1.b { v0 }[0], [x8] ; CHECK-NEXT: ret ; ; BE-LABEL: shift_trunc_store_const_offset_3: