Skip to content

Commit

Permalink
[SVE][CodeGen] Call refineIndexType & refineUniformBase from visitMGA…
Browse files Browse the repository at this point in the history
…THER

The refineIndexType & refineUniformBase functions added by D90942 can also be used to
improve CodeGen of masked gathers.

These changes were split out from D91092

Reviewed By: sdesmalen

Differential Revision: https://reviews.llvm.org/D92319
  • Loading branch information
kmclaughlin-arm committed Dec 7, 2020
1 parent ecaff13 commit 111f559
Show file tree
Hide file tree
Showing 8 changed files with 103 additions and 365 deletions.
29 changes: 24 additions & 5 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Expand Up @@ -9410,13 +9410,13 @@ bool refineUniformBase(SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG) {
}

// Fold sext/zext of index into index type.
bool refineIndexType(MaskedScatterSDNode *MSC, SDValue &Index, bool Scaled,
SelectionDAG &DAG) {
bool refineIndexType(MaskedGatherScatterSDNode *MGS, SDValue &Index,
bool Scaled, SelectionDAG &DAG) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();

if (Index.getOpcode() == ISD::ZERO_EXTEND) {
SDValue Op = Index.getOperand(0);
MSC->setIndexType(Scaled ? ISD::UNSIGNED_SCALED : ISD::UNSIGNED_UNSCALED);
MGS->setIndexType(Scaled ? ISD::UNSIGNED_SCALED : ISD::UNSIGNED_UNSCALED);
if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) {
Index = Op;
return true;
Expand All @@ -9425,7 +9425,7 @@ bool refineIndexType(MaskedScatterSDNode *MSC, SDValue &Index, bool Scaled,

if (Index.getOpcode() == ISD::SIGN_EXTEND) {
SDValue Op = Index.getOperand(0);
MSC->setIndexType(Scaled ? ISD::SIGNED_SCALED : ISD::SIGNED_UNSCALED);
MGS->setIndexType(Scaled ? ISD::SIGNED_SCALED : ISD::SIGNED_UNSCALED);
if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) {
Index = Op;
return true;
Expand Down Expand Up @@ -9494,11 +9494,30 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {
SDValue DAGCombiner::visitMGATHER(SDNode *N) {
MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(N);
SDValue Mask = MGT->getMask();
SDValue Chain = MGT->getChain();
SDValue Index = MGT->getIndex();
SDValue Scale = MGT->getScale();
SDValue PassThru = MGT->getPassThru();
SDValue BasePtr = MGT->getBasePtr();
SDLoc DL(N);

// Zap gathers with a zero mask.
if (ISD::isBuildVectorAllZeros(Mask.getNode()))
return CombineTo(N, MGT->getPassThru(), MGT->getChain());
return CombineTo(N, PassThru, MGT->getChain());

if (refineUniformBase(BasePtr, Index, DAG)) {
SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other),
PassThru.getValueType(), DL, Ops,
MGT->getMemOperand(), MGT->getIndexType());
}

if (refineIndexType(MGT, Index, MGT->isIndexScaled(), DAG)) {
SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other),
PassThru.getValueType(), DL, Ops,
MGT->getMemOperand(), MGT->getIndexType());
}

return SDValue();
}
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Expand Up @@ -3894,6 +3894,9 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,

SDVTList VTs = DAG.getVTList(PassThru.getSimpleValueType(), MVT::Other);

if (getGatherScatterIndexIsExtended(Index))
Index = Index.getOperand(0);

SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT, PassThru};
return DAG.getNode(getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend), DL,
VTs, Ops);
Expand Down
22 changes: 4 additions & 18 deletions llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll
Expand Up @@ -8,8 +8,6 @@
define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
; CHECK-NEXT: and z0.d, z0.d, #0xffff
; CHECK-NEXT: ret
Expand All @@ -22,8 +20,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32>
define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
; CHECK-NEXT: ret
Expand All @@ -36,8 +32,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32>
define <vscale x 2 x i64> @masked_gather_nxv2i64(i64* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
; CHECK-NEXT: ret
%ptrs = getelementptr i64, i64* %base, <vscale x 2 x i32> %offsets
Expand All @@ -48,8 +42,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i64(i64* %base, <vscale x 2 x i32>
define <vscale x 2 x half> @masked_gather_nxv2f16(half* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
; CHECK-NEXT: ret
%ptrs = getelementptr half, half* %base, <vscale x 2 x i32> %offsets
Expand All @@ -60,8 +52,6 @@ define <vscale x 2 x half> @masked_gather_nxv2f16(half* %base, <vscale x 2 x i32
define <vscale x 2 x float> @masked_gather_nxv2f32(float* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
; CHECK-NEXT: ret
%ptrs = getelementptr float, float* %base, <vscale x 2 x i32> %offsets
Expand All @@ -72,8 +62,6 @@ define <vscale x 2 x float> @masked_gather_nxv2f32(float* %base, <vscale x 2 x i
define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
; CHECK-NEXT: ret
%ptrs = getelementptr double, double* %base, <vscale x 2 x i32> %offsets
Expand All @@ -84,10 +72,9 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x
define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
; CHECK-NEXT: sxth z0.d, p1/m, z0.d
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxth z0.d, p0/m, z0.d
; CHECK-NEXT: ret
%ptrs = getelementptr i16, i16* %base, <vscale x 2 x i32> %offsets
%vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
Expand All @@ -98,10 +85,9 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32>
define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
; CHECK-NEXT: ret
%ptrs = getelementptr i32, i32* %base, <vscale x 2 x i32> %offsets
%vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
Expand Down
134 changes: 19 additions & 115 deletions llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll
Expand Up @@ -8,8 +8,6 @@
define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: and z0.d, z0.d, #0xff
; CHECK-NEXT: ret
Expand All @@ -22,12 +20,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %o
define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: mov z1.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: and z0.d, z0.d, #0xffff
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
Expand All @@ -40,12 +33,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %
define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: mov z1.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
Expand All @@ -58,12 +46,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %
define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: mov z1.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d]
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
Expand All @@ -74,12 +57,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i32> %
define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: mov z1.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
Expand All @@ -90,12 +68,7 @@ define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i32>
define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: mov z1.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
Expand All @@ -106,12 +79,7 @@ define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i32>
define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: mov z1.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d]
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
Expand All @@ -122,10 +90,9 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32
define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: sxtb z0.d, p1/m, z0.d
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxtb z0.d, p0/m, z0.d
; CHECK-NEXT: ret
%ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
Expand All @@ -136,13 +103,9 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %
define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: mov z1.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
; CHECK-NEXT: sxth z0.d, p1/m, z0.d
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxth z0.d, p0/m, z0.d
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
Expand All @@ -154,13 +117,9 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32>
define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: mov z1.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
Expand Down Expand Up @@ -188,18 +147,7 @@ define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %o
define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z1.d, x0
; CHECK-NEXT: sunpklo z2.d, z0.s
; CHECK-NEXT: sunpkhi z0.d, z0.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z2.d, z1.d, z2.d
; CHECK-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
; CHECK-NEXT: ld1h { z0.d }, p2/z, [x8, z0.d]
; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, z2.d]
; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw]
; CHECK-NEXT: and z0.s, z0.s, #0xffff
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
Expand All @@ -212,18 +160,7 @@ define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %
define <vscale x 4 x i32> @masked_gather_nxv4i32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z1.d, x0
; CHECK-NEXT: sunpklo z2.d, z0.s
; CHECK-NEXT: sunpkhi z0.d, z0.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z2.d, z1.d, z2.d
; CHECK-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
; CHECK-NEXT: ld1w { z0.d }, p2/z, [x8, z0.d]
; CHECK-NEXT: ld1w { z1.d }, p0/z, [x8, z2.d]
; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
%ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i32*>
Expand All @@ -234,18 +171,7 @@ define <vscale x 4 x i32> @masked_gather_nxv4i32(i8* %base, <vscale x 4 x i32> %
define <vscale x 4 x half> @masked_gather_nxv4f16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z1.d, x0
; CHECK-NEXT: sunpklo z2.d, z0.s
; CHECK-NEXT: sunpkhi z0.d, z0.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z2.d, z1.d, z2.d
; CHECK-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
; CHECK-NEXT: ld1h { z0.d }, p2/z, [x8, z0.d]
; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, z2.d]
; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
%ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x half*>
Expand All @@ -256,18 +182,7 @@ define <vscale x 4 x half> @masked_gather_nxv4f16(i8* %base, <vscale x 4 x i32>
define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z1.d, x0
; CHECK-NEXT: sunpklo z2.d, z0.s
; CHECK-NEXT: sunpkhi z0.d, z0.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z2.d, z1.d, z2.d
; CHECK-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
; CHECK-NEXT: ld1w { z0.d }, p2/z, [x8, z0.d]
; CHECK-NEXT: ld1w { z1.d }, p0/z, [x8, z2.d]
; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
%ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x float*>
Expand All @@ -291,19 +206,8 @@ define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %
define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z1.d, x0
; CHECK-NEXT: sunpklo z2.d, z0.s
; CHECK-NEXT: sunpkhi z0.d, z0.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z2.d, z1.d, z2.d
; CHECK-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
; CHECK-NEXT: ld1h { z0.d }, p2/z, [x8, z0.d]
; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, z2.d]
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw]
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
; CHECK-NEXT: sxth z0.s, p0/m, z0.s
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
Expand Down

0 comments on commit 111f559

Please sign in to comment.