[SVE][CodeGen] Call refineIndexType & refineUniformBase from visitMGA…

…THER The refineIndexType & refineUniformBase functions added by D90942 can also be used to improve CodeGen of masked gathers. These changes were split out from D91092 Reviewed By: sdesmalen Differential Revision: https://reviews.llvm.org/D92319
llvm · Dec 7, 2020 · 111f559 · 111f559
1 parent ecaff13
commit 111f559
Show file tree

Hide file tree

Showing 8 changed files with 103 additions and 365 deletions.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9410,13 +9410,13 @@ bool refineUniformBase(SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG) {
 }
 
 // Fold sext/zext of index into index type.
-bool refineIndexType(MaskedScatterSDNode *MSC, SDValue &Index, bool Scaled,
-                     SelectionDAG &DAG) {
+bool refineIndexType(MaskedGatherScatterSDNode *MGS, SDValue &Index,
+                     bool Scaled, SelectionDAG &DAG) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   if (Index.getOpcode() == ISD::ZERO_EXTEND) {
     SDValue Op = Index.getOperand(0);
-    MSC->setIndexType(Scaled ? ISD::UNSIGNED_SCALED : ISD::UNSIGNED_UNSCALED);
+    MGS->setIndexType(Scaled ? ISD::UNSIGNED_SCALED : ISD::UNSIGNED_UNSCALED);
     if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) {
       Index = Op;
       return true;
@@ -9425,7 +9425,7 @@ bool refineIndexType(MaskedScatterSDNode *MSC, SDValue &Index, bool Scaled,
 
   if (Index.getOpcode() == ISD::SIGN_EXTEND) {
     SDValue Op = Index.getOperand(0);
-    MSC->setIndexType(Scaled ? ISD::SIGNED_SCALED : ISD::SIGNED_UNSCALED);
+    MGS->setIndexType(Scaled ? ISD::SIGNED_SCALED : ISD::SIGNED_UNSCALED);
     if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) {
       Index = Op;
       return true;
@@ -9494,11 +9494,30 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {
 SDValue DAGCombiner::visitMGATHER(SDNode *N) {
   MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(N);
   SDValue Mask = MGT->getMask();
+  SDValue Chain = MGT->getChain();
+  SDValue Index = MGT->getIndex();
+  SDValue Scale = MGT->getScale();
+  SDValue PassThru = MGT->getPassThru();
+  SDValue BasePtr = MGT->getBasePtr();
   SDLoc DL(N);
 
   // Zap gathers with a zero mask.
   if (ISD::isBuildVectorAllZeros(Mask.getNode()))
-    return CombineTo(N, MGT->getPassThru(), MGT->getChain());
+    return CombineTo(N, PassThru, MGT->getChain());
+
+  if (refineUniformBase(BasePtr, Index, DAG)) {
+    SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
+    return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other),
+                               PassThru.getValueType(), DL, Ops,
+                               MGT->getMemOperand(), MGT->getIndexType());
+  }
+
+  if (refineIndexType(MGT, Index, MGT->isIndexScaled(), DAG)) {
+    SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
+    return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other),
+                               PassThru.getValueType(), DL, Ops,
+                               MGT->getMemOperand(), MGT->getIndexType());
+  }
 
   return SDValue();
 }

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3894,6 +3894,9 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
 
   SDVTList VTs = DAG.getVTList(PassThru.getSimpleValueType(), MVT::Other);
 
+  if (getGatherScatterIndexIsExtended(Index))
+    Index = Index.getOperand(0);
+
   SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT, PassThru};
   return DAG.getNode(getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend), DL,
                      VTs, Ops);

diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll
@@ -8,8 +8,6 @@
 define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_gather_nxv2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
 ; CHECK-NEXT:    and z0.d, z0.d, #0xffff
 ; CHECK-NEXT:    ret
@@ -22,8 +20,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32>
 define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_gather_nxv2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
 ; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
 ; CHECK-NEXT:    ret
@@ -36,8 +32,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32>
 define <vscale x 2 x i64> @masked_gather_nxv2i64(i64* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_gather_nxv2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i64, i64* %base, <vscale x 2 x i32> %offsets
@@ -48,8 +42,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i64(i64* %base, <vscale x 2 x i32>
 define <vscale x 2 x half> @masked_gather_nxv2f16(half* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_gather_nxv2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr half, half* %base, <vscale x 2 x i32> %offsets
@@ -60,8 +52,6 @@ define <vscale x 2 x half> @masked_gather_nxv2f16(half* %base, <vscale x 2 x i32
 define <vscale x 2 x float> @masked_gather_nxv2f32(float* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_gather_nxv2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr float, float* %base, <vscale x 2 x i32> %offsets
@@ -72,8 +62,6 @@ define <vscale x 2 x float> @masked_gather_nxv2f32(float* %base, <vscale x 2 x i
 define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_gather_nxv2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr double, double* %base, <vscale x 2 x i32> %offsets
@@ -84,10 +72,9 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x
 define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
-; CHECK-NEXT:    sxth z0.d, p1/m, z0.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sxth z0.d, p0/m, z0.d
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i32> %offsets
   %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
@@ -98,10 +85,9 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32>
 define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
-; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i32> %offsets
   %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)

diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll
@@ -8,8 +8,6 @@
 define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_gather_nxv2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
 ; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT:    and z0.d, z0.d, #0xff
 ; CHECK-NEXT:    ret
@@ -22,12 +20,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %o
 define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_gather_nxv2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
-; CHECK-NEXT:    mov z1.d, x0
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT:    and z0.d, z0.d, #0xffff
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
@@ -40,12 +33,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %
 define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_gather_nxv2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
-; CHECK-NEXT:    mov z1.d, x0
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
@@ -58,12 +46,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %
 define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_gather_nxv2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
-; CHECK-NEXT:    mov z1.d, x0
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
   %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
@@ -74,12 +57,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i32> %
 define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_gather_nxv2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
-; CHECK-NEXT:    mov z1.d, x0
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
   %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
@@ -90,12 +68,7 @@ define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i32>
 define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_gather_nxv2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
-; CHECK-NEXT:    mov z1.d, x0
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
   %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
@@ -106,12 +79,7 @@ define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i32>
 define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_gather_nxv2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
-; CHECK-NEXT:    mov z1.d, x0
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
   %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
@@ -122,10 +90,9 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32
 define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
 ; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT:    sxtb z0.d, p1/m, z0.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sxtb z0.d, p0/m, z0.d
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
   %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
@@ -136,13 +103,9 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %
 define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
-; CHECK-NEXT:    mov z1.d, x0
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x8, z0.d]
-; CHECK-NEXT:    sxth z0.d, p1/m, z0.d
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sxth z0.d, p0/m, z0.d
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
   %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
@@ -154,13 +117,9 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32>
 define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
-; CHECK-NEXT:    mov z1.d, x0
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x8, z0.d]
-; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
   %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
@@ -188,18 +147,7 @@ define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %o
 define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: masked_gather_nxv4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.d, x0
-; CHECK-NEXT:    sunpklo z2.d, z0.s
-; CHECK-NEXT:    sunpkhi z0.d, z0.s
-; CHECK-NEXT:    pfalse p1.b
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    add z2.d, z1.d, z2.d
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT:    ld1h { z0.d }, p2/z, [x8, z0.d]
-; CHECK-NEXT:    ld1h { z1.d }, p0/z, [x8, z2.d]
-; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, sxtw]
 ; CHECK-NEXT:    and z0.s, z0.s, #0xffff
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
@@ -212,18 +160,7 @@ define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %
 define <vscale x 4 x i32> @masked_gather_nxv4i32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: masked_gather_nxv4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.d, x0
-; CHECK-NEXT:    sunpklo z2.d, z0.s
-; CHECK-NEXT:    sunpkhi z0.d, z0.s
-; CHECK-NEXT:    pfalse p1.b
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    add z2.d, z1.d, z2.d
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT:    ld1w { z0.d }, p2/z, [x8, z0.d]
-; CHECK-NEXT:    ld1w { z1.d }, p0/z, [x8, z2.d]
-; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, z0.s, sxtw]
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
   %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i32*>
@@ -234,18 +171,7 @@ define <vscale x 4 x i32> @masked_gather_nxv4i32(i8* %base, <vscale x 4 x i32> %
 define <vscale x 4 x half> @masked_gather_nxv4f16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: masked_gather_nxv4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.d, x0
-; CHECK-NEXT:    sunpklo z2.d, z0.s
-; CHECK-NEXT:    sunpkhi z0.d, z0.s
-; CHECK-NEXT:    pfalse p1.b
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    add z2.d, z1.d, z2.d
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT:    ld1h { z0.d }, p2/z, [x8, z0.d]
-; CHECK-NEXT:    ld1h { z1.d }, p0/z, [x8, z2.d]
-; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, sxtw]
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
   %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x half*>
@@ -256,18 +182,7 @@ define <vscale x 4 x half> @masked_gather_nxv4f16(i8* %base, <vscale x 4 x i32>
 define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: masked_gather_nxv4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.d, x0
-; CHECK-NEXT:    sunpklo z2.d, z0.s
-; CHECK-NEXT:    sunpkhi z0.d, z0.s
-; CHECK-NEXT:    pfalse p1.b
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    add z2.d, z1.d, z2.d
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT:    ld1w { z0.d }, p2/z, [x8, z0.d]
-; CHECK-NEXT:    ld1w { z1.d }, p0/z, [x8, z2.d]
-; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, z0.s, sxtw]
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
   %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x float*>
@@ -291,19 +206,8 @@ define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %
 define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.d, x0
-; CHECK-NEXT:    sunpklo z2.d, z0.s
-; CHECK-NEXT:    sunpkhi z0.d, z0.s
-; CHECK-NEXT:    pfalse p1.b
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    add z2.d, z1.d, z2.d
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
-; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT:    ld1h { z0.d }, p2/z, [x8, z0.d]
-; CHECK-NEXT:    ld1h { z1.d }, p0/z, [x8, z2.d]
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, sxtw]
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
 ; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets