From db04d3e30b3878ae39ef64eb0b0a1538644c7f6a Mon Sep 17 00:00:00 2001 From: David Truby Date: Tue, 16 Nov 2021 11:33:12 +0000 Subject: [PATCH] [AArch64][SVE][VLS] Move extends into arguments of comparisons When a comparison is extended and it would be free to extend the arguments to that comparison, we can propagate the extend into those arguments. This prevents extra instructions being generated to extend the result of the comparison, which is not free to extend. Differential Revision: https://reviews.llvm.org/D116812 --- .../Target/AArch64/AArch64ISelLowering.cpp | 39 ++++ .../AArch64/sve-fixed-length-masked-loads.ll | 196 ++++++------------ 2 files changed, 105 insertions(+), 130 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index e2ef4ff786376..3f385ba91887e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -15316,6 +15316,40 @@ static SDValue performIntrinsicCombine(SDNode *N, return SDValue(); } +static bool isCheapToExtend(const SDValue &N) { + unsigned OC = N->getOpcode(); + return OC == ISD::LOAD || OC == ISD::MLOAD || + ISD::isConstantSplatVectorAllZeros(N.getNode()); +} + +static SDValue +performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + // If we have (sext (setcc A B)) and A and B are cheap to extend, + // we can move the sext into the arguments and have the same result. For + // example, if A and B are both loads, we can make those extending loads and + // avoid an extra instruction. This pattern appears often in VLS code + // generation where the inputs to the setcc have a different size to the + // instruction that wants to use the result of the setcc. + assert(N->getOpcode() == ISD::SIGN_EXTEND && + N->getOperand(0)->getOpcode() == ISD::SETCC); + const SDValue SetCC = N->getOperand(0); + + if (isCheapToExtend(SetCC.getOperand(0)) && + isCheapToExtend(SetCC.getOperand(1))) { + const SDValue Ext1 = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), + N->getValueType(0), SetCC.getOperand(0)); + const SDValue Ext2 = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), + N->getValueType(0), SetCC.getOperand(1)); + + return DAG.getSetCC( + SDLoc(SetCC), N->getValueType(0), Ext1, Ext2, + cast(SetCC->getOperand(2).getNode())->get()); + } + + return SDValue(); +} + static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -15334,6 +15368,11 @@ static SDValue performExtendCombine(SDNode *N, return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD); } + + if (N->getOpcode() == ISD::SIGN_EXTEND && + N->getOperand(0)->getOpcode() == ISD::SETCC) + return performSignExtendSetCCCombine(N, DCI, DAG); + return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll index d3ad7df60e669..6bb85a815389e 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll @@ -265,12 +265,10 @@ define <8 x double> @masked_load_passthru_v8f64(<8 x double>* %ap, <8 x double>* define <32 x i16> @masked_load_sext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.b, vl32 -; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.b, p0/z, z0.b, #0 -; VBITS_GE_512-NEXT: punpklo p0.h, p0.b -; VBITS_GE_512-NEXT: ld1sb { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1sb { z0.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0 +; VBITS_GE_512-NEXT: ld1sb { z0.h }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8] ; VBITS_GE_512-NEXT: ret %b = load <32 x i8>, <32 x i8>* %bp @@ -283,12 +281,9 @@ define <32 x i16> @masked_load_sext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 define <16 x i32> @masked_load_sext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_sext_v16i8i32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ldr q0, [x1] ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: cmeq v0.16b, v0.16b, #0 -; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b -; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: cmpne p1.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: ld1sb { z0.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 ; VBITS_GE_512-NEXT: ld1sb { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_512-NEXT: ret @@ -302,13 +297,9 @@ define <16 x i32> @masked_load_sext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 define <8 x i64> @masked_load_sext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_sext_v8i8i64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ldr d0, [x1] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: cmeq v0.8b, v0.8b, #0 -; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b -; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1sb { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1sb { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_512-NEXT: ret @@ -322,12 +313,10 @@ define <8 x i64> @masked_load_sext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 { define <16 x i32> @masked_load_sext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl16 -; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.h, p0/z, z0.h, #0 -; VBITS_GE_512-NEXT: punpklo p0.h, p0.b -; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: ld1sh { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_512-NEXT: ret %b = load <16 x i16>, <16 x i16>* %bp @@ -340,12 +329,9 @@ define <16 x i32> @masked_load_sext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) define <8 x i64> @masked_load_sext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_sext_v8i16i64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ldr q0, [x1] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: cmeq v0.8h, v0.8h, #0 -; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1sh { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1sh { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_512-NEXT: ret @@ -359,12 +345,10 @@ define <8 x i64> @masked_load_sext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 { define <8 x i64> @masked_load_sext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl8 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, #0 -; VBITS_GE_512-NEXT: punpklo p0.h, p0.b -; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_512-NEXT: ret %b = load <8 x i32>, <8 x i32>* %bp @@ -377,12 +361,10 @@ define <8 x i64> @masked_load_sext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 { define <32 x i16> @masked_load_zext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.b, vl32 -; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.b, p0/z, z0.b, #0 -; VBITS_GE_512-NEXT: punpklo p0.h, p0.b -; VBITS_GE_512-NEXT: ld1b { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1sb { z0.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0 +; VBITS_GE_512-NEXT: ld1b { z0.h }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8] ; VBITS_GE_512-NEXT: ret %b = load <32 x i8>, <32 x i8>* %bp @@ -395,12 +377,9 @@ define <32 x i16> @masked_load_zext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 define <16 x i32> @masked_load_zext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_zext_v16i8i32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ldr q0, [x1] ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: cmeq v0.16b, v0.16b, #0 -; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b -; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: cmpne p1.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: ld1sb { z0.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 ; VBITS_GE_512-NEXT: ld1b { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_512-NEXT: ret @@ -414,13 +393,9 @@ define <16 x i32> @masked_load_zext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 define <8 x i64> @masked_load_zext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_zext_v8i8i64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ldr d0, [x1] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: cmeq v0.8b, v0.8b, #0 -; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b -; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1sb { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1b { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_512-NEXT: ret @@ -434,12 +409,10 @@ define <8 x i64> @masked_load_zext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 { define <16 x i32> @masked_load_zext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl16 -; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.h, p0/z, z0.h, #0 -; VBITS_GE_512-NEXT: punpklo p0.h, p0.b -; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: ld1h { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_512-NEXT: ret %b = load <16 x i16>, <16 x i16>* %bp @@ -452,12 +425,9 @@ define <16 x i32> @masked_load_zext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) define <8 x i64> @masked_load_zext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_zext_v8i16i64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ldr q0, [x1] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: cmeq v0.8h, v0.8h, #0 -; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1sh { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1h { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_512-NEXT: ret @@ -471,12 +441,10 @@ define <8 x i64> @masked_load_zext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 { define <8 x i64> @masked_load_zext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 { ; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl8 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, #0 -; VBITS_GE_512-NEXT: punpklo p0.h, p0.b -; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_512-NEXT: ret %b = load <8 x i32>, <8 x i32>* %bp @@ -681,12 +649,10 @@ define <8 x i64> @masked_load_zext_v8i32i64_m64(<8 x i32>* %ap, <8 x i64>* %bp) define <128 x i16> @masked_load_sext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_sext_v128i8i16: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.b, vl128 -; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: ld1sb { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 +; VBITS_GE_2048-NEXT: ld1sb { z0.h }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.h, p0/z, z0.h, #0 +; VBITS_GE_2048-NEXT: ld1sb { z0.h }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <128 x i8>, <128 x i8>* %bp @@ -699,13 +665,10 @@ define <128 x i16> @masked_load_sext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) define <64 x i32> @masked_load_sext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_sext_v64i8i32: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.b, vl64 -; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: ld1sb { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: ld1sb { z0.s }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_2048-NEXT: ld1sb { z0.s }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <64 x i8>, <64 x i8>* %bp @@ -718,14 +681,10 @@ define <64 x i32> @masked_load_sext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0 define <32 x i64> @masked_load_sext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_sext_v32i8i64: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.b, vl32 -; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: ld1sb { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1sb { z0.d }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_2048-NEXT: ld1sb { z0.d }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <32 x i8>, <32 x i8>* %bp @@ -738,12 +697,10 @@ define <32 x i64> @masked_load_sext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0 define <64 x i32> @masked_load_sext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_sext_v64i16i32: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl64 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p0.h, p0/z, z0.h, #0 -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <64 x i16>, <64 x i16>* %bp @@ -756,13 +713,10 @@ define <64 x i32> @masked_load_sext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) define <32 x i64> @masked_load_sext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_sext_v32i16i64: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p0.h, p0/z, z0.h, #0 -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <32 x i16>, <32 x i16>* %bp @@ -775,12 +729,10 @@ define <32 x i64> @masked_load_sext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) define <32 x i64> @masked_load_sext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_sext_v32i32i64: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p0.s, p0/z, z0.s, #0 -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <32 x i32>, <32 x i32>* %bp @@ -793,12 +745,10 @@ define <32 x i64> @masked_load_sext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp) define <128 x i16> @masked_load_zext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_zext_v128i8i16: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.b, vl128 -; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: ld1b { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 +; VBITS_GE_2048-NEXT: ld1sb { z0.h }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.h, p0/z, z0.h, #0 +; VBITS_GE_2048-NEXT: ld1b { z0.h }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <128 x i8>, <128 x i8>* %bp @@ -811,13 +761,10 @@ define <128 x i16> @masked_load_zext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) define <64 x i32> @masked_load_zext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_zext_v64i8i32: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.b, vl64 -; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: ld1b { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: ld1sb { z0.s }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_2048-NEXT: ld1b { z0.s }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <64 x i8>, <64 x i8>* %bp @@ -830,14 +777,10 @@ define <64 x i32> @masked_load_zext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0 define <32 x i64> @masked_load_zext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_zext_v32i8i64: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.b, vl32 -; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: ld1b { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1sb { z0.d }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_2048-NEXT: ld1b { z0.d }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <32 x i8>, <32 x i8>* %bp @@ -850,12 +793,10 @@ define <32 x i64> @masked_load_zext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0 define <64 x i32> @masked_load_zext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_zext_v64i16i32: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl64 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p0.h, p0/z, z0.h, #0 -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: ld1h { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_2048-NEXT: ld1h { z0.s }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <64 x i16>, <64 x i16>* %bp @@ -868,13 +809,10 @@ define <64 x i32> @masked_load_zext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) define <32 x i64> @masked_load_zext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_zext_v32i16i64: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p0.h, p0/z, z0.h, #0 -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: ld1h { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <32 x i16>, <32 x i16>* %bp @@ -887,12 +825,10 @@ define <32 x i64> @masked_load_zext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) define <32 x i64> @masked_load_zext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp) #0 { ; VBITS_GE_2048-LABEL: masked_load_zext_v32i32i64: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p0.s, p0/z, z0.s, #0 -; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b -; VBITS_GE_2048-NEXT: ld1w { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p0/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [x0] ; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %b = load <32 x i32>, <32 x i32>* %bp