From 9682d0d5dcc50d4583709bcb2e63f1ad630fe9a0 Mon Sep 17 00:00:00 2001 From: Petre-Ionut Tudor Date: Tue, 21 Apr 2020 14:11:13 +0100 Subject: [PATCH] [ARM] Refactor lower to S[LR]I optimization Summary: The optimization has been refactored to fix certain bugs and limitations. The condition for lowering to S[LR]I has been changed to reflect the manual pseudocode description of SLI and SRI operation. The optimization can now handle more cases of operand type and order. Subscribers: kristof.beyls, hiraditya, danielkiss, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D79233 --- .../Target/AArch64/AArch64ISelLowering.cpp | 105 +++-- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 10 +- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 14 +- .../test/CodeGen/AArch64/arm64-sli-sri-opt.ll | 439 +++++++++++++++++- 4 files changed, 512 insertions(+), 56 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 594be76f7ad7d..3270ba27d5904 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -99,11 +99,6 @@ STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumShiftInserts, "Number of vector shift inserts"); STATISTIC(NumOptimizedImms, "Number of times immediates were optimized"); -static cl::opt -EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, - cl::desc("Allow AArch64 SLI/SRI formation"), - cl::init(false)); - // FIXME: The necessary dtprel relocations don't seem to be supported // well in the GNU bfd and gold linkers at the moment. Therefore, by // default, for now, fall back to GeneralDynamic code generation. @@ -1340,6 +1335,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::VSHL: return "AArch64ISD::VSHL"; case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR"; case AArch64ISD::VASHR: return "AArch64ISD::VASHR"; + case AArch64ISD::VSLI: return "AArch64ISD::VSLI"; + case AArch64ISD::VSRI: return "AArch64ISD::VSRI"; case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ"; case AArch64ISD::CMGE: return "AArch64ISD::CMGE"; case AArch64ISD::CMGT: return "AArch64ISD::CMGT"; @@ -3173,6 +3170,23 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, "llvm.eh.recoverfp must take a function as the first argument"); return IncomingFPOp; } + + case Intrinsic::aarch64_neon_vsri: + case Intrinsic::aarch64_neon_vsli: { + EVT Ty = Op.getValueType(); + + if (!Ty.isVector()) + report_fatal_error("Unexpected type for aarch64_neon_vsli"); + + uint64_t ShiftAmount = Op.getConstantOperandVal(3); + unsigned ElemSizeInBits = Ty.getScalarSizeInBits(); + assert(ShiftAmount <= ElemSizeInBits); + + bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri; + unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI; + return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2), + Op.getOperand(3)); + } } } @@ -7950,8 +7964,10 @@ static unsigned getIntrinsicID(const SDNode *N) { // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)), // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a -// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2. -// Also, logical shift right -> sri, with the same structure. +// BUILD_VECTORs with constant element C1, C2 is a constant, and: +// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2) +// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2) +// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled. static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); @@ -7960,49 +7976,70 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); - // Is the first op an AND? - const SDValue And = N->getOperand(0); - if (And.getOpcode() != ISD::AND) + SDValue And; + SDValue Shift; + + SDValue FirstOp = N->getOperand(0); + unsigned FirstOpc = FirstOp.getOpcode(); + SDValue SecondOp = N->getOperand(1); + unsigned SecondOpc = SecondOp.getOpcode(); + + // Is one of the operands an AND or a BICi? The AND may have been optimised to + // a BICi in order to use an immediate instead of a register. + // Is the other operand an shl or lshr? This will have been turned into: + // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift. + if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) && + (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR)) { + And = FirstOp; + Shift = SecondOp; + + } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) && + (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR)) { + And = SecondOp; + Shift = FirstOp; + } else return SDValue(); - // Is the second op an shl or lshr? - SDValue Shift = N->getOperand(1); - // This will have been turned into: AArch64ISD::VSHL vector, #shift - // or AArch64ISD::VLSHR vector, #shift - unsigned ShiftOpc = Shift.getOpcode(); - if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR)) - return SDValue(); - bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR; + bool IsAnd = And.getOpcode() == ISD::AND; + bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR; // Is the shift amount constant? ConstantSDNode *C2node = dyn_cast(Shift.getOperand(1)); if (!C2node) return SDValue(); - // Is the and mask vector all constant? uint64_t C1; - if (!isAllConstantBuildVector(And.getOperand(1), C1)) - return SDValue(); + if (IsAnd) { + // Is the and mask vector all constant? + if (!isAllConstantBuildVector(And.getOperand(1), C1)) + return SDValue(); + } else { + // Reconstruct the corresponding AND immediate from the two BICi immediates. + ConstantSDNode *C1nodeImm = dyn_cast(And.getOperand(1)); + ConstantSDNode *C1nodeShift = dyn_cast(And.getOperand(2)); + assert(C1nodeImm && C1nodeShift); + C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue()); + } - // Is C1 == ~C2, taking into account how much one can shift elements of a - // particular size? + // Is C1 == ~(Ones(ElemSizeInBits) << C2) or + // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account + // how much one can shift elements of a particular size? uint64_t C2 = C2node->getZExtValue(); unsigned ElemSizeInBits = VT.getScalarSizeInBits(); if (C2 > ElemSizeInBits) return SDValue(); - unsigned ElemMask = (1 << ElemSizeInBits) - 1; - if ((C1 & ElemMask) != (~C2 & ElemMask)) + + APInt C1AsAPInt(ElemSizeInBits, C1); + APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2) + : APInt::getLowBitsSet(ElemSizeInBits, C2); + if (C1AsAPInt != RequiredC1) return SDValue(); SDValue X = And.getOperand(0); SDValue Y = Shift.getOperand(0); - unsigned Intrin = - IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli; - SDValue ResultSLI = - DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrin, DL, MVT::i32), X, Y, - Shift.getOperand(1)); + unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI; + SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1)); LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n"); LLVM_DEBUG(N->dump(&DAG)); @@ -8016,10 +8053,8 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SelectionDAG &DAG) const { // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) - if (EnableAArch64SlrGeneration) { - if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) - return Res; - } + if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) + return Res; EVT VT = Op.getValueType(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 176b8941bd047..1e0c875da3271 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -132,6 +132,10 @@ enum NodeType : unsigned { SRSHR_I, URSHR_I, + // Vector shift by constant and insert + VSLI, + VSRI, + // Vector comparisons CMEQ, CMGE, @@ -207,8 +211,10 @@ enum NodeType : unsigned { UMULL, // Reciprocal estimates and steps. - FRECPE, FRECPS, - FRSQRTE, FRSQRTS, + FRECPE, + FRECPS, + FRSQRTE, + FRSQRTS, SUNPKHI, SUNPKLO, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 19b320d3cc440..458bff9e7af5e 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -237,6 +237,10 @@ def SDT_AArch64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisInt<3>]>; def SDT_AArch64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>; +def SDT_AArch64vshiftinsert : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<3>, + SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>; + def SDT_AArch64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; def SDT_AArch64fcmpz : SDTypeProfile<1, 1, []>; def SDT_AArch64fcmp : SDTypeProfile<1, 2, [SDTCisSameAs<1,2>]>; @@ -475,6 +479,8 @@ def AArch64uqshli : SDNode<"AArch64ISD::UQSHL_I", SDT_AArch64vshift>; def AArch64sqshlui : SDNode<"AArch64ISD::SQSHLU_I", SDT_AArch64vshift>; def AArch64srshri : SDNode<"AArch64ISD::SRSHR_I", SDT_AArch64vshift>; def AArch64urshri : SDNode<"AArch64ISD::URSHR_I", SDT_AArch64vshift>; +def AArch64vsli : SDNode<"AArch64ISD::VSLI", SDT_AArch64vshiftinsert>; +def AArch64vsri : SDNode<"AArch64ISD::VSRI", SDT_AArch64vshiftinsert>; def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>; def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>; @@ -5886,8 +5892,8 @@ defm RSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn", defm SHL : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>; defm SHRN : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn", BinOpFrag<(trunc (AArch64vashr node:$LHS, node:$RHS))>>; -defm SLI : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_aarch64_neon_vsli>; -def : Pat<(v1i64 (int_aarch64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), +defm SLI : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", AArch64vsli>; +def : Pat<(v1i64 (AArch64vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm))), (SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>; defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn", @@ -5900,8 +5906,8 @@ defm SQSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn", int_aarch64_neon_sqshrn>; defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun", int_aarch64_neon_sqshrun>; -defm SRI : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_aarch64_neon_vsri>; -def : Pat<(v1i64 (int_aarch64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), +defm SRI : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", AArch64vsri>; +def : Pat<(v1i64 (AArch64vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))), (SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>; defm SRSHR : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", AArch64srshri>; diff --git a/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll b/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll index b26542d759e4f..1503df251cc3a 100644 --- a/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll +++ b/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll @@ -1,18 +1,86 @@ -; RUN: llc < %s -aarch64-shift-insert-generation=true -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s -define void @testLeftGood(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind { -; CHECK-LABEL: testLeftGood: -; CHECK: sli.16b v0, v1, #3 - %and.i = and <16 x i8> %src1, +define void @testLeftGood8x8(<8 x i8> %src1, <8 x i8> %src2, <8 x i8>* %dest) nounwind { +; CHECK-LABEL: testLeftGood8x8: +; CHECK: // %bb.0: +; CHECK-NEXT: sli.8b v0, v1, #3 +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret + %and.i = and <8 x i8> %src1, + %vshl_n = shl <8 x i8> %src2, + %result = or <8 x i8> %and.i, %vshl_n + store <8 x i8> %result, <8 x i8>* %dest, align 8 + ret void +} + +define void @testLeftBad8x8(<8 x i8> %src1, <8 x i8> %src2, <8 x i8>* %dest) nounwind { +; CHECK-LABEL: testLeftBad8x8: +; CHECK: // %bb.0: +; CHECK-NEXT: movi.8b v2, #165 +; CHECK-NEXT: and.8b v0, v0, v2 +; CHECK-NEXT: shl.8b v1, v1, #1 +; CHECK-NEXT: orr.8b v0, v0, v1 +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret + %and.i = and <8 x i8> %src1, + %vshl_n = shl <8 x i8> %src2, + %result = or <8 x i8> %and.i, %vshl_n + store <8 x i8> %result, <8 x i8>* %dest, align 8 + ret void +} + +define void @testRightGood8x8(<8 x i8> %src1, <8 x i8> %src2, <8 x i8>* %dest) nounwind { +; CHECK-LABEL: testRightGood8x8: +; CHECK: // %bb.0: +; CHECK-NEXT: sri.8b v0, v1, #3 +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret + %and.i = and <8 x i8> %src1, + %vshl_n = lshr <8 x i8> %src2, + %result = or <8 x i8> %and.i, %vshl_n + store <8 x i8> %result, <8 x i8>* %dest, align 8 + ret void +} + +define void @testRightBad8x8(<8 x i8> %src1, <8 x i8> %src2, <8 x i8>* %dest) nounwind { +; CHECK-LABEL: testRightBad8x8: +; CHECK: // %bb.0: +; CHECK-NEXT: movi.8b v2, #165 +; CHECK-NEXT: and.8b v0, v0, v2 +; CHECK-NEXT: ushr.8b v1, v1, #1 +; CHECK-NEXT: orr.8b v0, v0, v1 +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret + %and.i = and <8 x i8> %src1, + %vshl_n = lshr <8 x i8> %src2, + %result = or <8 x i8> %and.i, %vshl_n + store <8 x i8> %result, <8 x i8>* %dest, align 8 + ret void +} + +define void @testLeftGood16x8(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind { +; CHECK-LABEL: testLeftGood16x8: +; CHECK: // %bb.0: +; CHECK-NEXT: sli.16b v0, v1, #3 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %and.i = and <16 x i8> %src1, %vshl_n = shl <16 x i8> %src2, %result = or <16 x i8> %and.i, %vshl_n store <16 x i8> %result, <16 x i8>* %dest, align 16 ret void } -define void @testLeftBad(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind { -; CHECK-LABEL: testLeftBad: -; CHECK-NOT: sli +define void @testLeftBad16x8(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind { +; CHECK-LABEL: testLeftBad16x8: +; CHECK: // %bb.0: +; CHECK-NEXT: movi.16b v2, #165 +; CHECK-NEXT: and.16b v0, v0, v2 +; CHECK-NEXT: shl.16b v1, v1, #1 +; CHECK-NEXT: orr.16b v0, v0, v1 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret %and.i = and <16 x i8> %src1, %vshl_n = shl <16 x i8> %src2, %result = or <16 x i8> %and.i, %vshl_n @@ -20,22 +88,363 @@ define void @testLeftBad(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nou ret void } -define void @testRightGood(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind { -; CHECK-LABEL: testRightGood: -; CHECK: sri.16b v0, v1, #3 - %and.i = and <16 x i8> %src1, +define void @testRightGood16x8(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind { +; CHECK-LABEL: testRightGood16x8: +; CHECK: // %bb.0: +; CHECK-NEXT: sri.16b v0, v1, #3 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %and.i = and <16 x i8> %src1, %vshl_n = lshr <16 x i8> %src2, %result = or <16 x i8> %and.i, %vshl_n store <16 x i8> %result, <16 x i8>* %dest, align 16 ret void } -define void @testRightBad(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind { -; CHECK-LABEL: testRightBad: -; CHECK-NOT: sri +define void @testRightBad16x8(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind { +; CHECK-LABEL: testRightBad16x8: +; CHECK: // %bb.0: +; CHECK-NEXT: movi.16b v2, #165 +; CHECK-NEXT: and.16b v0, v0, v2 +; CHECK-NEXT: ushr.16b v1, v1, #1 +; CHECK-NEXT: orr.16b v0, v0, v1 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret %and.i = and <16 x i8> %src1, %vshl_n = lshr <16 x i8> %src2, %result = or <16 x i8> %and.i, %vshl_n store <16 x i8> %result, <16 x i8>* %dest, align 16 ret void } + +define void @testLeftGood4x16(<4 x i16> %src1, <4 x i16> %src2, <4 x i16>* %dest) nounwind { +; CHECK-LABEL: testLeftGood4x16: +; CHECK: // %bb.0: +; CHECK-NEXT: sli.4h v0, v1, #14 +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret + %and.i = and <4 x i16> %src1, + %vshl_n = shl <4 x i16> %src2, + %result = or <4 x i16> %and.i, %vshl_n + store <4 x i16> %result, <4 x i16>* %dest, align 8 + ret void +} + +define void @testLeftBad4x16(<4 x i16> %src1, <4 x i16> %src2, <4 x i16>* %dest) nounwind { +; CHECK-LABEL: testLeftBad4x16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16500 +; CHECK-NEXT: dup.4h v2, w8 +; CHECK-NEXT: and.8b v0, v0, v2 +; CHECK-NEXT: shl.4h v1, v1, #14 +; CHECK-NEXT: orr.8b v0, v0, v1 +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret + %and.i = and <4 x i16> %src1, + %vshl_n = shl <4 x i16> %src2, + %result = or <4 x i16> %and.i, %vshl_n + store <4 x i16> %result, <4 x i16>* %dest, align 8 + ret void +} + +define void @testRightGood4x16(<4 x i16> %src1, <4 x i16> %src2, <4 x i16>* %dest) nounwind { +; CHECK-LABEL: testRightGood4x16: +; CHECK: // %bb.0: +; CHECK-NEXT: sri.4h v0, v1, #14 +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret + %and.i = and <4 x i16> %src1, + %vshl_n = lshr <4 x i16> %src2, + %result = or <4 x i16> %and.i, %vshl_n + store <4 x i16> %result, <4 x i16>* %dest, align 8 + ret void +} + +define void @testRightBad4x16(<4 x i16> %src1, <4 x i16> %src2, <4 x i16>* %dest) nounwind { +; CHECK-LABEL: testRightBad4x16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16500 +; CHECK-NEXT: dup.4h v2, w8 +; CHECK-NEXT: and.8b v0, v0, v2 +; CHECK-NEXT: ushr.4h v1, v1, #14 +; CHECK-NEXT: orr.8b v0, v0, v1 +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret + %and.i = and <4 x i16> %src1, + %vshl_n = lshr <4 x i16> %src2, + %result = or <4 x i16> %and.i, %vshl_n + store <4 x i16> %result, <4 x i16>* %dest, align 8 + ret void +} + +define void @testLeftGood8x16(<8 x i16> %src1, <8 x i16> %src2, <8 x i16>* %dest) nounwind { +; CHECK-LABEL: testLeftGood8x16: +; CHECK: // %bb.0: +; CHECK-NEXT: sli.8h v0, v1, #14 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %and.i = and <8 x i16> %src1, + %vshl_n = shl <8 x i16> %src2, + %result = or <8 x i16> %and.i, %vshl_n + store <8 x i16> %result, <8 x i16>* %dest, align 16 + ret void +} + +define void @testLeftBad8x16(<8 x i16> %src1, <8 x i16> %src2, <8 x i16>* %dest) nounwind { +; CHECK-LABEL: testLeftBad8x16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16500 +; CHECK-NEXT: dup.8h v2, w8 +; CHECK-NEXT: and.16b v0, v0, v2 +; CHECK-NEXT: shl.8h v1, v1, #14 +; CHECK-NEXT: orr.16b v0, v0, v1 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %and.i = and <8 x i16> %src1, + %vshl_n = shl <8 x i16> %src2, + %result = or <8 x i16> %and.i, %vshl_n + store <8 x i16> %result, <8 x i16>* %dest, align 16 + ret void +} + +define void @testRightGood8x16(<8 x i16> %src1, <8 x i16> %src2, <8 x i16>* %dest) nounwind { +; CHECK-LABEL: testRightGood8x16: +; CHECK: // %bb.0: +; CHECK-NEXT: sri.8h v0, v1, #14 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %and.i = and <8 x i16> %src1, + %vshl_n = lshr <8 x i16> %src2, + %result = or <8 x i16> %and.i, %vshl_n + store <8 x i16> %result, <8 x i16>* %dest, align 16 + ret void +} + +define void @testRightBad8x16(<8 x i16> %src1, <8 x i16> %src2, <8 x i16>* %dest) nounwind { +; CHECK-LABEL: testRightBad8x16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16500 +; CHECK-NEXT: dup.8h v2, w8 +; CHECK-NEXT: and.16b v0, v0, v2 +; CHECK-NEXT: ushr.8h v1, v1, #14 +; CHECK-NEXT: orr.16b v0, v0, v1 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %and.i = and <8 x i16> %src1, + %vshl_n = lshr <8 x i16> %src2, + %result = or <8 x i16> %and.i, %vshl_n + store <8 x i16> %result, <8 x i16>* %dest, align 16 + ret void +} + +define void @testLeftGood2x32(<2 x i32> %src1, <2 x i32> %src2, <2 x i32>* %dest) nounwind { +; CHECK-LABEL: testLeftGood2x32: +; CHECK: // %bb.0: +; CHECK-NEXT: sli.2s v0, v1, #22 +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret + %and.i = and <2 x i32> %src1, + %vshl_n = shl <2 x i32> %src2, + %result = or <2 x i32> %and.i, %vshl_n + store <2 x i32> %result, <2 x i32>* %dest, align 8 + ret void +} + +define void @testLeftBad2x32(<2 x i32> %src1, <2 x i32> %src2, <2 x i32>* %dest) nounwind { +; CHECK-LABEL: testLeftBad2x32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #4194300 +; CHECK-NEXT: dup.2s v2, w8 +; CHECK-NEXT: and.8b v0, v0, v2 +; CHECK-NEXT: shl.2s v1, v1, #22 +; CHECK-NEXT: orr.8b v0, v0, v1 +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret + %and.i = and <2 x i32> %src1, + %vshl_n = shl <2 x i32> %src2, + %result = or <2 x i32> %and.i, %vshl_n + store <2 x i32> %result, <2 x i32>* %dest, align 8 + ret void +} + +define void @testRightGood2x32(<2 x i32> %src1, <2 x i32> %src2, <2 x i32>* %dest) nounwind { +; CHECK-LABEL: testRightGood2x32: +; CHECK: // %bb.0: +; CHECK-NEXT: sri.2s v0, v1, #22 +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret + %and.i = and <2 x i32> %src1, + %vshl_n = lshr <2 x i32> %src2, + %result = or <2 x i32> %and.i, %vshl_n + store <2 x i32> %result, <2 x i32>* %dest, align 8 + ret void +} + +define void @testRightBad2x32(<2 x i32> %src1, <2 x i32> %src2, <2 x i32>* %dest) nounwind { +; CHECK-LABEL: testRightBad2x32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #4194300 +; CHECK-NEXT: dup.2s v2, w8 +; CHECK-NEXT: and.8b v0, v0, v2 +; CHECK-NEXT: ushr.2s v1, v1, #22 +; CHECK-NEXT: orr.8b v0, v0, v1 +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret + %and.i = and <2 x i32> %src1, + %vshl_n = lshr <2 x i32> %src2, + %result = or <2 x i32> %and.i, %vshl_n + store <2 x i32> %result, <2 x i32>* %dest, align 8 + ret void +} + +define void @testLeftGood4x32(<4 x i32> %src1, <4 x i32> %src2, <4 x i32>* %dest) nounwind { +; CHECK-LABEL: testLeftGood4x32: +; CHECK: // %bb.0: +; CHECK-NEXT: sli.4s v0, v1, #22 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %and.i = and <4 x i32> %src1, + %vshl_n = shl <4 x i32> %src2, + %result = or <4 x i32> %and.i, %vshl_n + store <4 x i32> %result, <4 x i32>* %dest, align 16 + ret void +} + +define void @testLeftBad4x32(<4 x i32> %src1, <4 x i32> %src2, <4 x i32>* %dest) nounwind { +; CHECK-LABEL: testLeftBad4x32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #4194300 +; CHECK-NEXT: dup.4s v2, w8 +; CHECK-NEXT: and.16b v0, v0, v2 +; CHECK-NEXT: shl.4s v1, v1, #22 +; CHECK-NEXT: orr.16b v0, v0, v1 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %and.i = and <4 x i32> %src1, + %vshl_n = shl <4 x i32> %src2, + %result = or <4 x i32> %and.i, %vshl_n + store <4 x i32> %result, <4 x i32>* %dest, align 16 + ret void +} + +define void @testRightGood4x32(<4 x i32> %src1, <4 x i32> %src2, <4 x i32>* %dest) nounwind { +; CHECK-LABEL: testRightGood4x32: +; CHECK: // %bb.0: +; CHECK-NEXT: sri.4s v0, v1, #22 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %and.i = and <4 x i32> %src1, + %vshl_n = lshr <4 x i32> %src2, + %result = or <4 x i32> %and.i, %vshl_n + store <4 x i32> %result, <4 x i32>* %dest, align 16 + ret void +} + +define void @testRightBad4x32(<4 x i32> %src1, <4 x i32> %src2, <4 x i32>* %dest) nounwind { +; CHECK-LABEL: testRightBad4x32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #4194300 +; CHECK-NEXT: dup.4s v2, w8 +; CHECK-NEXT: and.16b v0, v0, v2 +; CHECK-NEXT: ushr.4s v1, v1, #22 +; CHECK-NEXT: orr.16b v0, v0, v1 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %and.i = and <4 x i32> %src1, + %vshl_n = lshr <4 x i32> %src2, + %result = or <4 x i32> %and.i, %vshl_n + store <4 x i32> %result, <4 x i32>* %dest, align 16 + ret void +} + +define void @testLeftGood2x64(<2 x i64> %src1, <2 x i64> %src2, <2 x i64>* %dest) nounwind { +; CHECK-LABEL: testLeftGood2x64: +; CHECK: // %bb.0: +; CHECK-NEXT: sli.2d v0, v1, #48 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %and.i = and <2 x i64> %src1, + %vshl_n = shl <2 x i64> %src2, + %result = or <2 x i64> %and.i, %vshl_n + store <2 x i64> %result, <2 x i64>* %dest, align 16 + ret void +} + +define void @testLeftBad2x64(<2 x i64> %src1, <2 x i64> %src2, <2 x i64>* %dest) nounwind { +; CHECK-LABEL: testLeftBad2x64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #10 +; CHECK-NEXT: movk x8, #1, lsl #48 +; CHECK-NEXT: dup.2d v2, x8 +; CHECK-NEXT: and.16b v0, v0, v2 +; CHECK-NEXT: shl.2d v1, v1, #48 +; CHECK-NEXT: orr.16b v0, v0, v1 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %and.i = and <2 x i64> %src1, + %vshl_n = shl <2 x i64> %src2, + %result = or <2 x i64> %and.i, %vshl_n + store <2 x i64> %result, <2 x i64>* %dest, align 16 + ret void +} + +define void @testRightGood2x64(<2 x i64> %src1, <2 x i64> %src2, <2 x i64>* %dest) nounwind { +; CHECK-LABEL: testRightGood2x64: +; CHECK: // %bb.0: +; CHECK-NEXT: sri.2d v0, v1, #48 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %and.i = and <2 x i64> %src1, + %vshl_n = lshr <2 x i64> %src2, + %result = or <2 x i64> %and.i, %vshl_n + store <2 x i64> %result, <2 x i64>* %dest, align 16 + ret void +} + +define void @testRightBad2x64(<2 x i64> %src1, <2 x i64> %src2, <2 x i64>* %dest) nounwind { +; CHECK-LABEL: testRightBad2x64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #10 +; CHECK-NEXT: movk x8, #1, lsl #48 +; CHECK-NEXT: dup.2d v2, x8 +; CHECK-NEXT: and.16b v0, v0, v2 +; CHECK-NEXT: ushr.2d v1, v1, #48 +; CHECK-NEXT: orr.16b v0, v0, v1 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %and.i = and <2 x i64> %src1, + %vshl_n = lshr <2 x i64> %src2, + %result = or <2 x i64> %and.i, %vshl_n + store <2 x i64> %result, <2 x i64>* %dest, align 16 + ret void +} + +define void @testLeftShouldNotCreateSLI1x128(<1 x i128> %src1, <1 x i128> %src2, <1 x i128>* %dest) nounwind { +; CHECK-LABEL: testLeftShouldNotCreateSLI1x128: +; CHECK: // %bb.0: +; CHECK-NEXT: bfi x1, x2, #6, #58 +; CHECK-NEXT: stp x0, x1, [x4] +; CHECK-NEXT: ret + %and.i = and <1 x i128> %src1, + %vshl_n = shl <1 x i128> %src2, + %result = or <1 x i128> %and.i, %vshl_n + store <1 x i128> %result, <1 x i128>* %dest, align 16 + ret void +} + +define void @testLeftNotAllConstantBuildVec8x8(<8 x i8> %src1, <8 x i8> %src2, <8 x i8>* %dest) nounwind { +; CHECK-LABEL: testLeftNotAllConstantBuildVec8x8: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI29_0 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI29_0] +; CHECK-NEXT: shl.8b v1, v1, #3 +; CHECK-NEXT: and.8b v0, v0, v2 +; CHECK-NEXT: orr.8b v0, v0, v1 +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret + %and.i = and <8 x i8> %src1, + %vshl_n = shl <8 x i8> %src2, + %result = or <8 x i8> %and.i, %vshl_n + store <8 x i8> %result, <8 x i8>* %dest, align 8 + ret void +}