From c464896dbe2ad5a8641a675fff525656e88be38a Mon Sep 17 00:00:00 2001 From: Vladislav Dzhidzhoev Date: Fri, 15 Sep 2023 14:03:48 +0200 Subject: [PATCH] [AArch64][GlobalISel] Select llvm.aarch64.neon.ld* intrinsics (#65630) Similar to llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp. --- .../GISel/AArch64InstructionSelector.cpp | 327 ++++++++++++- .../AArch64/GISel/AArch64RegisterBankInfo.cpp | 53 ++- llvm/test/CodeGen/AArch64/arm64-ld1.ll | 431 ++++++++++++------ 3 files changed, 650 insertions(+), 161 deletions(-) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 94d0dbca838de..89d1b96bdacb2 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -143,6 +143,12 @@ class AArch64InstructionSelector : public InstructionSelector { const TargetRegisterClass *DstRC, Register Scalar, MachineIRBuilder &MIRBuilder) const; + /// Helper to narrow vector that was widened by emitScalarToVector. + /// Copy lowest part of 128-bit or 64-bit vector to 64-bit or 32-bit + /// vector, correspondingly. + MachineInstr *emitNarrowVector(Register DstReg, Register SrcReg, + MachineIRBuilder &MIRBuilder, + MachineRegisterInfo &MRI) const; /// Emit a lane insert into \p DstReg, or a new vector register if /// std::nullopt is provided. @@ -186,6 +192,8 @@ class AArch64InstructionSelector : public InstructionSelector { /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction. bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs, MachineInstr &I); + bool selectVectorLoadLaneIntrinsic(unsigned Opc, unsigned NumVecs, + MachineInstr &I); bool selectIntrinsicWithSideEffects(MachineInstr &I, MachineRegisterInfo &MRI); bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI); @@ -3897,6 +3905,31 @@ MachineInstr *AArch64InstructionSelector::emitScalarToVector( } } +MachineInstr * +AArch64InstructionSelector::emitNarrowVector(Register DstReg, Register SrcReg, + MachineIRBuilder &MIB, + MachineRegisterInfo &MRI) const { + LLT DstTy = MRI.getType(DstReg); + const TargetRegisterClass *RC = + getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(SrcReg, MRI, TRI)); + if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { + LLVM_DEBUG(dbgs() << "Unsupported register class!\n"); + return nullptr; + } + unsigned SubReg = 0; + if (!getSubRegForClass(RC, TRI, SubReg)) + return nullptr; + if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) { + LLVM_DEBUG(dbgs() << "Unsupported destination size! (" + << DstTy.getSizeInBits() << "\n"); + return nullptr; + } + auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) + .addReg(SrcReg, 0, SubReg); + RBI.constrainGenericRegister(DstReg, *RC, MRI); + return Copy; +} + bool AArch64InstructionSelector::selectMergeValues( MachineInstr &I, MachineRegisterInfo &MRI) { assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode"); @@ -5384,24 +5417,8 @@ bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I, if (VecSize < 128) { // If we had to widen to perform the insert, then we have to demote back to // the original size to get the result we want. - Register DemoteVec = InsMI->getOperand(0).getReg(); - const TargetRegisterClass *RC = - getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DemoteVec, MRI, TRI)); - if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) { - LLVM_DEBUG(dbgs() << "Unsupported register class!\n"); - return false; - } - unsigned SubReg = 0; - if (!getSubRegForClass(RC, TRI, SubReg)) + if (!emitNarrowVector(DstReg, InsMI->getOperand(0).getReg(), MIB, MRI)) return false; - if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) { - LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << VecSize - << "\n"); - return false; - } - MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) - .addReg(DemoteVec, 0, SubReg); - RBI.constrainGenericRegister(DstReg, *RC, MRI); } else { // No widening needed. InsMI->getOperand(0).setReg(DstReg); @@ -5630,6 +5647,60 @@ bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc, return true; } +bool AArch64InstructionSelector::selectVectorLoadLaneIntrinsic( + unsigned Opc, unsigned NumVecs, MachineInstr &I) { + assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); + assert(Opc && "Expected an opcode?"); + assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors"); + auto &MRI = *MIB.getMRI(); + LLT Ty = MRI.getType(I.getOperand(0).getReg()); + bool Narrow = Ty.getSizeInBits() == 64; + + auto FirstSrcRegIt = I.operands_begin() + NumVecs + 1; + SmallVector Regs(NumVecs); + std::transform(FirstSrcRegIt, FirstSrcRegIt + NumVecs, Regs.begin(), + [](auto MO) { return MO.getReg(); }); + + if (Narrow) { + transform(Regs, Regs.begin(), [this](Register Reg) { + return emitScalarToVector(64, &AArch64::FPR128RegClass, Reg, MIB) + ->getOperand(0) + .getReg(); + }); + Ty = Ty.multiplyElements(2); + } + + Register Tuple = createQTuple(Regs, MIB); + auto LaneNo = getIConstantVRegVal((FirstSrcRegIt + NumVecs)->getReg(), MRI); + if (!LaneNo) + return false; + + Register Ptr = (FirstSrcRegIt + NumVecs + 1)->getReg(); + auto Load = MIB.buildInstr(Opc, {Ty}, {}) + .addReg(Tuple) + .addImm(LaneNo->getZExtValue()) + .addReg(Ptr); + Load.cloneMemRefs(I); + constrainSelectedInstRegOperands(*Load, TII, TRI, RBI); + Register SelectedLoadDst = Load->getOperand(0).getReg(); + unsigned SubReg = AArch64::qsub0; + for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { + auto Vec = MIB.buildInstr(TargetOpcode::COPY, + {Narrow ? DstOp(&AArch64::FPR128RegClass) + : DstOp(I.getOperand(Idx).getReg())}, + {}) + .addReg(SelectedLoadDst, 0, SubReg + Idx); + Register WideReg = Vec.getReg(0); + // Emit the subreg copies and immediately select them. + selectCopy(*Vec, TII, MRI, TRI, RBI); + if (Narrow && + !emitNarrowVector(I.getOperand(Idx).getReg(), WideReg, MIB, MRI)) + return false; + } + + return true; +} + bool AArch64InstructionSelector::selectIntrinsicWithSideEffects( MachineInstr &I, MachineRegisterInfo &MRI) { // Find the intrinsic ID. @@ -5664,6 +5735,78 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects( MIB.buildInstr(AArch64::BRK, {}, {}) .addImm(I.getOperand(1).getImm() | ('U' << 8)); break; + case Intrinsic::aarch64_neon_ld1x2: { + LLT Ty = MRI.getType(I.getOperand(0).getReg()); + unsigned Opc = 0; + if (Ty == LLT::fixed_vector(8, S8)) + Opc = AArch64::LD1Twov8b; + else if (Ty == LLT::fixed_vector(16, S8)) + Opc = AArch64::LD1Twov16b; + else if (Ty == LLT::fixed_vector(4, S16)) + Opc = AArch64::LD1Twov4h; + else if (Ty == LLT::fixed_vector(8, S16)) + Opc = AArch64::LD1Twov8h; + else if (Ty == LLT::fixed_vector(2, S32)) + Opc = AArch64::LD1Twov2s; + else if (Ty == LLT::fixed_vector(4, S32)) + Opc = AArch64::LD1Twov4s; + else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) + Opc = AArch64::LD1Twov2d; + else if (Ty == S64 || Ty == P0) + Opc = AArch64::LD1Twov1d; + else + llvm_unreachable("Unexpected type for ld1x2!"); + selectVectorLoadIntrinsic(Opc, 2, I); + break; + } + case Intrinsic::aarch64_neon_ld1x3: { + LLT Ty = MRI.getType(I.getOperand(0).getReg()); + unsigned Opc = 0; + if (Ty == LLT::fixed_vector(8, S8)) + Opc = AArch64::LD1Threev8b; + else if (Ty == LLT::fixed_vector(16, S8)) + Opc = AArch64::LD1Threev16b; + else if (Ty == LLT::fixed_vector(4, S16)) + Opc = AArch64::LD1Threev4h; + else if (Ty == LLT::fixed_vector(8, S16)) + Opc = AArch64::LD1Threev8h; + else if (Ty == LLT::fixed_vector(2, S32)) + Opc = AArch64::LD1Threev2s; + else if (Ty == LLT::fixed_vector(4, S32)) + Opc = AArch64::LD1Threev4s; + else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) + Opc = AArch64::LD1Threev2d; + else if (Ty == S64 || Ty == P0) + Opc = AArch64::LD1Threev1d; + else + llvm_unreachable("Unexpected type for ld1x3!"); + selectVectorLoadIntrinsic(Opc, 3, I); + break; + } + case Intrinsic::aarch64_neon_ld1x4: { + LLT Ty = MRI.getType(I.getOperand(0).getReg()); + unsigned Opc = 0; + if (Ty == LLT::fixed_vector(8, S8)) + Opc = AArch64::LD1Fourv8b; + else if (Ty == LLT::fixed_vector(16, S8)) + Opc = AArch64::LD1Fourv16b; + else if (Ty == LLT::fixed_vector(4, S16)) + Opc = AArch64::LD1Fourv4h; + else if (Ty == LLT::fixed_vector(8, S16)) + Opc = AArch64::LD1Fourv8h; + else if (Ty == LLT::fixed_vector(2, S32)) + Opc = AArch64::LD1Fourv2s; + else if (Ty == LLT::fixed_vector(4, S32)) + Opc = AArch64::LD1Fourv4s; + else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) + Opc = AArch64::LD1Fourv2d; + else if (Ty == S64 || Ty == P0) + Opc = AArch64::LD1Fourv1d; + else + llvm_unreachable("Unexpected type for ld1x4!"); + selectVectorLoadIntrinsic(Opc, 4, I); + break; + } case Intrinsic::aarch64_neon_ld2: { LLT Ty = MRI.getType(I.getOperand(0).getReg()); unsigned Opc = 0; @@ -5688,6 +5831,114 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects( selectVectorLoadIntrinsic(Opc, 2, I); break; } + case Intrinsic::aarch64_neon_ld2lane: { + LLT Ty = MRI.getType(I.getOperand(0).getReg()); + unsigned Opc; + if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8)) + Opc = AArch64::LD2i8; + else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16)) + Opc = AArch64::LD2i16; + else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32)) + Opc = AArch64::LD2i32; + else if (Ty == LLT::fixed_vector(2, S64) || + Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0) + Opc = AArch64::LD2i64; + else + llvm_unreachable("Unexpected type for st2lane!"); + if (!selectVectorLoadLaneIntrinsic(Opc, 2, I)) + return false; + break; + } + case Intrinsic::aarch64_neon_ld2r: { + LLT Ty = MRI.getType(I.getOperand(0).getReg()); + unsigned Opc = 0; + if (Ty == LLT::fixed_vector(8, S8)) + Opc = AArch64::LD2Rv8b; + else if (Ty == LLT::fixed_vector(16, S8)) + Opc = AArch64::LD2Rv16b; + else if (Ty == LLT::fixed_vector(4, S16)) + Opc = AArch64::LD2Rv4h; + else if (Ty == LLT::fixed_vector(8, S16)) + Opc = AArch64::LD2Rv8h; + else if (Ty == LLT::fixed_vector(2, S32)) + Opc = AArch64::LD2Rv2s; + else if (Ty == LLT::fixed_vector(4, S32)) + Opc = AArch64::LD2Rv4s; + else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) + Opc = AArch64::LD2Rv2d; + else if (Ty == S64 || Ty == P0) + Opc = AArch64::LD2Rv1d; + else + llvm_unreachable("Unexpected type for ld2r!"); + selectVectorLoadIntrinsic(Opc, 2, I); + break; + } + case Intrinsic::aarch64_neon_ld3: { + LLT Ty = MRI.getType(I.getOperand(0).getReg()); + unsigned Opc = 0; + if (Ty == LLT::fixed_vector(8, S8)) + Opc = AArch64::LD3Threev8b; + else if (Ty == LLT::fixed_vector(16, S8)) + Opc = AArch64::LD3Threev16b; + else if (Ty == LLT::fixed_vector(4, S16)) + Opc = AArch64::LD3Threev4h; + else if (Ty == LLT::fixed_vector(8, S16)) + Opc = AArch64::LD3Threev8h; + else if (Ty == LLT::fixed_vector(2, S32)) + Opc = AArch64::LD3Threev2s; + else if (Ty == LLT::fixed_vector(4, S32)) + Opc = AArch64::LD3Threev4s; + else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) + Opc = AArch64::LD3Threev2d; + else if (Ty == S64 || Ty == P0) + Opc = AArch64::LD1Threev1d; + else + llvm_unreachable("Unexpected type for ld3!"); + selectVectorLoadIntrinsic(Opc, 3, I); + break; + } + case Intrinsic::aarch64_neon_ld3lane: { + LLT Ty = MRI.getType(I.getOperand(0).getReg()); + unsigned Opc; + if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8)) + Opc = AArch64::LD3i8; + else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16)) + Opc = AArch64::LD3i16; + else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32)) + Opc = AArch64::LD3i32; + else if (Ty == LLT::fixed_vector(2, S64) || + Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0) + Opc = AArch64::LD3i64; + else + llvm_unreachable("Unexpected type for st3lane!"); + if (!selectVectorLoadLaneIntrinsic(Opc, 3, I)) + return false; + break; + } + case Intrinsic::aarch64_neon_ld3r: { + LLT Ty = MRI.getType(I.getOperand(0).getReg()); + unsigned Opc = 0; + if (Ty == LLT::fixed_vector(8, S8)) + Opc = AArch64::LD3Rv8b; + else if (Ty == LLT::fixed_vector(16, S8)) + Opc = AArch64::LD3Rv16b; + else if (Ty == LLT::fixed_vector(4, S16)) + Opc = AArch64::LD3Rv4h; + else if (Ty == LLT::fixed_vector(8, S16)) + Opc = AArch64::LD3Rv8h; + else if (Ty == LLT::fixed_vector(2, S32)) + Opc = AArch64::LD3Rv2s; + else if (Ty == LLT::fixed_vector(4, S32)) + Opc = AArch64::LD3Rv4s; + else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) + Opc = AArch64::LD3Rv2d; + else if (Ty == S64 || Ty == P0) + Opc = AArch64::LD3Rv1d; + else + llvm_unreachable("Unexpected type for ld3r!"); + selectVectorLoadIntrinsic(Opc, 3, I); + break; + } case Intrinsic::aarch64_neon_ld4: { LLT Ty = MRI.getType(I.getOperand(0).getReg()); unsigned Opc = 0; @@ -5712,6 +5963,48 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects( selectVectorLoadIntrinsic(Opc, 4, I); break; } + case Intrinsic::aarch64_neon_ld4lane: { + LLT Ty = MRI.getType(I.getOperand(0).getReg()); + unsigned Opc; + if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8)) + Opc = AArch64::LD4i8; + else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16)) + Opc = AArch64::LD4i16; + else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32)) + Opc = AArch64::LD4i32; + else if (Ty == LLT::fixed_vector(2, S64) || + Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0) + Opc = AArch64::LD4i64; + else + llvm_unreachable("Unexpected type for st4lane!"); + if (!selectVectorLoadLaneIntrinsic(Opc, 4, I)) + return false; + break; + } + case Intrinsic::aarch64_neon_ld4r: { + LLT Ty = MRI.getType(I.getOperand(0).getReg()); + unsigned Opc = 0; + if (Ty == LLT::fixed_vector(8, S8)) + Opc = AArch64::LD4Rv8b; + else if (Ty == LLT::fixed_vector(16, S8)) + Opc = AArch64::LD4Rv16b; + else if (Ty == LLT::fixed_vector(4, S16)) + Opc = AArch64::LD4Rv4h; + else if (Ty == LLT::fixed_vector(8, S16)) + Opc = AArch64::LD4Rv8h; + else if (Ty == LLT::fixed_vector(2, S32)) + Opc = AArch64::LD4Rv2s; + else if (Ty == LLT::fixed_vector(4, S32)) + Opc = AArch64::LD4Rv4s; + else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0)) + Opc = AArch64::LD4Rv2d; + else if (Ty == S64 || Ty == P0) + Opc = AArch64::LD4Rv1d; + else + llvm_unreachable("Unexpected type for ld4r!"); + selectVectorLoadIntrinsic(Opc, 4, I); + break; + } case Intrinsic::aarch64_neon_st2: { Register Src1 = I.getOperand(1).getReg(); Register Src2 = I.getOperand(2).getReg(); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp index 8ca2bc641b14a..26954c62e03f1 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -580,6 +580,25 @@ bool AArch64RegisterBankInfo::onlyDefinesFP(const MachineInstr &MI, case TargetOpcode::G_BUILD_VECTOR: case TargetOpcode::G_BUILD_VECTOR_TRUNC: return true; + case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: + switch (cast(MI).getIntrinsicID()) { + case Intrinsic::aarch64_neon_ld1x2: + case Intrinsic::aarch64_neon_ld1x3: + case Intrinsic::aarch64_neon_ld1x4: + case Intrinsic::aarch64_neon_ld2: + case Intrinsic::aarch64_neon_ld2lane: + case Intrinsic::aarch64_neon_ld2r: + case Intrinsic::aarch64_neon_ld3: + case Intrinsic::aarch64_neon_ld3lane: + case Intrinsic::aarch64_neon_ld3r: + case Intrinsic::aarch64_neon_ld4: + case Intrinsic::aarch64_neon_ld4lane: + case Intrinsic::aarch64_neon_ld4r: + return true; + default: + break; + } + break; default: break; } @@ -722,10 +741,13 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { Register ScalarReg = MI.getOperand(1).getReg(); LLT ScalarTy = MRI.getType(ScalarReg); auto ScalarDef = MRI.getVRegDef(ScalarReg); + // We want to select dup(load) into LD1R. + if (ScalarDef->getOpcode() == TargetOpcode::G_LOAD) + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR}; // s8 is an exception for G_DUP, which we always want on gpr. - if (ScalarTy.getSizeInBits() != 8 && - (getRegBank(ScalarReg, MRI, TRI) == &AArch64::FPRRegBank || - onlyDefinesFP(*ScalarDef, MRI, TRI))) + else if (ScalarTy.getSizeInBits() != 8 && + (getRegBank(ScalarReg, MRI, TRI) == &AArch64::FPRRegBank || + onlyDefinesFP(*ScalarDef, MRI, TRI))) OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR}; else OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR}; @@ -1015,17 +1037,26 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // Assign them FPR for now. OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR, PMI_FirstFPR}; break; - case TargetOpcode::G_INTRINSIC: { + case TargetOpcode::G_INTRINSIC: + case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: { // Check if we know that the intrinsic has any constraints on its register // banks. If it does, then update the mapping accordingly. unsigned Idx = 0; - if (!isFPIntrinsic(MRI, MI)) - break; - for (const auto &Op : MI.explicit_operands()) { - if (Op.isReg()) - OpRegBankIdx[Idx] = PMI_FirstFPR; - ++Idx; - } + if (onlyDefinesFP(MI, MRI, TRI)) + for (const auto &Op : MI.defs()) { + if (Op.isReg()) + OpRegBankIdx[Idx] = PMI_FirstFPR; + ++Idx; + } + else + Idx += MI.getNumExplicitDefs(); + + if (onlyUsesFP(MI, MRI, TRI)) + for (const auto &Op : MI.explicit_uses()) { + if (Op.isReg()) + OpRegBankIdx[Idx] = PMI_FirstFPR; + ++Idx; + } break; } case TargetOpcode::G_LROUND: diff --git a/llvm/test/CodeGen/AArch64/arm64-ld1.ll b/llvm/test/CodeGen/AArch64/arm64-ld1.ll index 47fb3308175b0..96468b2cfa8ac 100644 --- a/llvm/test/CodeGen/AArch64/arm64-ld1.ll +++ b/llvm/test/CodeGen/AArch64/arm64-ld1.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc < %s -global-isel=1 -global-isel-abort=2 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK-GI %struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> } %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } @@ -350,39 +351,63 @@ declare %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld4.v1f64.p0(ptr) nounwi define %struct.__neon_int8x16x2_t @ld2lane_16b(<16 x i8> %L1, <16 x i8> %L2, ptr %A) nounwind { ; Make sure we are using the operands defined by the ABI -; CHECK-LABEL: ld2lane_16b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ld2.b { v0, v1 }[1], [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ld2lane_16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: ld2.b { v0, v1 }[1], [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ld2lane_16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: ld2.b { v0, v1 }[1], [x0] +; CHECK-GI-NEXT: ret %tmp2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> %L1, <16 x i8> %L2, i64 1, ptr %A) ret %struct.__neon_int8x16x2_t %tmp2 } define %struct.__neon_int8x16x3_t @ld3lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, ptr %A) nounwind { ; Make sure we are using the operands defined by the ABI -; CHECK-LABEL: ld3lane_16b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: ld3.b { v0, v1, v2 }[1], [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ld3lane_16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-SD-NEXT: ld3.b { v0, v1, v2 }[1], [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ld3lane_16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: ld3.b { v0, v1, v2 }[1], [x0] +; CHECK-GI-NEXT: ret %tmp2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i64 1, ptr %A) ret %struct.__neon_int8x16x3_t %tmp2 } define %struct.__neon_int8x16x4_t @ld4lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, ptr %A) nounwind { ; Make sure we are using the operands defined by the ABI -; CHECK-LABEL: ld4lane_16b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: ld4.b { v0, v1, v2, v3 }[1], [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ld4lane_16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: ld4.b { v0, v1, v2, v3 }[1], [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ld4lane_16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: ld4.b { v0, v1, v2, v3 }[1], [x0] +; CHECK-GI-NEXT: ret %tmp2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i64 1, ptr %A) ret %struct.__neon_int8x16x4_t %tmp2 } @@ -393,39 +418,63 @@ declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> define %struct.__neon_int16x8x2_t @ld2lane_8h(<8 x i16> %L1, <8 x i16> %L2, ptr %A) nounwind { ; Make sure we are using the operands defined by the ABI -; CHECK-LABEL: ld2lane_8h: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ld2.h { v0, v1 }[1], [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ld2lane_8h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: ld2.h { v0, v1 }[1], [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ld2lane_8h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: ld2.h { v0, v1 }[1], [x0] +; CHECK-GI-NEXT: ret %tmp2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> %L1, <8 x i16> %L2, i64 1, ptr %A) ret %struct.__neon_int16x8x2_t %tmp2 } define %struct.__neon_int16x8x3_t @ld3lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, ptr %A) nounwind { ; Make sure we are using the operands defined by the ABI -; CHECK-LABEL: ld3lane_8h: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: ld3.h { v0, v1, v2 }[1], [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ld3lane_8h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-SD-NEXT: ld3.h { v0, v1, v2 }[1], [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ld3lane_8h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: ld3.h { v0, v1, v2 }[1], [x0] +; CHECK-GI-NEXT: ret %tmp2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i64 1, ptr %A) ret %struct.__neon_int16x8x3_t %tmp2 } define %struct.__neon_int16x8x4_t @ld4lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, ptr %A) nounwind { ; Make sure we are using the operands defined by the ABI -; CHECK-LABEL: ld4lane_8h: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: ld4.h { v0, v1, v2, v3 }[1], [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ld4lane_8h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: ld4.h { v0, v1, v2, v3 }[1], [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ld4lane_8h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: ld4.h { v0, v1, v2, v3 }[1], [x0] +; CHECK-GI-NEXT: ret %tmp2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i64 1, ptr %A) ret %struct.__neon_int16x8x4_t %tmp2 } @@ -436,39 +485,63 @@ declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> define %struct.__neon_int32x4x2_t @ld2lane_4s(<4 x i32> %L1, <4 x i32> %L2, ptr %A) nounwind { ; Make sure we are using the operands defined by the ABI -; CHECK-LABEL: ld2lane_4s: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ld2.s { v0, v1 }[1], [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ld2lane_4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: ld2.s { v0, v1 }[1], [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ld2lane_4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: ld2.s { v0, v1 }[1], [x0] +; CHECK-GI-NEXT: ret %tmp2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32> %L1, <4 x i32> %L2, i64 1, ptr %A) ret %struct.__neon_int32x4x2_t %tmp2 } define %struct.__neon_int32x4x3_t @ld3lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, ptr %A) nounwind { ; Make sure we are using the operands defined by the ABI -; CHECK-LABEL: ld3lane_4s: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: ld3.s { v0, v1, v2 }[1], [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ld3lane_4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-SD-NEXT: ld3.s { v0, v1, v2 }[1], [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ld3lane_4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: ld3.s { v0, v1, v2 }[1], [x0] +; CHECK-GI-NEXT: ret %tmp2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i64 1, ptr %A) ret %struct.__neon_int32x4x3_t %tmp2 } define %struct.__neon_int32x4x4_t @ld4lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, ptr %A) nounwind { ; Make sure we are using the operands defined by the ABI -; CHECK-LABEL: ld4lane_4s: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: ld4.s { v0, v1, v2, v3 }[1], [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ld4lane_4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: ld4.s { v0, v1, v2, v3 }[1], [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ld4lane_4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: ld4.s { v0, v1, v2, v3 }[1], [x0] +; CHECK-GI-NEXT: ret %tmp2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i64 1, ptr %A) ret %struct.__neon_int32x4x4_t %tmp2 } @@ -479,39 +552,63 @@ declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32> define %struct.__neon_int64x2x2_t @ld2lane_2d(<2 x i64> %L1, <2 x i64> %L2, ptr %A) nounwind { ; Make sure we are using the operands defined by the ABI -; CHECK-LABEL: ld2lane_2d: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ld2.d { v0, v1 }[1], [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ld2lane_2d: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-SD-NEXT: ld2.d { v0, v1 }[1], [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ld2lane_2d: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: ld2.d { v0, v1 }[1], [x0] +; CHECK-GI-NEXT: ret %tmp2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> %L1, <2 x i64> %L2, i64 1, ptr %A) ret %struct.__neon_int64x2x2_t %tmp2 } define %struct.__neon_int64x2x3_t @ld3lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, ptr %A) nounwind { ; Make sure we are using the operands defined by the ABI -; CHECK-LABEL: ld3lane_2d: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 -; CHECK-NEXT: ld3.d { v0, v1, v2 }[1], [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ld3lane_2d: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-SD-NEXT: ld3.d { v0, v1, v2 }[1], [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ld3lane_2d: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 +; CHECK-GI-NEXT: ld3.d { v0, v1, v2 }[1], [x0] +; CHECK-GI-NEXT: ret %tmp2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64 1, ptr %A) ret %struct.__neon_int64x2x3_t %tmp2 } define %struct.__neon_int64x2x4_t @ld4lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, ptr %A) nounwind { ; Make sure we are using the operands defined by the ABI -; CHECK-LABEL: ld4lane_2d: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: ld4.d { v0, v1, v2, v3 }[1], [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ld4lane_2d: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-SD-NEXT: ld4.d { v0, v1, v2, v3 }[1], [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ld4lane_2d: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-GI-NEXT: ld4.d { v0, v1, v2, v3 }[1], [x0] +; CHECK-GI-NEXT: ret %tmp2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64 1, ptr %A) ret %struct.__neon_int64x2x4_t %tmp2 } @@ -907,10 +1004,16 @@ declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3r.v2i64.p0(ptr) nounwin declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4r.v2i64.p0(ptr) nounwind readonly define <16 x i8> @ld1_16b(<16 x i8> %V, ptr %bar) { -; CHECK-LABEL: ld1_16b: -; CHECK: // %bb.0: -; CHECK-NEXT: ld1.b { v0 }[0], [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ld1_16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ld1.b { v0 }[0], [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ld1_16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr b1, [x0] +; CHECK-GI-NEXT: mov.b v0[0], v1[0] +; CHECK-GI-NEXT: ret ; Make sure we are using the operands defined by the ABI %tmp1 = load i8, ptr %bar %tmp2 = insertelement <16 x i8> %V, i8 %tmp1, i32 0 @@ -918,10 +1021,16 @@ define <16 x i8> @ld1_16b(<16 x i8> %V, ptr %bar) { } define <8 x i16> @ld1_8h(<8 x i16> %V, ptr %bar) { -; CHECK-LABEL: ld1_8h: -; CHECK: // %bb.0: -; CHECK-NEXT: ld1.h { v0 }[0], [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ld1_8h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ld1.h { v0 }[0], [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ld1_8h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr h1, [x0] +; CHECK-GI-NEXT: mov.h v0[0], v1[0] +; CHECK-GI-NEXT: ret ; Make sure we are using the operands defined by the ABI %tmp1 = load i16, ptr %bar %tmp2 = insertelement <8 x i16> %V, i16 %tmp1, i32 0 @@ -929,10 +1038,16 @@ define <8 x i16> @ld1_8h(<8 x i16> %V, ptr %bar) { } define <4 x i32> @ld1_4s(<4 x i32> %V, ptr %bar) { -; CHECK-LABEL: ld1_4s: -; CHECK: // %bb.0: -; CHECK-NEXT: ld1.s { v0 }[0], [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ld1_4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ld1.s { v0 }[0], [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ld1_4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr s1, [x0] +; CHECK-GI-NEXT: mov.s v0[0], v1[0] +; CHECK-GI-NEXT: ret ; Make sure we are using the operands defined by the ABI %tmp1 = load i32, ptr %bar %tmp2 = insertelement <4 x i32> %V, i32 %tmp1, i32 0 @@ -940,10 +1055,16 @@ define <4 x i32> @ld1_4s(<4 x i32> %V, ptr %bar) { } define <4 x float> @ld1_4s_float(<4 x float> %V, ptr %bar) { -; CHECK-LABEL: ld1_4s_float: -; CHECK: // %bb.0: -; CHECK-NEXT: ld1.s { v0 }[0], [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ld1_4s_float: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ld1.s { v0 }[0], [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ld1_4s_float: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr s1, [x0] +; CHECK-GI-NEXT: mov.s v0[0], v1[0] +; CHECK-GI-NEXT: ret ; Make sure we are using the operands defined by the ABI %tmp1 = load float, ptr %bar %tmp2 = insertelement <4 x float> %V, float %tmp1, i32 0 @@ -951,10 +1072,16 @@ define <4 x float> @ld1_4s_float(<4 x float> %V, ptr %bar) { } define <2 x i64> @ld1_2d(<2 x i64> %V, ptr %bar) { -; CHECK-LABEL: ld1_2d: -; CHECK: // %bb.0: -; CHECK-NEXT: ld1.d { v0 }[0], [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ld1_2d: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ld1.d { v0 }[0], [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ld1_2d: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr d1, [x0] +; CHECK-GI-NEXT: mov.d v0[0], v1[0] +; CHECK-GI-NEXT: ret ; Make sure we are using the operands defined by the ABI %tmp1 = load i64, ptr %bar %tmp2 = insertelement <2 x i64> %V, i64 %tmp1, i32 0 @@ -962,10 +1089,16 @@ define <2 x i64> @ld1_2d(<2 x i64> %V, ptr %bar) { } define <2 x double> @ld1_2d_double(<2 x double> %V, ptr %bar) { -; CHECK-LABEL: ld1_2d_double: -; CHECK: // %bb.0: -; CHECK-NEXT: ld1.d { v0 }[0], [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ld1_2d_double: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ld1.d { v0 }[0], [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ld1_2d_double: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr d1, [x0] +; CHECK-GI-NEXT: mov.d v0[0], v1[0] +; CHECK-GI-NEXT: ret ; Make sure we are using the operands defined by the ABI %tmp1 = load double, ptr %bar %tmp2 = insertelement <2 x double> %V, double %tmp1, i32 0 @@ -983,12 +1116,20 @@ define <1 x i64> @ld1_1d(ptr %p) { } define <8 x i8> @ld1_8b(<8 x i8> %V, ptr %bar) { -; CHECK-LABEL: ld1_8b: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: ld1.b { v0 }[0], [x0] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ld1_8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: ld1.b { v0 }[0], [x0] +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ld1_8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr b1, [x0] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov.b v0[0], v1[0] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret ; Make sure we are using the operands defined by the ABI %tmp1 = load i8, ptr %bar %tmp2 = insertelement <8 x i8> %V, i8 %tmp1, i32 0 @@ -996,12 +1137,20 @@ define <8 x i8> @ld1_8b(<8 x i8> %V, ptr %bar) { } define <4 x i16> @ld1_4h(<4 x i16> %V, ptr %bar) { -; CHECK-LABEL: ld1_4h: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: ld1.h { v0 }[0], [x0] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ld1_4h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: ld1.h { v0 }[0], [x0] +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ld1_4h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr h1, [x0] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov.h v0[0], v1[0] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret ; Make sure we are using the operands defined by the ABI %tmp1 = load i16, ptr %bar %tmp2 = insertelement <4 x i16> %V, i16 %tmp1, i32 0 @@ -1009,12 +1158,20 @@ define <4 x i16> @ld1_4h(<4 x i16> %V, ptr %bar) { } define <2 x i32> @ld1_2s(<2 x i32> %V, ptr %bar) { -; CHECK-LABEL: ld1_2s: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: ld1.s { v0 }[0], [x0] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ld1_2s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: ld1.s { v0 }[0], [x0] +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ld1_2s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr s1, [x0] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov.s v0[0], v1[0] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret ; Make sure we are using the operands defined by the ABI %tmp1 = load i32, ptr %bar %tmp2 = insertelement <2 x i32> %V, i32 %tmp1, i32 0 @@ -1022,12 +1179,20 @@ define <2 x i32> @ld1_2s(<2 x i32> %V, ptr %bar) { } define <2 x float> @ld1_2s_float(<2 x float> %V, ptr %bar) { -; CHECK-LABEL: ld1_2s_float: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: ld1.s { v0 }[0], [x0] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ld1_2s_float: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: ld1.s { v0 }[0], [x0] +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ld1_2s_float: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr s1, [x0] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov.s v0[0], v1[0] +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret ; Make sure we are using the operands defined by the ABI %tmp1 = load float, ptr %bar %tmp2 = insertelement <2 x float> %V, float %tmp1, i32 0