diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp index 6a2bbc1ae8e2e1..4981b8051657ab 100644 --- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -488,27 +488,44 @@ Instruction *MVEGatherScatterLowering::tryCreateMaskedGatherOffset( // The size of the gather was already checked in isLegalTypeAndAlignment; // if it was not a full vector width an appropriate extend should follow. auto *Extend = Root; + bool TruncResult = false; if (MemoryTy->getPrimitiveSizeInBits() < 128) { - // Only transform gathers with exactly one use - if (!I->hasOneUse()) - return nullptr; + if (I->hasOneUse()) { + // If the gather has a single extend of the correct type, use an extending + // gather and replace the ext. In which case the correct root to replace + // is not the CallInst itself, but the instruction which extends it. + Instruction* User = cast(*I->users().begin()); + if (isa(User) && + User->getType()->getPrimitiveSizeInBits() == 128) { + LLVM_DEBUG(dbgs() << "masked gathers: Incorporating extend: " + << *User << "\n"); + Extend = User; + ResultTy = User->getType(); + Unsigned = 0; + } else if (isa(User) && + User->getType()->getPrimitiveSizeInBits() == 128) { + LLVM_DEBUG(dbgs() << "masked gathers: Incorporating extend: " + << *ResultTy << "\n"); + Extend = User; + ResultTy = User->getType(); + } + } - // The correct root to replace is not the CallInst itself, but the - // instruction which extends it - Extend = cast(*I->users().begin()); - if (isa(Extend)) { - Unsigned = 0; - } else if (!isa(Extend)) { - LLVM_DEBUG(dbgs() << "masked gathers: extend needed but not provided. " - << "Expanding\n"); - return nullptr; + // If an extend hasn't been found and the type is an integer, create an + // extending gather and truncate back to the original type. + if (ResultTy->getPrimitiveSizeInBits() < 128 && + ResultTy->isIntOrIntVectorTy()) { + ResultTy = ResultTy->getWithNewBitWidth( + 128 / cast(ResultTy)->getNumElements()); + TruncResult = true; + LLVM_DEBUG(dbgs() << "masked gathers: Small input type, truncing to: " + << *ResultTy << "\n"); } - LLVM_DEBUG(dbgs() << "masked gathers: found an extending gather\n"); - ResultTy = Extend->getType(); + // The final size of the gather must be a full vector width if (ResultTy->getPrimitiveSizeInBits() != 128) { - LLVM_DEBUG(dbgs() << "masked gathers: extending from the wrong type. " - << "Expanding\n"); + LLVM_DEBUG(dbgs() << "masked gathers: Extend needed but not provided " + "from the correct type. Expanding\n"); return nullptr; } } @@ -522,18 +539,25 @@ Instruction *MVEGatherScatterLowering::tryCreateMaskedGatherOffset( Root = Extend; Value *Mask = I->getArgOperand(2); + Instruction *Load = nullptr; if (!match(Mask, m_One())) - return Builder.CreateIntrinsic( + Load = Builder.CreateIntrinsic( Intrinsic::arm_mve_vldr_gather_offset_predicated, {ResultTy, BasePtr->getType(), Offsets->getType(), Mask->getType()}, {BasePtr, Offsets, Builder.getInt32(MemoryTy->getScalarSizeInBits()), Builder.getInt32(Scale), Builder.getInt32(Unsigned), Mask}); else - return Builder.CreateIntrinsic( + Load = Builder.CreateIntrinsic( Intrinsic::arm_mve_vldr_gather_offset, {ResultTy, BasePtr->getType(), Offsets->getType()}, {BasePtr, Offsets, Builder.getInt32(MemoryTy->getScalarSizeInBits()), Builder.getInt32(Scale), Builder.getInt32(Unsigned)}); + + if (TruncResult) { + Load = TruncInst::Create(Instruction::Trunc, Load, MemoryTy); + Builder.Insert(Load); + } + return Load; } Instruction *MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) { diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll index 79c60dcf95dedb..1685e9ea3c7bc8 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll @@ -18,33 +18,9 @@ entry: define arm_aapcs_vfpcc <8 x i8> @unscaled_v8i8_i8(i8* %base, <8 x i8>* %offptr) { ; CHECK-LABEL: unscaled_v8i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrb.u32 q0, [r1, #4] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, r12, d0 -; CHECK-NEXT: vmov r3, lr, d1 -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r4, r5, d0 -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb.w r12, [r12] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: ldrb.w lr, [lr] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov.16 q0[0], r4 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.16 q0[3], r1 -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.16 q0[5], r12 -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], lr -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vldrb.u16 q0, [r0, q1] +; CHECK-NEXT: bx lr entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 %offs.zext = zext <8 x i8> %offs to <8 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll index 668abe4c43cdff..816969209ff8dd 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll @@ -314,15 +314,9 @@ entry: define arm_aapcs_vfpcc <4 x i16> @ptr_v4i16(<4 x i16*>* %offptr) { ; CHECK-LABEL: ptr_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: vldrh.u32 q0, [r1, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4 @@ -658,15 +652,9 @@ entry: define arm_aapcs_vfpcc <4 x i8> @ptr_v4i8(<4 x i8*>* %offptr) { ; CHECK-LABEL: ptr_v4i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: vldrb.u32 q0, [r1, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4 @@ -897,33 +885,25 @@ entry: define arm_aapcs_vfpcc <8 x i32> @sext_unsigned_unscaled_i8_i8_toi64(i8* %base, <8 x i8>* %offptr) { ; CHECK-LABEL: sext_unsigned_unscaled_i8_i8_toi64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vldrb.u32 q0, [r1, #4] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, r12, d1 -; CHECK-NEXT: vmov r3, lr, d0 -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r4, r5, d0 -; CHECK-NEXT: ldrb r6, [r2] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: ldrb.w r12, [r12] -; CHECK-NEXT: ldrb.w r2, [lr] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r6 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: vmov q1[3], q1[1], r2, r12 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmov q0[3], q0[1], r5, r1 +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vldrb.u16 q1, [r0, q0] +; CHECK-NEXT: vmov.u16 r0, q1[2] +; CHECK-NEXT: vmov.u16 r1, q1[0] +; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.u16 r1, q1[1] +; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.u16 r1, q1[4] ; CHECK-NEXT: vmovlb.s8 q0, q0 +; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.u16 r1, q1[5] ; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: vmovlb.s8 q1, q2 +; CHECK-NEXT: vmovlb.s16 q1, q1 +; CHECK-NEXT: bx lr entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 %offs.zext = zext <8 x i8> %offs to <8 x i32>