diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 5f2d09f0765aa..9ddee9e4edd58 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2996,7 +2996,7 @@ static unsigned getSVEGatherScatterOverhead(unsigned Opcode) { InstructionCost AArch64TTIImpl::getGatherScatterOpCost( unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { - if (useNeonVector(DataTy)) + if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy)) return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, Alignment, CostKind, I); auto *VT = cast(DataTy); @@ -3004,6 +3004,10 @@ InstructionCost AArch64TTIImpl::getGatherScatterOpCost( if (!LT.first.isValid()) return InstructionCost::getInvalid(); + if (!LT.second.isVector() || + !isElementTypeLegalForScalableVector(VT->getElementType())) + return InstructionCost::getInvalid(); + // The code-generator is currently not able to handle scalable vectors // of yet, so return an invalid cost to avoid selecting // it. This change will be removed when code-generation for these types is diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll b/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll index 432dfb5726406..c05339d89d35c 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll @@ -105,9 +105,27 @@ define void @masked_gathers_no_vscale_range() #2 { ret void } +define <2 x i128> @masked_gather_v1i128(<2 x ptr> %ld, <2 x i1> %masks, <2 x i128> %passthru) #3 { +; CHECK-LABEL: 'masked_gather_v1i128' +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %res +; +; CHECK-VSCALE-2-LABEL: 'masked_gather_v1i128' +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru) +; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %res +; +; CHECK-VSCALE-1-LABEL: 'masked_gather_v1i128' +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru) +; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %res +; + %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru) + ret <2 x i128> %res +} + attributes #0 = { "target-features"="+sve" vscale_range(1, 8) } attributes #1 = { "target-features"="+sve" vscale_range(1, 16) "tune-cpu"="generic" } attributes #2 = { "target-features"="+sve" } +attributes #3 = { "target-features"="+sve" vscale_range(2, 2) } declare @llvm.masked.gather.nxv4i32(, i32, , ) declare @llvm.masked.gather.nxv8i32(, i32, , ) @@ -120,3 +138,4 @@ declare @llvm.masked.gather.nxv2f32(, i32 declare @llvm.masked.gather.nxv16i16(, i32, , ) declare @llvm.masked.gather.nxv8i16(, i32, , ) declare @llvm.masked.gather.nxv4i16(, i32, , ) +declare <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i128>) diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-load-128.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-load-128.ll new file mode 100644 index 0000000000000..3f02f974e59e6 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-load-128.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu -mcpu=neoverse-512tvb < %s | FileCheck %s + +define void @gather_load_fp128(ptr %arg) #0 { +; CHECK-LABEL: @gather_load_fp128( +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[ARG:%.*]], i64 16 +; CHECK-NEXT: [[LOAD0:%.*]] = load fp128, ptr [[ARG]], align 1 +; CHECK-NEXT: [[LOAD1:%.*]] = load fp128, ptr [[GEP]], align 1 +; CHECK-NEXT: [[LOAD2:%.*]] = load fp128, ptr null, align 1 +; CHECK-NEXT: [[LOAD3:%.*]] = load fp128, ptr null, align 1 +; CHECK-NEXT: [[FCMP0:%.*]] = fcmp oeq fp128 [[LOAD0]], 0xL00000000000000000000000000000000 +; CHECK-NEXT: [[FCMP1:%.*]] = fcmp oeq fp128 [[LOAD1]], 0xL00000000000000000000000000000000 +; CHECK-NEXT: [[FCMP2:%.*]] = fcmp oeq fp128 [[LOAD2]], 0xL00000000000000000000000000000000 +; CHECK-NEXT: [[FCMP3:%.*]] = fcmp oeq fp128 [[LOAD3]], 0xL00000000000000000000000000000000 +; CHECK-NEXT: ret void +; + %gep = getelementptr i8, ptr %arg, i64 16 + %load0 = load fp128, ptr %arg, align 1 + %load1 = load fp128, ptr %gep, align 1 + %load2 = load fp128, ptr null, align 1 + %load3 = load fp128, ptr null, align 1 + %fcmp0 = fcmp oeq fp128 %load0, 0xL0 + %fcmp1 = fcmp oeq fp128 %load1, 0xL0 + %fcmp2 = fcmp oeq fp128 %load2, 0xL0 + %fcmp3 = fcmp oeq fp128 %load3, 0xL0 + ret void +} + +define void @gather_load_i128(ptr %arg) #0 { +; CHECK-LABEL: @gather_load_i128( +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[ARG:%.*]], i64 16 +; CHECK-NEXT: [[LOAD0:%.*]] = load i128, ptr [[ARG]], align 1 +; CHECK-NEXT: [[LOAD1:%.*]] = load i128, ptr [[GEP]], align 1 +; CHECK-NEXT: [[LOAD2:%.*]] = load i128, ptr null, align 1 +; CHECK-NEXT: [[LOAD3:%.*]] = load i128, ptr null, align 1 +; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i128 [[LOAD0]], 0 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i128 [[LOAD1]], 0 +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i128 [[LOAD2]], 0 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i128 [[LOAD3]], 0 +; CHECK-NEXT: ret void +; + %gep = getelementptr i8, ptr %arg, i64 16 + %load0 = load i128, ptr %arg, align 1 + %load1 = load i128, ptr %gep, align 1 + %load2 = load i128, ptr null, align 1 + %load3 = load i128, ptr null, align 1 + %cmp0 = icmp eq i128 %load0, 0 + %cmp1 = icmp eq i128 %load1, 0 + %cmp2 = icmp eq i128 %load2, 0 + %cmp3 = icmp eq i128 %load3, 0 + ret void +} + +attributes #0 = { vscale_range(2,2) }