diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 135e260f26f759..ab80b18d12c2e0 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -140,3 +140,29 @@ Optional RISCVTTIImpl::getMaxVScale() const { return MaxVectorSizeInBits / RISCV::RVVBitsPerBlock; return BaseT::getMaxVScale(); } + +unsigned RISCVTTIImpl::getGatherScatterOpCost( + unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, + Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { + if (CostKind != TTI::TCK_RecipThroughput) + return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, + Alignment, CostKind, I); + + // FIXME: Only supporting fixed vectors for now. + if (!isa(DataTy) || ST->getMinRVVVectorSizeInBits() == 0) + return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, + Alignment, CostKind, I); + + if ((Opcode == Instruction::Load && + !isLegalMaskedGather(DataTy, Align(Alignment))) || + (Opcode == Instruction::Store && + !isLegalMaskedScatter(DataTy, Align(Alignment)))) + return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, + Alignment, CostKind, I); + + auto *VTy = cast(DataTy); + unsigned NumLoads = VTy->getNumElements(); + unsigned MemOpCost = + getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0, CostKind, I); + return NumLoads * MemOpCost; +} diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 36e7bebd1f5a32..5b0af6812d5ff3 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -69,6 +69,11 @@ class RISCVTTIImpl : public BasicTTIImplBase { llvm_unreachable("Unsupported register kind"); } + unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, + const Value *Ptr, bool VariableMask, + Align Alignment, TTI::TargetCostKind CostKind, + const Instruction *I); + bool isLegalElementTypeForRVV(Type *ScalarTy) { if (ScalarTy->isPointerTy()) return true; diff --git a/llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll b/llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll new file mode 100644 index 00000000000000..a456c95313c349 --- /dev/null +++ b/llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll @@ -0,0 +1,134 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt -cost-model -analyze -mtriple=riscv64 -mattr=+experimental-v,+f,+d,+experimental-zfh -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 < %s 2>%t | FileCheck %s + +define i32 @masked_gather() { +; CHECK-LABEL: 'masked_gather' +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 8, <2 x i1> undef, <2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 8, <1 x i1> undef, <1 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 1, <8 x i1> undef, <8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 4, <4 x i1> undef, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 4, <2 x i1> undef, <2 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = call <1 x float> @llvm.masked.gather.v1f32.v1p0f32(<1 x float*> undef, i32 4, <1 x i1> undef, <1 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32F16 = call <32 x half> @llvm.masked.gather.v32f16.v32p0f16(<32 x half*> undef, i32 1, <32 x i1> undef, <32 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16F16 = call <16 x half> @llvm.masked.gather.v16f16.v16p0f16(<16 x half*> undef, i32 1, <16 x i1> undef, <16 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F16 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> undef, i32 2, <8 x i1> undef, <8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F16 = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> undef, i32 2, <4 x i1> undef, <4 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = call <2 x half> @llvm.masked.gather.v2f16.v2p0f16(<2 x half*> undef, i32 2, <2 x i1> undef, <2 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = call <1 x half> @llvm.masked.gather.v1f16.v1p0f16(<1 x half*> undef, i32 2, <1 x i1> undef, <1 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 8, <2 x i1> undef, <2 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> undef, i32 8, <1 x i1> undef, <1 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 4, <4 x i1> undef, <4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 4, <2 x i1> undef, <2 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I32 = call <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*> undef, i32 4, <1 x i1> undef, <1 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 2, <8 x i1> undef, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 2, <4 x i1> undef, <4 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> undef, i32 2, <2 x i1> undef, <2 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I16 = call <1 x i16> @llvm.masked.gather.v1i16.v1p0i16(<1 x i16*> undef, i32 2, <1 x i1> undef, <1 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> undef, i32 1, <4 x i1> undef, <4 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I8 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> undef, i32 1, <2 x i1> undef, <2 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I8 = call <1 x i8> @llvm.masked.gather.v1i8.v1p0i8(<1 x i8*> undef, i32 1, <1 x i1> undef, <1 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; + %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 1, <8 x i1> undef, <8 x double> undef) + %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 1, <4 x i1> undef, <4 x double> undef) + %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 8, <2 x i1> undef, <2 x double> undef) + %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 8, <1 x i1> undef, <1 x double> undef) + + %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 1, <16 x i1> undef, <16 x float> undef) + %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 1, <8 x i1> undef, <8 x float> undef) + %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 4, <4 x i1> undef, <4 x float> undef) + %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 4, <2 x i1> undef, <2 x float> undef) + %V1F32 = call <1 x float> @llvm.masked.gather.v1f32.v1p0f32(<1 x float*> undef, i32 4, <1 x i1> undef, <1 x float> undef) + + %V32F16 = call <32 x half> @llvm.masked.gather.v32f16.v32p0f16(<32 x half*> undef, i32 1, <32 x i1> undef, <32 x half> undef) + %V16F16 = call <16 x half> @llvm.masked.gather.v16f16.v16p0f16(<16 x half*> undef, i32 1, <16 x i1> undef, <16 x half> undef) + %V8F16 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> undef, i32 2, <8 x i1> undef, <8 x half> undef) + %V4F16 = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> undef, i32 2, <4 x i1> undef, <4 x half> undef) + %V2F16 = call <2 x half> @llvm.masked.gather.v2f16.v2p0f16(<2 x half*> undef, i32 2, <2 x i1> undef, <2 x half> undef) + %V1F16 = call <1 x half> @llvm.masked.gather.v1f16.v1p0f16(<1 x half*> undef, i32 2, <1 x i1> undef, <1 x half> undef) + + %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 1, <8 x i1> undef, <8 x i64> undef) + %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 1, <4 x i1> undef, <4 x i64> undef) + %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 8, <2 x i1> undef, <2 x i64> undef) + %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> undef, i32 8, <1 x i1> undef, <1 x i64> undef) + + %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef) + %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef) + %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 4, <4 x i1> undef, <4 x i32> undef) + %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 4, <2 x i1> undef, <2 x i32> undef) + %V1I32 = call <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*> undef, i32 4, <1 x i1> undef, <1 x i32> undef) + + %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef) + %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef) + %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 2, <8 x i1> undef, <8 x i16> undef) + %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 2, <4 x i1> undef, <4 x i16> undef) + %V2I16 = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> undef, i32 2, <2 x i1> undef, <2 x i16> undef) + %V1I16 = call <1 x i16> @llvm.masked.gather.v1i16.v1p0i16(<1 x i16*> undef, i32 2, <1 x i1> undef, <1 x i16> undef) + + %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef) + %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef) + %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef) + %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef) + %V4I8 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> undef, i32 1, <4 x i1> undef, <4 x i8> undef) + %V2I8 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> undef, i32 1, <2 x i1> undef, <2 x i8> undef) + %V1I8 = call <1 x i8> @llvm.masked.gather.v1i8.v1p0i8(<1 x i8*> undef, i32 1, <1 x i1> undef, <1 x i8> undef) + + ret i32 0 +} + +declare <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*>, i32, <8 x i1>, <8 x double>) +declare <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*>, i32, <4 x i1>, <4 x double>) +declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*>, i32, <2 x i1>, <2 x double>) +declare <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*>, i32, <1 x i1>, <1 x double>) + +declare <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*>, i32, <16 x i1>, <16 x float>) +declare <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*>, i32, <8 x i1>, <8 x float>) +declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>) +declare <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*>, i32, <2 x i1>, <2 x float>) +declare <1 x float> @llvm.masked.gather.v1f32.v1p0f32(<1 x float*>, i32, <1 x i1>, <1 x float>) + +declare <32 x half> @llvm.masked.gather.v32f16.v32p0f16(<32 x half*>, i32, <32 x i1>, <32 x half>) +declare <16 x half> @llvm.masked.gather.v16f16.v16p0f16(<16 x half*>, i32, <16 x i1>, <16 x half>) +declare <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*>, i32, <8 x i1>, <8 x half>) +declare <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*>, i32, <4 x i1>, <4 x half>) +declare <2 x half> @llvm.masked.gather.v2f16.v2p0f16(<2 x half*>, i32, <2 x i1>, <2 x half>) +declare <1 x half> @llvm.masked.gather.v1f16.v1p0f16(<1 x half*>, i32, <1 x i1>, <1 x half>) + +declare <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*>, i32, <8 x i1>, <8 x i64>) +declare <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*>, i32, <4 x i1>, <4 x i64>) +declare <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>) +declare <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*>, i32, <1 x i1>, <1 x i64>) + +declare <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>) +declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>, i32, <8 x i1>, <8 x i32>) +declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) +declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>) +declare <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*>, i32, <1 x i1>, <1 x i32>) + +declare <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*>, i32, <32 x i1>, <32 x i16>) +declare <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*>, i32, <16 x i1>, <16 x i16>) +declare <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>) +declare <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>) +declare <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*>, i32, <2 x i1>, <2 x i16>) +declare <1 x i16> @llvm.masked.gather.v1i16.v1p0i16(<1 x i16*>, i32, <1 x i1>, <1 x i16>) + +declare <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*>, i32, <64 x i1>, <64 x i8>) +declare <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*>, i32, <32 x i1>, <32 x i8>) +declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32, <16 x i1>, <16 x i8>) +declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>) +declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>) +declare <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*>, i32, <2 x i1>, <2 x i8>) +declare <1 x i8> @llvm.masked.gather.v1i8.v1p0i8(<1 x i8*>, i32, <1 x i1>, <1 x i8>) diff --git a/llvm/test/Analysis/CostModel/RISCV/fixed-vector-scatter.ll b/llvm/test/Analysis/CostModel/RISCV/fixed-vector-scatter.ll new file mode 100644 index 00000000000000..23f0fbd24afb0a --- /dev/null +++ b/llvm/test/Analysis/CostModel/RISCV/fixed-vector-scatter.ll @@ -0,0 +1,134 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt -cost-model -analyze -mtriple=riscv64 -mattr=+experimental-v,+f,+d,+experimental-zfh -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 < %s 2>%t | FileCheck %s + +define i32 @masked_scatter() { +; CHECK-LABEL: 'masked_scatter' +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> undef, <8 x double*> undef, i32 1, <8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 1, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 8, <2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double> undef, <1 x double*> undef, i32 8, <1 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 1, <16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> undef, <8 x float*> undef, i32 1, <8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> undef, <4 x float*> undef, i32 2, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> undef, <2 x float*> undef, i32 2, <2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f32.v1p0f32(<1 x float> undef, <1 x float*> undef, i32 2, <1 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v32f16.v32p0f16(<32 x half> undef, <32 x half*> undef, i32 1, <32 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16f16.v16p0f16(<16 x half> undef, <16 x half*> undef, i32 1, <16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> undef, <8 x half*> undef, i32 2, <8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half> undef, <4 x half*> undef, i32 2, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2f16.v2p0f16(<2 x half> undef, <2 x half*> undef, i32 2, <2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f16.v1p0f16(<1 x half> undef, <1 x half*> undef, i32 2, <1 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> undef, <8 x i64*> undef, i32 1, <8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 1, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> undef, <2 x i64*> undef, i32 8, <2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0i64(<1 x i64> undef, <1 x i64*> undef, i32 8, <1 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 1, <16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 4, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 4, <2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32> undef, <1 x i32*> undef, i32 4, <1 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 2, <8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 2, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16> undef, <2 x i16*> undef, i32 2, <2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i16.v1p0i16(<1 x i16> undef, <1 x i16*> undef, i32 2, <1 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> undef, <4 x i8*> undef, i32 1, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> undef, <2 x i8*> undef, i32 1, <2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i8.v1p0i8(<1 x i8> undef, <1 x i8*> undef, i32 1, <1 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0 +; + call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> undef, <8 x double*> undef, i32 1, <8 x i1> undef) + call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 1, <4 x i1> undef) + call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 8, <2 x i1> undef) + call void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double> undef, <1 x double*> undef, i32 8, <1 x i1> undef) + + call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 1, <16 x i1> undef) + call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> undef, <8 x float*> undef, i32 1, <8 x i1> undef) + call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> undef, <4 x float*> undef, i32 2, <4 x i1> undef) + call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> undef, <2 x float*> undef, i32 2, <2 x i1> undef) + call void @llvm.masked.scatter.v1f32.v1p0f32(<1 x float> undef, <1 x float*> undef, i32 2, <1 x i1> undef) + + call void @llvm.masked.scatter.v32f16.v32p0f16(<32 x half> undef, <32 x half*> undef, i32 1, <32 x i1> undef) + call void @llvm.masked.scatter.v16f16.v16p0f16(<16 x half> undef, <16 x half*> undef, i32 1, <16 x i1> undef) + call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> undef, <8 x half*> undef, i32 2, <8 x i1> undef) + call void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half> undef, <4 x half*> undef, i32 2, <4 x i1> undef) + call void @llvm.masked.scatter.v2f16.v2p0f16(<2 x half> undef, <2 x half*> undef, i32 2, <2 x i1> undef) + call void @llvm.masked.scatter.v1f16.v1p0f16(<1 x half> undef, <1 x half*> undef, i32 2, <1 x i1> undef) + + call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> undef, <8 x i64*> undef, i32 1, <8 x i1> undef) + call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 1, <4 x i1> undef) + call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> undef, <2 x i64*> undef, i32 8, <2 x i1> undef) + call void @llvm.masked.scatter.v1i64.v1p0i64(<1 x i64> undef, <1 x i64*> undef, i32 8, <1 x i1> undef) + + call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 1, <16 x i1> undef) + call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef) + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 4, <4 x i1> undef) + call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 4, <2 x i1> undef) + call void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32> undef, <1 x i32*> undef, i32 4, <1 x i1> undef) + + call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef) + call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef) + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 2, <8 x i1> undef) + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 2, <4 x i1> undef) + call void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16> undef, <2 x i16*> undef, i32 2, <2 x i1> undef) + call void @llvm.masked.scatter.v1i16.v1p0i16(<1 x i16> undef, <1 x i16*> undef, i32 2, <1 x i1> undef) + + call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef) + call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef) + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef) + call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef) + call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> undef, <4 x i8*> undef, i32 1, <4 x i1> undef) + call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> undef, <2 x i8*> undef, i32 1, <2 x i1> undef) + call void @llvm.masked.scatter.v1i8.v1p0i8(<1 x i8> undef, <1 x i8*> undef, i32 1, <1 x i1> undef) + + ret i32 0 +} + +declare void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double>, <8 x double*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double>, <4 x double*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double>, <2 x double*>, i32, <2 x i1>) +declare void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double>, <1 x double*>, i32, <1 x i1>) + +declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float>, <16 x float*>, i32, <16 x i1>) +declare void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float>, <8 x float*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float>, <4 x float*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float>, <2 x float*>, i32, <2 x i1>) +declare void @llvm.masked.scatter.v1f32.v1p0f32(<1 x float>, <1 x float*>, i32, <1 x i1>) + +declare void @llvm.masked.scatter.v32f16.v32p0f16(<32 x half>, <32 x half*>, i32, <32 x i1>) +declare void @llvm.masked.scatter.v16f16.v16p0f16(<16 x half>, <16 x half*>, i32, <16 x i1>) +declare void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half>, <8 x half*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half>, <4 x half*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2f16.v2p0f16(<2 x half>, <2 x half*>, i32, <2 x i1>) +declare void @llvm.masked.scatter.v1f16.v1p0f16(<1 x half>, <1 x half*>, i32, <1 x i1>) + +declare void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64>, <8 x i64*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64>, <4 x i64*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64>, <2 x i64*>, i32, <2 x i1>) +declare void @llvm.masked.scatter.v1i64.v1p0i64(<1 x i64>, <1 x i64*>, i32, <1 x i1>) + +declare void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>, <16 x i32*>, i32, <16 x i1>) +declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32>, <8 x i32*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32>, <2 x i32*>, i32, <2 x i1>) +declare void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32>, <1 x i32*>, i32, <1 x i1>) + +declare void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16>, <32 x i16*>, i32, <32 x i1>) +declare void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16>, <16 x i16*>, i32, <16 x i1>) +declare void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16>, <8 x i16*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16>, <4 x i16*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16>, <2 x i16*>, i32, <2 x i1>) +declare void @llvm.masked.scatter.v1i16.v1p0i16(<1 x i16>, <1 x i16*>, i32, <1 x i1>) + +declare void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8>, <64 x i8*>, i32, <64 x i1>) +declare void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8>, <32 x i8*>, i32, <32 x i1>) +declare void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8>, <16 x i8*>, i32, <16 x i1>) +declare void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8>, <2 x i8*>, i32, <2 x i1>) +declare void @llvm.masked.scatter.v1i8.v1p0i8(<1 x i8>, <1 x i8*>, i32, <1 x i1>) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll new file mode 100644 index 00000000000000..1bc3b419ea2269 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll @@ -0,0 +1,182 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -loop-vectorize -mtriple=riscv32 -mattr=+experimental-v,+d -riscv-v-vector-bits-min=256 -S | FileCheck %s -check-prefixes=RV32 +; RUN: opt < %s -loop-vectorize -mtriple=riscv64 -mattr=+experimental-v,+d -riscv-v-vector-bits-min=256 -S | FileCheck %s -check-prefixes=RV64 + +; The source code: +; +;void foo4(double *A, double *B, int *trigger) { +; +; for (int i=0; i<10000; i += 16) { +; if (trigger[i] < 100) { +; A[i] = B[i*2] + trigger[i]; << non-consecutive access +; } +; } +;} + +define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 { +; RV32-LABEL: @foo4( +; RV32-NEXT: entry: +; RV32-NEXT: [[A1:%.*]] = bitcast double* [[A:%.*]] to i8* +; RV32-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* +; RV32-NEXT: [[B6:%.*]] = bitcast double* [[B:%.*]] to i8* +; RV32-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; RV32: vector.memcheck: +; RV32-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[A]], i64 9985 +; RV32-NEXT: [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8* +; RV32-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 9985 +; RV32-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* +; RV32-NEXT: [[SCEVGEP7:%.*]] = getelementptr double, double* [[B]], i64 19969 +; RV32-NEXT: [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8* +; RV32-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]] +; RV32-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] +; RV32-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; RV32-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]] +; RV32-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]] +; RV32-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] +; RV32-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; RV32-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true +; RV32-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; RV32: vector.ph: +; RV32-NEXT: br label [[VECTOR_BODY:%.*]] +; RV32: vector.body: +; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; RV32-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; RV32-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <4 x i64> [[VEC_IND]] +; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP0]], i32 4, <4 x i1> , <4 x i32> undef), !alias.scope !0 +; RV32-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[WIDE_MASKED_GATHER]], +; RV32-NEXT: [[TMP2:%.*]] = shl nuw nsw <4 x i64> [[VEC_IND]], +; RV32-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[B]], <4 x i64> [[TMP2]] +; RV32-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> [[TMP3]], i32 8, <4 x i1> [[TMP1]], <4 x double> undef), !alias.scope !3 +; RV32-NEXT: [[TMP4:%.*]] = sitofp <4 x i32> [[WIDE_MASKED_GATHER]] to <4 x double> +; RV32-NEXT: [[TMP5:%.*]] = fadd <4 x double> [[WIDE_MASKED_GATHER12]], [[TMP4]] +; RV32-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[A]], <4 x i64> [[VEC_IND]] +; RV32-NEXT: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> [[TMP5]], <4 x double*> [[TMP6]], i32 8, <4 x i1> [[TMP1]]), !alias.scope !5, !noalias !7 +; RV32-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; RV32-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; RV32-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 624 +; RV32-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; RV32: middle.block: +; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 625, 624 +; RV32-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; RV32: scalar.ph: +; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; RV32-NEXT: br label [[FOR_BODY:%.*]] +; RV32: for.body: +; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; RV32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] +; RV32-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; RV32-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP8]], 100 +; RV32-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; RV32: if.then: +; RV32-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 +; RV32-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP9]] +; RV32-NEXT: [[TMP10:%.*]] = load double, double* [[ARRAYIDX3]], align 8 +; RV32-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP8]] to double +; RV32-NEXT: [[ADD:%.*]] = fadd double [[TMP10]], [[CONV]] +; RV32-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]] +; RV32-NEXT: store double [[ADD]], double* [[ARRAYIDX7]], align 8 +; RV32-NEXT: br label [[FOR_INC]] +; RV32: for.inc: +; RV32-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 +; RV32-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000 +; RV32-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]] +; RV32: for.end: +; RV32-NEXT: ret void +; +; RV64-LABEL: @foo4( +; RV64-NEXT: entry: +; RV64-NEXT: [[A1:%.*]] = bitcast double* [[A:%.*]] to i8* +; RV64-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* +; RV64-NEXT: [[B6:%.*]] = bitcast double* [[B:%.*]] to i8* +; RV64-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; RV64: vector.memcheck: +; RV64-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[A]], i64 9985 +; RV64-NEXT: [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8* +; RV64-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 9985 +; RV64-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8* +; RV64-NEXT: [[SCEVGEP7:%.*]] = getelementptr double, double* [[B]], i64 19969 +; RV64-NEXT: [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8* +; RV64-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]] +; RV64-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]] +; RV64-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; RV64-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]] +; RV64-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]] +; RV64-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] +; RV64-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; RV64-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true +; RV64-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; RV64: vector.ph: +; RV64-NEXT: br label [[VECTOR_BODY:%.*]] +; RV64: vector.body: +; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; RV64-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; RV64-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <4 x i64> [[VEC_IND]] +; RV64-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP0]], i32 4, <4 x i1> , <4 x i32> undef), !alias.scope !0 +; RV64-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[WIDE_MASKED_GATHER]], +; RV64-NEXT: [[TMP2:%.*]] = shl nuw nsw <4 x i64> [[VEC_IND]], +; RV64-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[B]], <4 x i64> [[TMP2]] +; RV64-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> [[TMP3]], i32 8, <4 x i1> [[TMP1]], <4 x double> undef), !alias.scope !3 +; RV64-NEXT: [[TMP4:%.*]] = sitofp <4 x i32> [[WIDE_MASKED_GATHER]] to <4 x double> +; RV64-NEXT: [[TMP5:%.*]] = fadd <4 x double> [[WIDE_MASKED_GATHER12]], [[TMP4]] +; RV64-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[A]], <4 x i64> [[VEC_IND]] +; RV64-NEXT: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> [[TMP5]], <4 x double*> [[TMP6]], i32 8, <4 x i1> [[TMP1]]), !alias.scope !5, !noalias !7 +; RV64-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; RV64-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; RV64-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 624 +; RV64-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; RV64: middle.block: +; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 625, 624 +; RV64-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; RV64: scalar.ph: +; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; RV64-NEXT: br label [[FOR_BODY:%.*]] +; RV64: for.body: +; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] +; RV64-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; RV64-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP8]], 100 +; RV64-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; RV64: if.then: +; RV64-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 +; RV64-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP9]] +; RV64-NEXT: [[TMP10:%.*]] = load double, double* [[ARRAYIDX3]], align 8 +; RV64-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP8]] to double +; RV64-NEXT: [[ADD:%.*]] = fadd double [[TMP10]], [[CONV]] +; RV64-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]] +; RV64-NEXT: store double [[ADD]], double* [[ARRAYIDX7]], align 8 +; RV64-NEXT: br label [[FOR_INC]] +; RV64: for.inc: +; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 +; RV64-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000 +; RV64-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]] +; RV64: for.end: +; RV64-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.inc + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] + %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %cmp1 = icmp slt i32 %0, 100 + br i1 %cmp1, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %1 = shl nuw nsw i64 %indvars.iv, 1 + %arrayidx3 = getelementptr inbounds double, double* %B, i64 %1 + %2 = load double, double* %arrayidx3, align 8 + %conv = sitofp i32 %0 to double + %add = fadd double %2, %conv + %arrayidx7 = getelementptr inbounds double, double* %A, i64 %indvars.iv + store double %add, double* %arrayidx7, align 8 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16 + %cmp = icmp ult i64 %indvars.iv.next, 10000 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.inc + ret void +}