-
Notifications
You must be signed in to change notification settings - Fork 11k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Support interleaved accesses for scalable vector. #90583
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Mel Chen (Mel-Chen) ChangesThe support for interleaved accesses for scalable vector with a factor of 2 is enabled in vectorizer. Therefore, the patch removed the restriction for scalable vector with a factor of 2. Patch is 155.42 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/90583.diff 3 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index ce26e61880fd05..5f84175da703d6 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -599,12 +599,8 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
bool UseMaskForCond, bool UseMaskForGaps) {
- if (isa<ScalableVectorType>(VecTy))
+ if (isa<ScalableVectorType>(VecTy) && Factor != 2)
return InstructionCost::getInvalid();
- auto *FVTy = cast<FixedVectorType>(VecTy);
- InstructionCost MemCost =
- getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
- unsigned VF = FVTy->getNumElements() / Factor;
// The interleaved memory access pass will lower interleaved memory ops (i.e
// a load and store followed by a specific shuffle) to vlseg/vsseg
@@ -612,24 +608,35 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
// memory op
if (!UseMaskForCond && !UseMaskForGaps &&
Factor <= TLI->getMaxSupportedInterleaveFactor()) {
- std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(FVTy);
+ auto *VTy = cast<VectorType>(VecTy);
+ std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
// Need to make sure type has't been scalarized
- if (LT.second.isFixedLengthVector()) {
- auto *LegalFVTy = FixedVectorType::get(FVTy->getElementType(),
- LT.second.getVectorNumElements());
+ if (LT.second.isVector()) {
+ auto *LegalVTy = VectorType::get(VTy->getElementType(),
+ LT.second.getVectorElementCount());
// FIXME: We use the memory op cost of the *legalized* type here, becuase
// it's getMemoryOpCost returns a really expensive cost for types like
// <6 x i8>, which show up when doing interleaves of Factor=3 etc.
// Should the memory op cost of these be cheaper?
- if (TLI->isLegalInterleavedAccessType(LegalFVTy, Factor, Alignment,
+ if (TLI->isLegalInterleavedAccessType(LegalVTy, Factor, Alignment,
AddressSpace, DL)) {
InstructionCost LegalMemCost = getMemoryOpCost(
- Opcode, LegalFVTy, Alignment, AddressSpace, CostKind);
+ Opcode, LegalVTy, Alignment, AddressSpace, CostKind);
return LT.first + LegalMemCost;
}
}
}
+ // TODO: Return the cost of interleaved accesses for scalable vector when
+ // unable to convert to segment accesses instructions.
+ if (isa<ScalableVectorType>(VecTy))
+ return InstructionCost::getInvalid();
+
+ auto *FVTy = cast<FixedVectorType>(VecTy);
+ InstructionCost MemCost =
+ getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
+ unsigned VF = FVTy->getNumElements() / Factor;
+
// An interleaved load will look like this for Factor=3:
// %wide.vec = load <12 x i32>, ptr %3, align 4
// %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index 6fa197591ab335..61a17e6857abfc 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -1,54 +1,113 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -S | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=off -mtriple=riscv64 -mattr=+v -S | FileCheck %s --check-prefix=FIXED
+; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on -mtriple=riscv64 -mattr=+v -S | FileCheck %s --check-prefix=SCALABLE
define void @load_store_factor2_i32(ptr %p) {
-; CHECK-LABEL: @load_store_factor2_i32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[TMP2]], i32 0
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP3]], align 4
-; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC1]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP6]], i32 -1
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; CHECK-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4
-; CHECK-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1
-; CHECK-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4
-; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; CHECK-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4
-; CHECK-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2
-; CHECK-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4
-; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
+; FIXED-LABEL: @load_store_factor2_i32(
+; FIXED-NEXT: entry:
+; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED: vector.ph:
+; FIXED-NEXT: br label [[VECTOR_BODY:%.*]]
+; FIXED: vector.body:
+; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FIXED-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
+; FIXED-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[TMP2]], i32 0
+; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP3]], align 4
+; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; FIXED-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1
+; FIXED-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP5]]
+; FIXED-NEXT: [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC1]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; FIXED-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP6]], i32 -1
+; FIXED-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; FIXED-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
+; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; FIXED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; FIXED: middle.block:
+; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED: scalar.ph:
+; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT: br label [[LOOP:%.*]]
+; FIXED: loop:
+; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; FIXED-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1
+; FIXED-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4
+; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
+; FIXED-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4
+; FIXED-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2
+; FIXED-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4
+; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1
+; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; FIXED: exit:
+; FIXED-NEXT: ret void
+;
+; SCALABLE-LABEL: @load_store_factor2_i32(
+; SCALABLE-NEXT: entry:
+; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE: vector.ph:
+; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]]
+; SCALABLE: vector.body:
+; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
+; SCALABLE-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0
+; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
+; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT: [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; SCALABLE-NEXT: [[TMP13:%.*]] = add i64 [[TMP7]], 1
+; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP13]]
+; SCALABLE-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i32> [[TMP11]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; SCALABLE-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP14]], i32 -1
+; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP15]])
+; SCALABLE-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 4
+; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SCALABLE: middle.block:
+; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE: scalar.ph:
+; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT: br label [[LOOP:%.*]]
+; SCALABLE: loop:
+; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; SCALABLE-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1
+; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4
+; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
+; SCALABLE-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4
+; SCALABLE-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2
+; SCALABLE-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4
+; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1
+; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; SCALABLE: exit:
+; SCALABLE-NEXT: ret void
;
entry:
br label %loop
@@ -75,53 +134,111 @@ exit:
}
define void @load_store_factor2_i64(ptr %p) {
-; CHECK-LABEL: @load_store_factor2_i64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP3]], align 8
-; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC1]], <i64 2, i64 2, i64 2, i64 2>
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[TMP6]], i32 -1
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8
-; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2
-; CHECK-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8
-; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
+; FIXED-LABEL: @load_store_factor2_i64(
+; FIXED-NEXT: entry:
+; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED: vector.ph:
+; FIXED-NEXT: br label [[VECTOR_BODY:%.*]]
+; FIXED: vector.body:
+; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FIXED-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
+; FIXED-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
+; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP3]], align 8
+; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], <i64 1, i64 1, i64 1, i64 1>
+; FIXED-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1
+; FIXED-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]]
+; FIXED-NEXT: [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC1]], <i64 2, i64 2, i64 2, i64 2>
+; FIXED-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[TMP6]], i32 -1
+; FIXED-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; FIXED-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
+; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; FIXED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-NEXT: br i...
[truncated]
|
@llvm/pr-subscribers-llvm-transforms Author: Mel Chen (Mel-Chen) ChangesThe support for interleaved accesses for scalable vector with a factor of 2 is enabled in vectorizer. Therefore, the patch removed the restriction for scalable vector with a factor of 2. Patch is 155.42 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/90583.diff 3 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index ce26e61880fd05..5f84175da703d6 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -599,12 +599,8 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
bool UseMaskForCond, bool UseMaskForGaps) {
- if (isa<ScalableVectorType>(VecTy))
+ if (isa<ScalableVectorType>(VecTy) && Factor != 2)
return InstructionCost::getInvalid();
- auto *FVTy = cast<FixedVectorType>(VecTy);
- InstructionCost MemCost =
- getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
- unsigned VF = FVTy->getNumElements() / Factor;
// The interleaved memory access pass will lower interleaved memory ops (i.e
// a load and store followed by a specific shuffle) to vlseg/vsseg
@@ -612,24 +608,35 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
// memory op
if (!UseMaskForCond && !UseMaskForGaps &&
Factor <= TLI->getMaxSupportedInterleaveFactor()) {
- std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(FVTy);
+ auto *VTy = cast<VectorType>(VecTy);
+ std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
// Need to make sure type has't been scalarized
- if (LT.second.isFixedLengthVector()) {
- auto *LegalFVTy = FixedVectorType::get(FVTy->getElementType(),
- LT.second.getVectorNumElements());
+ if (LT.second.isVector()) {
+ auto *LegalVTy = VectorType::get(VTy->getElementType(),
+ LT.second.getVectorElementCount());
// FIXME: We use the memory op cost of the *legalized* type here, becuase
// it's getMemoryOpCost returns a really expensive cost for types like
// <6 x i8>, which show up when doing interleaves of Factor=3 etc.
// Should the memory op cost of these be cheaper?
- if (TLI->isLegalInterleavedAccessType(LegalFVTy, Factor, Alignment,
+ if (TLI->isLegalInterleavedAccessType(LegalVTy, Factor, Alignment,
AddressSpace, DL)) {
InstructionCost LegalMemCost = getMemoryOpCost(
- Opcode, LegalFVTy, Alignment, AddressSpace, CostKind);
+ Opcode, LegalVTy, Alignment, AddressSpace, CostKind);
return LT.first + LegalMemCost;
}
}
}
+ // TODO: Return the cost of interleaved accesses for scalable vector when
+ // unable to convert to segment accesses instructions.
+ if (isa<ScalableVectorType>(VecTy))
+ return InstructionCost::getInvalid();
+
+ auto *FVTy = cast<FixedVectorType>(VecTy);
+ InstructionCost MemCost =
+ getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
+ unsigned VF = FVTy->getNumElements() / Factor;
+
// An interleaved load will look like this for Factor=3:
// %wide.vec = load <12 x i32>, ptr %3, align 4
// %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index 6fa197591ab335..61a17e6857abfc 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -1,54 +1,113 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -S | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=off -mtriple=riscv64 -mattr=+v -S | FileCheck %s --check-prefix=FIXED
+; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on -mtriple=riscv64 -mattr=+v -S | FileCheck %s --check-prefix=SCALABLE
define void @load_store_factor2_i32(ptr %p) {
-; CHECK-LABEL: @load_store_factor2_i32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[TMP2]], i32 0
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP3]], align 4
-; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC1]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP6]], i32 -1
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; CHECK-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4
-; CHECK-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1
-; CHECK-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4
-; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; CHECK-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4
-; CHECK-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2
-; CHECK-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4
-; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
+; FIXED-LABEL: @load_store_factor2_i32(
+; FIXED-NEXT: entry:
+; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED: vector.ph:
+; FIXED-NEXT: br label [[VECTOR_BODY:%.*]]
+; FIXED: vector.body:
+; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FIXED-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
+; FIXED-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[TMP2]], i32 0
+; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP3]], align 4
+; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; FIXED-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1
+; FIXED-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP5]]
+; FIXED-NEXT: [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC1]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; FIXED-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP6]], i32 -1
+; FIXED-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; FIXED-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
+; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; FIXED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; FIXED: middle.block:
+; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED: scalar.ph:
+; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT: br label [[LOOP:%.*]]
+; FIXED: loop:
+; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; FIXED-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1
+; FIXED-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4
+; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
+; FIXED-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4
+; FIXED-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2
+; FIXED-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4
+; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1
+; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; FIXED: exit:
+; FIXED-NEXT: ret void
+;
+; SCALABLE-LABEL: @load_store_factor2_i32(
+; SCALABLE-NEXT: entry:
+; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE: vector.ph:
+; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]]
+; SCALABLE: vector.body:
+; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
+; SCALABLE-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0
+; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
+; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT: [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; SCALABLE-NEXT: [[TMP13:%.*]] = add i64 [[TMP7]], 1
+; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP13]]
+; SCALABLE-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i32> [[TMP11]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; SCALABLE-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP14]], i32 -1
+; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP15]])
+; SCALABLE-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 4
+; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SCALABLE: middle.block:
+; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE: scalar.ph:
+; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT: br label [[LOOP:%.*]]
+; SCALABLE: loop:
+; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1
+; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
+; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; SCALABLE-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1
+; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4
+; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
+; SCALABLE-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4
+; SCALABLE-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2
+; SCALABLE-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4
+; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1
+; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; SCALABLE: exit:
+; SCALABLE-NEXT: ret void
;
entry:
br label %loop
@@ -75,53 +134,111 @@ exit:
}
define void @load_store_factor2_i64(ptr %p) {
-; CHECK-LABEL: @load_store_factor2_i64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP3]], align 8
-; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC1]], <i64 2, i64 2, i64 2, i64 2>
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[TMP6]], i32 -1
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8
-; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2
-; CHECK-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8
-; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
+; FIXED-LABEL: @load_store_factor2_i64(
+; FIXED-NEXT: entry:
+; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED: vector.ph:
+; FIXED-NEXT: br label [[VECTOR_BODY:%.*]]
+; FIXED: vector.body:
+; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FIXED-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
+; FIXED-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
+; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP3]], align 8
+; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], <i64 1, i64 1, i64 1, i64 1>
+; FIXED-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1
+; FIXED-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]]
+; FIXED-NEXT: [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC1]], <i64 2, i64 2, i64 2, i64 2>
+; FIXED-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[TMP6]], i32 -1
+; FIXED-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; FIXED-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
+; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; FIXED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-NEXT: br i...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, after applying this patch I can see plenty of VLA segmented loads and stores emitted on SPEC CPU 2017 and llvm-test-suite
The vlseg and vsseg intrinsic functions are not overloaded on pointer type, so cannot handle non-default address spaces. This fixes an error we see after #90583.
The support for interleaved accesses for scalable vector with a factor of 2 is enabled in vectorizer. Therefore, the patch removed the restriction for scalable vector with a factor of 2.