-
Notifications
You must be signed in to change notification settings - Fork 15.2k
VectorCombine: Improve the insert/extract fold in the narrowing case #168820
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/nhaehnle/spr/main/a960175d
Are you sure you want to change the base?
VectorCombine: Improve the insert/extract fold in the narrowing case #168820
Conversation
|
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-backend-amdgpu Author: Nicolai Hähnle (nhaehnle) ChangesKeeping the extracted element in a natural position in the narrowed
There are minor X86 test changes that look reasonable to me. The IR Stack:
Full diff: https://github.com/llvm/llvm-project/pull/168820.diff 5 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index fc39f4123fac4..9025b93f75458 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -4455,22 +4455,15 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
- bool IsExtIdxInBounds = ExtIdx < NumDstElts;
bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
if (NeedDstSrcSwap) {
SK = TargetTransformInfo::SK_PermuteSingleSrc;
- if (!IsExtIdxInBounds && NeedExpOrNarrow)
- Mask[InsIdx] = 0;
- else
- Mask[InsIdx] = ExtIdx;
+ Mask[InsIdx] = ExtIdx % NumDstElts;
std::swap(DstVec, SrcVec);
} else {
SK = TargetTransformInfo::SK_PermuteTwoSrc;
std::iota(Mask.begin(), Mask.end(), 0);
- if (!IsExtIdxInBounds && NeedExpOrNarrow)
- Mask[InsIdx] = NumDstElts;
- else
- Mask[InsIdx] = ExtIdx + NumDstElts;
+ Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
}
// Cost
@@ -4491,14 +4484,11 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
nullptr, {DstVec, SrcVec});
} else {
- // When creating length-changing-vector, always create with a Mask whose
- // first element has an ExtIdx, so that the first element of the vector
- // being created is always the target to be extracted.
+ // When creating a length-changing-vector, always try to keep the relevant
+ // element in an equivalent position, so that bulk shuffles are more likely
+ // to be useful.
ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
- if (IsExtIdxInBounds)
- ExtToVecMask[ExtIdx] = ExtIdx;
- else
- ExtToVecMask[0] = ExtIdx;
+ ExtToVecMask[ExtIdx % NumDstElts] = ExtIdx;
// Add cost for expanding or narrowing
NewCost = TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
diff --git a/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-i8.ll b/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-i8.ll
index eaab7199a3cf3..442a93689a791 100644
--- a/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-i8.ll
+++ b/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-i8.ll
@@ -91,21 +91,8 @@ entry:
define <8 x i8> @extract_insert_chain_shortening(<32 x i8> %in) {
; OPT-LABEL: define <8 x i8> @extract_insert_chain_shortening(
; OPT-SAME: <32 x i8> [[IN:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[I_1:%.*]] = extractelement <32 x i8> [[IN]], i64 17
-; OPT-NEXT: [[I_2:%.*]] = extractelement <32 x i8> [[IN]], i64 18
-; OPT-NEXT: [[I_3:%.*]] = extractelement <32 x i8> [[IN]], i64 19
-; OPT-NEXT: [[I_5:%.*]] = extractelement <32 x i8> [[IN]], i64 21
-; OPT-NEXT: [[I_6:%.*]] = extractelement <32 x i8> [[IN]], i64 22
-; OPT-NEXT: [[I_7:%.*]] = extractelement <32 x i8> [[IN]], i64 23
-; OPT-NEXT: [[O_0:%.*]] = shufflevector <32 x i8> [[IN]], <32 x i8> poison, <8 x i32> <i32 16, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; OPT-NEXT: [[O_1:%.*]] = insertelement <8 x i8> [[O_0]], i8 [[I_1]], i32 1
-; OPT-NEXT: [[O_2:%.*]] = insertelement <8 x i8> [[O_1]], i8 [[I_2]], i32 2
-; OPT-NEXT: [[O_3:%.*]] = insertelement <8 x i8> [[O_2]], i8 [[I_3]], i32 3
-; OPT-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[IN]], <32 x i8> poison, <8 x i32> <i32 20, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; OPT-NEXT: [[O_4:%.*]] = shufflevector <8 x i8> [[O_3]], <8 x i8> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 7>
-; OPT-NEXT: [[O_5:%.*]] = insertelement <8 x i8> [[O_4]], i8 [[I_5]], i32 5
-; OPT-NEXT: [[O_6:%.*]] = insertelement <8 x i8> [[O_5]], i8 [[I_6]], i32 6
-; OPT-NEXT: [[O_7:%.*]] = insertelement <8 x i8> [[O_6]], i8 [[I_7]], i32 7
+; OPT-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[IN]], <32 x i8> poison, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; OPT-NEXT: [[O_7:%.*]] = shufflevector <8 x i8> poison, <8 x i8> [[TMP1]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; OPT-NEXT: ret <8 x i8> [[O_7]]
;
%i.0 = extractelement <32 x i8> %in, i64 16
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll
index e85c092b1b213..228f161698bb2 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll
@@ -140,10 +140,14 @@ define <2 x double> @src_ins0_v2f64_ext2_v4f64(<2 x double> %a, <4 x double> %b)
}
define <2 x double> @src_ins0_v2f64_ext3_v4f64(<2 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @src_ins0_v2f64_ext3_v4f64(
-; CHECK-NEXT: [[EXT:%.*]] = extractelement <4 x double> [[B:%.*]], i32 3
-; CHECK-NEXT: [[INS:%.*]] = insertelement <2 x double> poison, double [[EXT]], i32 0
-; CHECK-NEXT: ret <2 x double> [[INS]]
+; SSE-LABEL: @src_ins0_v2f64_ext3_v4f64(
+; SSE-NEXT: [[INS:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 3, i32 poison>
+; SSE-NEXT: ret <2 x double> [[INS]]
+;
+; AVX-LABEL: @src_ins0_v2f64_ext3_v4f64(
+; AVX-NEXT: [[EXT:%.*]] = extractelement <4 x double> [[B:%.*]], i32 3
+; AVX-NEXT: [[INS:%.*]] = insertelement <2 x double> poison, double [[EXT]], i32 0
+; AVX-NEXT: ret <2 x double> [[INS]]
;
%ext = extractelement <4 x double> %b, i32 3
%ins = insertelement <2 x double> poison, double %ext, i32 0
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-insert.ll b/llvm/test/Transforms/VectorCombine/X86/extract-insert.ll
index 193ad36616a4a..e591ea55a453d 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-insert.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-insert.ll
@@ -136,8 +136,8 @@ define <2 x double> @src_ins0_v2f64_ext2_v4f64(<2 x double> %a, <4 x double> %b)
define <2 x double> @src_ins0_v2f64_ext3_v4f64(<2 x double> %a, <4 x double> %b) {
; CHECK-LABEL: @src_ins0_v2f64_ext3_v4f64(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 3, i32 poison>
-; CHECK-NEXT: [[INS:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[TMP1]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 poison, i32 3>
+; CHECK-NEXT: [[INS:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[TMP1]], <2 x i32> <i32 3, i32 1>
; CHECK-NEXT: ret <2 x double> [[INS]]
;
%ext = extractelement <4 x double> %b, i32 3
@@ -185,8 +185,8 @@ define <2 x double> @src_ins1_v2f64_ext2_v4f64(<2 x double> %a, <4 x double> %b)
define <2 x double> @src_ins1_v2f64_ext3_v4f64(<2 x double> %a, <4 x double> %b) {
; CHECK-LABEL: @src_ins1_v2f64_ext3_v4f64(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 3, i32 poison>
-; CHECK-NEXT: [[INS:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 poison, i32 3>
+; CHECK-NEXT: [[INS:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: ret <2 x double> [[INS]]
;
%ext = extractelement <4 x double> %b, i32 3
diff --git a/llvm/test/Transforms/VectorCombine/X86/pr126085.ll b/llvm/test/Transforms/VectorCombine/X86/pr126085.ll
index f596807027db6..d29cdb3d95c81 100644
--- a/llvm/test/Transforms/VectorCombine/X86/pr126085.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/pr126085.ll
@@ -6,8 +6,8 @@ define i32 @test(ptr %a0) {
; CHECK-SAME: ptr [[A0:%.*]]) {
; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, ptr [[A0]], align 1
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <16 x i8> [[LOAD]], <16 x i8> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[ELT:%.*]] = extractelement <16 x i8> [[LOAD]], i64 11
-; CHECK-NEXT: [[INS:%.*]] = insertelement <4 x i8> [[SHUF]], i8 [[ELT]], i64 1
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[LOAD]], <16 x i8> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 11>
+; CHECK-NEXT: [[INS:%.*]] = shufflevector <4 x i8> [[SHUF]], <4 x i8> [[TMP1]], <4 x i32> <i32 0, i32 7, i32 2, i32 3>
; CHECK-NEXT: [[RES:%.*]] = bitcast <4 x i8> [[INS]] to i32
; CHECK-NEXT: ret i32 [[RES]]
;
|
|
@llvm/pr-subscribers-llvm-transforms Author: Nicolai Hähnle (nhaehnle) ChangesKeeping the extracted element in a natural position in the narrowed
There are minor X86 test changes that look reasonable to me. The IR Stack:
Full diff: https://github.com/llvm/llvm-project/pull/168820.diff 5 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index fc39f4123fac4..9025b93f75458 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -4455,22 +4455,15 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
- bool IsExtIdxInBounds = ExtIdx < NumDstElts;
bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
if (NeedDstSrcSwap) {
SK = TargetTransformInfo::SK_PermuteSingleSrc;
- if (!IsExtIdxInBounds && NeedExpOrNarrow)
- Mask[InsIdx] = 0;
- else
- Mask[InsIdx] = ExtIdx;
+ Mask[InsIdx] = ExtIdx % NumDstElts;
std::swap(DstVec, SrcVec);
} else {
SK = TargetTransformInfo::SK_PermuteTwoSrc;
std::iota(Mask.begin(), Mask.end(), 0);
- if (!IsExtIdxInBounds && NeedExpOrNarrow)
- Mask[InsIdx] = NumDstElts;
- else
- Mask[InsIdx] = ExtIdx + NumDstElts;
+ Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
}
// Cost
@@ -4491,14 +4484,11 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
nullptr, {DstVec, SrcVec});
} else {
- // When creating length-changing-vector, always create with a Mask whose
- // first element has an ExtIdx, so that the first element of the vector
- // being created is always the target to be extracted.
+ // When creating a length-changing-vector, always try to keep the relevant
+ // element in an equivalent position, so that bulk shuffles are more likely
+ // to be useful.
ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
- if (IsExtIdxInBounds)
- ExtToVecMask[ExtIdx] = ExtIdx;
- else
- ExtToVecMask[0] = ExtIdx;
+ ExtToVecMask[ExtIdx % NumDstElts] = ExtIdx;
// Add cost for expanding or narrowing
NewCost = TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
diff --git a/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-i8.ll b/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-i8.ll
index eaab7199a3cf3..442a93689a791 100644
--- a/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-i8.ll
+++ b/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-i8.ll
@@ -91,21 +91,8 @@ entry:
define <8 x i8> @extract_insert_chain_shortening(<32 x i8> %in) {
; OPT-LABEL: define <8 x i8> @extract_insert_chain_shortening(
; OPT-SAME: <32 x i8> [[IN:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[I_1:%.*]] = extractelement <32 x i8> [[IN]], i64 17
-; OPT-NEXT: [[I_2:%.*]] = extractelement <32 x i8> [[IN]], i64 18
-; OPT-NEXT: [[I_3:%.*]] = extractelement <32 x i8> [[IN]], i64 19
-; OPT-NEXT: [[I_5:%.*]] = extractelement <32 x i8> [[IN]], i64 21
-; OPT-NEXT: [[I_6:%.*]] = extractelement <32 x i8> [[IN]], i64 22
-; OPT-NEXT: [[I_7:%.*]] = extractelement <32 x i8> [[IN]], i64 23
-; OPT-NEXT: [[O_0:%.*]] = shufflevector <32 x i8> [[IN]], <32 x i8> poison, <8 x i32> <i32 16, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; OPT-NEXT: [[O_1:%.*]] = insertelement <8 x i8> [[O_0]], i8 [[I_1]], i32 1
-; OPT-NEXT: [[O_2:%.*]] = insertelement <8 x i8> [[O_1]], i8 [[I_2]], i32 2
-; OPT-NEXT: [[O_3:%.*]] = insertelement <8 x i8> [[O_2]], i8 [[I_3]], i32 3
-; OPT-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[IN]], <32 x i8> poison, <8 x i32> <i32 20, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; OPT-NEXT: [[O_4:%.*]] = shufflevector <8 x i8> [[O_3]], <8 x i8> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 7>
-; OPT-NEXT: [[O_5:%.*]] = insertelement <8 x i8> [[O_4]], i8 [[I_5]], i32 5
-; OPT-NEXT: [[O_6:%.*]] = insertelement <8 x i8> [[O_5]], i8 [[I_6]], i32 6
-; OPT-NEXT: [[O_7:%.*]] = insertelement <8 x i8> [[O_6]], i8 [[I_7]], i32 7
+; OPT-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[IN]], <32 x i8> poison, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; OPT-NEXT: [[O_7:%.*]] = shufflevector <8 x i8> poison, <8 x i8> [[TMP1]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; OPT-NEXT: ret <8 x i8> [[O_7]]
;
%i.0 = extractelement <32 x i8> %in, i64 16
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll
index e85c092b1b213..228f161698bb2 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll
@@ -140,10 +140,14 @@ define <2 x double> @src_ins0_v2f64_ext2_v4f64(<2 x double> %a, <4 x double> %b)
}
define <2 x double> @src_ins0_v2f64_ext3_v4f64(<2 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @src_ins0_v2f64_ext3_v4f64(
-; CHECK-NEXT: [[EXT:%.*]] = extractelement <4 x double> [[B:%.*]], i32 3
-; CHECK-NEXT: [[INS:%.*]] = insertelement <2 x double> poison, double [[EXT]], i32 0
-; CHECK-NEXT: ret <2 x double> [[INS]]
+; SSE-LABEL: @src_ins0_v2f64_ext3_v4f64(
+; SSE-NEXT: [[INS:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 3, i32 poison>
+; SSE-NEXT: ret <2 x double> [[INS]]
+;
+; AVX-LABEL: @src_ins0_v2f64_ext3_v4f64(
+; AVX-NEXT: [[EXT:%.*]] = extractelement <4 x double> [[B:%.*]], i32 3
+; AVX-NEXT: [[INS:%.*]] = insertelement <2 x double> poison, double [[EXT]], i32 0
+; AVX-NEXT: ret <2 x double> [[INS]]
;
%ext = extractelement <4 x double> %b, i32 3
%ins = insertelement <2 x double> poison, double %ext, i32 0
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-insert.ll b/llvm/test/Transforms/VectorCombine/X86/extract-insert.ll
index 193ad36616a4a..e591ea55a453d 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-insert.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-insert.ll
@@ -136,8 +136,8 @@ define <2 x double> @src_ins0_v2f64_ext2_v4f64(<2 x double> %a, <4 x double> %b)
define <2 x double> @src_ins0_v2f64_ext3_v4f64(<2 x double> %a, <4 x double> %b) {
; CHECK-LABEL: @src_ins0_v2f64_ext3_v4f64(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 3, i32 poison>
-; CHECK-NEXT: [[INS:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[TMP1]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 poison, i32 3>
+; CHECK-NEXT: [[INS:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[TMP1]], <2 x i32> <i32 3, i32 1>
; CHECK-NEXT: ret <2 x double> [[INS]]
;
%ext = extractelement <4 x double> %b, i32 3
@@ -185,8 +185,8 @@ define <2 x double> @src_ins1_v2f64_ext2_v4f64(<2 x double> %a, <4 x double> %b)
define <2 x double> @src_ins1_v2f64_ext3_v4f64(<2 x double> %a, <4 x double> %b) {
; CHECK-LABEL: @src_ins1_v2f64_ext3_v4f64(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 3, i32 poison>
-; CHECK-NEXT: [[INS:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 poison, i32 3>
+; CHECK-NEXT: [[INS:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: ret <2 x double> [[INS]]
;
%ext = extractelement <4 x double> %b, i32 3
diff --git a/llvm/test/Transforms/VectorCombine/X86/pr126085.ll b/llvm/test/Transforms/VectorCombine/X86/pr126085.ll
index f596807027db6..d29cdb3d95c81 100644
--- a/llvm/test/Transforms/VectorCombine/X86/pr126085.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/pr126085.ll
@@ -6,8 +6,8 @@ define i32 @test(ptr %a0) {
; CHECK-SAME: ptr [[A0:%.*]]) {
; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, ptr [[A0]], align 1
; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <16 x i8> [[LOAD]], <16 x i8> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[ELT:%.*]] = extractelement <16 x i8> [[LOAD]], i64 11
-; CHECK-NEXT: [[INS:%.*]] = insertelement <4 x i8> [[SHUF]], i8 [[ELT]], i64 1
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[LOAD]], <16 x i8> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 11>
+; CHECK-NEXT: [[INS:%.*]] = shufflevector <4 x i8> [[SHUF]], <4 x i8> [[TMP1]], <4 x i32> <i32 0, i32 7, i32 2, i32 3>
; CHECK-NEXT: [[RES:%.*]] = bitcast <4 x i8> [[INS]] to i32
; CHECK-NEXT: ret i32 [[RES]]
;
|
🐧 Linux x64 Test Results
|
Keeping the extracted element in a natural position in the narrowed vector has two beneficial effects: 1. It makes the narrowing shuffles cheaper (at least on AMDGPU), which allows the insert/extract fold to trigger. 2. It makes the narrowing shuffles in a chain of extract/insert compatible, which allows foldLengthChangingShuffles to successfully recognize a chain that can be folded. There are minor X86 test changes that look reasonable to me. The IR change for AVX2 in llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll doesn't change the assembly generated by `llc -mtriple=x86_64-- -mattr=AVX2` at all. commit-id:c151bb04
21c74b5 to
aaee5f6
Compare
19a4296 to
b43293a
Compare
Keeping the extracted element in a natural position in the narrowed
vector has two beneficial effects:
allows the insert/extract fold to trigger.
compatible, which allows foldLengthChangingShuffles to successfully
recognize a chain that can be folded.
There are minor X86 test changes that look reasonable to me. The IR
change for AVX2 in llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll
doesn't change the assembly generated by
llc -mtriple=x86_64-- -mattr=AVX2at all.
Stack: