From b43293a52737145abd98c771156b633ab3e94167 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= Date: Wed, 19 Nov 2025 18:00:32 -0800 Subject: [PATCH] VectorCombine: Improve the insert/extract fold in the narrowing case Keeping the extracted element in a natural position in the narrowed vector has two beneficial effects: 1. It makes the narrowing shuffles cheaper (at least on AMDGPU), which allows the insert/extract fold to trigger. 2. It makes the narrowing shuffles in a chain of extract/insert compatible, which allows foldLengthChangingShuffles to successfully recognize a chain that can be folded. There are minor X86 test changes that look reasonable to me. The IR change for AVX2 in llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll doesn't change the assembly generated by `llc -mtriple=x86_64-- -mattr=AVX2` at all. commit-id:c151bb04 --- .../Transforms/Vectorize/VectorCombine.cpp | 22 +++++-------------- .../VectorCombine/AMDGPU/extract-insert-i8.ll | 17 ++------------ .../X86/extract-insert-poison.ll | 12 ++++++---- .../VectorCombine/X86/extract-insert.ll | 8 +++---- .../Transforms/VectorCombine/X86/pr126085.ll | 4 ++-- 5 files changed, 22 insertions(+), 41 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index fc39f4123fac4..9025b93f75458 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -4455,22 +4455,15 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) { SmallVector Mask(NumDstElts, PoisonMaskElem); bool NeedExpOrNarrow = NumSrcElts != NumDstElts; - bool IsExtIdxInBounds = ExtIdx < NumDstElts; bool NeedDstSrcSwap = isa(DstVec) && !isa(SrcVec); if (NeedDstSrcSwap) { SK = TargetTransformInfo::SK_PermuteSingleSrc; - if (!IsExtIdxInBounds && NeedExpOrNarrow) - Mask[InsIdx] = 0; - else - Mask[InsIdx] = ExtIdx; + Mask[InsIdx] = ExtIdx % NumDstElts; std::swap(DstVec, SrcVec); } else { SK = TargetTransformInfo::SK_PermuteTwoSrc; std::iota(Mask.begin(), Mask.end(), 0); - if (!IsExtIdxInBounds && NeedExpOrNarrow) - Mask[InsIdx] = NumDstElts; - else - Mask[InsIdx] = ExtIdx + NumDstElts; + Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts; } // Cost @@ -4491,14 +4484,11 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) { NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0, nullptr, {DstVec, SrcVec}); } else { - // When creating length-changing-vector, always create with a Mask whose - // first element has an ExtIdx, so that the first element of the vector - // being created is always the target to be extracted. + // When creating a length-changing-vector, always try to keep the relevant + // element in an equivalent position, so that bulk shuffles are more likely + // to be useful. ExtToVecMask.assign(NumDstElts, PoisonMaskElem); - if (IsExtIdxInBounds) - ExtToVecMask[ExtIdx] = ExtIdx; - else - ExtToVecMask[0] = ExtIdx; + ExtToVecMask[ExtIdx % NumDstElts] = ExtIdx; // Add cost for expanding or narrowing NewCost = TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DstVecTy, SrcVecTy, ExtToVecMask, CostKind); diff --git a/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-i8.ll b/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-i8.ll index eaab7199a3cf3..442a93689a791 100644 --- a/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-i8.ll +++ b/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-i8.ll @@ -91,21 +91,8 @@ entry: define <8 x i8> @extract_insert_chain_shortening(<32 x i8> %in) { ; OPT-LABEL: define <8 x i8> @extract_insert_chain_shortening( ; OPT-SAME: <32 x i8> [[IN:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[I_1:%.*]] = extractelement <32 x i8> [[IN]], i64 17 -; OPT-NEXT: [[I_2:%.*]] = extractelement <32 x i8> [[IN]], i64 18 -; OPT-NEXT: [[I_3:%.*]] = extractelement <32 x i8> [[IN]], i64 19 -; OPT-NEXT: [[I_5:%.*]] = extractelement <32 x i8> [[IN]], i64 21 -; OPT-NEXT: [[I_6:%.*]] = extractelement <32 x i8> [[IN]], i64 22 -; OPT-NEXT: [[I_7:%.*]] = extractelement <32 x i8> [[IN]], i64 23 -; OPT-NEXT: [[O_0:%.*]] = shufflevector <32 x i8> [[IN]], <32 x i8> poison, <8 x i32> -; OPT-NEXT: [[O_1:%.*]] = insertelement <8 x i8> [[O_0]], i8 [[I_1]], i32 1 -; OPT-NEXT: [[O_2:%.*]] = insertelement <8 x i8> [[O_1]], i8 [[I_2]], i32 2 -; OPT-NEXT: [[O_3:%.*]] = insertelement <8 x i8> [[O_2]], i8 [[I_3]], i32 3 -; OPT-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[IN]], <32 x i8> poison, <8 x i32> -; OPT-NEXT: [[O_4:%.*]] = shufflevector <8 x i8> [[O_3]], <8 x i8> [[TMP1]], <8 x i32> -; OPT-NEXT: [[O_5:%.*]] = insertelement <8 x i8> [[O_4]], i8 [[I_5]], i32 5 -; OPT-NEXT: [[O_6:%.*]] = insertelement <8 x i8> [[O_5]], i8 [[I_6]], i32 6 -; OPT-NEXT: [[O_7:%.*]] = insertelement <8 x i8> [[O_6]], i8 [[I_7]], i32 7 +; OPT-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[IN]], <32 x i8> poison, <8 x i32> +; OPT-NEXT: [[O_7:%.*]] = shufflevector <8 x i8> poison, <8 x i8> [[TMP1]], <8 x i32> ; OPT-NEXT: ret <8 x i8> [[O_7]] ; %i.0 = extractelement <32 x i8> %in, i64 16 diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll index e85c092b1b213..228f161698bb2 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll @@ -140,10 +140,14 @@ define <2 x double> @src_ins0_v2f64_ext2_v4f64(<2 x double> %a, <4 x double> %b) } define <2 x double> @src_ins0_v2f64_ext3_v4f64(<2 x double> %a, <4 x double> %b) { -; CHECK-LABEL: @src_ins0_v2f64_ext3_v4f64( -; CHECK-NEXT: [[EXT:%.*]] = extractelement <4 x double> [[B:%.*]], i32 3 -; CHECK-NEXT: [[INS:%.*]] = insertelement <2 x double> poison, double [[EXT]], i32 0 -; CHECK-NEXT: ret <2 x double> [[INS]] +; SSE-LABEL: @src_ins0_v2f64_ext3_v4f64( +; SSE-NEXT: [[INS:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> +; SSE-NEXT: ret <2 x double> [[INS]] +; +; AVX-LABEL: @src_ins0_v2f64_ext3_v4f64( +; AVX-NEXT: [[EXT:%.*]] = extractelement <4 x double> [[B:%.*]], i32 3 +; AVX-NEXT: [[INS:%.*]] = insertelement <2 x double> poison, double [[EXT]], i32 0 +; AVX-NEXT: ret <2 x double> [[INS]] ; %ext = extractelement <4 x double> %b, i32 3 %ins = insertelement <2 x double> poison, double %ext, i32 0 diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-insert.ll b/llvm/test/Transforms/VectorCombine/X86/extract-insert.ll index 193ad36616a4a..e591ea55a453d 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-insert.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-insert.ll @@ -136,8 +136,8 @@ define <2 x double> @src_ins0_v2f64_ext2_v4f64(<2 x double> %a, <4 x double> %b) define <2 x double> @src_ins0_v2f64_ext3_v4f64(<2 x double> %a, <4 x double> %b) { ; CHECK-LABEL: @src_ins0_v2f64_ext3_v4f64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> -; CHECK-NEXT: [[INS:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[INS:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[TMP1]], <2 x i32> ; CHECK-NEXT: ret <2 x double> [[INS]] ; %ext = extractelement <4 x double> %b, i32 3 @@ -185,8 +185,8 @@ define <2 x double> @src_ins1_v2f64_ext2_v4f64(<2 x double> %a, <4 x double> %b) define <2 x double> @src_ins1_v2f64_ext3_v4f64(<2 x double> %a, <4 x double> %b) { ; CHECK-LABEL: @src_ins1_v2f64_ext3_v4f64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> -; CHECK-NEXT: [[INS:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[INS:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[TMP1]], <2 x i32> ; CHECK-NEXT: ret <2 x double> [[INS]] ; %ext = extractelement <4 x double> %b, i32 3 diff --git a/llvm/test/Transforms/VectorCombine/X86/pr126085.ll b/llvm/test/Transforms/VectorCombine/X86/pr126085.ll index f596807027db6..d29cdb3d95c81 100644 --- a/llvm/test/Transforms/VectorCombine/X86/pr126085.ll +++ b/llvm/test/Transforms/VectorCombine/X86/pr126085.ll @@ -6,8 +6,8 @@ define i32 @test(ptr %a0) { ; CHECK-SAME: ptr [[A0:%.*]]) { ; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, ptr [[A0]], align 1 ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <16 x i8> [[LOAD]], <16 x i8> poison, <4 x i32> -; CHECK-NEXT: [[ELT:%.*]] = extractelement <16 x i8> [[LOAD]], i64 11 -; CHECK-NEXT: [[INS:%.*]] = insertelement <4 x i8> [[SHUF]], i8 [[ELT]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[LOAD]], <16 x i8> poison, <4 x i32> +; CHECK-NEXT: [[INS:%.*]] = shufflevector <4 x i8> [[SHUF]], <4 x i8> [[TMP1]], <4 x i32> ; CHECK-NEXT: [[RES:%.*]] = bitcast <4 x i8> [[INS]] to i32 ; CHECK-NEXT: ret i32 [[RES]] ;