diff --git a/clang/test/CodeGen/X86/avx-shuffle-builtins.c b/clang/test/CodeGen/X86/avx-shuffle-builtins.c index 9109247e534f4..82be43bc05049 100644 --- a/clang/test/CodeGen/X86/avx-shuffle-builtins.c +++ b/clang/test/CodeGen/X86/avx-shuffle-builtins.c @@ -60,7 +60,8 @@ __m256 test_mm256_permute2f128_ps(__m256 a, __m256 b) { __m256i test_mm256_permute2f128_si256(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_permute2f128_si256 - // CHECK: shufflevector{{.*}} <8 x i32> + // X64: shufflevector{{.*}} + // X86: shufflevector{{.*}} return _mm256_permute2f128_si256(a, b, 0x20); } @@ -104,7 +105,8 @@ __m256d test_mm256_insertf128_pd_0(__m256d a, __m128d b) { __m256i test_mm256_insertf128_si256_0(__m256i a, __m128i b) { // CHECK-LABEL: test_mm256_insertf128_si256_0 - // CHECK: shufflevector{{.*}} + // X64: shufflevector{{.*}} + // X86: shufflevector{{.*}} return _mm256_insertf128_si256(a, b, 0); } @@ -122,7 +124,8 @@ __m256d test_mm256_insertf128_pd_1(__m256d a, __m128d b) { __m256i test_mm256_insertf128_si256_1(__m256i a, __m128i b) { // CHECK-LABEL: test_mm256_insertf128_si256_1 - // CHECK: shufflevector{{.*}} + // X64: shufflevector{{.*}} + // X86: shufflevector{{.*}} return _mm256_insertf128_si256(a, b, 1); } diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 0b16a8b767692..23494314f132c 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -684,10 +684,10 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { /// destination type followed by shuffle. This can enable further transforms by /// moving bitcasts or shuffles together. bool VectorCombine::foldBitcastShuffle(Instruction &I) { - Value *V0; + Value *V0, *V1; ArrayRef Mask; if (!match(&I, m_BitCast(m_OneUse( - m_Shuffle(m_Value(V0), m_Undef(), m_Mask(Mask)))))) + m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask)))))) return false; // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for @@ -728,17 +728,21 @@ bool VectorCombine::foldBitcastShuffle(Instruction &I) { FixedVectorType::get(DestTy->getScalarType(), NumSrcElts); auto *OldShuffleTy = FixedVectorType::get(SrcTy->getScalarType(), Mask.size()); + bool IsUnary = isa(V1); + unsigned NumOps = IsUnary ? 1 : 2; // The new shuffle must not cost more than the old shuffle. TargetTransformInfo::TargetCostKind CK = TargetTransformInfo::TCK_RecipThroughput; TargetTransformInfo::ShuffleKind SK = - TargetTransformInfo::SK_PermuteSingleSrc; + IsUnary ? TargetTransformInfo::SK_PermuteSingleSrc + : TargetTransformInfo::SK_PermuteTwoSrc; InstructionCost DestCost = TTI.getShuffleCost(SK, NewShuffleTy, NewMask, CK) + - TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy, - TargetTransformInfo::CastContextHint::None, CK); + (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy, + TargetTransformInfo::CastContextHint::None, + CK)); InstructionCost SrcCost = TTI.getShuffleCost(SK, SrcTy, Mask, CK) + TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy, @@ -746,10 +750,11 @@ bool VectorCombine::foldBitcastShuffle(Instruction &I) { if (DestCost > SrcCost || !DestCost.isValid()) return false; - // bitcast (shuf V0, MaskC) --> shuf (bitcast V0), MaskC' + // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC' ++NumShufOfBitcast; - Value *CastV = Builder.CreateBitCast(V0, NewShuffleTy); - Value *Shuf = Builder.CreateShuffleVector(CastV, NewMask); + Value *CastV0 = Builder.CreateBitCast(V0, NewShuffleTy); + Value *CastV1 = Builder.CreateBitCast(V1, NewShuffleTy); + Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask); replaceValue(I, *Shuf); return true; } diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll index 211c90b5604e6..e61b254b7a5f1 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s -; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s -; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s +; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK +; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK +; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512 define <4 x i64> @PR67803(<4 x i64> %x, <4 x i64> %y, <4 x i64> %a, <4 x i64> %b) { ; CHECK-LABEL: @PR67803( @@ -35,6 +35,35 @@ define <4 x i64> @PR67803(<4 x i64> %x, <4 x i64> %y, <4 x i64> %a, <4 x i64> %b ; CHECK-NEXT: [[SHUFFLE_I23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP20]], <4 x i32> ; CHECK-NEXT: ret <4 x i64> [[SHUFFLE_I23]] ; +; AVX512-LABEL: @PR67803( +; AVX512-NEXT: entry: +; AVX512-NEXT: [[TMP0:%.*]] = bitcast <4 x i64> [[X:%.*]] to <8 x i32> +; AVX512-NEXT: [[TMP1:%.*]] = bitcast <4 x i64> [[Y:%.*]] to <8 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP0]], [[TMP1]] +; AVX512-NEXT: [[CMP_I21:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> +; AVX512-NEXT: [[SEXT_I22:%.*]] = sext <4 x i1> [[CMP_I21]] to <4 x i32> +; AVX512-NEXT: [[CMP_I:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> +; AVX512-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[SEXT_I22]], <4 x i32> [[SEXT_I]], <8 x i32> +; AVX512-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8> +; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <32 x i8> [[TMP4]], <32 x i8> poison, <16 x i32> +; AVX512-NEXT: [[TMP6:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8> +; AVX512-NEXT: [[TMP7:%.*]] = shufflevector <32 x i8> [[TMP6]], <32 x i8> poison, <16 x i32> +; AVX512-NEXT: [[TMP8:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8> +; AVX512-NEXT: [[TMP9:%.*]] = shufflevector <32 x i8> [[TMP8]], <32 x i8> poison, <16 x i32> +; AVX512-NEXT: [[TMP10:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[TMP5]], <16 x i8> [[TMP7]], <16 x i8> [[TMP9]]) +; AVX512-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64> +; AVX512-NEXT: [[TMP12:%.*]] = bitcast <4 x i64> [[A]] to <32 x i8> +; AVX512-NEXT: [[TMP13:%.*]] = shufflevector <32 x i8> [[TMP12]], <32 x i8> poison, <16 x i32> +; AVX512-NEXT: [[TMP14:%.*]] = bitcast <4 x i64> [[B]] to <32 x i8> +; AVX512-NEXT: [[TMP15:%.*]] = shufflevector <32 x i8> [[TMP14]], <32 x i8> poison, <16 x i32> +; AVX512-NEXT: [[TMP16:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8> +; AVX512-NEXT: [[TMP17:%.*]] = shufflevector <32 x i8> [[TMP16]], <32 x i8> poison, <16 x i32> +; AVX512-NEXT: [[TMP18:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[TMP13]], <16 x i8> [[TMP15]], <16 x i8> [[TMP17]]) +; AVX512-NEXT: [[TMP19:%.*]] = bitcast <16 x i8> [[TMP18]] to <2 x i64> +; AVX512-NEXT: [[SHUFFLE_I23:%.*]] = shufflevector <2 x i64> [[TMP11]], <2 x i64> [[TMP19]], <4 x i32> +; AVX512-NEXT: ret <4 x i64> [[SHUFFLE_I23]] +; entry: %0 = bitcast <4 x i64> %x to <8 x i32> %extract = shufflevector <8 x i32> %0, <8 x i32> poison, <4 x i32>