diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 3661de89e8070..4fe1d72127927 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -17277,8 +17277,13 @@ static SDValue lowerShuffleAsRepeatedMaskAndLanePermute( return SDValue(); // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes - // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes. - int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1; + // (with PERMQ/PERMPD). On AVX512BW targets, permuting 64-bit sub-lanes, even + // with a variable shuffle, is worth it for 64xi8 vectors. Otherwise we can + // only permute whole 128-bit lanes. + int SubLaneScale = 1; + if ((Subtarget.hasAVX2() && VT.is256BitVector()) || + (Subtarget.hasBWI() && VT == MVT::v64i8)) + SubLaneScale = 2; int NumSubLanes = NumLanes * SubLaneScale; int NumSubLaneElts = NumLaneElts / SubLaneScale; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll index 7b2217925faea..92fede6ec0abd 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll @@ -815,19 +815,9 @@ define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm0[2,3,6,7],zmm1[2,3,6,7] -; AVX512BW-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX512BW-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[0,1,4,5] -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] -; AVX512BW-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512BW-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,1,3,5,7] +; AVX512BW-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126: