Skip to content

Commit

Permalink
[X86] X86FixupVectorConstantsPass - attempt to replace full width fp …
Browse files Browse the repository at this point in the history
…vector constant loads with broadcasts on AVX+ targets

lowerBuildVectorAsBroadcast will not broadcast splat constants in all cases, resulting in a lot of situations where a full width vector load that has failed to fold but is loading splat constant values could use a broadcast load instruction just as cheaply, and save constant pool space.

NOTE: SSE3 targets can use MOVDDUP but not all SSE era CPUs can perform this as cheaply as a vector load, we will need to add scheduler model checks if we want to pursue this.
  • Loading branch information
RKSimon committed May 29, 2023
1 parent 7fb60b0 commit 9806101
Show file tree
Hide file tree
Showing 81 changed files with 836 additions and 853 deletions.
47 changes: 46 additions & 1 deletion llvm/lib/Target/X86/X86FixupVectorConstants.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,8 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
MachineBasicBlock &MBB,
MachineInstr &MI) {
unsigned Opc = MI.getOpcode();
MachineConstantPool *CP = MI.getParent()->getParent()->getConstantPool();
MachineConstantPool *CP = MI.getParent()->getParent()->getConstantPool();
bool HasDQI = ST->hasDQI();

auto ConvertToBroadcast = [&](unsigned OpBcst256, unsigned OpBcst128,
unsigned OpBcst64, unsigned OpBcst32,
Expand Down Expand Up @@ -262,6 +263,50 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
return false;
};

// Attempt to convert full width vector loads into broadcast loads.
switch (Opc) {
/* FP Loads */
case X86::MOVAPDrm:
case X86::MOVAPSrm:
case X86::MOVUPDrm:
case X86::MOVUPSrm:
// TODO: SSE3 MOVDDUP Handling
return false;
case X86::VMOVAPDrm:
case X86::VMOVAPSrm:
case X86::VMOVUPDrm:
case X86::VMOVUPSrm:
return ConvertToBroadcast(0, 0, X86::VMOVDDUPrm, X86::VBROADCASTSSrm, 0, 0,
1);
case X86::VMOVAPDYrm:
case X86::VMOVAPSYrm:
case X86::VMOVUPDYrm:
case X86::VMOVUPSYrm:
return ConvertToBroadcast(0, X86::VBROADCASTF128, X86::VBROADCASTSDYrm,
X86::VBROADCASTSSYrm, 0, 0, 1);
case X86::VMOVAPDZ128rm:
case X86::VMOVAPSZ128rm:
case X86::VMOVUPDZ128rm:
case X86::VMOVUPSZ128rm:
return ConvertToBroadcast(0, 0, X86::VMOVDDUPZ128rm,
X86::VBROADCASTSSZ128rm, 0, 0, 1);
case X86::VMOVAPDZ256rm:
case X86::VMOVAPSZ256rm:
case X86::VMOVUPDZ256rm:
case X86::VMOVUPSZ256rm:
return ConvertToBroadcast(
0, HasDQI ? X86::VBROADCASTF64X2Z128rm : X86::VBROADCASTF32X4Z256rm,
X86::VBROADCASTSDZ256rm, X86::VBROADCASTSSZ256rm, 0, 0, 1);
case X86::VMOVAPDZrm:
case X86::VMOVAPSZrm:
case X86::VMOVUPDZrm:
case X86::VMOVUPSZrm:
return ConvertToBroadcast(
HasDQI ? X86::VBROADCASTF32X8rm : X86::VBROADCASTF64X4rm,
HasDQI ? X86::VBROADCASTF64X2rm : X86::VBROADCASTF32X4rm,
X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0, 1);
}

// Attempt to find a AVX512 mapping from a full width memory-fold instruction
// to a broadcast-fold instruction variant.
if ((MI.getDesc().TSFlags & X86II::EncodingMask) == X86II::EVEX) {
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/avx-basic.ll
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ define <8 x i32> @VMOVZQI2PQI(ptr nocapture %aFOO) nounwind {
define <16 x float> @fneg(<16 x float> %a) nounwind {
; CHECK-LABEL: fneg:
; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; CHECK-NEXT: vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0
; CHECK-NEXT: vxorps %ymm2, %ymm1, %ymm1
; CHECK-NEXT: retq
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/avx-vbroadcast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -300,12 +300,12 @@ entry:
define <4 x float> @_e2(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: _e2:
; X86: ## %bb.0: ## %entry
; X86-NEXT: vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
; X86-NEXT: vbroadcastss {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
; X86-NEXT: retl
;
; X64-LABEL: _e2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
; X64-NEXT: retq
entry:
%vecinit.i = insertelement <4 x float> undef, float 0xbf80000000000000, i32 0
Expand Down
6 changes: 4 additions & 2 deletions llvm/test/CodeGen/X86/avx2-conversions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ define <4 x i32> @trunc4(<4 x i64> %A) nounwind {
;
; X86-FAST-ALL-LABEL: trunc4:
; X86-FAST-ALL: # %bb.0:
; X86-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
; X86-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
; X86-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; X86-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; X86-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; X86-FAST-ALL-NEXT: vzeroupper
Expand All @@ -38,7 +39,8 @@ define <4 x i32> @trunc4(<4 x i64> %A) nounwind {
;
; X64-FAST-ALL-LABEL: trunc4:
; X64-FAST-ALL: # %bb.0:
; X64-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
; X64-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
; X64-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; X64-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; X64-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; X64-FAST-ALL-NEXT: vzeroupper
Expand Down
56 changes: 32 additions & 24 deletions llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
Original file line number Diff line number Diff line change
Expand Up @@ -72,30 +72,34 @@ declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readn
define <32 x i8> @test_x86_avx2_packsswb_fold() {
; X86-AVX-LABEL: test_x86_avx2_packsswb_fold:
; X86-AVX: # %bb.0:
; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
; X86-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A]
; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-AVX-NEXT: # ymm0 = mem[0,1,0,1]
; X86-AVX-NEXT: retl # encoding: [0xc3]
;
; X86-AVX512VL-LABEL: test_x86_avx2_packsswb_fold:
; X86-AVX512VL: # %bb.0:
; X86-AVX512VL-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-AVX512VL-NEXT: vbroadcastf128 {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
; X86-AVX512VL-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A]
; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-AVX512VL-NEXT: # ymm0 = mem[0,1,0,1]
; X86-AVX512VL-NEXT: retl # encoding: [0xc3]
;
; X64-AVX-LABEL: test_x86_avx2_packsswb_fold:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
; X64-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A]
; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; X64-AVX-NEXT: # ymm0 = mem[0,1,0,1]
; X64-AVX-NEXT: retq # encoding: [0xc3]
;
; X64-AVX512VL-LABEL: test_x86_avx2_packsswb_fold:
; X64-AVX512VL: # %bb.0:
; X64-AVX512VL-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; X64-AVX512VL-NEXT: vbroadcastf128 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
; X64-AVX512VL-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A]
; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; X64-AVX512VL-NEXT: # ymm0 = mem[0,1,0,1]
; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678, i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <16 x i16> zeroinitializer)
ret <32 x i8> %res
Expand All @@ -121,30 +125,34 @@ declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readn
define <32 x i8> @test_x86_avx2_packuswb_fold() {
; X86-AVX-LABEL: test_x86_avx2_packuswb_fold:
; X86-AVX: # %bb.0:
; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
; X86-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A]
; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-AVX-NEXT: # ymm0 = mem[0,1,0,1]
; X86-AVX-NEXT: retl # encoding: [0xc3]
;
; X86-AVX512VL-LABEL: test_x86_avx2_packuswb_fold:
; X86-AVX512VL: # %bb.0:
; X86-AVX512VL-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-AVX512VL-NEXT: vbroadcastf128 {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
; X86-AVX512VL-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A]
; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
; X86-AVX512VL-NEXT: # ymm0 = mem[0,1,0,1]
; X86-AVX512VL-NEXT: retl # encoding: [0xc3]
;
; X64-AVX-LABEL: test_x86_avx2_packuswb_fold:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
; X64-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A]
; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; X64-AVX-NEXT: # ymm0 = mem[0,1,0,1]
; X64-AVX-NEXT: retq # encoding: [0xc3]
;
; X64-AVX512VL-LABEL: test_x86_avx2_packuswb_fold:
; X64-AVX512VL: # %bb.0:
; X64-AVX512VL-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; X64-AVX512VL-NEXT: vbroadcastf128 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
; X64-AVX512VL-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A]
; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; X64-AVX512VL-NEXT: # ymm0 = mem[0,1,0,1]
; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678, i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <16 x i16> zeroinitializer)
ret <32 x i8> %res
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/avx2-vbroadcast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -657,12 +657,12 @@ define <4 x float> @_e2(ptr %ptr) nounwind uwtable readnone ssp {
define <8 x i8> @_e4(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: _e4:
; X86: ## %bb.0:
; X86-NEXT: vmovaps {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u>
; X86-NEXT: vbroadcastss {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52,52,52,52,52,52,52,52,52]
; X86-NEXT: retl
;
; X64-LABEL: _e4:
; X64: ## %bb.0:
; X64-NEXT: vmovaps {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u>
; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52,52,52,52,52,52,52,52,52]
; X64-NEXT: retq
%vecinit0.i = insertelement <8 x i8> undef, i8 52, i32 0
%vecinit1.i = insertelement <8 x i8> %vecinit0.i, i8 52, i32 1
Expand Down
5 changes: 3 additions & 2 deletions llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,10 @@ define dso_local i64 @caller_argv64i1() #0 {
; X32: # %bb.0: # %entry
; X32-NEXT: pushl %edi
; X32-NEXT: subl $88, %esp
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [2,1,2,1]
; X32-NEXT: vmovddup {{.*#+}} xmm0 = [2,1,2,1]
; X32-NEXT: # xmm0 = mem[0,0]
; X32-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp)
; X32-NEXT: vmovaps {{.*#+}} zmm0 = [2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1]
; X32-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1]
; X32-NEXT: vmovups %zmm0, (%esp)
; X32-NEXT: movl $1, {{[0-9]+}}(%esp)
; X32-NEXT: movl $2, {{[0-9]+}}(%esp)
Expand Down
15 changes: 10 additions & 5 deletions llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3630,7 +3630,8 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x
; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <0,10,6,15,u,u,u,u>
; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,10,6,15,0,10,6,15]
; CHECK-NEXT: # ymm3 = mem[0,1,0,1]
; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
Expand All @@ -3648,7 +3649,8 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4
; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <0,10,6,15,u,u,u,u>
; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,10,6,15,0,10,6,15]
; CHECK-NEXT: # ymm1 = mem[0,1,0,1]
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
Expand Down Expand Up @@ -3892,7 +3894,8 @@ define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp,
define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) {
; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,3,7,3]
; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [7,3,7,3]
; CHECK-NEXT: # ymm1 = mem[0,1,0,1]
; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-NEXT: retq
Expand All @@ -3902,7 +3905,8 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) {
define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [7,3,7,3]
; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [7,3,7,3]
; CHECK-NEXT: # ymm3 = mem[0,1,0,1]
; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
Expand All @@ -3917,7 +3921,8 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %v
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [7,3,7,3]
; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [7,3,7,3]
; CHECK-NEXT: # ymm2 = mem[0,1,0,1]
; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/bitreverse.ll
Original file line number Diff line number Diff line change
Expand Up @@ -592,12 +592,12 @@ define <2 x i16> @fold_v2i16() {
;
; X86XOP-LABEL: fold_v2i16:
; X86XOP: # %bb.0:
; X86XOP-NEXT: vmovaps {{.*#+}} xmm0 = <61440,240,u,u,u,u,u,u>
; X86XOP-NEXT: vbroadcastss {{.*#+}} xmm0 = [61440,240,61440,240,61440,240,61440,240]
; X86XOP-NEXT: retl
;
; GFNI-LABEL: fold_v2i16:
; GFNI: # %bb.0:
; GFNI-NEXT: vmovaps {{.*#+}} xmm0 = <61440,240,u,u,u,u,u,u>
; GFNI-NEXT: vbroadcastss {{.*#+}} xmm0 = [61440,240,61440,240,61440,240,61440,240]
; GFNI-NEXT: retq
%b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> <i16 15, i16 3840>)
ret <2 x i16> %b
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ define <64 x i8> @f64xi8_i16(<64 x i8> %a) {
; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT: retl
Expand Down Expand Up @@ -328,7 +328,7 @@ define <64 x i8> @f64xi8_i16(<64 x i8> %a) {
; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0
; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-64-NEXT: vbroadcastss {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT: retq
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/cast-vsel.ll
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ define <8 x i16> @trunc(<8 x i16> %a, <8 x i16> %b, <8 x i32> %c, <8 x i32> %d)
; AVX1-LABEL: trunc:
; AVX1: # %bb.0:
; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535]
; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535]
; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
Expand Down Expand Up @@ -337,7 +337,7 @@ define dso_local void @example25() nounwind {
; AVX1-LABEL: example25:
; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: movq $-4096, %rax # imm = 0xF000
; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB5_1: # %vector.body
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
Expand Down

0 comments on commit 9806101

Please sign in to comment.