Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions llvm/lib/Target/X86/X86FixupInstTuning.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,22 @@ bool X86FixupInstTuningPass::processInstruction(
return true;
};

// Is ADD(X,X) more efficient than SHL(X,1)?
auto ProcessShiftLeftToAdd = [&](unsigned AddOpc) -> bool {
if (MI.getOperand(NumOperands - 1).getImm() != 1)
return false;
if (!NewOpcPreferable(AddOpc, /*ReplaceInTie*/ true))
return false;
LLVM_DEBUG(dbgs() << "Replacing: " << MI);
{
MI.setDesc(TII->get(AddOpc));
MI.removeOperand(NumOperands - 1);
MI.addOperand(MI.getOperand(NumOperands - 2));
}
LLVM_DEBUG(dbgs() << " With: " << MI);
return false;
};

switch (Opc) {
case X86::BLENDPDrri:
return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1);
Expand Down Expand Up @@ -563,6 +579,44 @@ bool X86FixupInstTuningPass::processInstruction(
return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmkz);
case X86::VUNPCKHPSZrmkz:
return ProcessUNPCKPS(X86::VPUNPCKHDQZrmkz);

case X86::PSLLWri:
return ProcessShiftLeftToAdd(X86::PADDWrr);
case X86::VPSLLWri:
return ProcessShiftLeftToAdd(X86::VPADDWrr);
case X86::VPSLLWYri:
return ProcessShiftLeftToAdd(X86::VPADDWYrr);
case X86::VPSLLWZ128ri:
return ProcessShiftLeftToAdd(X86::VPADDWZ128rr);
case X86::VPSLLWZ256ri:
return ProcessShiftLeftToAdd(X86::VPADDWZ256rr);
case X86::VPSLLWZri:
return ProcessShiftLeftToAdd(X86::VPADDWZrr);
case X86::PSLLDri:
return ProcessShiftLeftToAdd(X86::PADDDrr);
case X86::VPSLLDri:
return ProcessShiftLeftToAdd(X86::VPADDDrr);
case X86::VPSLLDYri:
return ProcessShiftLeftToAdd(X86::VPADDDYrr);
case X86::VPSLLDZ128ri:
return ProcessShiftLeftToAdd(X86::VPADDDZ128rr);
case X86::VPSLLDZ256ri:
return ProcessShiftLeftToAdd(X86::VPADDDZ256rr);
case X86::VPSLLDZri:
return ProcessShiftLeftToAdd(X86::VPADDDZrr);
case X86::PSLLQri:
return ProcessShiftLeftToAdd(X86::PADDQrr);
case X86::VPSLLQri:
return ProcessShiftLeftToAdd(X86::VPADDQrr);
case X86::VPSLLQYri:
return ProcessShiftLeftToAdd(X86::VPADDQYrr);
case X86::VPSLLQZ128ri:
return ProcessShiftLeftToAdd(X86::VPADDQZ128rr);
case X86::VPSLLQZ256ri:
return ProcessShiftLeftToAdd(X86::VPADDQZ256rr);
case X86::VPSLLQZri:
return ProcessShiftLeftToAdd(X86::VPADDQZrr);

default:
return false;
}
Expand Down
16 changes: 1 addition & 15 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30313,22 +30313,8 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,

uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();

if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
// Hardware support for vector shifts is sparse which makes us scalarize the
// vector operations in many cases. Also, on sandybridge ADD is faster than
// shl: (shl V, 1) -> (add (freeze V), (freeze V))
if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
// R may be undef at run-time, but (shl R, 1) must be an even number (LSB
// must be 0). (add undef, undef) however can be any value. To make this
// safe, we must freeze R to ensure that register allocation uses the same
// register for an undefined value. This ensures that the result will
// still be even and preserves the original semantics.
R = DAG.getFreeze(R);
return DAG.getNode(ISD::ADD, dl, VT, R, R);
}

if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
}

// i64 SRA needs to be performed as partial shifts.
if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/combine-add.ll
Original file line number Diff line number Diff line change
Expand Up @@ -235,10 +235,10 @@ define void @PR52039(ptr %pa, ptr %pb) {
; SSE-NEXT: psubd %xmm1, %xmm3
; SSE-NEXT: psubd %xmm0, %xmm2
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: paddd %xmm2, %xmm0
; SSE-NEXT: paddd %xmm0, %xmm0
; SSE-NEXT: paddd %xmm2, %xmm0
; SSE-NEXT: movdqa %xmm3, %xmm1
; SSE-NEXT: paddd %xmm3, %xmm1
; SSE-NEXT: paddd %xmm1, %xmm1
; SSE-NEXT: paddd %xmm3, %xmm1
; SSE-NEXT: movdqu %xmm3, 16(%rsi)
; SSE-NEXT: movdqu %xmm2, (%rsi)
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/combine-mul.ll
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ define <4 x i64> @combine_vec_mul_pow2c(<4 x i64> %x) {
; SSE-LABEL: combine_vec_mul_pow2c:
; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: paddq %xmm0, %xmm2
; SSE-NEXT: paddq %xmm2, %xmm2
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: psllq $4, %xmm2
Expand Down
30 changes: 14 additions & 16 deletions llvm/test/CodeGen/X86/combine-sdiv.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2187,29 +2187,28 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: pcmpgtb %xmm1, %xmm3
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,2,2,2,2,128,2,128]
; SSE41-NEXT: psrlw $8, %xmm3
; SSE41-NEXT: paddw %xmm4, %xmm4
; SSE41-NEXT: pmovsxbw %xmm1, %xmm2
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4,5],xmm4[6],xmm2[7]
; SSE41-NEXT: pmovsxbw %xmm1, %xmm0
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; SSE41-NEXT: paddw %xmm2, %xmm2
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2],xmm0[3,4,5],xmm2[6],xmm0[7]
; SSE41-NEXT: psrlw $8, %xmm2
; SSE41-NEXT: packuswb %xmm3, %xmm2
; SSE41-NEXT: paddb %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
; SSE41-NEXT: psraw $8, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm3
; SSE41-NEXT: paddw %xmm0, %xmm3
; SSE41-NEXT: psllw $7, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7]
; SSE41-NEXT: psrlw $8, %xmm0
; SSE41-NEXT: psllw $7, %xmm3
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm0[5],xmm3[6],xmm0[7]
; SSE41-NEXT: psrlw $8, %xmm3
; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE41-NEXT: psraw $8, %xmm2
; SSE41-NEXT: psllw $7, %xmm2
; SSE41-NEXT: psrlw $8, %xmm2
; SSE41-NEXT: packuswb %xmm0, %xmm2
; SSE41-NEXT: packuswb %xmm3, %xmm2
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
Expand All @@ -2225,18 +2224,17 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [256,2,2,2,2,128,2,128]
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5],xmm2[6],xmm3[7]
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm2
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5],xmm3[6],xmm2[7]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3
; AVX1-NEXT: vpsllw $7, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6],xmm3[7]
; AVX1-NEXT: vpsllw $7, %xmm2, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5],xmm3[6],xmm2[7]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/known-signbits-shl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ define void @computeNumSignBits_shl_zext_vec_3(<2 x i8> %x, ptr %p) nounwind {
; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; X64-NEXT: por %xmm2, %xmm1
; X64-NEXT: movdqa %xmm0, %xmm2
; X64-NEXT: paddw %xmm0, %xmm2
; X64-NEXT: paddw %xmm2, %xmm2
; X64-NEXT: movdqa %xmm2, %xmm3
; X64-NEXT: psraw $1, %xmm3
; X64-NEXT: pcmpeqw %xmm0, %xmm3
Expand Down
33 changes: 12 additions & 21 deletions llvm/test/CodeGen/X86/masked_gather_scatter.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4806,9 +4806,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-KNL-NEXT: vpslld $1, (%rsi), %zmm0
; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm0
; X64-KNL-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
; X64-KNL-NEXT: vmovaps %zmm1, %zmm0
; X64-KNL-NEXT: retq
Expand All @@ -4830,9 +4829,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1
; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-SKX-SMALL-NEXT: vpslld $1, (%rsi), %zmm0
; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0
; X64-SKX-SMALL-NEXT: retq
Expand All @@ -4842,10 +4840,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1
; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-SKX-LARGE-NEXT: vpslld $1, (%rsi), %zmm0
; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0
; X64-SKX-LARGE-NEXT: retq
Expand Down Expand Up @@ -4875,9 +4872,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-KNL-NEXT: vpslld $1, (%rsi), %zmm0
; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm0
; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
; X64-KNL-NEXT: vmovaps %zmm1, %zmm0
; X64-KNL-NEXT: retq
Expand All @@ -4899,9 +4895,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1
; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-SKX-SMALL-NEXT: vpslld $1, (%rsi), %zmm0
; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0
; X64-SKX-SMALL-NEXT: retq
Expand All @@ -4911,10 +4906,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1
; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-SKX-LARGE-NEXT: vpslld $1, (%rsi), %zmm0
; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0
; X64-SKX-LARGE-NEXT: retq
Expand Down Expand Up @@ -4944,9 +4938,8 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair(
; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm2
; X64-KNL-NEXT: vpslld $1, (%rsi), %zmm0
; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm2
; X64-KNL-NEXT: kmovw %k1, %k2
; X64-KNL-NEXT: vmovaps %zmm1, %zmm0
; X64-KNL-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
Expand All @@ -4972,9 +4965,8 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair(
; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1
; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm2
; X64-SKX-SMALL-NEXT: vpslld $1, (%rsi), %zmm0
; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm2
; X64-SKX-SMALL-NEXT: kmovw %k1, %k2
; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0
; X64-SKX-SMALL-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
Expand All @@ -4986,10 +4978,9 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair(
; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1
; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-SKX-LARGE-NEXT: vpslld $1, (%rsi), %zmm0
; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm2
; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm2
; X64-SKX-LARGE-NEXT: kmovw %k1, %k2
; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0
; X64-SKX-LARGE-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/X86/oddsubvector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -155,18 +155,18 @@ define <16 x i32> @PR42819(ptr %a0) {
define void @PR42833() {
; SSE2-LABEL: PR42833:
; SSE2: # %bb.0:
; SSE2-NEXT: movl b(%rip), %eax
; SSE2-NEXT: movdqa c+144(%rip), %xmm2
; SSE2-NEXT: movdqa c+128(%rip), %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: addl b(%rip), %eax
; SSE2-NEXT: addl c+128(%rip), %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: movd %eax, %xmm3
; SSE2-NEXT: paddd %xmm0, %xmm3
; SSE2-NEXT: movdqa d+144(%rip), %xmm4
; SSE2-NEXT: psubd %xmm2, %xmm4
; SSE2-NEXT: paddd %xmm2, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: paddd %xmm0, %xmm5
; SSE2-NEXT: paddd %xmm5, %xmm5
; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
; SSE2-NEXT: movdqa %xmm2, c+144(%rip)
; SSE2-NEXT: movaps %xmm5, c+128(%rip)
Expand All @@ -191,17 +191,17 @@ define void @PR42833() {
;
; SSE42-LABEL: PR42833:
; SSE42: # %bb.0:
; SSE42-NEXT: movl b(%rip), %eax
; SSE42-NEXT: movdqa c+144(%rip), %xmm1
; SSE42-NEXT: movdqa c+128(%rip), %xmm0
; SSE42-NEXT: movd %xmm0, %eax
; SSE42-NEXT: addl b(%rip), %eax
; SSE42-NEXT: addl c+128(%rip), %eax
; SSE42-NEXT: movd %eax, %xmm2
; SSE42-NEXT: paddd %xmm0, %xmm2
; SSE42-NEXT: movdqa d+144(%rip), %xmm3
; SSE42-NEXT: psubd %xmm1, %xmm3
; SSE42-NEXT: paddd %xmm1, %xmm1
; SSE42-NEXT: movdqa %xmm0, %xmm4
; SSE42-NEXT: paddd %xmm0, %xmm4
; SSE42-NEXT: paddd %xmm4, %xmm4
; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
; SSE42-NEXT: movdqa %xmm1, c+144(%rip)
; SSE42-NEXT: movdqa %xmm4, c+128(%rip)
Expand Down
38 changes: 22 additions & 16 deletions llvm/test/CodeGen/X86/pr62286.ll
Original file line number Diff line number Diff line change
Expand Up @@ -26,27 +26,33 @@ define i64 @PR62286(i32 %a) {
; AVX1-LABEL: PR62286:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7]
; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR62286:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm1
; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm1
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
Expand All @@ -59,12 +65,12 @@ define i64 @PR62286(i32 %a) {
; AVX512-LABEL: PR62286:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovd %edi, %xmm0
; AVX512-NEXT: movb $8, %al
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
; AVX512-NEXT: vpaddd %ymm0, %ymm0, %ymm1
; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: movw $4369, %ax # imm = 0x1111
; AVX512-NEXT: kmovd %eax, %k1
; AVX512-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z}
; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; AVX512-NEXT: vpaddd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
Expand Down
Loading