Skip to content

Commit

Permalink
[X86] Teach getUndefRegClearance that we use undef for inputs to PUNP…
Browse files Browse the repository at this point in the history
…CK in some cases.

This enables the register to be changed from XMM/YMM/ZMM0 to
instead match the other source. This prevents a false
dependency.

I added all the integer unpck instructions, but the tests
only show changes for BW and WD.

Unfortunately, we can have undef on operand 1 or 2 of the AVX
instructions. This breaks the interface with hasUndefRegUpdate
which used to tell which operand to check.

Now we scan the input operands looking for an undef register and
then ask hasUndefRegUpdate if its an instruction we care about
and which operands of that instruction we care about.

I also had to make some changes to the load folding code to
always pass operand 1 to hasUndefRegUpdate. I've updated
hasUndefRegUpdate to return false when ForLoadFold is set for
instructions that are not explicitly blocked for load folding in
isel patterns.

Differential Revision: https://reviews.llvm.org/D79615
  • Loading branch information
topperc committed May 9, 2020
1 parent 56bf0b5 commit c7be6a8
Show file tree
Hide file tree
Showing 50 changed files with 1,136 additions and 1,082 deletions.
98 changes: 76 additions & 22 deletions llvm/lib/Target/X86/X86InstrInfo.cpp
Expand Up @@ -4834,15 +4834,29 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance(
// operand into the unused high bits of the destination operand.
// Also returns true for instructions that have two inputs where one may
// be undef and we want it to use the same register as the other input.
static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum,
static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
bool ForLoadFold = false) {
// Set the OpNum parameter to the first source operand.
OpNum = 1;
switch (Opcode) {
case X86::PACKSSWBrr:
case X86::PACKUSWBrr:
case X86::PACKSSDWrr:
case X86::PACKUSDWrr:
case X86::PUNPCKHBWrr:
case X86::PUNPCKLBWrr:
case X86::PUNPCKHWDrr:
case X86::PUNPCKLWDrr:
case X86::PUNPCKHDQrr:
case X86::PUNPCKLDQrr:
case X86::PUNPCKHQDQrr:
case X86::PUNPCKLQDQrr:
// These instructions are sometimes used with an undef first or second
// source. Return true here so BreakFalseDeps will assign this source to the
// same register as the first source to avoid a false dependency.
// Operand 1 of these instructions is tied so they're separate from their
// VEX counterparts.
return OpNum == 2 && !ForLoadFold;

case X86::VPACKSSWBrr:
case X86::VPACKUSWBrr:
case X86::VPACKSSDWrr:
Expand All @@ -4851,12 +4865,51 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum,
case X86::VPACKUSWBZ128rr:
case X86::VPACKSSDWZ128rr:
case X86::VPACKUSDWZ128rr:
// These instructions are sometimes used with an undef second source to
// truncate 128-bit vectors to 64-bit with undefined high bits. Return
// true here so BreakFalseDeps will assign this source to the same register
// as the first source to avoid a false dependency.
OpNum = 2;
return true;
case X86::VPUNPCKHBWrr:
case X86::VPUNPCKLBWrr:
case X86::VPUNPCKHBWYrr:
case X86::VPUNPCKLBWYrr:
case X86::VPUNPCKHBWZ128rr:
case X86::VPUNPCKLBWZ128rr:
case X86::VPUNPCKHBWZ256rr:
case X86::VPUNPCKLBWZ256rr:
case X86::VPUNPCKHBWZrr:
case X86::VPUNPCKLBWZrr:
case X86::VPUNPCKHWDrr:
case X86::VPUNPCKLWDrr:
case X86::VPUNPCKHWDYrr:
case X86::VPUNPCKLWDYrr:
case X86::VPUNPCKHWDZ128rr:
case X86::VPUNPCKLWDZ128rr:
case X86::VPUNPCKHWDZ256rr:
case X86::VPUNPCKLWDZ256rr:
case X86::VPUNPCKHWDZrr:
case X86::VPUNPCKLWDZrr:
case X86::VPUNPCKHDQrr:
case X86::VPUNPCKLDQrr:
case X86::VPUNPCKHDQYrr:
case X86::VPUNPCKLDQYrr:
case X86::VPUNPCKHDQZ128rr:
case X86::VPUNPCKLDQZ128rr:
case X86::VPUNPCKHDQZ256rr:
case X86::VPUNPCKLDQZ256rr:
case X86::VPUNPCKHDQZrr:
case X86::VPUNPCKLDQZrr:
case X86::VPUNPCKHQDQrr:
case X86::VPUNPCKLQDQrr:
case X86::VPUNPCKHQDQYrr:
case X86::VPUNPCKLQDQYrr:
case X86::VPUNPCKHQDQZ128rr:
case X86::VPUNPCKLQDQZ128rr:
case X86::VPUNPCKHQDQZ256rr:
case X86::VPUNPCKLQDQZ256rr:
case X86::VPUNPCKHQDQZrr:
case X86::VPUNPCKLQDQZrr:
// These instructions are sometimes used with an undef first or second
// source. Return true here so BreakFalseDeps will assign this source to the
// same register as the first source to avoid a false dependency.
return (OpNum == 1 || OpNum == 2) && !ForLoadFold;

case X86::VCVTSI2SSrr:
case X86::VCVTSI2SSrm:
case X86::VCVTSI2SSrr_Int:
Expand Down Expand Up @@ -4914,7 +4967,7 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum,
case X86::VCVTUSI642SDZrm_Int:
// Load folding won't effect the undef register update since the input is
// a GPR.
return !ForLoadFold;
return OpNum == 1 && !ForLoadFold;
case X86::VCVTSD2SSrr:
case X86::VCVTSD2SSrm:
case X86::VCVTSD2SSrr_Int:
Expand Down Expand Up @@ -5013,15 +5066,13 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum,
case X86::VSQRTSDZrb_Int:
case X86::VSQRTSDZm:
case X86::VSQRTSDZm_Int:
return true;
return OpNum == 1;
case X86::VMOVSSZrrk:
case X86::VMOVSDZrrk:
OpNum = 3;
return true;
return OpNum == 3 && !ForLoadFold;
case X86::VMOVSSZrrkz:
case X86::VMOVSDZrrkz:
OpNum = 2;
return true;
return OpNum == 2 && !ForLoadFold;
}

return false;
Expand All @@ -5044,13 +5095,17 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum,
unsigned
X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
const TargetRegisterInfo *TRI) const {
if (!hasUndefRegUpdate(MI.getOpcode(), OpNum))
return 0;

const MachineOperand &MO = MI.getOperand(OpNum);
if (MO.isUndef() && Register::isPhysicalRegister(MO.getReg())) {
return UndefRegClearance;
for (unsigned i = MI.getNumExplicitDefs(), e = MI.getNumExplicitOperands();
i != e; ++i) {
const MachineOperand &MO = MI.getOperand(i);
if (MO.isReg() && MO.isUndef() &&
Register::isPhysicalRegister(MO.getReg()) &&
hasUndefRegUpdate(MI.getOpcode(), i)) {
OpNum = i;
return UndefRegClearance;
}
}

return 0;
}

Expand Down Expand Up @@ -5294,8 +5349,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(

static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF,
MachineInstr &MI) {
unsigned Ignored;
if (!hasUndefRegUpdate(MI.getOpcode(), Ignored, /*ForLoadFold*/true) ||
if (!hasUndefRegUpdate(MI.getOpcode(), 1, /*ForLoadFold*/true) ||
!MI.getOperand(1).isReg())
return false;

Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/avx2-arith.ll
Expand Up @@ -171,12 +171,12 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
; X32-LABEL: mul_v32i8:
; X32: # %bb.0:
; X32-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
; X32-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; X32-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; X32-NEXT: vpmullw %ymm2, %ymm3, %ymm2
; X32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; X32-NEXT: vpand %ymm3, %ymm2, %ymm2
; X32-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; X32-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; X32-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; X32-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; X32-NEXT: vpand %ymm3, %ymm0, %ymm0
Expand All @@ -185,12 +185,12 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
;
; X64-LABEL: mul_v32i8:
; X64: # %bb.0:
; X64-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
; X64-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; X64-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; X64-NEXT: vpmullw %ymm2, %ymm3, %ymm2
; X64-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; X64-NEXT: vpand %ymm3, %ymm2, %ymm2
; X64-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; X64-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; X64-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; X64-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; X64-NEXT: vpand %ymm3, %ymm0, %ymm0
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/avx2-vector-shifts.ll
Expand Up @@ -575,7 +575,7 @@ define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
; X32-LABEL: ashr_32i8:
; X32: # %bb.0:
; X32-NEXT: vpsllw $5, %ymm1, %ymm1
; X32-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
; X32-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; X32-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; X32-NEXT: vpsraw $4, %ymm3, %ymm4
; X32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
Expand All @@ -586,7 +586,7 @@ define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
; X32-NEXT: vpaddw %ymm2, %ymm2, %ymm2
; X32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
; X32-NEXT: vpsrlw $8, %ymm2, %ymm2
; X32-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; X32-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; X32-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; X32-NEXT: vpsraw $4, %ymm0, %ymm3
; X32-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
Expand All @@ -603,7 +603,7 @@ define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
; X64-LABEL: ashr_32i8:
; X64: # %bb.0:
; X64-NEXT: vpsllw $5, %ymm1, %ymm1
; X64-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
; X64-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; X64-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; X64-NEXT: vpsraw $4, %ymm3, %ymm4
; X64-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
Expand All @@ -614,7 +614,7 @@ define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
; X64-NEXT: vpaddw %ymm2, %ymm2, %ymm2
; X64-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
; X64-NEXT: vpsrlw $8, %ymm2, %ymm2
; X64-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; X64-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; X64-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; X64-NEXT: vpsraw $4, %ymm0, %ymm3
; X64-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/cast-vsel.ll
Expand Up @@ -295,9 +295,9 @@ define void @example25() nounwind {
; SSE2-NEXT: packssdw %xmm3, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: movdqa %xmm2, dj+4112(%rax)
; SSE2-NEXT: movdqa %xmm1, dj+4096(%rax)
Expand Down Expand Up @@ -326,7 +326,7 @@ define void @example25() nounwind {
; SSE41-NEXT: pand %xmm1, %xmm2
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; SSE41-NEXT: pand %xmm0, %xmm1
; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
; SSE41-NEXT: pand %xmm0, %xmm2
; SSE41-NEXT: movdqa %xmm2, dj+4112(%rax)
; SSE41-NEXT: movdqa %xmm1, dj+4096(%rax)
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/combine-mul.ll
Expand Up @@ -294,7 +294,7 @@ define <16 x i8> @combine_mul_to_abs_v16i8(<16 x i8> %x) {
; SSE-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE-NEXT: pmullw %xmm0, %xmm2
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
; SSE-NEXT: pand %xmm0, %xmm2
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/combine-shl.ll
Expand Up @@ -240,15 +240,15 @@ define <8 x i32> @combine_vec_shl_ext_shl0(<8 x i16> %x) {
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $20, %xmm0
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; SSE2-NEXT: pslld $20, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_shl_ext_shl0:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; SSE41-NEXT: pslld $20, %xmm1
; SSE41-NEXT: pslld $20, %xmm0
; SSE41-NEXT: retq
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
Expand Up @@ -264,14 +264,14 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, <16 x i8>* %divdst)
; X86-NEXT: movdqa %xmm5, %xmm2
; X86-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; X86-NEXT: movdqa %xmm2, (%ecx)
; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-NEXT: movdqa %xmm1, %xmm2
; X86-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
; X86-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; X86-NEXT: pmullw %xmm3, %xmm2
; X86-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; X86-NEXT: pand %xmm3, %xmm2
; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-NEXT: pmullw %xmm5, %xmm1
; X86-NEXT: pand %xmm3, %xmm1
; X86-NEXT: packuswb %xmm2, %xmm1
Expand Down Expand Up @@ -377,14 +377,14 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, <16 x i8>* %divdst)
; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; X64-NEXT: movdqa %xmm2, (%rax)
; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-NEXT: movdqa %xmm1, %xmm2
; X64-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
; X64-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; X64-NEXT: pmullw %xmm3, %xmm2
; X64-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; X64-NEXT: pand %xmm3, %xmm2
; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-NEXT: pmullw %xmm6, %xmm1
; X64-NEXT: pand %xmm3, %xmm1
; X64-NEXT: packuswb %xmm2, %xmm1
Expand Down

0 comments on commit c7be6a8

Please sign in to comment.