Skip to content

Commit

Permalink
[X86] Correct the placement of ReadAfterLd in BEXTR and BZHI. Add ded…
Browse files Browse the repository at this point in the history
…icated SchedRW for BEXTR/BZHI.

These instructions have the memory operand before the register operand. So we need to put ReadDefault for all the load ops first. Then the ReadAfterLd

Differential Revision: https://reviews.llvm.org/D44838

llvm-svn: 328823
  • Loading branch information
topperc committed Mar 29, 2018
1 parent 5c14ed8 commit 89310f5
Show file tree
Hide file tree
Showing 12 changed files with 63 additions and 72 deletions.
19 changes: 12 additions & 7 deletions llvm/lib/Target/X86/X86InstrInfo.td
Expand Up @@ -2378,30 +2378,35 @@ let Predicates = [HasBMI] in {

multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
X86MemOperand x86memop, Intrinsic Int,
PatFrag ld_frag> {
PatFrag ld_frag, X86FoldableSchedWrite Sched> {
def rr : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)], IIC_BIN_NONMEM>,
T8PS, VEX, Sched<[WriteALU]>;
T8PS, VEX, Sched<[Sched]>;
def rm : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)),
(implicit EFLAGS)], IIC_BIN_MEM>, T8PS, VEX,
Sched<[WriteALULd, ReadAfterLd]>;
Sched<[Sched.Folded,
// x86memop:$src1
ReadDefault, ReadDefault, ReadDefault, ReadDefault,
ReadDefault,
// RC:$src2
ReadAfterLd]>;
}

let Predicates = [HasBMI], Defs = [EFLAGS] in {
defm BEXTR32 : bmi_bextr_bzhi<0xF7, "bextr{l}", GR32, i32mem,
int_x86_bmi_bextr_32, loadi32>;
int_x86_bmi_bextr_32, loadi32, WriteBEXTR>;
defm BEXTR64 : bmi_bextr_bzhi<0xF7, "bextr{q}", GR64, i64mem,
int_x86_bmi_bextr_64, loadi64>, VEX_W;
int_x86_bmi_bextr_64, loadi64, WriteBEXTR>, VEX_W;
}

let Predicates = [HasBMI2], Defs = [EFLAGS] in {
defm BZHI32 : bmi_bextr_bzhi<0xF5, "bzhi{l}", GR32, i32mem,
int_x86_bmi_bzhi_32, loadi32>;
int_x86_bmi_bzhi_32, loadi32, WriteBZHI>;
defm BZHI64 : bmi_bextr_bzhi<0xF5, "bzhi{q}", GR64, i64mem,
int_x86_bmi_bzhi_64, loadi64>, VEX_W;
int_x86_bmi_bzhi_64, loadi64, WriteBZHI>, VEX_W;
}

def CountTrailingOnes : SDNodeXForm<imm, [{
Expand Down
16 changes: 5 additions & 11 deletions llvm/lib/Target/X86/X86SchedBroadwell.td
Expand Up @@ -120,6 +120,10 @@ defm : BWWriteResPair<WritePOPCNT, [BWPort1], 3>;
// Integer shifts and rotates.
defm : BWWriteResPair<WriteShift, [BWPort06], 1>;

// BMI1 BEXTR, BMI2 BZHI
defm : BWWriteResPair<WriteBEXTR, [BWPort06,BWPort15], 2, [1,1], 2>;
defm : BWWriteResPair<WriteBZHI, [BWPort15], 1>;

// Loads, stores, and moves, not folded with other operations.
def : WriteRes<WriteLoad, [BWPort23]> { let Latency = 5; }
def : WriteRes<WriteStore, [BWPort237, BWPort4]>;
Expand Down Expand Up @@ -492,7 +496,6 @@ def: InstRW<[BWWriteResGroup7], (instregex "ANDN(32|64)rr",
"BLSI(32|64)rr",
"BLSMSK(32|64)rr",
"BLSR(32|64)rr",
"BZHI(32|64)rr",
"LEA(16|32|64)(_32)?r",
"MMX_PABSBrr",
"MMX_PABSDrr",
Expand Down Expand Up @@ -780,8 +783,7 @@ def BWWriteResGroup19 : SchedWriteRes<[BWPort06,BWPort15]> {
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
def: InstRW<[BWWriteResGroup19], (instregex "BEXTR(32|64)rr",
"BSWAP(16|32|64)r")>;
def: InstRW<[BWWriteResGroup19], (instregex "BSWAP(16|32|64)r")>;

def BWWriteResGroup20 : SchedWriteRes<[BWPort06,BWPort0156]> {
let Latency = 2;
Expand Down Expand Up @@ -1442,7 +1444,6 @@ def: InstRW<[BWWriteResGroup64], (instregex "ANDN(32|64)rm",
"BLSI(32|64)rm",
"BLSMSK(32|64)rm",
"BLSR(32|64)rm",
"BZHI(32|64)rm",
"MMX_PABSBrm",
"MMX_PABSDrm",
"MMX_PABSWrm",
Expand Down Expand Up @@ -1833,13 +1834,6 @@ def BWWriteResGroup84 : SchedWriteRes<[BWPort6,BWPort23,BWPort0156]> {
def: InstRW<[BWWriteResGroup84], (instregex "LRETQ",
"RETQ")>;

def BWWriteResGroup85 : SchedWriteRes<[BWPort23,BWPort06,BWPort15]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
def: InstRW<[BWWriteResGroup85], (instregex "BEXTR(32|64)rm")>;

def BWWriteResGroup86 : SchedWriteRes<[BWPort23,BWPort06,BWPort0156]> {
let Latency = 7;
let NumMicroOps = 3;
Expand Down
16 changes: 5 additions & 11 deletions llvm/lib/Target/X86/X86SchedHaswell.td
Expand Up @@ -128,6 +128,10 @@ defm : HWWriteResPair<WriteLZCNT, [HWPort1], 3>;
defm : HWWriteResPair<WriteTZCNT, [HWPort1], 3>;
defm : HWWriteResPair<WritePOPCNT, [HWPort1], 3>;

// BMI1 BEXTR, BMI2 BZHI
defm : HWWriteResPair<WriteBEXTR, [HWPort06,HWPort15], 2, [1,1], 2>;
defm : HWWriteResPair<WriteBZHI, [HWPort15], 1>;

// This is quite rough, latency depends on the dividend.
defm : HWWriteResPair<WriteIDiv, [HWPort0, HWDivider], 25, [1,10], 1, 4>;
// Scalar and vector floating point.
Expand Down Expand Up @@ -844,7 +848,6 @@ def: InstRW<[HWWriteResGroup8], (instregex "ANDN(32|64)rr",
"BLSI(32|64)rr",
"BLSMSK(32|64)rr",
"BLSR(32|64)rr",
"BZHI(32|64)rr",
"LEA(16|32|64)(_32)?r",
"MMX_PABSBrr",
"MMX_PABSDrr",
Expand Down Expand Up @@ -1230,7 +1233,6 @@ def: InstRW<[HWWriteResGroup16], (instregex "ANDN(32|64)rm",
"BLSI(32|64)rm",
"BLSMSK(32|64)rm",
"BLSR(32|64)rm",
"BZHI(32|64)rm",
"MMX_PABSBrm",
"MMX_PABSDrm",
"MMX_PABSWrm",
Expand Down Expand Up @@ -1606,8 +1608,7 @@ def HWWriteResGroup34 : SchedWriteRes<[HWPort06,HWPort15]> {
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
def: InstRW<[HWWriteResGroup34], (instregex "BEXTR(32|64)rr",
"BSWAP(16|32|64)r")>;
def: InstRW<[HWWriteResGroup34], (instregex "BSWAP(16|32|64)r")>;

def HWWriteResGroup35 : SchedWriteRes<[HWPort06,HWPort0156]> {
let Latency = 2;
Expand Down Expand Up @@ -1711,13 +1712,6 @@ def: InstRW<[HWWriteResGroup41], (instregex "LRETQ",
"RETL",
"RETQ")>;

def HWWriteResGroup42 : SchedWriteRes<[HWPort23,HWPort06,HWPort15]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
def: InstRW<[HWWriteResGroup42], (instregex "BEXTR(32|64)rm")>;

def HWWriteResGroup43 : SchedWriteRes<[HWPort23,HWPort06,HWPort0156]> {
let Latency = 7;
let NumMicroOps = 3;
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/X86/X86SchedSandyBridge.td
Expand Up @@ -119,6 +119,11 @@ defm : SBWriteResPair<WriteLZCNT, [SBPort1], 3, [1], 1, 5>;
defm : SBWriteResPair<WriteTZCNT, [SBPort1], 3, [1], 1, 5>;
defm : SBWriteResPair<WritePOPCNT, [SBPort1], 3, [1], 1, 5>;

// BMI1 BEXTR, BMI2 BZHI
// NOTE: These don't exist on Sandy Bridge. Ports are guesses.
defm : SBWriteResPair<WriteBEXTR, [SBPort05,SBPort1], 2, [1,1], 2>;
defm : SBWriteResPair<WriteBZHI, [SBPort1], 1>;

// Scalar and vector floating point.
def : WriteRes<WriteFStore, [SBPort23, SBPort4]>;
def : WriteRes<WriteFLoad, [SBPort23]> { let Latency = 6; }
Expand Down
16 changes: 5 additions & 11 deletions llvm/lib/Target/X86/X86SchedSkylakeClient.td
Expand Up @@ -120,6 +120,10 @@ defm : SKLWriteResPair<WritePOPCNT, [SKLPort1], 3>;
// Integer shifts and rotates.
defm : SKLWriteResPair<WriteShift, [SKLPort06], 1>;

// BMI1 BEXTR, BMI2 BZHI
defm : SKLWriteResPair<WriteBEXTR, [SKLPort06,SKLPort15], 2, [1,1], 2>;
defm : SKLWriteResPair<WriteBZHI, [SKLPort15], 1>;

// Loads, stores, and moves, not folded with other operations.
def : WriteRes<WriteLoad, [SKLPort23]> { let Latency = 5; }
def : WriteRes<WriteStore, [SKLPort237, SKLPort4]>;
Expand Down Expand Up @@ -558,7 +562,6 @@ def: InstRW<[SKLWriteResGroup8], (instregex "ANDN(32|64)rr",
"BLSI(32|64)rr",
"BLSMSK(32|64)rr",
"BLSR(32|64)rr",
"BZHI(32|64)rr",
"LEA(16|32|64)(_32)?r")>;

def SKLWriteResGroup9 : SchedWriteRes<[SKLPort015]> {
Expand Down Expand Up @@ -802,8 +805,7 @@ def SKLWriteResGroup22 : SchedWriteRes<[SKLPort06,SKLPort15]> {
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
def: InstRW<[SKLWriteResGroup22], (instregex "BEXTR(32|64)rr",
"BSWAP(16|32|64)r")>;
def: InstRW<[SKLWriteResGroup22], (instregex "BSWAP(16|32|64)r")>;

def SKLWriteResGroup23 : SchedWriteRes<[SKLPort06,SKLPort0156]> {
let Latency = 2;
Expand Down Expand Up @@ -1464,7 +1466,6 @@ def: InstRW<[SKLWriteResGroup75], (instregex "ANDN(32|64)rm",
"BLSI(32|64)rm",
"BLSMSK(32|64)rm",
"BLSR(32|64)rm",
"BZHI(32|64)rm",
"MOVBE(16|32|64)rm")>;

def SKLWriteResGroup76 : SchedWriteRes<[SKLPort23,SKLPort0156]> {
Expand Down Expand Up @@ -1806,13 +1807,6 @@ def SKLWriteResGroup98 : SchedWriteRes<[SKLPort6,SKLPort23,SKLPort0156]> {
def: InstRW<[SKLWriteResGroup98], (instregex "LRETQ",
"RETQ")>;

def SKLWriteResGroup99 : SchedWriteRes<[SKLPort23,SKLPort06,SKLPort15]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
def: InstRW<[SKLWriteResGroup99], (instregex "BEXTR(32|64)rm")>;

def SKLWriteResGroup100 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> {
let Latency = 7;
let NumMicroOps = 5;
Expand Down
16 changes: 5 additions & 11 deletions llvm/lib/Target/X86/X86SchedSkylakeServer.td
Expand Up @@ -120,6 +120,10 @@ defm : SKXWriteResPair<WriteLZCNT, [SKXPort1], 3>;
defm : SKXWriteResPair<WriteTZCNT, [SKXPort1], 3>;
defm : SKXWriteResPair<WritePOPCNT, [SKXPort1], 3>;

// BMI1 BEXTR, BMI2 BZHI
defm : SKXWriteResPair<WriteBEXTR, [SKXPort06,SKXPort15], 2, [1,1], 2>;
defm : SKXWriteResPair<WriteBZHI, [SKXPort15], 1>;

// Loads, stores, and moves, not folded with other operations.
def : WriteRes<WriteLoad, [SKXPort23]> { let Latency = 5; }
def : WriteRes<WriteStore, [SKXPort237, SKXPort4]>;
Expand Down Expand Up @@ -1034,7 +1038,6 @@ def: InstRW<[SKXWriteResGroup8], (instregex "ANDN(32|64)rr",
"BLSI(32|64)rr",
"BLSMSK(32|64)rr",
"BLSR(32|64)rr",
"BZHI(32|64)rr",
"LEA(16|32|64)(_32)?r")>;

def SKXWriteResGroup9 : SchedWriteRes<[SKXPort015]> {
Expand Down Expand Up @@ -1597,8 +1600,7 @@ def SKXWriteResGroup22 : SchedWriteRes<[SKXPort06,SKXPort15]> {
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
def: InstRW<[SKXWriteResGroup22], (instregex "BEXTR(32|64)rr",
"BSWAP(16|32|64)r")>;
def: InstRW<[SKXWriteResGroup22], (instregex "BSWAP(16|32|64)r")>;

def SKXWriteResGroup23 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
let Latency = 2;
Expand Down Expand Up @@ -3094,7 +3096,6 @@ def: InstRW<[SKXWriteResGroup79], (instregex "ANDN(32|64)rm",
"BLSI(32|64)rm",
"BLSMSK(32|64)rm",
"BLSR(32|64)rm",
"BZHI(32|64)rm",
"MOVBE(16|32|64)rm")>;

def SKXWriteResGroup80 : SchedWriteRes<[SKXPort23,SKXPort015]> {
Expand Down Expand Up @@ -3753,13 +3754,6 @@ def SKXWriteResGroup104 : SchedWriteRes<[SKXPort6,SKXPort23,SKXPort0156]> {
def: InstRW<[SKXWriteResGroup104], (instregex "LRETQ",
"RETQ")>;

def SKXWriteResGroup105 : SchedWriteRes<[SKXPort23,SKXPort06,SKXPort15]> {
let Latency = 7;
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
def: InstRW<[SKXWriteResGroup105], (instregex "BEXTR(32|64)rm")>;

def SKXWriteResGroup106 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> {
let Latency = 7;
let NumMicroOps = 4;
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/X86/X86Schedule.td
Expand Up @@ -54,6 +54,10 @@ defm WriteTZCNT : X86SchedWritePair; // Trailing zero count.
// Integer shifts and rotates.
defm WriteShift : X86SchedWritePair;

// BMI1 BEXTR, BMI2 BZHI
defm WriteBEXTR : X86SchedWritePair;
defm WriteBZHI : X86SchedWritePair;

// Loads, stores, and moves, not folded with other operations.
def WriteLoad : SchedWrite;
def WriteStore : SchedWrite;
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/X86/X86ScheduleBtVer2.td
Expand Up @@ -141,6 +141,10 @@ defm : JWriteResIntPair<WritePOPCNT, [JALU01], 1>;
defm : JWriteResIntPair<WriteLZCNT, [JALU01], 1>;
defm : JWriteResIntPair<WriteTZCNT, [JALU01], 2, [2]>;

// BMI1 BEXTR, BMI2 BZHI
defm : JWriteResIntPair<WriteBEXTR, [JALU01], 1>;
defm : JWriteResIntPair<WriteBZHI, [JALU01], 1>; // NOTE: Doesn't exist on Jaguar.

def JWriteIMul64 : SchedWriteRes<[JALU1, JMul]> {
let Latency = 6;
let ResourceCycles = [1, 4];
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/X86/X86ScheduleSLM.td
Expand Up @@ -104,6 +104,11 @@ defm : SLMWriteResPair<WriteLZCNT, [SLM_IEC_RSV0], 3>;
defm : SLMWriteResPair<WriteTZCNT, [SLM_IEC_RSV0], 3>;
defm : SLMWriteResPair<WritePOPCNT, [SLM_IEC_RSV0], 3>;

// BMI1 BEXTR, BMI2 BZHI
// NOTE: These don't exist on Silvermont. Ports are guesses.
defm : SBWriteResPair<WriteBEXTR, [SLM_IEC_RSV0], 1>;
defm : SBWriteResPair<WriteBZHI, [SLM_IEC_RSV0], 1>;

// This is quite rough, latency depends on the dividend.
defm : SLMWriteResPair<WriteIDiv, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>;

Expand Down
18 changes: 5 additions & 13 deletions llvm/lib/Target/X86/X86ScheduleZnver1.td
Expand Up @@ -162,6 +162,10 @@ defm : ZnWriteResPair<WritePOPCNT, [ZnALU], 1>;
// Treat misc copies as a move.
def : InstRW<[WriteMove], (instrs COPY)>;

// BMI1 BEXTR, BMI2 BZHI
defm : ZnWriteResPair<WriteBEXTR, [ZnALU], 1>;
defm : ZnWriteResPair<WriteBZHI, [ZnALU], 1>;

// IDIV
def : WriteRes<WriteIDiv, [ZnALU2, ZnDivider]> {
let Latency = 41;
Expand Down Expand Up @@ -564,25 +568,13 @@ def : InstRW<[ZnWriteALULat2], (instregex "BLS(I|MSK|R)(32|64)rr")>;
// r,m.
def : InstRW<[ZnWriteALULat2Ld, ReadAfterLd], (instregex "BLS(I|MSK|R)(32|64)rm")>;

// BEXTR.
// r,r,r.
def : InstRW<[WriteALU], (instregex "BEXTR(32|64)rr")>;
// r,m,r.
def : InstRW<[WriteALULd, ReadAfterLd], (instregex "BEXTR(32|64)rm")>;

// BZHI.
// r,r,r.
def : InstRW<[WriteALU], (instregex "BZHI(32|64)rr")>;
// r,m,r.
def : InstRW<[WriteALULd, ReadAfterLd], (instregex "BZHI(32|64)rm")>;

// CLD STD.
def : InstRW<[WriteALU], (instregex "STD", "CLD")>;

// PDEP PEXT.
// r,r,r.
def : InstRW<[WriteMicrocoded], (instregex "PDEP(32|64)rr", "PEXT(32|64)rr")>;
// r,m,r.
// r,r,m.
def : InstRW<[WriteMicrocoded], (instregex "PDEP(32|64)rm", "PEXT(32|64)rm")>;

// ROR ROL.
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/bmi-schedule.ll
Expand Up @@ -172,8 +172,8 @@ define i64 @test_andn_i64(i64 %a0, i64 %a1, i64 *%a2) {
define i32 @test_bextr_i32(i32 %a0, i32 %a1, i32 *%a2) {
; GENERIC-LABEL: test_bextr_i32:
; GENERIC: # %bb.0:
; GENERIC-NEXT: bextrl %edi, (%rdx), %ecx # sched: [5:0.50]
; GENERIC-NEXT: bextrl %edi, %esi, %eax # sched: [1:0.33]
; GENERIC-NEXT: bextrl %edi, (%rdx), %ecx # sched: [6:1.00]
; GENERIC-NEXT: bextrl %edi, %esi, %eax # sched: [2:1.00]
; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
Expand Down Expand Up @@ -222,8 +222,8 @@ declare i32 @llvm.x86.bmi.bextr.32(i32, i32)
define i64 @test_bextr_i64(i64 %a0, i64 %a1, i64 *%a2) {
; GENERIC-LABEL: test_bextr_i64:
; GENERIC: # %bb.0:
; GENERIC-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [5:0.50]
; GENERIC-NEXT: bextrq %rdi, %rsi, %rax # sched: [1:0.33]
; GENERIC-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [6:1.00]
; GENERIC-NEXT: bextrq %rdi, %rsi, %rax # sched: [2:1.00]
; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/bmi2-schedule.ll
Expand Up @@ -9,8 +9,8 @@
define i32 @test_bzhi_i32(i32 %a0, i32 %a1, i32 *%a2) {
; GENERIC-LABEL: test_bzhi_i32:
; GENERIC: # %bb.0:
; GENERIC-NEXT: bzhil %edi, (%rdx), %ecx # sched: [5:0.50]
; GENERIC-NEXT: bzhil %edi, %esi, %eax # sched: [1:0.33]
; GENERIC-NEXT: bzhil %edi, (%rdx), %ecx # sched: [5:1.00]
; GENERIC-NEXT: bzhil %edi, %esi, %eax # sched: [1:1.00]
; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
Expand Down Expand Up @@ -59,8 +59,8 @@ declare i32 @llvm.x86.bmi.bzhi.32(i32, i32)
define i64 @test_bzhi_i64(i64 %a0, i64 %a1, i64 *%a2) {
; GENERIC-LABEL: test_bzhi_i64:
; GENERIC: # %bb.0:
; GENERIC-NEXT: bzhiq %rdi, (%rdx), %rcx # sched: [5:0.50]
; GENERIC-NEXT: bzhiq %rdi, %rsi, %rax # sched: [1:0.33]
; GENERIC-NEXT: bzhiq %rdi, (%rdx), %rcx # sched: [5:1.00]
; GENERIC-NEXT: bzhiq %rdi, %rsi, %rax # sched: [1:1.00]
; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
Expand Down

0 comments on commit 89310f5

Please sign in to comment.