Skip to content

Commit

Permalink
[AVX-512] Add support for commuting VPERMT2(B/W/D/Q/PS/PD) to/from VP…
Browse files Browse the repository at this point in the history
…ERMI2(B/W/D/Q/PS/PD).

Summary:
The index and one of the table operands can be swapped by changing the opcode to the other version. Neither of these operands are the one that can load from memory so this can't be used to increase memory folding opportunities.

We need to handle the unmasked forms and the kz forms. Since the load operand isn't being commuted we can commute the load and broadcast instructions too.

Reviewers: igorb, delena, Ayal, Farhana, RKSimon

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D25652

llvm-svn: 287621
  • Loading branch information
topperc committed Nov 22, 2016
1 parent 81a4b26 commit cada9f2
Show file tree
Hide file tree
Showing 15 changed files with 389 additions and 316 deletions.
18 changes: 9 additions & 9 deletions llvm/lib/Target/X86/X86InstrAVX512.td
Expand Up @@ -1352,14 +1352,14 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2, _.RC:$src3))>, EVEX_4V,
(_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2, _.RC:$src3)), 1>, EVEX_4V,
AVX5128IBase;

defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2,
(_.VT (bitconvert (_.LdFrag addr:$src3)))))>,
(_.VT (bitconvert (_.LdFrag addr:$src3))))), 1>,
EVEX_4V, AVX5128IBase;
}
}
Expand All @@ -1371,8 +1371,8 @@ multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr,
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr ),
(_.VT (X86VPermi2X _.RC:$src1,
_.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>,
AVX5128IBase, EVEX_4V, EVEX_B;
_.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))),
1>, AVX5128IBase, EVEX_4V, EVEX_B;
}

multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr,
Expand Down Expand Up @@ -1420,14 +1420,14 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins IdxVT.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3))>, EVEX_4V,
AVX5128IBase;
(_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3)), 1>,
EVEX_4V, AVX5128IBase;

defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins IdxVT.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2,
(bitconvert (_.LdFrag addr:$src3))))>,
(bitconvert (_.LdFrag addr:$src3)))), 1>,
EVEX_4V, AVX5128IBase;
}
}
Expand All @@ -1439,8 +1439,8 @@ multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr ),
(_.VT (X86VPermt2 _.RC:$src1,
IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>,
AVX5128IBase, EVEX_4V, EVEX_B;
IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))),
1>, AVX5128IBase, EVEX_4V, EVEX_B;
}

multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr,
Expand Down
121 changes: 115 additions & 6 deletions llvm/lib/Target/X86/X86InstrInfo.cpp
Expand Up @@ -3533,6 +3533,92 @@ static bool commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
return true;
}

// Returns true if this is a VPERMI2 or VPERMT2 instrution that can be
// commuted.
static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
#define VPERM_CASES(Suffix) \
case X86::VPERMI2##Suffix##128rr: case X86::VPERMT2##Suffix##128rr: \
case X86::VPERMI2##Suffix##256rr: case X86::VPERMT2##Suffix##256rr: \
case X86::VPERMI2##Suffix##rr: case X86::VPERMT2##Suffix##rr: \
case X86::VPERMI2##Suffix##128rm: case X86::VPERMT2##Suffix##128rm: \
case X86::VPERMI2##Suffix##256rm: case X86::VPERMT2##Suffix##256rm: \
case X86::VPERMI2##Suffix##rm: case X86::VPERMT2##Suffix##rm: \
case X86::VPERMI2##Suffix##128rrkz: case X86::VPERMT2##Suffix##128rrkz: \
case X86::VPERMI2##Suffix##256rrkz: case X86::VPERMT2##Suffix##256rrkz: \
case X86::VPERMI2##Suffix##rrkz: case X86::VPERMT2##Suffix##rrkz: \
case X86::VPERMI2##Suffix##128rmkz: case X86::VPERMT2##Suffix##128rmkz: \
case X86::VPERMI2##Suffix##256rmkz: case X86::VPERMT2##Suffix##256rmkz: \
case X86::VPERMI2##Suffix##rmkz: case X86::VPERMT2##Suffix##rmkz:

#define VPERM_CASES_BROADCAST(Suffix) \
VPERM_CASES(Suffix) \
case X86::VPERMI2##Suffix##128rmb: case X86::VPERMT2##Suffix##128rmb: \
case X86::VPERMI2##Suffix##256rmb: case X86::VPERMT2##Suffix##256rmb: \
case X86::VPERMI2##Suffix##rmb: case X86::VPERMT2##Suffix##rmb: \
case X86::VPERMI2##Suffix##128rmbkz: case X86::VPERMT2##Suffix##128rmbkz: \
case X86::VPERMI2##Suffix##256rmbkz: case X86::VPERMT2##Suffix##256rmbkz: \
case X86::VPERMI2##Suffix##rmbkz: case X86::VPERMT2##Suffix##rmbkz:

switch (Opcode) {
default: return false;
VPERM_CASES(B)
VPERM_CASES_BROADCAST(D)
VPERM_CASES_BROADCAST(PD)
VPERM_CASES_BROADCAST(PS)
VPERM_CASES_BROADCAST(Q)
VPERM_CASES(W)
return true;
}
#undef VPERM_CASES_BROADCAST
#undef VPERM_CASES
}

// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
// from the I opcod to the T opcode and vice versa.
static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
#define VPERM_CASES(Orig, New) \
case X86::Orig##128rr: return X86::New##128rr; \
case X86::Orig##128rrkz: return X86::New##128rrkz; \
case X86::Orig##128rm: return X86::New##128rm; \
case X86::Orig##128rmkz: return X86::New##128rmkz; \
case X86::Orig##256rr: return X86::New##256rr; \
case X86::Orig##256rrkz: return X86::New##256rrkz; \
case X86::Orig##256rm: return X86::New##256rm; \
case X86::Orig##256rmkz: return X86::New##256rmkz; \
case X86::Orig##rr: return X86::New##rr; \
case X86::Orig##rrkz: return X86::New##rrkz; \
case X86::Orig##rm: return X86::New##rm; \
case X86::Orig##rmkz: return X86::New##rmkz;

#define VPERM_CASES_BROADCAST(Orig, New) \
VPERM_CASES(Orig, New) \
case X86::Orig##128rmb: return X86::New##128rmb; \
case X86::Orig##128rmbkz: return X86::New##128rmbkz; \
case X86::Orig##256rmb: return X86::New##256rmb; \
case X86::Orig##256rmbkz: return X86::New##256rmbkz; \
case X86::Orig##rmb: return X86::New##rmb; \
case X86::Orig##rmbkz: return X86::New##rmbkz;

switch (Opcode) {
VPERM_CASES(VPERMI2B, VPERMT2B)
VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D)
VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q)
VPERM_CASES(VPERMI2W, VPERMT2W)
VPERM_CASES(VPERMT2B, VPERMI2B)
VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D)
VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q)
VPERM_CASES(VPERMT2W, VPERMI2W)
}

llvm_unreachable("Unreachable!");
#undef VPERM_CASES_BROADCAST
#undef VPERM_CASES
}

MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
unsigned OpIdx1,
unsigned OpIdx2) const {
Expand Down Expand Up @@ -3854,7 +3940,15 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
default:
default: {
if (isCommutableVPERMV3Instruction(MI.getOpcode())) {
unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode());
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.setDesc(get(Opc));
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}

const X86InstrFMA3Group *FMA3Group =
X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
if (FMA3Group) {
Expand All @@ -3870,6 +3964,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,

return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
}
}
}

bool X86InstrInfo::findFMA3CommutedOpIndices(
Expand Down Expand Up @@ -4041,12 +4136,26 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
// Handled masked instructions since we need to skip over the mask input
// and the preserved input.
if (Desc.TSFlags & X86II::EVEX_K) {
// First assume that the first input is the mask operand and skip past it.
unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
// If there is no preserved input we only need to skip 1 operand.
if (MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
MCOI::TIED_TO) != -1)
++CommutableOpIdx1;
unsigned CommutableOpIdx2 = CommutableOpIdx1 + 1;
unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
// Check if the first input is tied. If there isn't one then we only
// need to skip the mask operand which we did above.
if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
MCOI::TIED_TO) != -1)) {
// If this is zero masking instruction with a tied operand, we need to
// move the first index back to the first input since this must
// be a 3 input instruction and we want the first two non-mask inputs.
// Otherwise this is a 2 input instruction with a preserved input and
// mask, so we need to move the indices to skip one more input.
if (Desc.TSFlags & X86II::EVEX_Z)
--CommutableOpIdx1;
else {
++CommutableOpIdx1;
++CommutableOpIdx2;
}
}

if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
CommutableOpIdx1, CommutableOpIdx2))
return false;
Expand Down
36 changes: 18 additions & 18 deletions llvm/test/CodeGen/X86/avx512-insert-extract.ll
Expand Up @@ -369,8 +369,8 @@ define i16 @test16(i1 *%addr, i16 %a) {
; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} {z}
; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,16,11,12,13,14,15]
; KNL-NEXT: vpermt2d %zmm0, %zmm2, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm0
; KNL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: retq
Expand All @@ -384,8 +384,8 @@ define i16 @test16(i1 *%addr, i16 %a) {
; SKX-NEXT: vpmovm2d %k1, %zmm0
; SKX-NEXT: vpmovm2d %k0, %zmm1
; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,16,11,12,13,14,15]
; SKX-NEXT: vpermt2d %zmm1, %zmm2, %zmm0
; SKX-NEXT: vpmovd2m %zmm0, %k0
; SKX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
; SKX-NEXT: vpmovd2m %zmm2, %k0
; SKX-NEXT: kmovw %k0, %eax
; SKX-NEXT: retq
%x = load i1 , i1 * %addr, align 128
Expand All @@ -406,8 +406,8 @@ define i8 @test17(i1 *%addr, i8 %a) {
; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z}
; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7]
; KNL-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm0
; KNL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; KNL-NEXT: vpsllq $63, %zmm2, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: retq
Expand All @@ -421,8 +421,8 @@ define i8 @test17(i1 *%addr, i8 %a) {
; SKX-NEXT: vpmovm2q %k1, %zmm0
; SKX-NEXT: vpmovm2q %k0, %zmm1
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7]
; SKX-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; SKX-NEXT: vpmovq2m %zmm0, %k0
; SKX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; SKX-NEXT: vpmovq2m %zmm2, %k0
; SKX-NEXT: kmovb %k0, %eax
; SKX-NEXT: retq
%x = load i1 , i1 * %addr, align 128
Expand Down Expand Up @@ -1217,8 +1217,8 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32>
; SKX-NEXT: vpmovm2w %k1, %zmm0
; SKX-NEXT: vpmovm2w %k0, %zmm1
; SKX-NEXT: vmovdqu16 {{.*#+}} zmm2 = [0,1,2,3,32,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
; SKX-NEXT: vpermt2w %zmm1, %zmm2, %zmm0
; SKX-NEXT: vpmovw2m %zmm0, %k0
; SKX-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
; SKX-NEXT: vpmovw2m %zmm2, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: retq
%cmp_res_i1 = icmp ult i32 %a, %b
Expand Down Expand Up @@ -1249,23 +1249,23 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
; KNL-NEXT: kmovw %eax, %k2
; KNL-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} {z}
; KNL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,2,3,4,5,6,7]
; KNL-NEXT: vpermt2q %zmm2, %zmm4, %zmm3
; KNL-NEXT: vpsllq $63, %zmm3, %zmm2
; KNL-NEXT: vpermi2q %zmm2, %zmm3, %zmm4
; KNL-NEXT: vpsllq $63, %zmm4, %zmm2
; KNL-NEXT: vptestmq %zmm2, %zmm2, %k2
; KNL-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} {z}
; KNL-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} {z}
; KNL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,5,6,7]
; KNL-NEXT: vpermt2q %zmm3, %zmm4, %zmm2
; KNL-NEXT: vpsllq $63, %zmm2, %zmm2
; KNL-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
; KNL-NEXT: vpsllq $63, %zmm4, %zmm2
; KNL-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} {z}
; KNL-NEXT: vpextrd $3, %xmm0, %eax
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
; KNL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,8,4,5,6,7]
; KNL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
; KNL-NEXT: vpsllq $63, %zmm2, %zmm0
; KNL-NEXT: vpermi2q %zmm0, %zmm2, %zmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: retq
Expand Down Expand Up @@ -1310,8 +1310,8 @@ define i8 @test_iinsertelement_v2i1(i32 %a, i32 %b, <2 x i64> %x , <2 x i64> %y)
; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z}
; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
; KNL-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm0
; KNL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; KNL-NEXT: vpsllq $63, %zmm2, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: retq
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/avx512-mask-op.ll
Expand Up @@ -643,8 +643,8 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z}
; KNL-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} {z}
; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7]
; KNL-NEXT: vpermt2q %zmm2, %zmm3, %zmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
; KNL-NEXT: vpsllq $63, %zmm3, %zmm1
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL-NEXT: kshiftlw $1, %k1, %k1
; KNL-NEXT: kshiftrw $1, %k1, %k1
Expand All @@ -665,8 +665,8 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
; SKX-NEXT: vpmovm2q %k0, %zmm0
; SKX-NEXT: vpmovm2q %k1, %zmm1
; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7]
; SKX-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; SKX-NEXT: vpmovq2m %zmm0, %k0
; SKX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; SKX-NEXT: vpmovq2m %zmm2, %k0
; SKX-NEXT: kshiftlb $1, %k0, %k0
; SKX-NEXT: kshiftrb $1, %k0, %k0
; SKX-NEXT: kshiftlb $7, %k2, %k1
Expand Down

0 comments on commit cada9f2

Please sign in to comment.