Skip to content

Commit

Permalink
[X86][InstCombine] Add constant folding and simplification support fo…
Browse files Browse the repository at this point in the history
…r pdep and pext

The instructions use a mask to either pack disjoint bits together(pext) or spread bits to disjoint locations(pdep). If the mask is all 0s then no bits are extracted or deposited. If the mask is all ones, then the source value is written to the result since no compression or expansion happens. Otherwise if both the source and mask are constant we can walk the bits in the source/mask and calculate the result.

There other crazier things we could do like computeKnownBits or turning pext into shift/and if only a single contiguous range of bits is extracted.

Fixes PR44389

Differential Revision: https://reviews.llvm.org/D71952
  • Loading branch information
topperc committed Dec 31, 2019
1 parent 1cc8a74 commit 374e029
Show file tree
Hide file tree
Showing 2 changed files with 190 additions and 0 deletions.
58 changes: 58 additions & 0 deletions llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
Expand Up @@ -2487,6 +2487,64 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
// TODO should we convert this to an AND if the RHS is constant?
}
break;
case Intrinsic::x86_bmi_pext_32:
case Intrinsic::x86_bmi_pext_64:
if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
if (MaskC->isNullValue())
return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0));
if (MaskC->isAllOnesValue())
return replaceInstUsesWith(CI, II->getArgOperand(0));

if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
uint64_t Src = SrcC->getZExtValue();
uint64_t Mask = MaskC->getZExtValue();
uint64_t Result = 0;
uint64_t BitToSet = 1;

while (Mask) {
// Isolate lowest set bit.
uint64_t BitToTest = Mask & -Mask;
if (BitToTest & Src)
Result |= BitToSet;

BitToSet <<= 1;
// Clear lowest set bit.
Mask &= Mask - 1;
}

return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result));
}
}
break;
case Intrinsic::x86_bmi_pdep_32:
case Intrinsic::x86_bmi_pdep_64:
if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
if (MaskC->isNullValue())
return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0));
if (MaskC->isAllOnesValue())
return replaceInstUsesWith(CI, II->getArgOperand(0));

if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
uint64_t Src = SrcC->getZExtValue();
uint64_t Mask = MaskC->getZExtValue();
uint64_t Result = 0;
uint64_t BitToTest = 1;

while (Mask) {
// Isolate lowest set bit.
uint64_t BitToSet = Mask & -Mask;
if (BitToTest & Src)
Result |= BitToSet;

BitToTest <<= 1;
// Clear lowest set bit;
Mask &= Mask - 1;
}

return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result));
}
}
break;

case Intrinsic::x86_vcvtph2ps_128:
case Intrinsic::x86_vcvtph2ps_256: {
Expand Down
132 changes: 132 additions & 0 deletions llvm/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll
Expand Up @@ -7,6 +7,10 @@ declare i32 @llvm.x86.bmi.bextr.32(i32, i32) nounwind readnone
declare i64 @llvm.x86.bmi.bextr.64(i64, i64) nounwind readnone
declare i32 @llvm.x86.bmi.bzhi.32(i32, i32) nounwind readnone
declare i64 @llvm.x86.bmi.bzhi.64(i64, i64) nounwind readnone
declare i32 @llvm.x86.bmi.pext.32(i32, i32) nounwind readnone
declare i64 @llvm.x86.bmi.pext.64(i64, i64) nounwind readnone
declare i32 @llvm.x86.bmi.pdep.32(i32, i32) nounwind readnone
declare i64 @llvm.x86.bmi.pdep.64(i64, i64) nounwind readnone

define i32 @test_x86_tbm_bextri_u32(i32 %a) nounwind readnone {
; CHECK-LABEL: @test_x86_tbm_bextri_u32(
Expand Down Expand Up @@ -269,3 +273,131 @@ define i64 @test_x86_bmi_bzhi_64_constfold() nounwind readnone {
%1 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 5, i64 1)
ret i64 %1
}

define i32 @test_x86_pext_32_zero_mask(i32 %x) nounwind readnone {
; CHECK-LABEL: @test_x86_pext_32_zero_mask(
; CHECK-NEXT: ret i32 0
;
%1 = tail call i32 @llvm.x86.bmi.pext.32(i32 %x, i32 0)
ret i32 %1
}

define i64 @test_x86_pext_64_zero_mask(i64 %x) nounwind readnone {
; CHECK-LABEL: @test_x86_pext_64_zero_mask(
; CHECK-NEXT: ret i64 0
;
%1 = tail call i64 @llvm.x86.bmi.pext.64(i64 %x, i64 0)
ret i64 %1
}

define i32 @test_x86_pext_32_allones_mask(i32 %x) nounwind readnone {
; CHECK-LABEL: @test_x86_pext_32_allones_mask(
; CHECK-NEXT: ret i32 %x
;
%1 = tail call i32 @llvm.x86.bmi.pext.32(i32 %x, i32 -1)
ret i32 %1
}

define i64 @test_x86_pext_64_allones_mask(i64 %x) nounwind readnone {
; CHECK-LABEL: @test_x86_pext_64_allones_mask(
; CHECK-NEXT: ret i64 %x
;
%1 = tail call i64 @llvm.x86.bmi.pext.64(i64 %x, i64 -1)
ret i64 %1
}

define i32 @test_x86_pext_32_constant_fold() nounwind readnone {
; CHECK-LABEL: @test_x86_pext_32_constant_fold(
; CHECK-NEXT: ret i32 30001
;
%1 = tail call i32 @llvm.x86.bmi.pext.32(i32 1985229328, i32 4042322160)
ret i32 %1
}

define i64 @test_x86_pext_64_constant_fold() nounwind readnone {
; CHECK-LABEL: @test_x86_pext_64_constant_fold(
; CHECK-NEXT: ret i64 1966210489
;
%1 = tail call i64 @llvm.x86.bmi.pext.64(i64 8526495043095935640, i64 -1085102592571150096)
ret i64 %1
}

define i32 @test_x86_pext_32_constant_fold_2() nounwind readnone {
; CHECK-LABEL: @test_x86_pext_32_constant_fold_2(
; CHECK-NEXT: ret i32 30224
;
%1 = tail call i32 @llvm.x86.bmi.pext.32(i32 1985229328, i32 4278190335)
ret i32 %1
}

define i64 @test_x86_pext_64_constant_fold_2() nounwind readnone {
; CHECK-LABEL: @test_x86_pext_64_constant_fold_2(
; CHECK-NEXT: ret i64 1980816570
;
%1 = tail call i64 @llvm.x86.bmi.pext.64(i64 8526495043095935640, i64 -72056498804490496)
ret i64 %1
}

define i32 @test_x86_pdep_32_zero_mask(i32 %x) nounwind readnone {
; CHECK-LABEL: @test_x86_pdep_32_zero_mask(
; CHECK-NEXT: ret i32 0
;
%1 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 0)
ret i32 %1
}

define i64 @test_x86_pdep_64_zero_mask(i64 %x) nounwind readnone {
; CHECK-LABEL: @test_x86_pdep_64_zero_mask(
; CHECK-NEXT: ret i64 0
;
%1 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %x, i64 0)
ret i64 %1
}

define i32 @test_x86_pdep_32_allones_mask(i32 %x) nounwind readnone {
; CHECK-LABEL: @test_x86_pdep_32_allones_mask(
; CHECK-NEXT: ret i32 %x
;
%1 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 -1)
ret i32 %1
}

define i64 @test_x86_pdep_64_allones_mask(i64 %x) nounwind readnone {
; CHECK-LABEL: @test_x86_pdep_64_allones_mask(
; CHECK-NEXT: ret i64 %x
;
%1 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %x, i64 -1)
ret i64 %1
}

define i32 @test_x86_pdep_32_constant_fold() nounwind readnone {
; CHECK-LABEL: @test_x86_pdep_32_constant_fold(
; CHECK-NEXT: ret i32 807407616
;
%1 = tail call i32 @llvm.x86.bmi.pdep.32(i32 1985229328, i32 4042322160)
ret i32 %1
}

define i64 @test_x86_pdep_64_constant_fold() nounwind readnone {
; CHECK-LABEL: @test_x86_pdep_64_constant_fold(
; CHECK-NEXT: ret i64 -1089641583808049024
;
%1 = tail call i64 @llvm.x86.bmi.pdep.64(i64 8526495043095935640, i64 -1085102592571150096)
ret i64 %1
}

define i32 @test_x86_pdep_32_constant_fold_2() nounwind readnone {
; CHECK-LABEL: @test_x86_pdep_32_constant_fold_2(
; CHECK-NEXT: ret i32 838860816
;
%1 = tail call i32 @llvm.x86.bmi.pdep.32(i32 1985229328, i32 4278190335)
ret i32 %1
}

define i64 @test_x86_pdep_64_constant_fold_2() nounwind readnone {
; CHECK-LABEL: @test_x86_pdep_64_constant_fold_2(
; CHECK-NEXT: ret i64 -144114243170822144
;
%1 = tail call i64 @llvm.x86.bmi.pdep.64(i64 8526495043095935640, i64 -72056498804490496)
ret i64 %1
}

0 comments on commit 374e029

Please sign in to comment.