Skip to content

Commit

Permalink
[AArch64] Use tbl for truncating vector FPtoUI conversions.
Browse files Browse the repository at this point in the history
On AArch64, doing the vector truncate separately after the fptoui
conversion can be lowered more efficiently using tbl.4, building on
D133495.

https://alive2.llvm.org/ce/z/T538CC

Depends on D133495

Reviewed By: t.p.northover

Differential Revision: https://reviews.llvm.org/D133496
  • Loading branch information
fhahn committed Sep 16, 2022
1 parent e596422 commit 6b86b48
Show file tree
Hide file tree
Showing 3 changed files with 221 additions and 125 deletions.
2 changes: 1 addition & 1 deletion llvm/lib/CodeGen/CodeGenPrepare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8047,7 +8047,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
if (OptimizeNoopCopyExpression(CI, *TLI, *DL))
return true;

if ((isa<UIToFPInst>(I) || isa<TruncInst>(I)) &&
if ((isa<UIToFPInst>(I) || isa<FPToUIInst>(I) || isa<TruncInst>(I)) &&
TLI->optimizeExtendOrTruncateConversion(I,
LI->getLoopFor(I->getParent())))
return true;
Expand Down
17 changes: 17 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13289,6 +13289,23 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I,
return true;
}

// Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
// followed by a truncate lowered to using tbl.4.
auto *FPToUI = dyn_cast<FPToUIInst>(I);
if (FPToUI &&
(SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
SrcTy->getElementType()->isFloatTy() &&
DstTy->getElementType()->isIntegerTy(8)) {
IRBuilder<> Builder(I);
auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
VectorType::getInteger(SrcTy));
auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
I->replaceAllUsesWith(TruncI);
I->eraseFromParent();
createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
return true;
}

// Convert 'trunc <(8|16) x i32> %x to <(8|16) x i8>' to a single tbl.4
// instruction selecting the lowest 8 bits per lane of the input interpreted
// as 2 or 4 <4 x i32> vectors.
Expand Down
Loading

0 comments on commit 6b86b48

Please sign in to comment.