Skip to content

Commit

Permalink
[ARM] Add FP handling for MVE lane interleaving
Browse files Browse the repository at this point in the history
FP16 to FP32 converts can be handled in MVE lane interleaving, much like
the sext/zext lowering we do. This expands the pass with fpext and
fptrunc handling, and basic fp operations allowing more efficient
lowering of fp vectors.

Differential Revision: https://reviews.llvm.org/D97292
  • Loading branch information
davemgreen committed Apr 12, 2021
1 parent bcbea2a commit 6c0a1ed
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 33 deletions.
22 changes: 16 additions & 6 deletions llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp
Expand Up @@ -123,17 +123,20 @@ static bool isProfitableToInterleave(SmallSetVector<Instruction *, 4> &Exts,
// T=VLDRH.16; A=VMOVNB T; B=VMOVNT T
// But those VMOVL may be folded into a VMULL.

// But expensive extends/truncs are always good to remove.
for (auto *E : Exts)
if (!isa<LoadInst>(E->getOperand(0))) {
// But expensive extends/truncs are always good to remove. FPExts always
// involve extra VCVT's so are always considered to be beneficial to convert.
for (auto *E : Exts) {
if (isa<FPExtInst>(E) || !isa<LoadInst>(E->getOperand(0))) {
LLVM_DEBUG(dbgs() << "Beneficial due to " << *E << "\n");
return true;
}
for (auto *T : Truncs)
}
for (auto *T : Truncs) {
if (T->hasOneUse() && !isa<StoreInst>(*T->user_begin())) {
LLVM_DEBUG(dbgs() << "Beneficial due to " << *T << "\n");
return true;
}
}

// Otherwise, we know we have a load(ext), see if any of the Extends are a
// vmull. This is a simple heuristic and certainly not perfect.
Expand Down Expand Up @@ -172,6 +175,7 @@ static bool tryInterleave(Instruction *Start,
switch (I->getOpcode()) {
// Truncs
case Instruction::Trunc:
case Instruction::FPTrunc:
if (Truncs.count(I))
continue;
Truncs.insert(I);
Expand All @@ -181,6 +185,7 @@ static bool tryInterleave(Instruction *Start,
// Extend leafs
case Instruction::SExt:
case Instruction::ZExt:
case Instruction::FPExt:
if (Exts.count(I))
continue;
for (auto *Use : I->users())
Expand All @@ -196,6 +201,9 @@ static bool tryInterleave(Instruction *Start,
case Instruction::LShr:
case Instruction::Shl:
case Instruction::ICmp:
case Instruction::FCmp:
case Instruction::FAdd:
case Instruction::FMul:
case Instruction::Select:
if (Ops.count(I))
continue;
Expand Down Expand Up @@ -297,9 +305,11 @@ static bool tryInterleave(Instruction *Start,
LLVM_DEBUG(dbgs() << "Replacing ext " << *I << "\n");
Builder.SetInsertPoint(I);
Value *Shuffle = Builder.CreateShuffleVector(I->getOperand(0), LeafMask);
bool FPext = isa<FPExtInst>(I);
bool Sext = isa<SExtInst>(I);
Value *Ext = Sext ? Builder.CreateSExt(Shuffle, I->getType())
: Builder.CreateZExt(Shuffle, I->getType());
Value *Ext = FPext ? Builder.CreateFPExt(Shuffle, I->getType())
: Sext ? Builder.CreateSExt(Shuffle, I->getType())
: Builder.CreateZExt(Shuffle, I->getType());
I->replaceAllUsesWith(Ext);
LLVM_DEBUG(dbgs() << " with " << *Shuffle << "\n");
}
Expand Down
48 changes: 21 additions & 27 deletions llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll
Expand Up @@ -360,16 +360,14 @@ define void @both_8(half* nocapture readonly %x, half* noalias nocapture %y) {
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: .LBB7_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrh.u32 q1, [r0, #8]
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
; CHECK-NEXT: vmul.f32 q1, q1, q0
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
; CHECK-NEXT: vstrh.32 q1, [r1, #8]
; CHECK-NEXT: vldrh.u32 q1, [r0], #16
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
; CHECK-NEXT: vldrh.u16 q1, [r0], #16
; CHECK-NEXT: vcvtb.f32.f16 q2, q1
; CHECK-NEXT: vcvtt.f32.f16 q1, q1
; CHECK-NEXT: vmul.f32 q2, q2, q0
; CHECK-NEXT: vmul.f32 q1, q1, q0
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
; CHECK-NEXT: vstrh.32 q1, [r1], #16
; CHECK-NEXT: vcvtb.f16.f32 q2, q2
; CHECK-NEXT: vcvtt.f16.f32 q2, q1
; CHECK-NEXT: vstrb.8 q2, [r1], #16
; CHECK-NEXT: le lr, .LBB7_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
Expand Down Expand Up @@ -412,26 +410,22 @@ define void @both_16(half* nocapture readonly %x, half* noalias nocapture %y) {
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: .LBB8_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrh.u32 q1, [r0, #24]
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
; CHECK-NEXT: vmul.f32 q1, q1, q0
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
; CHECK-NEXT: vstrh.32 q1, [r1, #24]
; CHECK-NEXT: vldrh.u32 q1, [r0, #16]
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
; CHECK-NEXT: vmul.f32 q1, q1, q0
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
; CHECK-NEXT: vstrh.32 q1, [r1, #16]
; CHECK-NEXT: vldrh.u32 q1, [r0, #8]
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
; CHECK-NEXT: vldrh.u16 q1, [r0, #16]
; CHECK-NEXT: vcvtb.f32.f16 q2, q1
; CHECK-NEXT: vcvtt.f32.f16 q1, q1
; CHECK-NEXT: vmul.f32 q2, q2, q0
; CHECK-NEXT: vmul.f32 q1, q1, q0
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
; CHECK-NEXT: vstrh.32 q1, [r1, #8]
; CHECK-NEXT: vldrh.u32 q1, [r0], #32
; CHECK-NEXT: vcvtb.f32.f16 q1, q1
; CHECK-NEXT: vcvtb.f16.f32 q2, q2
; CHECK-NEXT: vcvtt.f16.f32 q2, q1
; CHECK-NEXT: vldrh.u16 q1, [r0], #32
; CHECK-NEXT: vstrh.16 q2, [r1, #16]
; CHECK-NEXT: vcvtb.f32.f16 q2, q1
; CHECK-NEXT: vcvtt.f32.f16 q1, q1
; CHECK-NEXT: vmul.f32 q2, q2, q0
; CHECK-NEXT: vmul.f32 q1, q1, q0
; CHECK-NEXT: vcvtb.f16.f32 q1, q1
; CHECK-NEXT: vstrh.32 q1, [r1], #32
; CHECK-NEXT: vcvtb.f16.f32 q2, q2
; CHECK-NEXT: vcvtt.f16.f32 q2, q1
; CHECK-NEXT: vstrh.16 q2, [r1], #32
; CHECK-NEXT: le lr, .LBB8_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
Expand Down

0 comments on commit 6c0a1ed

Please sign in to comment.