Skip to content

Commit

Permalink
[AArch64][GlobalISel] Add support for 64 bit vector shuffle using TBL1.
Browse files Browse the repository at this point in the history
This extends the existing support for shufflevector to handle cases like
<2 x float>, which we can implement by concating the vectors and using a TBL1.

Differential Revision: https://reviews.llvm.org/D58684

llvm-svn: 355104
  • Loading branch information
aemerson committed Feb 28, 2019
1 parent 5d4d168 commit 85c3afd
Show file tree
Hide file tree
Showing 2 changed files with 165 additions and 51 deletions.
144 changes: 118 additions & 26 deletions llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
Expand Up @@ -82,6 +82,8 @@ class AArch64InstructionSelector : public InstructionSelector {
unsigned emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const;
MachineInstr *emitLoadFromConstantPool(Constant *CPVal,
MachineIRBuilder &MIRBuilder) const;
MachineInstr *emitVectorConcat(unsigned Op1, unsigned Op2,
MachineIRBuilder &MIRBuilder) const;

ComplexRendererFns selectArithImmed(MachineOperand &Root) const;

Expand Down Expand Up @@ -1965,6 +1967,98 @@ MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
return &*Load;
}

/// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
/// size and RB.
static std::pair<unsigned, unsigned>
getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
unsigned Opc, SubregIdx;
if (RB.getID() == AArch64::GPRRegBankID) {
if (EltSize == 32) {
Opc = AArch64::INSvi32gpr;
SubregIdx = AArch64::ssub;
} else if (EltSize == 64) {
Opc = AArch64::INSvi64gpr;
SubregIdx = AArch64::dsub;
} else {
llvm_unreachable("invalid elt size!");
}
} else {
if (EltSize == 8) {
Opc = AArch64::INSvi8lane;
SubregIdx = AArch64::bsub;
} else if (EltSize == 16) {
Opc = AArch64::INSvi16lane;
SubregIdx = AArch64::hsub;
} else if (EltSize == 32) {
Opc = AArch64::INSvi32lane;
SubregIdx = AArch64::ssub;
} else if (EltSize == 64) {
Opc = AArch64::INSvi64lane;
SubregIdx = AArch64::dsub;
} else {
llvm_unreachable("invalid elt size!");
}
}
return std::make_pair(Opc, SubregIdx);
}

MachineInstr *AArch64InstructionSelector::emitVectorConcat(
unsigned Op1, unsigned Op2, MachineIRBuilder &MIRBuilder) const {
// We implement a vector concat by:
// 1. Use scalar_to_vector to insert the lower vector into the larger dest
// 2. Insert the upper vector into the destination's upper element
// TODO: some of this code is common with G_BUILD_VECTOR handling.
MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();

const LLT Op1Ty = MRI.getType(Op1);
const LLT Op2Ty = MRI.getType(Op2);

if (Op1Ty != Op2Ty) {
LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
return nullptr;
}
assert(Op1Ty.isVector() && "Expected a vector for vector concat");

if (Op1Ty.getSizeInBits() >= 128) {
LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
return nullptr;
}

// At the moment we just support 64 bit vector concats.
if (Op1Ty.getSizeInBits() != 64) {
LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
return nullptr;
}

const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
const LLT &DstTy = LLT::vector(2, ScalarTy);
const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
const TargetRegisterClass *DstRC =
getMinClassForRegBank(FPRBank, Op1Ty.getSizeInBits() * 2);

MachineInstr *WidenedOp1 = emitScalarToVector(DstTy, DstRC, Op1, MIRBuilder);
MachineInstr *WidenedOp2 = emitScalarToVector(DstTy, DstRC, Op2, MIRBuilder);
if (!WidenedOp1 || !WidenedOp2) {
LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
return nullptr;
}

// Now do the insert of the upper element.
unsigned InsertOpc, InsSubRegIdx;
std::tie(InsertOpc, InsSubRegIdx) =
getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits());

auto InsElt =
MIRBuilder
.buildInstr(InsertOpc, {DstRC}, {WidenedOp1->getOperand(0).getReg()})
.addImm(1) /* Lane index */
.addUse(WidenedOp2->getOperand(0).getReg())
.addImm(0);

constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
return &*InsElt;
}

bool AArch64InstructionSelector::selectShuffleVector(
MachineInstr &I, MachineRegisterInfo &MRI) const {
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
Expand Down Expand Up @@ -2002,21 +2096,37 @@ bool AArch64InstructionSelector::selectShuffleVector(
}
}

if (DstTy.getSizeInBits() != 128) {
assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
// This case can be done with TBL1.
return false;
}
MachineIRBuilder MIRBuilder(I);

// Use a constant pool to load the index vector for TBL.
Constant *CPVal = ConstantVector::get(CstIdxs);
MachineIRBuilder MIRBuilder(I);
MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIRBuilder);
if (!IndexLoad) {
LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
return false;
}

if (DstTy.getSizeInBits() != 128) {
assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
// This case can be done with TBL1.
MachineInstr *Concat = emitVectorConcat(Src1Reg, Src2Reg, MIRBuilder);
if (!Concat) {
LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
return false;
}
auto TBL1 = MIRBuilder.buildInstr(
AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
{Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);

auto Copy = BuildMI(*I.getParent(), I, I.getDebugLoc(),
TII.get(TargetOpcode::COPY), I.getOperand(0).getReg())
.addUse(TBL1->getOperand(0).getReg(), 0, AArch64::dsub);
RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
I.eraseFromParent();
return true;
}

// For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
// Q registers for regalloc.
auto RegSeq = MIRBuilder
Expand Down Expand Up @@ -2048,26 +2158,8 @@ bool AArch64InstructionSelector::selectBuildVector(
const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
unsigned Opc;
unsigned SubregIdx;
if (RB.getID() == AArch64::GPRRegBankID) {
if (EltSize == 32) {
Opc = AArch64::INSvi32gpr;
SubregIdx = AArch64::ssub;
} else {
Opc = AArch64::INSvi64gpr;
SubregIdx = AArch64::dsub;
}
} else {
if (EltSize == 16) {
Opc = AArch64::INSvi16lane;
SubregIdx = AArch64::hsub;
} else if (EltSize == 32) {
Opc = AArch64::INSvi32lane;
SubregIdx = AArch64::ssub;
} else {
Opc = AArch64::INSvi64lane;
SubregIdx = AArch64::dsub;
}
}

std::tie(Opc, SubregIdx) = getInsertVecEltOpInfo(RB, EltSize);

MachineIRBuilder MIRBuilder(I);

Expand Down
72 changes: 47 additions & 25 deletions llvm/test/CodeGen/AArch64/GlobalISel/select-shuffle-vector.mir
@@ -1,11 +1,17 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# WARNING: update_mir_test_checks.py does not include the constant pools output,
# so this test requires manual fixing up after running the script.

# RUN: llc -mtriple=aarch64-- -O0 -run-pass=instruction-select -verify-machineinstrs %s -global-isel-abort=1 -o - | FileCheck %s
--- |
; ModuleID = 'shufflevec-only-legal.ll'
source_filename = "shufflevec-only-legal.ll"
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64"

define <2 x float> @shuffle_v2f32(<2 x float> %a, <2 x float> %b) {
%shuf = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 0>
ret <2 x float> %shuf
}

define <4 x i32> @shuffle_v4i32(<4 x i32> %a, <4 x i32> %b) {
%shuf = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 3, i32 0>
ret <4 x i32> %shuf
Expand All @@ -21,21 +27,52 @@
ret <2 x i64> %shuf
}

...
---
name: shuffle_v2f32
alignment: 2
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
bb.1 (%ir-block.0):
liveins: $d0, $d1
; CHECK-LABEL: name: shuffle_v2f32
; CHECK: constants:
; CHECK: - id: 0
; CHECK: value: '<8 x i8> <i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3>'
; CHECK: alignment: 8
; CHECK: liveins: $d0, $d1
; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1
; CHECK: [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) %const.0
; CHECK: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) %const.0
; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.dsub
; CHECK: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[COPY1]], %subreg.dsub
; CHECK: [[INSvi64lane:%[0-9]+]]:fpr128 = INSvi64lane [[INSERT_SUBREG]], 1, [[INSERT_SUBREG1]], 0
; CHECK: [[TBLv16i8One:%[0-9]+]]:fpr128 = TBLv16i8One [[INSvi64lane]], [[LDRQui]]
; CHECK: [[COPY2:%[0-9]+]]:fpr64 = COPY [[TBLv16i8One]].dsub
; CHECK: $d0 = COPY [[COPY2]]
; CHECK: RET_ReallyLR implicit $d0
%0:fpr(<2 x s32>) = COPY $d0
%1:fpr(<2 x s32>) = COPY $d1
%4:gpr(s32) = G_CONSTANT i32 1
%5:gpr(s32) = G_CONSTANT i32 0
%3:fpr(<2 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32)
%2:fpr(<2 x s32>) = G_SHUFFLE_VECTOR %0(<2 x s32>), %1, %3(<2 x s32>)
$d0 = COPY %2(<2 x s32>)
RET_ReallyLR implicit $d0
...
---
name: shuffle_v4i32
alignment: 2
legalized: true
regBankSelected: true
tracksRegLiveness: true
registers:
- { id: 0, class: fpr }
- { id: 1, class: fpr }
- { id: 2, class: fpr }
- { id: 3, class: fpr }
- { id: 4, class: gpr }
- { id: 5, class: gpr }
- { id: 6, class: gpr }
body: |
bb.1 (%ir-block.0):
liveins: $q0, $q1
Expand Down Expand Up @@ -71,15 +108,6 @@ alignment: 2
legalized: true
regBankSelected: true
tracksRegLiveness: true
registers:
- { id: 0, class: fpr }
- { id: 1, class: fpr }
- { id: 2, class: fpr }
- { id: 3, class: fpr }
- { id: 4, class: gpr }
- { id: 5, class: gpr }
- { id: 6, class: gpr }
- { id: 7, class: gpr }
body: |
bb.1 (%ir-block.0):
liveins: $q0, $q1
Expand Down Expand Up @@ -116,12 +144,6 @@ alignment: 2
legalized: true
regBankSelected: true
tracksRegLiveness: true
registers:
- { id: 0, class: fpr }
- { id: 1, class: fpr }
- { id: 2, class: fpr }
- { id: 3, class: fpr }
- { id: 4, class: gpr }
body: |
bb.1 (%ir-block.0):
liveins: $q0, $q1
Expand Down

0 comments on commit 85c3afd

Please sign in to comment.