Skip to content

Commit

Permalink
Supported lowerInterleavedStore() in X86InterleavedAccess.
Browse files Browse the repository at this point in the history
Reviewers: RKSimon, DavidKreitzer

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D32658

llvm-svn: 306068
  • Loading branch information
Farhana Aleen committed Jun 22, 2017
1 parent 5a7c2f1 commit 4b652a5
Show file tree
Hide file tree
Showing 4 changed files with 197 additions and 100 deletions.
6 changes: 6 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.h
Expand Up @@ -1070,6 +1070,12 @@ namespace llvm {
ArrayRef<unsigned> Indices,
unsigned Factor) const override;

/// \brief Lower interleaved store(s) into target specific
/// instructions/intrinsics.
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;


void finalizeLowering(MachineFunction &MF) const override;

protected:
Expand Down
126 changes: 95 additions & 31 deletions llvm/lib/Target/X86/X86InterleavedAccess.cpp
Expand Up @@ -16,6 +16,7 @@

#include "X86ISelLowering.h"
#include "X86TargetMachine.h"
#include "llvm/Analysis/VectorUtils.h"

using namespace llvm;

Expand Down Expand Up @@ -50,9 +51,8 @@ class X86InterleavedAccessGroup {
IRBuilder<> &Builder;

/// \brief Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
/// sub vectors of type \p T. Returns true and the sub-vectors in
/// \p DecomposedVectors if it decomposes the Inst, returns false otherwise.
bool decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
/// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
void decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
SmallVectorImpl<Instruction *> &DecomposedVectors);

/// \brief Performs matrix transposition on a 4x4 matrix \p InputVectors and
Expand Down Expand Up @@ -80,8 +80,7 @@ class X86InterleavedAccessGroup {
/// target information \p STarget.
explicit X86InterleavedAccessGroup(Instruction *I,
ArrayRef<ShuffleVectorInst *> Shuffs,
ArrayRef<unsigned> Ind,
const unsigned F,
ArrayRef<unsigned> Ind, const unsigned F,
const X86Subtarget &STarget,
IRBuilder<> &B)
: Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
Expand All @@ -102,48 +101,61 @@ bool X86InterleavedAccessGroup::isSupported() const {
uint64_t ShuffleVecSize = DL.getTypeSizeInBits(ShuffleVecTy);
Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();

if (DL.getTypeSizeInBits(Inst->getType()) < Factor * ShuffleVecSize)
return false;
// Currently, lowering is supported for 4-element vectors of 64 bits on AVX.
uint64_t ExpectedShuffleVecSize;
if (isa<LoadInst>(Inst))
ExpectedShuffleVecSize = 256;
else
ExpectedShuffleVecSize = 1024;

// Currently, lowering is supported for 64 bits on AVX.
if (!Subtarget.hasAVX() || ShuffleVecSize != 256 ||
if (!Subtarget.hasAVX() || ShuffleVecSize != ExpectedShuffleVecSize ||
DL.getTypeSizeInBits(ShuffleEltTy) != 64 || Factor != 4)
return false;

return true;
}

bool X86InterleavedAccessGroup::decompose(
void X86InterleavedAccessGroup::decompose(
Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy,
SmallVectorImpl<Instruction *> &DecomposedVectors) {

assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
"Expected Load or Shuffle");

Type *VecTy = VecInst->getType();
(void)VecTy;
assert(VecTy->isVectorTy() &&
DL.getTypeSizeInBits(VecTy) >=
DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
"Invalid Inst-size!!!");
assert(VecTy->getVectorElementType() == SubVecTy->getVectorElementType() &&
"Element type mismatched!!!");

if (!isa<LoadInst>(VecInst))
return false;
if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
Value *Op0 = SVI->getOperand(0);
Value *Op1 = SVI->getOperand(1);

// Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
for (unsigned i = 0; i < NumSubVectors; ++i)
DecomposedVectors.push_back(
cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
Op0, Op1, createSequentialMask(Builder, Indices[i],
SubVecTy->getVectorNumElements(), 0))));
return;
}

// Decompose the load instruction.
LoadInst *LI = cast<LoadInst>(VecInst);
Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace());

Value *VecBasePtr =
Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);

// Generate N loads of T type
// Generate N loads of T type.
for (unsigned i = 0; i < NumSubVectors; i++) {
// TODO: Support inbounds GEP
// TODO: Support inbounds GEP.
Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));
Instruction *NewLoad =
Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment());
DecomposedVectors.push_back(NewLoad);
}

return true;
}

void X86InterleavedAccessGroup::transpose_4x4(
Expand Down Expand Up @@ -181,21 +193,46 @@ void X86InterleavedAccessGroup::transpose_4x4(
// instructions/intrinsics.
bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
SmallVector<Instruction *, 4> DecomposedVectors;
VectorType *VecTy = Shuffles[0]->getType();
// Try to generate target-sized register(/instruction).
if (!decompose(Inst, Factor, VecTy, DecomposedVectors))
return false;

SmallVector<Value *, 4> TransposedVectors;
// Perform matrix-transposition in order to compute interleaved
// results by generating some sort of (optimized) target-specific
// instructions.
VectorType *ShuffleTy = Shuffles[0]->getType();

if (isa<LoadInst>(Inst)) {
// Try to generate target-sized register(/instruction).
decompose(Inst, Factor, ShuffleTy, DecomposedVectors);

// Perform matrix-transposition in order to compute interleaved
// results by generating some sort of (optimized) target-specific
// instructions.
transpose_4x4(DecomposedVectors, TransposedVectors);

// Now replace the unoptimized-interleaved-vectors with the
// transposed-interleaved vectors.
for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);

return true;
}

Type *ShuffleEltTy = ShuffleTy->getVectorElementType();
unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor;

// Lower the interleaved stores:
// 1. Decompose the interleaved wide shuffle into individual shuffle
// vectors.
decompose(Shuffles[0], Factor,
VectorType::get(ShuffleEltTy, NumSubVecElems), DecomposedVectors);

// 2. Transpose the interleaved-vectors into vectors of contiguous
// elements.
transpose_4x4(DecomposedVectors, TransposedVectors);

// Now replace the unoptimized-interleaved-vectors with the
// transposed-interleaved vectors.
for (unsigned i = 0; i < Shuffles.size(); i++)
Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
// 3. Concatenate the contiguous-vectors back into a wide vector.
Value *WideVec = concatenateVectors(Builder, TransposedVectors);

// 4. Generate a store instruction for wide-vec.
StoreInst *SI = cast<StoreInst>(Inst);
Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(),
SI->getAlignment());

return true;
}
Expand All @@ -220,3 +257,30 @@ bool X86TargetLowering::lowerInterleavedLoad(

return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
}

bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
ShuffleVectorInst *SVI,
unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");

VectorType *VecTy = SVI->getType();
assert(VecTy->getVectorNumElements() % Factor == 0 &&
"Invalid interleaved store");

// Holds the indices of SVI that correspond to the starting index of each
// interleaved shuffle.
SmallVector<unsigned, 4> Indices;
auto Mask = SVI->getShuffleMask();
for (unsigned i = 0; i < Factor; i++)
Indices.push_back(Mask[i]);

ArrayRef<ShuffleVectorInst *> Shuffles = makeArrayRef(SVI);

// Create an interleaved access group.
IRBuilder<> Builder(SI);
X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
Builder);

return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
}
92 changes: 32 additions & 60 deletions llvm/test/CodeGen/X86/x86-interleaved-access.ll
Expand Up @@ -129,26 +129,18 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) {
define void @store_factorf64_4(<16 x double>* %ptr, <4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x double> %v3) {
; AVX-LABEL: store_factorf64_4:
; AVX: # BB#0:
; AVX-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm2[0],xmm3[0]
; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
; AVX-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm0[0],xmm1[0]
; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3]
; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm2[1],xmm3[1]
; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm0[1],xmm1[1]
; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3]
; AVX-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
; AVX-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3]
; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
; AVX-NEXT: vmovupd %ymm0, 96(%rdi)
; AVX-NEXT: vmovupd %ymm6, 64(%rdi)
; AVX-NEXT: vmovupd %ymm5, 32(%rdi)
; AVX-NEXT: vmovupd %ymm4, (%rdi)
; AVX-NEXT: vmovupd %ymm3, 64(%rdi)
; AVX-NEXT: vmovupd %ymm4, 32(%rdi)
; AVX-NEXT: vmovupd %ymm2, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
%s0 = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
Expand All @@ -161,55 +153,35 @@ define void @store_factorf64_4(<16 x double>* %ptr, <4 x double> %v0, <4 x doubl
define void @store_factori64_4(<16 x i64>* %ptr, <4 x i64> %v0, <4 x i64> %v1, <4 x i64> %v2, <4 x i64> %v3) {
; AVX1-LABEL: store_factori64_4:
; AVX1: # BB#0:
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm0[0],xmm1[0]
; AVX1-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3]
; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm2[1],xmm3[1]
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm0[1],xmm1[1]
; AVX1-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3]
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX1-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3]
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
; AVX1-NEXT: vmovupd %ymm0, 96(%rdi)
; AVX1-NEXT: vmovupd %ymm6, 64(%rdi)
; AVX1-NEXT: vmovupd %ymm5, 32(%rdi)
; AVX1-NEXT: vmovupd %ymm4, (%rdi)
; AVX1-NEXT: vmovupd %ymm3, 64(%rdi)
; AVX1-NEXT: vmovupd %ymm4, 32(%rdi)
; AVX1-NEXT: vmovupd %ymm2, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: store_factori64_4:
; AVX2: # BB#0:
; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5
; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,2,2,3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm6
; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm0[3,1,2,3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm6
; AVX2-NEXT: vpbroadcastq %xmm3, %ymm7
; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm0[0],xmm1[0]
; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi)
; AVX2-NEXT: vmovdqu %ymm6, (%rdi)
; AVX2-NEXT: vmovdqu %ymm5, 96(%rdi)
; AVX2-NEXT: vmovdqu %ymm4, 64(%rdi)
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm5
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi)
; AVX2-NEXT: vmovdqu %ymm3, 64(%rdi)
; AVX2-NEXT: vmovdqu %ymm4, 32(%rdi)
; AVX2-NEXT: vmovdqu %ymm2, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%s0 = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
Expand Down

0 comments on commit 4b652a5

Please sign in to comment.