Skip to content

Commit

Permalink
[X86][LLVM]Expanding Supports lowerInterleavedStore() in X86Interleav…
Browse files Browse the repository at this point in the history
…edAccess.

This patch expands the support of lowerInterleavedStore to 32x8i stride 4.

LLVM creates suboptimal shuffle code-gen for AVX2. In overall, this patch is a specific fix for the pattern (Strid=4 VF=32) and we plan to include more patterns in the future. To reach our goal of "more patterns". We include two mask creators. The first function creates shuffle's mask equivalent to unpacklo/unpackhi instructions. The other creator creates mask equivalent to a concat of two half vectors(high/low).

The patch goal is to optimize the following sequence:
At the end of the computation, we have ymm2, ymm0, ymm12 and ymm3 holding
each 32 chars:

c0, c1, , c31
m0, m1, , m31
y0, y1, , y31
k0, k1, ., k31

And these need to be transposed/interleaved and stored like so:

c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3 ....

Reviewers:
dorit
Farhana
RKSimon
guyblank
DavidKreitzer

Differential Revision: https://reviews.llvm.org/D34601

llvm-svn: 309086
  • Loading branch information
Michael Zuckerman authored and Michael Zuckerman committed Jul 26, 2017
1 parent 1b73682 commit c1918ad
Show file tree
Hide file tree
Showing 3 changed files with 212 additions and 146 deletions.
139 changes: 132 additions & 7 deletions llvm/lib/Target/X86/X86InterleavedAccess.cpp
Expand Up @@ -14,7 +14,6 @@
///
//===--------------------------------------------------------------------===//

#include "X86ISelLowering.h"
#include "X86TargetMachine.h"
#include "llvm/Analysis/VectorUtils.h"

Expand Down Expand Up @@ -69,8 +68,9 @@ class X86InterleavedAccessGroup {
/// Out-V2 = p3, q3, r3, s3
/// Out-V3 = P4, q4, r4, s4
void transpose_4x4(ArrayRef<Instruction *> InputVectors,
SmallVectorImpl<Value *> &TrasposedVectors);

SmallVectorImpl<Value *> &TransposedMatrix);
void interleave8bit_32x4(ArrayRef<Instruction *> InputVectors,
SmallVectorImpl<Value *> &TransposedMatrix);
public:
/// In order to form an interleaved access group X86InterleavedAccessGroup
/// requires a wide-load instruction \p 'I', a group of interleaved-vectors
Expand Down Expand Up @@ -101,18 +101,27 @@ bool X86InterleavedAccessGroup::isSupported() const {
Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
unsigned SupportedNumElem = 4;
if (ShuffleElemSize == 8)
SupportedNumElem = 32;
unsigned WideInstSize;

// Currently, lowering is supported for 4-element vectors of 64 bits on AVX.
// Currently, lowering is supported for the following vectors:
// 1. 4-element vectors of 64 bits on AVX.
// 2. 32-element vectors of 8 bits on AVX.
if (isa<LoadInst>(Inst)) {
if (DL.getTypeSizeInBits(ShuffleVecTy) != SupportedNumElem * ShuffleElemSize)
if (DL.getTypeSizeInBits(ShuffleVecTy) !=
SupportedNumElem * ShuffleElemSize)
return false;

WideInstSize = DL.getTypeSizeInBits(Inst->getType());
} else
WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());

if (!Subtarget.hasAVX() || Factor != 4 || ShuffleElemSize != 64 ||
if (DL.getTypeSizeInBits(ShuffleEltTy) == 8 && !isa<StoreInst>(Inst))
return false;

if (!Subtarget.hasAVX() || Factor != 4 ||
(ShuffleElemSize != 64 && ShuffleElemSize != 8) ||
WideInstSize != (Factor * ShuffleElemSize * SupportedNumElem))
return false;

Expand Down Expand Up @@ -163,6 +172,113 @@ void X86InterleavedAccessGroup::decompose(
}
}

// Create shuffle mask for concatenation of two half vectors.
// Low = false: mask generated for the shuffle
// shuffle(VEC1,VEC2,{NumElement/2, NumElement/2+1, NumElement/2+2...,
// NumElement-1, NumElement+NumElement/2,
// NumElement+NumElement/2+1..., 2*NumElement-1})
// = concat(high_half(VEC1),high_half(VEC2))
// Low = true: mask generated for the shuffle
// shuffle(VEC1,VEC2,{0,1,2,...,NumElement/2-1,NumElement,
// NumElement+1...,NumElement+NumElement/2-1})
// = concat(low_half(VEC1),low_half(VEC2))
static void createConcatShuffleMask(int NumElements,
SmallVectorImpl<uint32_t> &Mask, bool Low) {
int NumHalfElements = NumElements / 2;
int Offset = Low ? 0 : NumHalfElements;
for (int i = 0; i < NumHalfElements; ++i)
Mask.push_back(i + Offset);
for (int i = 0; i < NumHalfElements; ++i)
Mask.push_back(i + Offset + NumElements);
}

void X86InterleavedAccessGroup::interleave8bit_32x4(
ArrayRef<Instruction *> Matrix,
SmallVectorImpl<Value *> &TransposedMatrix) {

// Example: Assuming we start from the following vectors:
// Matrix[0]= c0 c1 c2 c3 c4 ... c31
// Matrix[1]= m0 m1 m2 m3 m4 ... m31
// Matrix[2]= y0 y1 y2 y3 y4 ... y31
// Matrix[3]= k0 k1 k2 k3 k4 ... k31

TransposedMatrix.resize(4);

SmallVector<uint32_t, 32> MaskHighTemp;
SmallVector<uint32_t, 32> MaskLowTemp;
SmallVector<uint32_t, 32> MaskHighTemp1;
SmallVector<uint32_t, 32> MaskLowTemp1;
SmallVector<uint32_t, 32> MaskHighTemp2;
SmallVector<uint32_t, 32> MaskLowTemp2;
SmallVector<uint32_t, 32> ConcatLow;
SmallVector<uint32_t, 32> ConcatHigh;

// MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
// shuffle pattern.

createUnpackShuffleMask<uint32_t>(MVT::v32i8, MaskHighTemp, false, false);
createUnpackShuffleMask<uint32_t>(MVT::v32i8, MaskLowTemp, true, false);
ArrayRef<uint32_t> MaskHigh = makeArrayRef(MaskHighTemp);
ArrayRef<uint32_t> MaskLow = makeArrayRef(MaskLowTemp);

// ConcatHigh and ConcatLow built in the vperm2i128 and vinserti128 X86
// shuffle pattern.

createConcatShuffleMask(32, ConcatLow, true);
createConcatShuffleMask(32, ConcatHigh, false);
ArrayRef<uint32_t> MaskConcatLow = makeArrayRef(ConcatLow);
ArrayRef<uint32_t> MaskConcatHigh = makeArrayRef(ConcatHigh);

// MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
// shuffle pattern.

createUnpackShuffleMask<uint32_t>(MVT::v16i16, MaskLowTemp1, true, false);
createUnpackShuffleMask<uint32_t>(MVT::v16i16, MaskHighTemp1, false, false);
scaleShuffleMask<uint32_t>(2, makeArrayRef(MaskHighTemp1), MaskHighTemp2);
scaleShuffleMask<uint32_t>(2, makeArrayRef(MaskLowTemp1), MaskLowTemp2);
ArrayRef<uint32_t> MaskHighWord = makeArrayRef(MaskHighTemp2);
ArrayRef<uint32_t> MaskLowWord = makeArrayRef(MaskLowTemp2);

// IntrVec1Low = c0 m0 c1 m1 ... c7 m7 | c16 m16 c17 m17 ... c23 m23
// IntrVec1High = c8 m8 c9 m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31
// IntrVec2Low = y0 k0 y1 k1 ... y7 k7 | y16 k16 y17 k17 ... y23 k23
// IntrVec2High = y8 k8 y9 k9 ... y15 k15 | y24 k24 y25 k25 ... y31 k31

Value *IntrVec1Low =
Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
Value *IntrVec1High =
Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh);
Value *IntrVec2Low =
Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
Value *IntrVec2High =
Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh);

// cmyk4 cmyk5 cmyk6 cmyk7 | cmyk20 cmyk21 cmyk22 cmyk23
// cmyk12 cmyk13 cmyk14 cmyk15 | cmyk28 cmyk29 cmyk30 cmyk31
// cmyk0 cmyk1 cmyk2 cmyk3 | cmyk16 cmyk17 cmyk18 cmyk19
// cmyk8 cmyk9 cmyk10 cmyk11 | cmyk24 cmyk25 cmyk26 cmyk27

Value *High =
Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
Value *High1 =
Builder.CreateShuffleVector(IntrVec1High, IntrVec2High, MaskHighWord);
Value *Low =
Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
Value *Low1 =
Builder.CreateShuffleVector(IntrVec1High, IntrVec2High, MaskLowWord);

// cmyk0 cmyk1 cmyk2 cmyk3 | cmyk4 cmyk5 cmyk6 cmyk7
// cmyk8 cmyk9 cmyk10 cmyk11 | cmyk12 cmyk13 cmyk14 cmyk15
// cmyk16 cmyk17 cmyk18 cmyk19 | cmyk20 cmyk21 cmyk22 cmyk23
// cmyk24 cmyk25 cmyk26 cmyk27 | cmyk28 cmyk29 cmyk30 cmyk31

TransposedMatrix[0] = Builder.CreateShuffleVector(Low, High, MaskConcatLow);
TransposedMatrix[1] = Builder.CreateShuffleVector(Low1, High1, MaskConcatLow);
TransposedMatrix[2] = Builder.CreateShuffleVector(Low, High, MaskConcatHigh);
TransposedMatrix[3] =
Builder.CreateShuffleVector(Low1, High1, MaskConcatHigh);
}

void X86InterleavedAccessGroup::transpose_4x4(
ArrayRef<Instruction *> Matrix,
SmallVectorImpl<Value *> &TransposedMatrix) {
Expand Down Expand Up @@ -229,7 +345,16 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {

// 2. Transpose the interleaved-vectors into vectors of contiguous
// elements.
transpose_4x4(DecomposedVectors, TransposedVectors);
switch (NumSubVecElems) {
case 4:
transpose_4x4(DecomposedVectors, TransposedVectors);
break;
case 32:
interleave8bit_32x4(DecomposedVectors, TransposedVectors);
break;
default:
return false;
}

// 3. Concatenate the contiguous-vectors back into a wide vector.
Value *WideVec = concatenateVectors(Builder, TransposedVectors);
Expand Down

0 comments on commit c1918ad

Please sign in to comment.