Skip to content

Commit

Permalink
[X86][LLVM]Expanding Supports lowerInterleavedStore() in X86Interleav…
Browse files Browse the repository at this point in the history
…edAccess (VF8 stride 4):

This patch expands the support of lowerInterleavedStore to 8x8i stride 4.

LLVM creates suboptimal shuffle code-gen for AVX2.
In overall, this patch is a specific fix for the pattern (Strid=4 VF=8) and we plan to include more patterns in the future.

The patch goal is to optimize the following sequence:
At the end of the computation, we have xmm2, xmm0, xmm12 and xmm3 holding
each 8 chars:

c0, c1, , c7
m0, m1, , m7
y0, y1, , y7
k0, k1, ., k7

And these need to be transposed/interleaved and stored like so:

c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3 ....

Reviewers
DavidKreitzer
Farhana
zvi
igorb
guyblank
RKSimon
Ayal

Differential Revision: https://reviews.llvm.org/D36058

Change-Id: I3cc5c2ca5d6318901c192a4428493b99ef424c32
llvm-svn: 314109
  • Loading branch information
Michael Zuckerman authored and Michael Zuckerman committed Sep 25, 2017
1 parent 31fef4d commit 4a97df0
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 36 deletions.
47 changes: 46 additions & 1 deletion llvm/lib/Target/X86/X86InterleavedAccess.cpp
Expand Up @@ -72,6 +72,8 @@ class X86InterleavedAccessGroup {
void interleave8bitStride4(ArrayRef<Instruction *> InputVectors,
SmallVectorImpl<Value *> &TransposedMatrix,
unsigned NumSubVecElems);
void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors,
SmallVectorImpl<Value *> &TransposedMatrix);
void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors,
SmallVectorImpl<Value *> &TransposedMatrix,
unsigned NumSubVecElems);
Expand Down Expand Up @@ -127,7 +129,7 @@ bool X86InterleavedAccessGroup::isSupported() const {
return true;

if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
(WideInstSize == 512 || WideInstSize == 1024))
(WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024))
return true;

if (ShuffleElemSize == 8 && isa<LoadInst>(Inst) && Factor == 3 &&
Expand Down Expand Up @@ -218,6 +220,46 @@ static MVT scaleVectorType(MVT VT) {
VT.getVectorNumElements() / 2);
}

void X86InterleavedAccessGroup::interleave8bitStride4VF8(
ArrayRef<Instruction *> Matrix,
SmallVectorImpl<Value *> &TransposedMatrix) {
// Assuming we start from the following vectors:
// Matrix[0]= c0 c1 c2 c3 c4 ... c7
// Matrix[1]= m0 m1 m2 m3 m4 ... m7
// Matrix[2]= y0 y1 y2 y3 y4 ... y7
// Matrix[3]= k0 k1 k2 k3 k4 ... k7

MVT VT = MVT::v8i16;
TransposedMatrix.resize(2);
SmallVector<uint32_t, 16> MaskLow;
SmallVector<uint32_t, 32> MaskLowTemp1, MaskLowWord;
SmallVector<uint32_t, 32> MaskHighTemp1, MaskHighWord;

for (unsigned i = 0; i < 8; ++i) {
MaskLow.push_back(i);
MaskLow.push_back(i + 8);
}

createUnpackShuffleMask<uint32_t>(VT, MaskLowTemp1, true, false);
createUnpackShuffleMask<uint32_t>(VT, MaskHighTemp1, false, false);
scaleShuffleMask<uint32_t>(2, MaskHighTemp1, MaskHighWord);
scaleShuffleMask<uint32_t>(2, MaskLowTemp1, MaskLowWord);
// IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7
// IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7
Value *IntrVec1Low =
Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
Value *IntrVec2Low =
Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);

// TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3
// TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7

TransposedMatrix[0] =
Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
TransposedMatrix[1] =
Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
}

void X86InterleavedAccessGroup::interleave8bitStride4(
ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix,
unsigned numberOfElement) {
Expand Down Expand Up @@ -538,6 +580,9 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
case 4:
transpose_4x4(DecomposedVectors, TransposedVectors);
break;
case 8:
interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
break;
case 16:
case 32:
interleave8bitStride4(DecomposedVectors, TransposedVectors, NumSubVecElems);
Expand Down
52 changes: 21 additions & 31 deletions llvm/test/CodeGen/X86/x86-interleaved-access.ll
Expand Up @@ -936,47 +936,37 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) {
define void @interleaved_store_vf8_i8_stride4(<8 x i8> %x1, <8 x i8> %x2, <8 x i8> %x3, <8 x i8> %x4, <32 x i8>* %p) {
; AVX1-LABEL: interleaved_store_vf8_i8_stride4:
; AVX1: # BB#0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,5,13,6,14,7,15,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,1,9,2,10,3,11,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm1
; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vmovaps %ymm0, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX-LABEL: interleaved_store_vf8_i8_stride4:
; AVX: # BB#0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,5,13,6,14,7,15,u,u,u,u,u,u,u,u>
; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm3
; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm2
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,1,9,2,10,3,11,u,u,u,u,u,u,u,u>
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm1
; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
; AVX-NEXT: vmovdqa %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
%v1 = shufflevector <8 x i8> %x1, <8 x i8> %x2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%v2 = shufflevector <8 x i8> %x2, <8 x i8> %x3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%v2 = shufflevector <8 x i8> %x3, <8 x i8> %x4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%interleaved.vec = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0,i32 8,i32 16,i32 24,i32 1,i32 9,i32 17,i32 25,i32 2,i32 10,i32 18,i32 26,i32 3,i32 11,i32 19,i32 27,i32 4,i32 12,i32 20,i32 28,i32 5,i32 13,i32 21,i32 29,i32 6,i32 14,i32 22,i32 30,i32 7,i32 15,i32 23,i32 31>
store <32 x i8> %interleaved.vec, <32 x i8>* %p
ret void
Expand Down
16 changes: 12 additions & 4 deletions llvm/test/Transforms/InterleavedAccess/X86/interleavedStore.ll
Expand Up @@ -66,13 +66,21 @@ ret void
define void @interleaved_store_vf8_i8_stride4(<8 x i8> %x1, <8 x i8> %x2, <8 x i8> %x3, <8 x i8> %x4, <32 x i8>* %p) {
; CHECK-LABEL: @interleaved_store_vf8_i8_stride4(
; CHECK-NEXT: [[V1:%.*]] = shufflevector <8 x i8> [[X1:%.*]], <8 x i8> [[X2:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[V2:%.*]] = shufflevector <8 x i8> [[X2]], <8 x i8> [[X3:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i8> [[V1]], <16 x i8> [[V2]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
; CHECK-NEXT: store <32 x i8> [[INTERLEAVED_VEC]], <32 x i8>* [[P:%.*]]
; CHECK-NEXT: [[V2:%.*]] = shufflevector <8 x i8> [[X3:%.*]], <8 x i8> [[X4:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V1]], <16 x i8> [[V2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[V1]], <16 x i8> [[V2]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[V1]], <16 x i8> [[V2]], <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i8> [[V1]], <16 x i8> [[V2]], <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 2, i32 3, i32 18, i32 19, i32 4, i32 5, i32 20, i32 21, i32 6, i32 7, i32 22, i32 23>
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], <16 x i32> <i32 8, i32 9, i32 24, i32 25, i32 10, i32 11, i32 26, i32 27, i32 12, i32 13, i32 28, i32 29, i32 14, i32 15, i32 30, i32 31>
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP8]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; CHECK-NEXT: store <32 x i8> [[TMP9]], <32 x i8>* [[P:%.*]]
; CHECK-NEXT: ret void
;
%v1 = shufflevector <8 x i8> %x1, <8 x i8> %x2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%v2 = shufflevector <8 x i8> %x2, <8 x i8> %x3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%v2 = shufflevector <8 x i8> %x3, <8 x i8> %x4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%interleaved.vec = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0,i32 8,i32 16,i32 24,i32 1,i32 9,i32 17,i32 25,i32 2,i32 10,i32 18,i32 26,i32 3,i32 11,i32 19,i32 27,i32 4,i32 12,i32 20,i32 28,i32 5,i32 13,i32 21,i32 29,i32 6,i32 14,i32 22,i32 30,i32 7,i32 15,i32 23,i32 31>
store <32 x i8> %interleaved.vec, <32 x i8>* %p
ret void
Expand Down

0 comments on commit 4a97df0

Please sign in to comment.