Skip to content

Commit

Permalink
[ARM] - Fix lowering of shufflevectors in AArch32
Browse files Browse the repository at this point in the history
Some shufflevectors are currently being incorrectly lowered in the AArch32
backend as the existing checks for detecting the NEON operations from the
shufflevector instruction expects the shuffle mask and the vector operands to be
of the same length.

This is not always the case as the mask may be twice as long as the operand;
here only the lower half of the shufflemask gets checked, so provided the lower
half of the shufflemask looks like a vector transpose (or even is just all -1
for undef) then the intrinsics may get incorrectly lowered into a vector
transpose (VTRN) instruction.

This patch fixes this by accommodating for both cases and adds regression tests.

Differential Revision: http://reviews.llvm.org/D11407

llvm-svn: 243103
  • Loading branch information
Luke Cheeseman committed Jul 24, 2015
1 parent f8b5874 commit 4d45ff2
Show file tree
Hide file tree
Showing 5 changed files with 221 additions and 38 deletions.
165 changes: 127 additions & 38 deletions llvm/lib/Target/ARM/ARMISelLowering.cpp
Expand Up @@ -5043,18 +5043,50 @@ static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
return VT == MVT::v8i8 && M.size() == 8;
}

// Checks whether the shuffle mask represents a vector transpose (VTRN) by
// checking that pairs of elements in the shuffle mask represent the same index
// in each vector, incrementing the expected index by 2 at each step.
// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
// v2={e,f,g,h}
// WhichResult gives the offset for each element in the mask based on which
// of the two results it belongs to.
//
// The transpose can be represented either as:
// result1 = shufflevector v1, v2, result1_shuffle_mask
// result2 = shufflevector v1, v2, result2_shuffle_mask
// where v1/v2 and the shuffle masks have the same number of elements
// (here WhichResult (see below) indicates which result is being checked)
//
// or as:
// results = shufflevector v1, v2, shuffle_mask
// where both results are returned in one vector and the shuffle mask has twice
// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
// want to check the low half and high half of the shuffle mask as if it were
// the other case
static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned EltSz = VT.getVectorElementType().getSizeInBits();
if (EltSz == 64)
return false;

unsigned NumElts = VT.getVectorNumElements();
WhichResult = (M[0] == 0 ? 0 : 1);
for (unsigned i = 0; i < NumElts; i += 2) {
if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) ||
(M[i+1] >= 0 && (unsigned) M[i+1] != i + NumElts + WhichResult))
return false;
if (M.size() != NumElts && M.size() != NumElts*2)
return false;

// If the mask is twice as long as the result then we need to check the upper
// and lower parts of the mask
for (unsigned i = 0; i < M.size(); i += NumElts) {
WhichResult = M[i] == 0 ? 0 : 1;
for (unsigned j = 0; j < NumElts; j += 2) {
if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
(M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
return false;
}
}

if (M.size() == NumElts*2)
WhichResult = 0;

return true;
}

Expand All @@ -5067,28 +5099,52 @@ static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
return false;

unsigned NumElts = VT.getVectorNumElements();
WhichResult = (M[0] == 0 ? 0 : 1);
for (unsigned i = 0; i < NumElts; i += 2) {
if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) ||
(M[i+1] >= 0 && (unsigned) M[i+1] != i + WhichResult))
return false;
if (M.size() != NumElts && M.size() != NumElts*2)
return false;

for (unsigned i = 0; i < M.size(); i += NumElts) {
WhichResult = M[i] == 0 ? 0 : 1;
for (unsigned j = 0; j < NumElts; j += 2) {
if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
(M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
return false;
}
}

if (M.size() == NumElts*2)
WhichResult = 0;

return true;
}

// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
// that the mask elements are either all even and in steps of size 2 or all odd
// and in steps of size 2.
// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
// v2={e,f,g,h}
// Requires similar checks to that of isVTRNMask with
// respect the how results are returned.
static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned EltSz = VT.getVectorElementType().getSizeInBits();
if (EltSz == 64)
return false;

unsigned NumElts = VT.getVectorNumElements();
WhichResult = (M[0] == 0 ? 0 : 1);
for (unsigned i = 0; i != NumElts; ++i) {
if (M[i] < 0) continue; // ignore UNDEF indices
if ((unsigned) M[i] != 2 * i + WhichResult)
return false;
if (M.size() != NumElts && M.size() != NumElts*2)
return false;

for (unsigned i = 0; i < M.size(); i += NumElts) {
WhichResult = M[i] == 0 ? 0 : 1;
for (unsigned j = 0; j < NumElts; ++j) {
if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
return false;
}
}

if (M.size() == NumElts*2)
WhichResult = 0;

// VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
if (VT.is64BitVector() && EltSz == 32)
return false;
Expand All @@ -5104,40 +5160,65 @@ static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
if (EltSz == 64)
return false;

unsigned Half = VT.getVectorNumElements() / 2;
WhichResult = (M[0] == 0 ? 0 : 1);
for (unsigned j = 0; j != 2; ++j) {
unsigned Idx = WhichResult;
for (unsigned i = 0; i != Half; ++i) {
int MIdx = M[i + j * Half];
if (MIdx >= 0 && (unsigned) MIdx != Idx)
return false;
Idx += 2;
unsigned NumElts = VT.getVectorNumElements();
if (M.size() != NumElts && M.size() != NumElts*2)
return false;

unsigned Half = NumElts / 2;
for (unsigned i = 0; i < M.size(); i += NumElts) {
WhichResult = M[i] == 0 ? 0 : 1;
for (unsigned j = 0; j < NumElts; j += Half) {
unsigned Idx = WhichResult;
for (unsigned k = 0; k < Half; ++k) {
int MIdx = M[i + j + k];
if (MIdx >= 0 && (unsigned) MIdx != Idx)
return false;
Idx += 2;
}
}
}

if (M.size() == NumElts*2)
WhichResult = 0;

// VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
if (VT.is64BitVector() && EltSz == 32)
return false;

return true;
}

// Checks whether the shuffle mask represents a vector zip (VZIP) by checking
// that pairs of elements of the shufflemask represent the same index in each
// vector incrementing sequentially through the vectors.
// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
// v2={e,f,g,h}
// Requires similar checks to that of isVTRNMask with respect the how results
// are returned.
static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned EltSz = VT.getVectorElementType().getSizeInBits();
if (EltSz == 64)
return false;

unsigned NumElts = VT.getVectorNumElements();
WhichResult = (M[0] == 0 ? 0 : 1);
unsigned Idx = WhichResult * NumElts / 2;
for (unsigned i = 0; i != NumElts; i += 2) {
if ((M[i] >= 0 && (unsigned) M[i] != Idx) ||
(M[i+1] >= 0 && (unsigned) M[i+1] != Idx + NumElts))
return false;
Idx += 1;
if (M.size() != NumElts && M.size() != NumElts*2)
return false;

for (unsigned i = 0; i < M.size(); i += NumElts) {
WhichResult = M[i] == 0 ? 0 : 1;
unsigned Idx = WhichResult * NumElts / 2;
for (unsigned j = 0; j < NumElts; j += 2) {
if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
(M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
return false;
Idx += 1;
}
}

if (M.size() == NumElts*2)
WhichResult = 0;

// VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
if (VT.is64BitVector() && EltSz == 32)
return false;
Expand All @@ -5154,15 +5235,23 @@ static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
return false;

unsigned NumElts = VT.getVectorNumElements();
WhichResult = (M[0] == 0 ? 0 : 1);
unsigned Idx = WhichResult * NumElts / 2;
for (unsigned i = 0; i != NumElts; i += 2) {
if ((M[i] >= 0 && (unsigned) M[i] != Idx) ||
(M[i+1] >= 0 && (unsigned) M[i+1] != Idx))
return false;
Idx += 1;
if (M.size() != NumElts && M.size() != NumElts*2)
return false;

for (unsigned i = 0; i < M.size(); i += NumElts) {
WhichResult = M[i] == 0 ? 0 : 1;
unsigned Idx = WhichResult * NumElts / 2;
for (unsigned j = 0; j < NumElts; j += 2) {
if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
(M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
return false;
Idx += 1;
}
}

if (M.size() == NumElts*2)
WhichResult = 0;

// VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
if (VT.is64BitVector() && EltSz == 32)
return false;
Expand Down
32 changes: 32 additions & 0 deletions llvm/test/CodeGen/ARM/vext.ll
Expand Up @@ -196,3 +196,35 @@ define arm_aapcscc void @test_elem_mismatch(<2 x i64>* nocapture %src, <4 x i16>
store <4 x i16> %tmp7, <4 x i16>* %dest, align 4
ret void
}

define <4 x i32> @test_reverse_and_extract(<2 x i32>* %A) {
entry:
; CHECK-LABEL: test_reverse_and_extract
; CHECK-NOT: vtrn
; CHECK: vrev
; CHECK: vext
%tmp1 = load <2 x i32>, <2 x i32>* %A
%0 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 0>
ret <4 x i32> %0
}

define <4 x i32> @test_dup_and_extract(<2 x i32>* %A) {
entry:
; CHECK-LABEL: test_dup_and_extract
; CHECK-NOT: vtrn
; CHECK: vdup
; CHECK: vext
%tmp1 = load <2 x i32>, <2 x i32>* %A
%0 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
ret <4 x i32> %0
}

define <4 x i32> @test_zip_and_extract(<2 x i32>* %A) {
entry:
; CHECK-LABEL: test_zip_and_extract
; CHECK: vzip
; CHECK: vext
%tmp1 = load <2 x i32>, <2 x i32>* %A
%0 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 1>
ret <4 x i32> %0
}
10 changes: 10 additions & 0 deletions llvm/test/CodeGen/ARM/vtrn.ll
Expand Up @@ -325,3 +325,13 @@ define <16 x i16> @vtrnQi16_undef_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 6, i32 14, i32 1, i32 undef, i32 3, i32 11, i32 5, i32 13, i32 undef, i32 undef>
ret <16 x i16> %tmp3
}

define <8 x i16> @vtrn_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) {
entry:
; CHECK-LABEL: vtrn_lower_shufflemask_undef
; CHECK: vtrn
%tmp1 = load <4 x i16>, <4 x i16>* %A
%tmp2 = load <4 x i16>, <4 x i16>* %B
%0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 5, i32 3, i32 7>
ret <8 x i16> %0
}
21 changes: 21 additions & 0 deletions llvm/test/CodeGen/ARM/vuzp.ll
Expand Up @@ -264,3 +264,24 @@ define <16 x i16> @vuzpQi16_undef_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind {
%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15>
ret <16 x i16> %tmp3
}

define <8 x i16> @vuzp_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) {
entry:
; CHECK-LABEL: vuzp_lower_shufflemask_undef
; CHECK: vuzp
%tmp1 = load <4 x i16>, <4 x i16>* %A
%tmp2 = load <4 x i16>, <4 x i16>* %B
%0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
ret <8 x i16> %0
}

define <4 x i32> @vuzp_lower_shufflemask_zeroed(<2 x i32>* %A, <2 x i32>* %B) {
entry:
; CHECK-LABEL: vuzp_lower_shufflemask_zeroed
; CHECK-NOT: vtrn
; CHECK: vuzp
%tmp1 = load <2 x i32>, <2 x i32>* %A
%tmp2 = load <2 x i32>, <2 x i32>* %B
%0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 0, i32 0, i32 1, i32 3>
ret <4 x i32> %0
}
31 changes: 31 additions & 0 deletions llvm/test/CodeGen/ARM/vzip.ll
Expand Up @@ -264,3 +264,34 @@ define <32 x i8> @vzipQi8_undef_QQres(<16 x i8>* %A, <16 x i8>* %B) nounwind {
%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> <i32 0, i32 16, i32 1, i32 undef, i32 undef, i32 undef, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 undef, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 undef, i32 14, i32 30, i32 undef, i32 31>
ret <32 x i8> %tmp3
}

define <8 x i16> @vzip_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) {
entry:
; CHECK-LABEL: vzip_lower_shufflemask_undef
; CHECK: vzip
%tmp1 = load <4 x i16>, <4 x i16>* %A
%tmp2 = load <4 x i16>, <4 x i16>* %B
%0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
ret <8 x i16> %0
}

define <4 x i32> @vzip_lower_shufflemask_zeroed(<2 x i32>* %A) {
entry:
; CHECK-LABEL: vzip_lower_shufflemask_zeroed
; CHECK-NOT: vtrn
; CHECK: vzip
%tmp1 = load <2 x i32>, <2 x i32>* %A
%0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp1, <4 x i32> <i32 0, i32 0, i32 1, i32 0>
ret <4 x i32> %0
}

define <4 x i32> @vzip_lower_shufflemask_vuzp(<2 x i32>* %A) {
entry:
; CHECK-LABEL: vzip_lower_shufflemask_vuzp
; CHECK-NOT: vuzp
; CHECK: vzip
%tmp1 = load <2 x i32>, <2 x i32>* %A
%0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp1, <4 x i32> <i32 0, i32 2, i32 1, i32 0>
ret <4 x i32> %0
}

0 comments on commit 4d45ff2

Please sign in to comment.