Skip to content

Commit fe2dc19

Browse files
authored
[LoongArch] Refine 256-bit vector_shuffle legalization for LASX (#160254)
1 parent a274ffe commit fe2dc19

File tree

3 files changed

+49
-39
lines changed

3 files changed

+49
-39
lines changed

llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp

Lines changed: 47 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2068,7 +2068,10 @@ lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
20682068

20692069
const auto &Begin = Mask.begin();
20702070
const auto &End = Mask.end();
2071-
unsigned HalfSize = Mask.size() / 2;
2071+
int HalfSize = Mask.size() / 2;
2072+
2073+
if (SplatIndex >= HalfSize)
2074+
return SDValue();
20722075

20732076
assert(SplatIndex < (int)Mask.size() && "Out of bounds mask index");
20742077
if (fitsRegularPattern<int>(Begin, 1, End - HalfSize, SplatIndex, 0) &&
@@ -2363,8 +2366,10 @@ static SDValue lowerVECTOR_SHUFFLE_XVSHUF(const SDLoc &DL, ArrayRef<int> Mask,
23632366
/// The first case is the closest to LoongArch instructions and the other
23642367
/// cases need to be converted to it for processing.
23652368
///
2366-
/// This function may modify V1, V2 and Mask
2367-
static void canonicalizeShuffleVectorByLane(
2369+
/// This function will return true for the last three cases above and will
2370+
/// modify V1, V2 and Mask. Otherwise, return false for the first case and
2371+
/// cross-lane shuffle cases.
2372+
static bool canonicalizeShuffleVectorByLane(
23682373
const SDLoc &DL, MutableArrayRef<int> Mask, MVT VT, SDValue &V1,
23692374
SDValue &V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget) {
23702375

@@ -2388,15 +2393,15 @@ static void canonicalizeShuffleVectorByLane(
23882393
preMask = LowLaneTy;
23892394

23902395
if (std::all_of(Mask.begin() + HalfSize, Mask.end(), [&](int M) {
2391-
return M < 0 || (M >= 0 && M < HalfSize) ||
2392-
(M >= MaskSize && M < MaskSize + HalfSize);
2396+
return M < 0 || (M >= HalfSize && M < MaskSize) ||
2397+
(M >= MaskSize + HalfSize && M < MaskSize * 2);
23932398
}))
2394-
postMask = HighLaneTy;
2399+
postMask = LowLaneTy;
23952400
else if (std::all_of(Mask.begin() + HalfSize, Mask.end(), [&](int M) {
2396-
return M < 0 || (M >= HalfSize && M < MaskSize) ||
2397-
(M >= MaskSize + HalfSize && M < MaskSize * 2);
2401+
return M < 0 || (M >= 0 && M < HalfSize) ||
2402+
(M >= MaskSize && M < MaskSize + HalfSize);
23982403
}))
2399-
postMask = LowLaneTy;
2404+
postMask = HighLaneTy;
24002405

24012406
// The pre-half of mask is high lane type, and the post-half of mask
24022407
// is low lane type, which is closest to the LoongArch instructions.
@@ -2405,7 +2410,7 @@ static void canonicalizeShuffleVectorByLane(
24052410
// to the lower 128-bit of vector register, and the low lane of mask
24062411
// corresponds the higher 128-bit of vector register.
24072412
if (preMask == HighLaneTy && postMask == LowLaneTy) {
2408-
return;
2413+
return false;
24092414
}
24102415
if (preMask == LowLaneTy && postMask == HighLaneTy) {
24112416
V1 = DAG.getBitcast(MVT::v4i64, V1);
@@ -2459,8 +2464,10 @@ static void canonicalizeShuffleVectorByLane(
24592464
*it = *it < 0 ? *it : *it + HalfSize;
24602465
}
24612466
} else { // cross-lane
2462-
return;
2467+
return false;
24632468
}
2469+
2470+
return true;
24642471
}
24652472

24662473
/// Lower VECTOR_SHUFFLE as lane permute and then shuffle (if possible).
@@ -2526,28 +2533,21 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
25262533
assert(Mask.size() % 2 == 0 && "Expected even mask size.");
25272534
assert(Mask.size() >= 4 && "Mask size is less than 4.");
25282535

2529-
// canonicalize non cross-lane shuffle vector
2530-
SmallVector<int> NewMask(Mask);
2531-
canonicalizeShuffleVectorByLane(DL, NewMask, VT, V1, V2, DAG, Subtarget);
2532-
25332536
APInt KnownUndef, KnownZero;
2534-
computeZeroableShuffleElements(NewMask, V1, V2, KnownUndef, KnownZero);
2537+
computeZeroableShuffleElements(Mask, V1, V2, KnownUndef, KnownZero);
25352538
APInt Zeroable = KnownUndef | KnownZero;
25362539

25372540
SDValue Result;
25382541
// TODO: Add more comparison patterns.
25392542
if (V2.isUndef()) {
2540-
if ((Result = lowerVECTOR_SHUFFLE_XVREPLVEI(DL, NewMask, VT, V1, V2, DAG,
2543+
if ((Result = lowerVECTOR_SHUFFLE_XVREPLVEI(DL, Mask, VT, V1, V2, DAG,
25412544
Subtarget)))
25422545
return Result;
2543-
if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG,
2546+
if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, Mask, VT, V1, V2, DAG,
25442547
Subtarget)))
25452548
return Result;
2546-
if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, NewMask, VT, V1, V2, DAG,
2547-
Subtarget)))
2548-
return Result;
2549-
if ((Result = lowerVECTOR_SHUFFLEAsLanePermuteAndShuffle(DL, NewMask, VT,
2550-
V1, V2, DAG)))
2549+
if ((Result =
2550+
lowerVECTOR_SHUFFLE_XVPERM(DL, Mask, VT, V1, V2, DAG, Subtarget)))
25512551
return Result;
25522552

25532553
// TODO: This comment may be enabled in the future to better match the
@@ -2557,24 +2557,39 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
25572557

25582558
// It is recommended not to change the pattern comparison order for better
25592559
// performance.
2560-
if ((Result = lowerVECTOR_SHUFFLE_XVPACKEV(DL, NewMask, VT, V1, V2, DAG)))
2560+
if ((Result = lowerVECTOR_SHUFFLE_XVPACKEV(DL, Mask, VT, V1, V2, DAG)))
25612561
return Result;
2562-
if ((Result = lowerVECTOR_SHUFFLE_XVPACKOD(DL, NewMask, VT, V1, V2, DAG)))
2562+
if ((Result = lowerVECTOR_SHUFFLE_XVPACKOD(DL, Mask, VT, V1, V2, DAG)))
25632563
return Result;
2564-
if ((Result = lowerVECTOR_SHUFFLE_XVILVH(DL, NewMask, VT, V1, V2, DAG)))
2564+
if ((Result = lowerVECTOR_SHUFFLE_XVILVH(DL, Mask, VT, V1, V2, DAG)))
25652565
return Result;
2566-
if ((Result = lowerVECTOR_SHUFFLE_XVILVL(DL, NewMask, VT, V1, V2, DAG)))
2566+
if ((Result = lowerVECTOR_SHUFFLE_XVILVL(DL, Mask, VT, V1, V2, DAG)))
25672567
return Result;
2568-
if ((Result = lowerVECTOR_SHUFFLE_XVPICKEV(DL, NewMask, VT, V1, V2, DAG)))
2568+
if ((Result = lowerVECTOR_SHUFFLE_XVPICKEV(DL, Mask, VT, V1, V2, DAG)))
25692569
return Result;
2570-
if ((Result = lowerVECTOR_SHUFFLE_XVPICKOD(DL, NewMask, VT, V1, V2, DAG)))
2570+
if ((Result = lowerVECTOR_SHUFFLE_XVPICKOD(DL, Mask, VT, V1, V2, DAG)))
25712571
return Result;
2572-
if ((Result = lowerVECTOR_SHUFFLEAsShift(DL, NewMask, VT, V1, V2, DAG,
2573-
Subtarget, Zeroable)))
2572+
if ((Result = lowerVECTOR_SHUFFLEAsShift(DL, Mask, VT, V1, V2, DAG, Subtarget,
2573+
Zeroable)))
25742574
return Result;
2575-
if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, NewMask, VT, V1, V2, DAG,
2575+
if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, Mask, VT, V1, V2, DAG,
25762576
Subtarget)))
25772577
return Result;
2578+
2579+
// canonicalize non cross-lane shuffle vector
2580+
SmallVector<int> NewMask(Mask);
2581+
if (canonicalizeShuffleVectorByLane(DL, NewMask, VT, V1, V2, DAG, Subtarget))
2582+
return lower256BitShuffle(DL, NewMask, VT, V1, V2, DAG, Subtarget);
2583+
2584+
// FIXME: Handling the remaining cases earlier can degrade performance
2585+
// in some situations. Further analysis is required to enable more
2586+
// effective optimizations.
2587+
if (V2.isUndef()) {
2588+
if ((Result = lowerVECTOR_SHUFFLEAsLanePermuteAndShuffle(DL, NewMask, VT,
2589+
V1, V2, DAG)))
2590+
return Result;
2591+
}
2592+
25782593
if (SDValue NewShuffle = widenShuffleMask(DL, NewMask, VT, V1, V2, DAG))
25792594
return NewShuffle;
25802595
if ((Result = lowerVECTOR_SHUFFLE_XVSHUF(DL, NewMask, VT, V1, V2, DAG)))

llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) {
88
; CHECK-LABEL: shufflevector_v4f64:
99
; CHECK: # %bb.0: # %entry
1010
; CHECK-NEXT: xvpickve.d $xr2, $xr1, 3
11-
; CHECK-NEXT: xvpermi.d $xr3, $xr0, 78
11+
; CHECK-NEXT: xvpermi.d $xr3, $xr0, 238
1212
; CHECK-NEXT: xvrepl128vei.d $xr3, $xr3, 1
1313
; CHECK-NEXT: vextrins.d $vr3, $vr2, 16
1414
; CHECK-NEXT: xvpickve.d $xr1, $xr1, 2

llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ define <32 x i8> @shufflevector_v32i8(<32 x i8> %a, <32 x i8> %b) {
1717
define <32 x i8> @shufflevector_v32i8_undef(<32 x i8> %a) {
1818
; CHECK-LABEL: shufflevector_v32i8_undef:
1919
; CHECK: # %bb.0:
20-
; CHECK-NEXT: xvpermi.d $xr0, $xr0, 68
2120
; CHECK-NEXT: xvrepl128vei.b $xr0, $xr0, 1
2221
; CHECK-NEXT: ret
2322
%c = shufflevector <32 x i8> %a, <32 x i8> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,
@@ -40,7 +39,6 @@ define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b) {
4039
define <16 x i16> @shufflevector_v16i16_undef(<16 x i16> %a) {
4140
; CHECK-LABEL: shufflevector_v16i16_undef:
4241
; CHECK: # %bb.0:
43-
; CHECK-NEXT: xvpermi.d $xr0, $xr0, 68
4442
; CHECK-NEXT: xvrepl128vei.h $xr0, $xr0, 3
4543
; CHECK-NEXT: ret
4644
%c = shufflevector <16 x i16> %a, <16 x i16> poison, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3,
@@ -63,7 +61,6 @@ define <8 x i32> @shufflevector_v8i32(<8 x i32> %a, <8 x i32> %b) {
6361
define <8 x i32> @shufflevector_v8i32_undef(<8 x i32> %a) {
6462
; CHECK-LABEL: shufflevector_v8i32_undef:
6563
; CHECK: # %bb.0:
66-
; CHECK-NEXT: xvpermi.d $xr0, $xr0, 68
6764
; CHECK-NEXT: xvrepl128vei.w $xr0, $xr0, 2
6865
; CHECK-NEXT: ret
6966
%c = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 8, i32 8, i32 8, i32 8>
@@ -84,7 +81,6 @@ define <4 x i64> @shufflevector_v4i64(<4 x i64> %a, <4 x i64> %b) {
8481
define <4 x i64> @shufflevector_v4i64_undef(<4 x i64> %a) {
8582
; CHECK-LABEL: shufflevector_v4i64_undef:
8683
; CHECK: # %bb.0:
87-
; CHECK-NEXT: xvpermi.d $xr0, $xr0, 68
8884
; CHECK-NEXT: xvrepl128vei.d $xr0, $xr0, 1
8985
; CHECK-NEXT: ret
9086
%c = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
@@ -105,7 +101,7 @@ define <8 x float> @shufflevector_v8f32(<8 x float> %a, <8 x float> %b) {
105101
define <8 x float> @shufflevector_v8f32_undef(<8 x float> %a) {
106102
; CHECK-LABEL: shufflevector_v8f32_undef:
107103
; CHECK: # %bb.0:
108-
; CHECK-NEXT: xvpermi.d $xr0, $xr0, 78
104+
; CHECK-NEXT: xvpermi.d $xr0, $xr0, 238
109105
; CHECK-NEXT: xvrepl128vei.w $xr0, $xr0, 1
110106
; CHECK-NEXT: ret
111107
%c = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 9, i32 9, i32 9, i32 9>
@@ -126,7 +122,6 @@ define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) {
126122
define <4 x double> @shufflevector_v4f64_undef(<4 x double> %a) {
127123
; CHECK-LABEL: shufflevector_v4f64_undef:
128124
; CHECK: # %bb.0:
129-
; CHECK-NEXT: xvpermi.d $xr0, $xr0, 68
130125
; CHECK-NEXT: xvrepl128vei.d $xr0, $xr0, 0
131126
; CHECK-NEXT: ret
132127
%c = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 7, i32 7>

0 commit comments

Comments
 (0)