[LoongArch] Refine 256-bit vector_shuffle legalization for LASX #160254

zhaoqi5 · 2025-09-23T08:09:13Z

No description provided.

llvmbot · 2025-09-23T08:09:51Z

@llvm/pr-subscribers-backend-loongarch

Author: ZhaoQi (zhaoqi5)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/160254.diff

2 Files Affected:

(modified) llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp (+40-30)
(modified) llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll (+1-1)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 32baa2d111270..c264a8a0f6a54 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2060,7 +2060,10 @@ lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
 
   const auto &Begin = Mask.begin();
   const auto &End = Mask.end();
-  unsigned HalfSize = Mask.size() / 2;
+  int HalfSize = Mask.size() / 2;
+
+  if (SplatIndex >= HalfSize)
+    return SDValue();
 
   assert(SplatIndex < (int)Mask.size() && "Out of bounds mask index");
   if (fitsRegularPattern<int>(Begin, 1, End - HalfSize, SplatIndex, 0) &&
@@ -2354,7 +2357,7 @@ static SDValue lowerVECTOR_SHUFFLE_XVSHUF(const SDLoc &DL, ArrayRef<int> Mask,
 /// cases need to be converted to it for processing.
 ///
 /// This function may modify V1, V2 and Mask
-static void canonicalizeShuffleVectorByLane(
+static bool canonicalizeShuffleVectorByLane(
     const SDLoc &DL, MutableArrayRef<int> Mask, MVT VT, SDValue &V1,
     SDValue &V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget) {
 
@@ -2378,15 +2381,15 @@ static void canonicalizeShuffleVectorByLane(
     preMask = LowLaneTy;
 
   if (std::all_of(Mask.begin() + HalfSize, Mask.end(), [&](int M) {
-        return M < 0 || (M >= 0 && M < HalfSize) ||
-               (M >= MaskSize && M < MaskSize + HalfSize);
+        return M < 0 || (M >= HalfSize && M < MaskSize) ||
+               (M >= MaskSize + HalfSize && M < MaskSize * 2);
       }))
-    postMask = HighLaneTy;
+    postMask = LowLaneTy;
   else if (std::all_of(Mask.begin() + HalfSize, Mask.end(), [&](int M) {
-             return M < 0 || (M >= HalfSize && M < MaskSize) ||
-                    (M >= MaskSize + HalfSize && M < MaskSize * 2);
+             return M < 0 || (M >= 0 && M < HalfSize) ||
+                    (M >= MaskSize && M < MaskSize + HalfSize);
            }))
-    postMask = LowLaneTy;
+    postMask = HighLaneTy;
 
   // The pre-half of mask is high lane type, and the post-half of mask
   // is low lane type, which is closest to the LoongArch instructions.
@@ -2395,7 +2398,7 @@ static void canonicalizeShuffleVectorByLane(
   // to the lower 128-bit of vector register, and the low lane of mask
   // corresponds the higher 128-bit of vector register.
   if (preMask == HighLaneTy && postMask == LowLaneTy) {
-    return;
+    return false;
   }
   if (preMask == LowLaneTy && postMask == HighLaneTy) {
     V1 = DAG.getBitcast(MVT::v4i64, V1);
@@ -2449,8 +2452,10 @@ static void canonicalizeShuffleVectorByLane(
       *it = *it < 0 ? *it : *it + HalfSize;
     }
   } else { // cross-lane
-    return;
+    return false;
   }
+
+  return true;
 }
 
 /// Lower VECTOR_SHUFFLE as lane permute and then shuffle (if possible).
@@ -2516,27 +2521,20 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
   assert(Mask.size() % 2 == 0 && "Expected even mask size.");
   assert(Mask.size() >= 4 && "Mask size is less than 4.");
 
-  // canonicalize non cross-lane shuffle vector
-  SmallVector<int> NewMask(Mask);
-  canonicalizeShuffleVectorByLane(DL, NewMask, VT, V1, V2, DAG, Subtarget);
-
   APInt KnownUndef, KnownZero;
-  computeZeroableShuffleElements(NewMask, V1, V2, KnownUndef, KnownZero);
+  computeZeroableShuffleElements(Mask, V1, V2, KnownUndef, KnownZero);
   APInt Zeroable = KnownUndef | KnownZero;
 
   SDValue Result;
   // TODO: Add more comparison patterns.
   if (V2.isUndef()) {
-    if ((Result = lowerVECTOR_SHUFFLE_XVREPLVEI(DL, NewMask, VT, V1, V2, DAG,
+    if ((Result = lowerVECTOR_SHUFFLE_XVREPLVEI(DL, Mask, VT, V1, V2, DAG,
                                                 Subtarget)))
       return Result;
-    if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG,
+    if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, Mask, VT, V1, V2, DAG,
                                                Subtarget)))
       return Result;
-    if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, NewMask, VT, V1, V2, DAG)))
-      return Result;
-    if ((Result = lowerVECTOR_SHUFFLEAsLanePermuteAndShuffle(DL, NewMask, VT,
-                                                             V1, V2, DAG)))
+    if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, Mask, VT, V1, V2, DAG)))
       return Result;
 
     // TODO: This comment may be enabled in the future to better match the
@@ -2546,24 +2544,36 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
 
   // It is recommended not to change the pattern comparison order for better
   // performance.
-  if ((Result = lowerVECTOR_SHUFFLE_XVPACKEV(DL, NewMask, VT, V1, V2, DAG)))
+  if ((Result = lowerVECTOR_SHUFFLE_XVPACKEV(DL, Mask, VT, V1, V2, DAG)))
     return Result;
-  if ((Result = lowerVECTOR_SHUFFLE_XVPACKOD(DL, NewMask, VT, V1, V2, DAG)))
+  if ((Result = lowerVECTOR_SHUFFLE_XVPACKOD(DL, Mask, VT, V1, V2, DAG)))
     return Result;
-  if ((Result = lowerVECTOR_SHUFFLE_XVILVH(DL, NewMask, VT, V1, V2, DAG)))
+  if ((Result = lowerVECTOR_SHUFFLE_XVILVH(DL, Mask, VT, V1, V2, DAG)))
     return Result;
-  if ((Result = lowerVECTOR_SHUFFLE_XVILVL(DL, NewMask, VT, V1, V2, DAG)))
+  if ((Result = lowerVECTOR_SHUFFLE_XVILVL(DL, Mask, VT, V1, V2, DAG)))
     return Result;
-  if ((Result = lowerVECTOR_SHUFFLE_XVPICKEV(DL, NewMask, VT, V1, V2, DAG)))
+  if ((Result = lowerVECTOR_SHUFFLE_XVPICKEV(DL, Mask, VT, V1, V2, DAG)))
     return Result;
-  if ((Result = lowerVECTOR_SHUFFLE_XVPICKOD(DL, NewMask, VT, V1, V2, DAG)))
+  if ((Result = lowerVECTOR_SHUFFLE_XVPICKOD(DL, Mask, VT, V1, V2, DAG)))
     return Result;
-  if ((Result = lowerVECTOR_SHUFFLEAsShift(DL, NewMask, VT, V1, V2, DAG,
-                                           Subtarget, Zeroable)))
+  if ((Result = lowerVECTOR_SHUFFLEAsShift(DL, Mask, VT, V1, V2, DAG, Subtarget,
+                                           Zeroable)))
     return Result;
-  if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, NewMask, VT, V1, V2, DAG,
+  if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, Mask, VT, V1, V2, DAG,
                                                 Subtarget)))
     return Result;
+
+  // canonicalize non cross-lane shuffle vector
+  SmallVector<int> NewMask(Mask);
+  if (canonicalizeShuffleVectorByLane(DL, NewMask, VT, V1, V2, DAG, Subtarget))
+    return lower256BitShuffle(DL, NewMask, VT, V1, V2, DAG, Subtarget);
+
+  if (V2.isUndef()) {
+    if ((Result = lowerVECTOR_SHUFFLEAsLanePermuteAndShuffle(DL, NewMask, VT,
+                                                             V1, V2, DAG)))
+      return Result;
+  }
+
   if (SDValue NewShuffle = widenShuffleMask(DL, NewMask, VT, V1, V2, DAG))
     return NewShuffle;
   if ((Result = lowerVECTOR_SHUFFLE_XVSHUF(DL, NewMask, VT, V1, V2, DAG)))
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
index 765473ce166df..30539427a1a0a 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
@@ -8,7 +8,7 @@ define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: shufflevector_v4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvpickve.d $xr2, $xr1, 3
-; CHECK-NEXT:    xvpermi.d $xr3, $xr0, 78
+; CHECK-NEXT:    xvpermi.d $xr3, $xr0, 238
 ; CHECK-NEXT:    xvrepl128vei.d $xr3, $xr3, 1
 ; CHECK-NEXT:    vextrins.d $vr3, $vr2, 16
 ; CHECK-NEXT:    xvpickve.d $xr1, $xr1, 2

llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp

heiher

LGTM.

github-actions · 2025-09-26T01:32:51Z

✅ With the latest revision this PR passed the C/C++ code formatter.

llvm-ci · 2025-09-26T06:42:50Z

LLVM Buildbot has detected a new failure on builder mlir-nvidia-gcc7 running on mlir-nvidia while building llvm at step 7 "test-build-check-mlir-build-only-check-mlir".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/116/builds/18902

Here is the relevant piece of the build log for the reference

Step 7 (test-build-check-mlir-build-only-check-mlir) failure: test (failure)
******************** TEST 'MLIR :: Integration/GPU/CUDA/async.mlir' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 1
/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -gpu-kernel-outlining  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)'  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -gpu-async-region -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary="format=fatbin"  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -async-to-async-runtime -async-runtime-ref-counting  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -convert-async-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-runner    --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_cuda_runtime.so    --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_async_runtime.so    --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_runner_utils.so    --entry-point-result=void -O0  | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/FileCheck /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -gpu-kernel-outlining
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt '-pass-pipeline=builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)'
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -gpu-async-region -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary=format=fatbin
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -async-to-async-runtime -async-runtime-ref-counting
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -convert-async-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-runner --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_cuda_runtime.so --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_async_runtime.so --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_runner_utils.so --entry-point-result=void -O0
# .---command stderr------------
# | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventSynchronize(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# `-----------------------------
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/FileCheck /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir
# .---command stderr------------
# | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir:68:12: error: CHECK: expected string not found in input
# |  // CHECK: [84, 84]
# |            ^
# | <stdin>:1:1: note: scanning from here
# | Unranked Memref base@ = 0x5b8babf22740 rank = 1 offset = 0 sizes = [2] strides = [1] data = 
# | ^
# | <stdin>:2:1: note: possible intended match here
# | [42, 42]
# | ^
# | 
# | Input file: <stdin>
# | Check file: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# |             1: Unranked Memref base@ = 0x5b8babf22740 rank = 1 offset = 0 sizes = [2] strides = [1] data =  
# | check:68'0     X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ error: no match found
# |             2: [42, 42] 
# | check:68'0     ~~~~~~~~~
# | check:68'1     ?         possible intended match
...

…#160254)

llvmbot added the backend:loongarch label Sep 23, 2025

tangaac mentioned this pull request Sep 23, 2025

LLVM optimization on PR 160254 [bot] llvm-ci-la/llvm-opt-ci#61

Open

[LoongArch] Refine 256-bit vector_shuffle legalization for LASX

80abdb7

zhaoqi5 force-pushed the users/zhaoqi5/refine-vectorshuffle-legalizing branch from 86d0a42 to 80abdb7 Compare September 24, 2025 05:06

zhaoqi5 requested review from heiher, SixWeining and tangaac September 24, 2025 06:18

heiher reviewed Sep 25, 2025

View reviewed changes

llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp Show resolved Hide resolved

address heiher's comment

7ca0db1

heiher approved these changes Sep 25, 2025

View reviewed changes

Merge branch 'main' into users/zhaoqi5/refine-vectorshuffle-legalizing

ad61aae

clang format

0a1b187

zhaoqi5 merged commit fe2dc19 into main Sep 26, 2025
9 checks passed

zhaoqi5 deleted the users/zhaoqi5/refine-vectorshuffle-legalizing branch September 26, 2025 06:38

mahesh-attarde pushed a commit to mahesh-attarde/llvm-project that referenced this pull request Oct 3, 2025

[LoongArch] Refine 256-bit vector_shuffle legalization for LASX (llvm…

774a529

…#160254)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[LoongArch] Refine 256-bit vector_shuffle legalization for LASX #160254

[LoongArch] Refine 256-bit vector_shuffle legalization for LASX #160254

Uh oh!

zhaoqi5 commented Sep 23, 2025

Uh oh!

llvmbot commented Sep 23, 2025

Uh oh!

Uh oh!

heiher left a comment

Uh oh!

github-actions bot commented Sep 26, 2025 •

edited

Loading

Uh oh!

Uh oh!

llvm-ci commented Sep 26, 2025

Uh oh!

Uh oh!

[LoongArch] Refine 256-bit vector_shuffle legalization for LASX #160254

[LoongArch] Refine 256-bit vector_shuffle legalization for LASX #160254

Uh oh!

Conversation

zhaoqi5 commented Sep 23, 2025

Uh oh!

llvmbot commented Sep 23, 2025

Uh oh!

Uh oh!

heiher left a comment

Choose a reason for hiding this comment

Uh oh!

github-actions bot commented Sep 26, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

llvm-ci commented Sep 26, 2025

Uh oh!

Uh oh!

github-actions bot commented Sep 26, 2025 •

edited

Loading