Skip to content

Conversation

@zhaoqi5
Copy link
Contributor

@zhaoqi5 zhaoqi5 commented Oct 30, 2025

Note: Maybe worse if the same mask is used by several shuffle operations? (ie. pcalau12i+vld+vshuf+vshuf+vshuf+...). How to trade-off?

@llvmbot
Copy link
Member

llvmbot commented Oct 30, 2025

@llvm/pr-subscribers-backend-loongarch

Author: ZhaoQi (zhaoqi5)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/165670.diff

3 Files Affected:

  • (modified) llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp (+102)
  • (modified) llvm/test/CodeGen/LoongArch/lsx/shufflevector-halves-quarters.ll (+34-32)
  • (modified) llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll (+29-29)
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 80c96c6dc8eb6..8564fb1fe5560 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -1738,6 +1738,105 @@ lowerVECTOR_SHUFFLE_IsReverse(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
                      DAG.getConstant(27, DL, Subtarget.getGRLenVT()));
 }
 
+/// Lower VECTOR_SHUFFLE whose result elements is all undef except for the
+/// first two or four elements which are from the half or quarter parts of the
+/// source vector.
+///
+/// It is possible to do optimization for VECTOR_SHUFFLE whose mask likes:
+///   <i, i+n/2, -1, ...>
+/// where n is the number of elements in the vector and i is in [0, n/2). Or:
+///   <i, i+4, i+8, i+12, -1, ...> (Only v16i8, and the first four can be undef)
+/// where i is in [0, 4).
+///
+/// For example: <0, 4, -1, ...> or <0, 4, 8, 12, -1, ...>, which appears when
+/// legalizing ISD::TRUNCATE in ReplaceNodeResults().
+static SDValue
+lowerVECTOR_SHUFFLE_HalvesOrQuarters(const SDLoc &DL, ArrayRef<int> Mask,
+                                     MVT VT, SDValue V1, SelectionDAG &DAG,
+                                     const LoongArchSubtarget &Subtarget) {
+  if (VT != MVT::v16i8 && VT != MVT::v8i16)
+    return SDValue();
+
+  int HalfSize = Mask.size() / 2;
+  int QuarterSize = Mask.size() / 4;
+  MVT GRLenVT = Subtarget.getGRLenVT();
+
+  auto allUndefFrom = [&](unsigned idx) -> bool {
+    return llvm::all_of(Mask.drop_front(idx), [](int M) { return M == -1; });
+  };
+
+  auto buildShuffled = [&](MVT CastVT, ArrayRef<int> ShuffleMask) {
+    SDValue Cast = DAG.getBitcast(CastVT, V1);
+    SDValue Shuf = DAG.getVectorShuffle(CastVT, DL, Cast, Cast, ShuffleMask);
+    return DAG.getBitcast(VT, Shuf);
+  };
+
+  // Check pattern: <i, i+HalfSize, -1, ...>
+  int M0 = Mask[0], M1 = Mask[1];
+  if (M0 >= 0 && M0 < HalfSize && M1 == M0 + HalfSize && allUndefFrom(2)) {
+    SDValue SrcVec = V1;
+    // Shuffle vector for various masks to place needed elements at front.
+    if (M0 >= QuarterSize && M0 < QuarterSize + 2)
+      SrcVec = buildShuffled(MVT::v4i32, {1, 0, 3, 2});
+    else if (M0 >= 2 && M0 < 4) // Only v16i8 meets this.
+      SrcVec = buildShuffled(MVT::v8i16, {1, 0, 3, 2, 5, 4, 7, 6});
+    else if (M0 >= 6 && M0 < 8) // Only v16i8 meets this.
+      SrcVec = buildShuffled(MVT::v8i16, {3, 2, 1, 0, 7, 6, 5, 4});
+
+    // Broadcast the needed high part elements.
+    SDValue VecHi = DAG.getNode(LoongArchISD::VREPLVEI, DL, MVT::v4i32,
+                                DAG.getBitcast(MVT::v4i32, SrcVec),
+                                DAG.getConstant(2, DL, GRLenVT));
+
+    unsigned Opc = (M0 % 2) ? LoongArchISD::VPACKOD : LoongArchISD::VPACKEV;
+    return DAG.getNode(Opc, DL, VT, DAG.getBitcast(VT, VecHi), SrcVec);
+  }
+
+  // Only consider quarter cases for v16i8.
+  if (VT != MVT::v16i8)
+    return SDValue();
+
+  // Check pattern: <i, i+4, i+8, i+12, -1, ...>
+  // Still succeeds even if the first four elements have undef.
+  bool FromQuarters = false;
+  int First = -1;
+  for (int i = 0; i < QuarterSize && !FromQuarters; ++i) {
+    FromQuarters = llvm::all_of(llvm::seq<int>(0, 4), [&](int j) {
+      return Mask[j] == -1 || Mask[j] == i + j * 4;
+    });
+    if (FromQuarters)
+      First = i;
+  }
+
+  if (FromQuarters && allUndefFrom(4)) {
+    SmallVector<int, 8> ShufMask =
+        (First < 2) ? SmallVector<int, 8>{0, 2, 1, 3, 4, 6, 5, 7}
+                    : SmallVector<int, 8>{1, 3, 0, 2, 5, 7, 4, 6};
+    SmallVector<int, 16> ExtractMask =
+        (First % 2) ? SmallVector<int, 16>{1, 3,  0, 2,  5,  7,  4,  6,
+                                           9, 11, 8, 10, 13, 15, 12, 14}
+                    : SmallVector<int, 16>{0, 2,  1, 3,  4,  6,  5,  7,
+                                           8, 10, 9, 11, 12, 14, 13, 15};
+
+    // Shuffle vector for various masks to place needed elements at front.
+    MVT ShufVT = MVT::v8i16;
+    SDValue SrcVec = buildShuffled(ShufVT, ShufMask);
+    SDValue Extract = DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, ExtractMask);
+
+    // Broadcast the needed high part elements.
+    SDValue VecHi = DAG.getNode(LoongArchISD::VREPLVEI, DL, ShufVT,
+                                DAG.getBitcast(ShufVT, Extract),
+                                DAG.getConstant(4, DL, GRLenVT));
+
+    unsigned Opc = (First % 2) ? LoongArchISD::VPACKOD : LoongArchISD::VPACKEV;
+    SDValue Result =
+        DAG.getNode(Opc, DL, ShufVT, VecHi, DAG.getBitcast(ShufVT, Extract));
+    return DAG.getBitcast(VT, Result);
+  }
+
+  return SDValue();
+}
+
 /// Lower VECTOR_SHUFFLE into VPACKEV (if possible).
 ///
 /// VPACKEV interleaves the even elements from each vector.
@@ -2044,6 +2143,9 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
     if ((Result =
              lowerVECTOR_SHUFFLE_IsReverse(DL, Mask, VT, V1, DAG, Subtarget)))
       return Result;
+    if ((Result = lowerVECTOR_SHUFFLE_HalvesOrQuarters(DL, Mask, VT, V1, DAG,
+                                                       Subtarget)))
+      return Result;
 
     // TODO: This comment may be enabled in the future to better match the
     // pattern for instruction selection.
diff --git a/llvm/test/CodeGen/LoongArch/lsx/shufflevector-halves-quarters.ll b/llvm/test/CodeGen/LoongArch/lsx/shufflevector-halves-quarters.ll
index 2a0a107a2b76e..946a4e5524bc0 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/shufflevector-halves-quarters.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/shufflevector-halves-quarters.ll
@@ -6,9 +6,8 @@ define void @shufflevector_halves_b(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: shufflevector_halves_b:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI0_0)
-; CHECK-NEXT:    vld $vr1, $a1, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT:    vshuf.b $vr0, $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 2
+; CHECK-NEXT:    vpackev.b $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -23,9 +22,9 @@ define void @shufflevector_halves_b_1(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: shufflevector_halves_b_1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI1_0)
-; CHECK-NEXT:    vld $vr1, $a1, %pc_lo12(.LCPI1_0)
-; CHECK-NEXT:    vshuf.b $vr0, $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.h $vr0, $vr0, 177
+; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 2
+; CHECK-NEXT:    vpackod.b $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -40,9 +39,9 @@ define void @shufflevector_halves_b_2(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: shufflevector_halves_b_2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI2_0)
-; CHECK-NEXT:    vld $vr1, $a1, %pc_lo12(.LCPI2_0)
-; CHECK-NEXT:    vshuf.b $vr0, $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 177
+; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 2
+; CHECK-NEXT:    vpackod.b $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -57,9 +56,9 @@ define void @shufflevector_halves_b_3(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: shufflevector_halves_b_3:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT:    vld $vr1, $a1, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT:    vshuf.b $vr0, $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.h $vr0, $vr0, 27
+; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 2
+; CHECK-NEXT:    vpackev.b $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -74,10 +73,9 @@ define void @shufflevector_halves_h(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: shufflevector_halves_h:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI4_0)
-; CHECK-NEXT:    vld $vr1, $a1, %pc_lo12(.LCPI4_0)
-; CHECK-NEXT:    vshuf.h $vr1, $vr0, $vr0
-; CHECK-NEXT:    vst $vr1, $a0, 0
+; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 2
+; CHECK-NEXT:    vpackev.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %va = load <8 x i16>, ptr %a
@@ -91,10 +89,10 @@ define void @shufflevector_halves_h_1(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: shufflevector_halves_h_1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI5_0)
-; CHECK-NEXT:    vld $vr1, $a1, %pc_lo12(.LCPI5_0)
-; CHECK-NEXT:    vshuf.h $vr1, $vr0, $vr0
-; CHECK-NEXT:    vst $vr1, $a0, 0
+; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 177
+; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 2
+; CHECK-NEXT:    vpackod.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %va = load <8 x i16>, ptr %a
@@ -108,9 +106,10 @@ define void @shufflevector_quarters_b(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: shufflevector_quarters_b:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI6_0)
-; CHECK-NEXT:    vld $vr1, $a1, %pc_lo12(.LCPI6_0)
-; CHECK-NEXT:    vshuf.b $vr0, $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.h $vr0, $vr0, 216
+; CHECK-NEXT:    vshuf4i.b $vr0, $vr0, 216
+; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 4
+; CHECK-NEXT:    vpackev.h $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -125,9 +124,10 @@ define void @shufflevector_quarters_b_1(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: shufflevector_quarters_b_1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI7_0)
-; CHECK-NEXT:    vld $vr1, $a1, %pc_lo12(.LCPI7_0)
-; CHECK-NEXT:    vshuf.b $vr0, $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.h $vr0, $vr0, 216
+; CHECK-NEXT:    vshuf4i.b $vr0, $vr0, 141
+; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 4
+; CHECK-NEXT:    vpackod.h $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -142,9 +142,10 @@ define void @shufflevector_quarters_b_2(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: shufflevector_quarters_b_2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI8_0)
-; CHECK-NEXT:    vld $vr1, $a1, %pc_lo12(.LCPI8_0)
-; CHECK-NEXT:    vshuf.b $vr0, $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.h $vr0, $vr0, 141
+; CHECK-NEXT:    vshuf4i.b $vr0, $vr0, 216
+; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 4
+; CHECK-NEXT:    vpackev.h $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -159,9 +160,10 @@ define void @shufflevector_quarters_b_3(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-LABEL: shufflevector_quarters_b_3:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI9_0)
-; CHECK-NEXT:    vld $vr1, $a1, %pc_lo12(.LCPI9_0)
-; CHECK-NEXT:    vshuf.b $vr0, $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.h $vr0, $vr0, 141
+; CHECK-NEXT:    vshuf4i.b $vr0, $vr0, 141
+; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 4
+; CHECK-NEXT:    vpackod.h $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll
index 314350acd23d6..9b9016b4e5972 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll
@@ -28,20 +28,18 @@ define void @load_trunc_2i64_to_2i16(ptr %ptr, ptr %dst) nounwind {
 ; LA32-LABEL: load_trunc_2i64_to_2i16:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    vld $vr0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_0)
-; LA32-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI1_0)
-; LA32-NEXT:    vshuf.h $vr1, $vr0, $vr0
-; LA32-NEXT:    vpickve2gr.w $a0, $vr1, 0
+; LA32-NEXT:    vreplvei.w $vr1, $vr0, 2
+; LA32-NEXT:    vpackev.h $vr0, $vr1, $vr0
+; LA32-NEXT:    vpickve2gr.w $a0, $vr0, 0
 ; LA32-NEXT:    st.w $a0, $a1, 0
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_trunc_2i64_to_2i16:
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    vld $vr0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_0)
-; LA64-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI1_0)
-; LA64-NEXT:    vshuf.h $vr1, $vr0, $vr0
-; LA64-NEXT:    vstelm.w $vr1, $a1, 0, 0
+; LA64-NEXT:    vreplvei.w $vr1, $vr0, 2
+; LA64-NEXT:    vpackev.h $vr0, $vr1, $vr0
+; LA64-NEXT:    vstelm.w $vr0, $a1, 0, 0
 ; LA64-NEXT:    ret
   %a = load <2 x i64>, ptr %ptr
   %trunc = trunc <2 x i64> %a to <2 x i16>
@@ -53,18 +51,16 @@ define void @load_trunc_2i64_to_2i8(ptr %ptr, ptr %dst) nounwind {
 ; LA32-LABEL: load_trunc_2i64_to_2i8:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    vld $vr0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA32-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI2_0)
-; LA32-NEXT:    vshuf.b $vr0, $vr0, $vr0, $vr1
+; LA32-NEXT:    vreplvei.w $vr1, $vr0, 2
+; LA32-NEXT:    vpackev.b $vr0, $vr1, $vr0
 ; LA32-NEXT:    vstelm.h $vr0, $a1, 0, 0
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_trunc_2i64_to_2i8:
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    vld $vr0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA64-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI2_0)
-; LA64-NEXT:    vshuf.b $vr0, $vr0, $vr0, $vr1
+; LA64-NEXT:    vreplvei.w $vr1, $vr0, 2
+; LA64-NEXT:    vpackev.b $vr0, $vr1, $vr0
 ; LA64-NEXT:    vstelm.h $vr0, $a1, 0, 0
 ; LA64-NEXT:    ret
   %a = load <2 x i64>, ptr %ptr
@@ -100,9 +96,10 @@ define void @load_trunc_4i32_to_4i8(ptr %ptr, ptr %dst) nounwind {
 ; LA32-LABEL: load_trunc_4i32_to_4i8:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    vld $vr0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
-; LA32-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI4_0)
-; LA32-NEXT:    vshuf.b $vr0, $vr0, $vr0, $vr1
+; LA32-NEXT:    vshuf4i.h $vr0, $vr0, 216
+; LA32-NEXT:    vshuf4i.b $vr0, $vr0, 216
+; LA32-NEXT:    vreplvei.h $vr1, $vr0, 4
+; LA32-NEXT:    vpackev.h $vr0, $vr1, $vr0
 ; LA32-NEXT:    vpickve2gr.w $a0, $vr0, 0
 ; LA32-NEXT:    st.w $a0, $a1, 0
 ; LA32-NEXT:    ret
@@ -110,9 +107,10 @@ define void @load_trunc_4i32_to_4i8(ptr %ptr, ptr %dst) nounwind {
 ; LA64-LABEL: load_trunc_4i32_to_4i8:
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    vld $vr0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
-; LA64-NEXT:    vld $vr1, $a0, %pc_lo12(.LCPI4_0)
-; LA64-NEXT:    vshuf.b $vr0, $vr0, $vr0, $vr1
+; LA64-NEXT:    vshuf4i.h $vr0, $vr0, 216
+; LA64-NEXT:    vshuf4i.b $vr0, $vr0, 216
+; LA64-NEXT:    vreplvei.h $vr1, $vr0, 4
+; LA64-NEXT:    vpackev.h $vr0, $vr1, $vr0
 ; LA64-NEXT:    vstelm.w $vr0, $a1, 0, 0
 ; LA64-NEXT:    ret
   %a = load <4 x i32>, ptr %ptr
@@ -174,21 +172,23 @@ define void @load_trunc_2i32_to_2i8(ptr %ptr, ptr %dst) nounwind {
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    ld.w $a2, $a0, 0
 ; LA32-NEXT:    ld.w $a0, $a0, 4
-; LA32-NEXT:    pcalau12i $a3, %pc_hi20(.LCPI7_0)
-; LA32-NEXT:    vld $vr0, $a3, %pc_lo12(.LCPI7_0)
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a2, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a0, 1
-; LA32-NEXT:    vshuf.b $vr0, $vr0, $vr1, $vr0
+; LA32-NEXT:    vinsgr2vr.w $vr0, $a2, 0
+; LA32-NEXT:    vinsgr2vr.w $vr0, $a0, 1
+; LA32-NEXT:    vshuf4i.h $vr0, $vr0, 216
+; LA32-NEXT:    vshuf4i.b $vr0, $vr0, 216
+; LA32-NEXT:    vreplvei.h $vr1, $vr0, 4
+; LA32-NEXT:    vpackev.h $vr0, $vr1, $vr0
 ; LA32-NEXT:    vstelm.h $vr0, $a1, 0, 0
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_trunc_2i32_to_2i8:
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    ld.d $a0, $a0, 0
-; LA64-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI7_0)
-; LA64-NEXT:    vld $vr0, $a2, %pc_lo12(.LCPI7_0)
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a0, 0
-; LA64-NEXT:    vshuf.b $vr0, $vr0, $vr1, $vr0
+; LA64-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; LA64-NEXT:    vshuf4i.h $vr0, $vr0, 216
+; LA64-NEXT:    vshuf4i.b $vr0, $vr0, 216
+; LA64-NEXT:    vreplvei.h $vr1, $vr0, 4
+; LA64-NEXT:    vpackev.h $vr0, $vr1, $vr0
 ; LA64-NEXT:    vstelm.h $vr0, $a1, 0, 0
 ; LA64-NEXT:    ret
   %a = load <2 x i32>, ptr %ptr

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants