-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[LoongArch] Custom legalize vector_shuffle which elements from halves or quarters #165670
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
zhaoqi5
wants to merge
1
commit into
users/zhaoqi5/tests-for-shuffle-halves-quarters
Choose a base branch
from
users/zhaoqi5/opt-shuffle-halves-quarters
base: users/zhaoqi5/tests-for-shuffle-halves-quarters
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
[LoongArch] Custom legalize vector_shuffle which elements from halves or quarters #165670
zhaoqi5
wants to merge
1
commit into
users/zhaoqi5/tests-for-shuffle-halves-quarters
from
users/zhaoqi5/opt-shuffle-halves-quarters
+165
−61
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Member
|
@llvm/pr-subscribers-backend-loongarch Author: ZhaoQi (zhaoqi5) ChangesFull diff: https://github.com/llvm/llvm-project/pull/165670.diff 3 Files Affected:
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 80c96c6dc8eb6..8564fb1fe5560 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -1738,6 +1738,105 @@ lowerVECTOR_SHUFFLE_IsReverse(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
DAG.getConstant(27, DL, Subtarget.getGRLenVT()));
}
+/// Lower VECTOR_SHUFFLE whose result elements is all undef except for the
+/// first two or four elements which are from the half or quarter parts of the
+/// source vector.
+///
+/// It is possible to do optimization for VECTOR_SHUFFLE whose mask likes:
+/// <i, i+n/2, -1, ...>
+/// where n is the number of elements in the vector and i is in [0, n/2). Or:
+/// <i, i+4, i+8, i+12, -1, ...> (Only v16i8, and the first four can be undef)
+/// where i is in [0, 4).
+///
+/// For example: <0, 4, -1, ...> or <0, 4, 8, 12, -1, ...>, which appears when
+/// legalizing ISD::TRUNCATE in ReplaceNodeResults().
+static SDValue
+lowerVECTOR_SHUFFLE_HalvesOrQuarters(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SelectionDAG &DAG,
+ const LoongArchSubtarget &Subtarget) {
+ if (VT != MVT::v16i8 && VT != MVT::v8i16)
+ return SDValue();
+
+ int HalfSize = Mask.size() / 2;
+ int QuarterSize = Mask.size() / 4;
+ MVT GRLenVT = Subtarget.getGRLenVT();
+
+ auto allUndefFrom = [&](unsigned idx) -> bool {
+ return llvm::all_of(Mask.drop_front(idx), [](int M) { return M == -1; });
+ };
+
+ auto buildShuffled = [&](MVT CastVT, ArrayRef<int> ShuffleMask) {
+ SDValue Cast = DAG.getBitcast(CastVT, V1);
+ SDValue Shuf = DAG.getVectorShuffle(CastVT, DL, Cast, Cast, ShuffleMask);
+ return DAG.getBitcast(VT, Shuf);
+ };
+
+ // Check pattern: <i, i+HalfSize, -1, ...>
+ int M0 = Mask[0], M1 = Mask[1];
+ if (M0 >= 0 && M0 < HalfSize && M1 == M0 + HalfSize && allUndefFrom(2)) {
+ SDValue SrcVec = V1;
+ // Shuffle vector for various masks to place needed elements at front.
+ if (M0 >= QuarterSize && M0 < QuarterSize + 2)
+ SrcVec = buildShuffled(MVT::v4i32, {1, 0, 3, 2});
+ else if (M0 >= 2 && M0 < 4) // Only v16i8 meets this.
+ SrcVec = buildShuffled(MVT::v8i16, {1, 0, 3, 2, 5, 4, 7, 6});
+ else if (M0 >= 6 && M0 < 8) // Only v16i8 meets this.
+ SrcVec = buildShuffled(MVT::v8i16, {3, 2, 1, 0, 7, 6, 5, 4});
+
+ // Broadcast the needed high part elements.
+ SDValue VecHi = DAG.getNode(LoongArchISD::VREPLVEI, DL, MVT::v4i32,
+ DAG.getBitcast(MVT::v4i32, SrcVec),
+ DAG.getConstant(2, DL, GRLenVT));
+
+ unsigned Opc = (M0 % 2) ? LoongArchISD::VPACKOD : LoongArchISD::VPACKEV;
+ return DAG.getNode(Opc, DL, VT, DAG.getBitcast(VT, VecHi), SrcVec);
+ }
+
+ // Only consider quarter cases for v16i8.
+ if (VT != MVT::v16i8)
+ return SDValue();
+
+ // Check pattern: <i, i+4, i+8, i+12, -1, ...>
+ // Still succeeds even if the first four elements have undef.
+ bool FromQuarters = false;
+ int First = -1;
+ for (int i = 0; i < QuarterSize && !FromQuarters; ++i) {
+ FromQuarters = llvm::all_of(llvm::seq<int>(0, 4), [&](int j) {
+ return Mask[j] == -1 || Mask[j] == i + j * 4;
+ });
+ if (FromQuarters)
+ First = i;
+ }
+
+ if (FromQuarters && allUndefFrom(4)) {
+ SmallVector<int, 8> ShufMask =
+ (First < 2) ? SmallVector<int, 8>{0, 2, 1, 3, 4, 6, 5, 7}
+ : SmallVector<int, 8>{1, 3, 0, 2, 5, 7, 4, 6};
+ SmallVector<int, 16> ExtractMask =
+ (First % 2) ? SmallVector<int, 16>{1, 3, 0, 2, 5, 7, 4, 6,
+ 9, 11, 8, 10, 13, 15, 12, 14}
+ : SmallVector<int, 16>{0, 2, 1, 3, 4, 6, 5, 7,
+ 8, 10, 9, 11, 12, 14, 13, 15};
+
+ // Shuffle vector for various masks to place needed elements at front.
+ MVT ShufVT = MVT::v8i16;
+ SDValue SrcVec = buildShuffled(ShufVT, ShufMask);
+ SDValue Extract = DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, ExtractMask);
+
+ // Broadcast the needed high part elements.
+ SDValue VecHi = DAG.getNode(LoongArchISD::VREPLVEI, DL, ShufVT,
+ DAG.getBitcast(ShufVT, Extract),
+ DAG.getConstant(4, DL, GRLenVT));
+
+ unsigned Opc = (First % 2) ? LoongArchISD::VPACKOD : LoongArchISD::VPACKEV;
+ SDValue Result =
+ DAG.getNode(Opc, DL, ShufVT, VecHi, DAG.getBitcast(ShufVT, Extract));
+ return DAG.getBitcast(VT, Result);
+ }
+
+ return SDValue();
+}
+
/// Lower VECTOR_SHUFFLE into VPACKEV (if possible).
///
/// VPACKEV interleaves the even elements from each vector.
@@ -2044,6 +2143,9 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
if ((Result =
lowerVECTOR_SHUFFLE_IsReverse(DL, Mask, VT, V1, DAG, Subtarget)))
return Result;
+ if ((Result = lowerVECTOR_SHUFFLE_HalvesOrQuarters(DL, Mask, VT, V1, DAG,
+ Subtarget)))
+ return Result;
// TODO: This comment may be enabled in the future to better match the
// pattern for instruction selection.
diff --git a/llvm/test/CodeGen/LoongArch/lsx/shufflevector-halves-quarters.ll b/llvm/test/CodeGen/LoongArch/lsx/shufflevector-halves-quarters.ll
index 2a0a107a2b76e..946a4e5524bc0 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/shufflevector-halves-quarters.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/shufflevector-halves-quarters.ll
@@ -6,9 +6,8 @@ define void @shufflevector_halves_b(ptr %res, ptr %a, ptr %b) nounwind {
; CHECK-LABEL: shufflevector_halves_b:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI0_0)
-; CHECK-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1
+; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2
+; CHECK-NEXT: vpackev.b $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -23,9 +22,9 @@ define void @shufflevector_halves_b_1(ptr %res, ptr %a, ptr %b) nounwind {
; CHECK-LABEL: shufflevector_halves_b_1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI1_0)
-; CHECK-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI1_0)
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1
+; CHECK-NEXT: vshuf4i.h $vr0, $vr0, 177
+; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2
+; CHECK-NEXT: vpackod.b $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -40,9 +39,9 @@ define void @shufflevector_halves_b_2(ptr %res, ptr %a, ptr %b) nounwind {
; CHECK-LABEL: shufflevector_halves_b_2:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI2_0)
-; CHECK-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI2_0)
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1
+; CHECK-NEXT: vshuf4i.w $vr0, $vr0, 177
+; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2
+; CHECK-NEXT: vpackod.b $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -57,9 +56,9 @@ define void @shufflevector_halves_b_3(ptr %res, ptr %a, ptr %b) nounwind {
; CHECK-LABEL: shufflevector_halves_b_3:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1
+; CHECK-NEXT: vshuf4i.h $vr0, $vr0, 27
+; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2
+; CHECK-NEXT: vpackev.b $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -74,10 +73,9 @@ define void @shufflevector_halves_h(ptr %res, ptr %a, ptr %b) nounwind {
; CHECK-LABEL: shufflevector_halves_h:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI4_0)
-; CHECK-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI4_0)
-; CHECK-NEXT: vshuf.h $vr1, $vr0, $vr0
-; CHECK-NEXT: vst $vr1, $a0, 0
+; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2
+; CHECK-NEXT: vpackev.h $vr0, $vr1, $vr0
+; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
%va = load <8 x i16>, ptr %a
@@ -91,10 +89,10 @@ define void @shufflevector_halves_h_1(ptr %res, ptr %a, ptr %b) nounwind {
; CHECK-LABEL: shufflevector_halves_h_1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI5_0)
-; CHECK-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI5_0)
-; CHECK-NEXT: vshuf.h $vr1, $vr0, $vr0
-; CHECK-NEXT: vst $vr1, $a0, 0
+; CHECK-NEXT: vshuf4i.w $vr0, $vr0, 177
+; CHECK-NEXT: vreplvei.w $vr1, $vr0, 2
+; CHECK-NEXT: vpackod.h $vr0, $vr1, $vr0
+; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
%va = load <8 x i16>, ptr %a
@@ -108,9 +106,10 @@ define void @shufflevector_quarters_b(ptr %res, ptr %a, ptr %b) nounwind {
; CHECK-LABEL: shufflevector_quarters_b:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI6_0)
-; CHECK-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI6_0)
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1
+; CHECK-NEXT: vshuf4i.h $vr0, $vr0, 216
+; CHECK-NEXT: vshuf4i.b $vr0, $vr0, 216
+; CHECK-NEXT: vreplvei.h $vr1, $vr0, 4
+; CHECK-NEXT: vpackev.h $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -125,9 +124,10 @@ define void @shufflevector_quarters_b_1(ptr %res, ptr %a, ptr %b) nounwind {
; CHECK-LABEL: shufflevector_quarters_b_1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI7_0)
-; CHECK-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI7_0)
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1
+; CHECK-NEXT: vshuf4i.h $vr0, $vr0, 216
+; CHECK-NEXT: vshuf4i.b $vr0, $vr0, 141
+; CHECK-NEXT: vreplvei.h $vr1, $vr0, 4
+; CHECK-NEXT: vpackod.h $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -142,9 +142,10 @@ define void @shufflevector_quarters_b_2(ptr %res, ptr %a, ptr %b) nounwind {
; CHECK-LABEL: shufflevector_quarters_b_2:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI8_0)
-; CHECK-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI8_0)
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1
+; CHECK-NEXT: vshuf4i.h $vr0, $vr0, 141
+; CHECK-NEXT: vshuf4i.b $vr0, $vr0, 216
+; CHECK-NEXT: vreplvei.h $vr1, $vr0, 4
+; CHECK-NEXT: vpackev.h $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -159,9 +160,10 @@ define void @shufflevector_quarters_b_3(ptr %res, ptr %a, ptr %b) nounwind {
; CHECK-LABEL: shufflevector_quarters_b_3:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vld $vr0, $a1, 0
-; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI9_0)
-; CHECK-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI9_0)
-; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1
+; CHECK-NEXT: vshuf4i.h $vr0, $vr0, 141
+; CHECK-NEXT: vshuf4i.b $vr0, $vr0, 141
+; CHECK-NEXT: vreplvei.h $vr1, $vr0, 4
+; CHECK-NEXT: vpackod.h $vr0, $vr1, $vr0
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll
index 314350acd23d6..9b9016b4e5972 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll
@@ -28,20 +28,18 @@ define void @load_trunc_2i64_to_2i16(ptr %ptr, ptr %dst) nounwind {
; LA32-LABEL: load_trunc_2i64_to_2i16:
; LA32: # %bb.0:
; LA32-NEXT: vld $vr0, $a0, 0
-; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0)
-; LA32-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI1_0)
-; LA32-NEXT: vshuf.h $vr1, $vr0, $vr0
-; LA32-NEXT: vpickve2gr.w $a0, $vr1, 0
+; LA32-NEXT: vreplvei.w $vr1, $vr0, 2
+; LA32-NEXT: vpackev.h $vr0, $vr1, $vr0
+; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0
; LA32-NEXT: st.w $a0, $a1, 0
; LA32-NEXT: ret
;
; LA64-LABEL: load_trunc_2i64_to_2i16:
; LA64: # %bb.0:
; LA64-NEXT: vld $vr0, $a0, 0
-; LA64-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0)
-; LA64-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI1_0)
-; LA64-NEXT: vshuf.h $vr1, $vr0, $vr0
-; LA64-NEXT: vstelm.w $vr1, $a1, 0, 0
+; LA64-NEXT: vreplvei.w $vr1, $vr0, 2
+; LA64-NEXT: vpackev.h $vr0, $vr1, $vr0
+; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0
; LA64-NEXT: ret
%a = load <2 x i64>, ptr %ptr
%trunc = trunc <2 x i64> %a to <2 x i16>
@@ -53,18 +51,16 @@ define void @load_trunc_2i64_to_2i8(ptr %ptr, ptr %dst) nounwind {
; LA32-LABEL: load_trunc_2i64_to_2i8:
; LA32: # %bb.0:
; LA32-NEXT: vld $vr0, $a0, 0
-; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA32-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI2_0)
-; LA32-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1
+; LA32-NEXT: vreplvei.w $vr1, $vr0, 2
+; LA32-NEXT: vpackev.b $vr0, $vr1, $vr0
; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0
; LA32-NEXT: ret
;
; LA64-LABEL: load_trunc_2i64_to_2i8:
; LA64: # %bb.0:
; LA64-NEXT: vld $vr0, $a0, 0
-; LA64-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA64-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI2_0)
-; LA64-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1
+; LA64-NEXT: vreplvei.w $vr1, $vr0, 2
+; LA64-NEXT: vpackev.b $vr0, $vr1, $vr0
; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0
; LA64-NEXT: ret
%a = load <2 x i64>, ptr %ptr
@@ -100,9 +96,10 @@ define void @load_trunc_4i32_to_4i8(ptr %ptr, ptr %dst) nounwind {
; LA32-LABEL: load_trunc_4i32_to_4i8:
; LA32: # %bb.0:
; LA32-NEXT: vld $vr0, $a0, 0
-; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI4_0)
-; LA32-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI4_0)
-; LA32-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1
+; LA32-NEXT: vshuf4i.h $vr0, $vr0, 216
+; LA32-NEXT: vshuf4i.b $vr0, $vr0, 216
+; LA32-NEXT: vreplvei.h $vr1, $vr0, 4
+; LA32-NEXT: vpackev.h $vr0, $vr1, $vr0
; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0
; LA32-NEXT: st.w $a0, $a1, 0
; LA32-NEXT: ret
@@ -110,9 +107,10 @@ define void @load_trunc_4i32_to_4i8(ptr %ptr, ptr %dst) nounwind {
; LA64-LABEL: load_trunc_4i32_to_4i8:
; LA64: # %bb.0:
; LA64-NEXT: vld $vr0, $a0, 0
-; LA64-NEXT: pcalau12i $a0, %pc_hi20(.LCPI4_0)
-; LA64-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI4_0)
-; LA64-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1
+; LA64-NEXT: vshuf4i.h $vr0, $vr0, 216
+; LA64-NEXT: vshuf4i.b $vr0, $vr0, 216
+; LA64-NEXT: vreplvei.h $vr1, $vr0, 4
+; LA64-NEXT: vpackev.h $vr0, $vr1, $vr0
; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0
; LA64-NEXT: ret
%a = load <4 x i32>, ptr %ptr
@@ -174,21 +172,23 @@ define void @load_trunc_2i32_to_2i8(ptr %ptr, ptr %dst) nounwind {
; LA32: # %bb.0:
; LA32-NEXT: ld.w $a2, $a0, 0
; LA32-NEXT: ld.w $a0, $a0, 4
-; LA32-NEXT: pcalau12i $a3, %pc_hi20(.LCPI7_0)
-; LA32-NEXT: vld $vr0, $a3, %pc_lo12(.LCPI7_0)
-; LA32-NEXT: vinsgr2vr.w $vr1, $a2, 0
-; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 1
-; LA32-NEXT: vshuf.b $vr0, $vr0, $vr1, $vr0
+; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0
+; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1
+; LA32-NEXT: vshuf4i.h $vr0, $vr0, 216
+; LA32-NEXT: vshuf4i.b $vr0, $vr0, 216
+; LA32-NEXT: vreplvei.h $vr1, $vr0, 4
+; LA32-NEXT: vpackev.h $vr0, $vr1, $vr0
; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0
; LA32-NEXT: ret
;
; LA64-LABEL: load_trunc_2i32_to_2i8:
; LA64: # %bb.0:
; LA64-NEXT: ld.d $a0, $a0, 0
-; LA64-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_0)
-; LA64-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI7_0)
-; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 0
-; LA64-NEXT: vshuf.b $vr0, $vr0, $vr1, $vr0
+; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0
+; LA64-NEXT: vshuf4i.h $vr0, $vr0, 216
+; LA64-NEXT: vshuf4i.b $vr0, $vr0, 216
+; LA64-NEXT: vreplvei.h $vr1, $vr0, 4
+; LA64-NEXT: vpackev.h $vr0, $vr1, $vr0
; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0
; LA64-NEXT: ret
%a = load <2 x i32>, ptr %ptr
|
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Note: Maybe worse if the same mask is used by several shuffle operations? (ie. pcalau12i+vld+vshuf+vshuf+vshuf+...). How to trade-off?