From e2ad2a1bef796543c050a922d65ed486aa0ca79e Mon Sep 17 00:00:00 2001 From: Qi Zhao Date: Sat, 18 Oct 2025 17:24:34 +0800 Subject: [PATCH] [LoongArch] Custom legalize vector_shuffle to `vextrins` TODO: LASX supporting will be in a later commit. --- .../LoongArch/LoongArchISelLowering.cpp | 88 +++++++++++++++++++ .../lsx/ir-instruction/shuffle-as-vextrins.ll | 48 +++++----- llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll | 51 +++++------ .../LoongArch/lsx/vec-shuffle-any-ext.ll | 39 ++++---- llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll | 34 +++---- llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll | 14 +-- 6 files changed, 169 insertions(+), 105 deletions(-) diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index f7deeafc9ccfc..a6818ab1378c4 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -1948,6 +1948,91 @@ static SDValue lowerVECTOR_SHUFFLE_VPICKOD(const SDLoc &DL, ArrayRef Mask, return DAG.getNode(LoongArchISD::VPICKOD, DL, VT, V2, V1); } +/// Lower VECTOR_SHUFFLE into VEXTRINS (if possible). +/// +/// VEXTRINS copies one element of a vector into any place of the result +/// vector and makes no change to the rest elements of the result vector. +/// +/// It is possible to lower into VEXTRINS when the mask takes the form: +/// <0, 1, 2, ..., n+i, ..., n-1> or or +/// <0, 1, 2, ..., i, ..., n-1> or +/// where n is the number of elements in the vector and i is in [0, n). +/// For example: +/// <0, 1, 2, 3, 4, 5, 6, 8> , <2, 9, 10, 11, 12, 13, 14, 15> , +/// <0, 1, 2, 6, 4, 5, 6, 7> , <8, 9, 10, 11, 12, 9, 14, 15> +/// +/// When undef's appear in the mask they are treated as if they were whatever +/// value is necessary in order to fit the above forms. +static SDValue +lowerVECTOR_SHUFFLE_VEXTRINS(const SDLoc &DL, ArrayRef Mask, MVT VT, + SDValue V1, SDValue V2, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { + unsigned NumElts = VT.getVectorNumElements(); + MVT EltVT = VT.getVectorElementType(); + MVT GRLenVT = Subtarget.getGRLenVT(); + + if (Mask.size() != NumElts) + return SDValue(); + + auto tryLowerToExtrAndIns = [&](unsigned Base) -> SDValue { + int DiffCount = 0; + int DiffPos = -1; + for (unsigned i = 0; i < NumElts; ++i) { + if (Mask[i] == -1) + continue; + if (Mask[i] != int(Base + i)) { + ++DiffCount; + DiffPos = int(i); + if (DiffCount > 1) + return SDValue(); + } + } + + // Need exactly one differing element to lower into VEXTRINS. + if (DiffCount != 1) + return SDValue(); + + // DiffMask must be in [0, 2N). + int DiffMask = Mask[DiffPos]; + if (DiffMask < 0 || DiffMask >= int(2 * NumElts)) + return SDValue(); + + // Determine source vector and source index. + SDValue SrcVec; + unsigned SrcIdx; + if (unsigned(DiffMask) < NumElts) { + SrcVec = V1; + SrcIdx = unsigned(DiffMask); + } else { + SrcVec = V2; + SrcIdx = unsigned(DiffMask) - NumElts; + } + + // Replace with EXTRACT_VECTOR_ELT + INSERT_VECTOR_ELT, it will match the + // patterns of VEXTRINS in tablegen. + bool IsEltFP = EltVT.isFloatingPoint(); + SDValue Extracted = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, IsEltFP ? EltVT : GRLenVT, + SrcVec, DAG.getConstant(SrcIdx, DL, GRLenVT)); + + SDValue InsertVal = Extracted; + if (!IsEltFP && EltVT != GRLenVT) + InsertVal = DAG.getNode(ISD::ANY_EXTEND, DL, GRLenVT, + DAG.getNode(ISD::TRUNCATE, DL, EltVT, Extracted)); + + SDValue Result = + DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, (Base == 0) ? V1 : V2, + InsertVal, DAG.getConstant(DiffPos, DL, GRLenVT)); + + return Result; + }; + + // Try [0, n-1) insertion then [n, 2n-1) insertion. + if (SDValue Result = tryLowerToExtrAndIns(0)) + return Result; + return tryLowerToExtrAndIns(NumElts); +} + /// Lower VECTOR_SHUFFLE into VSHUF. /// /// This mostly consists of converting the shuffle mask into a BUILD_VECTOR and @@ -2028,6 +2113,9 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, (Result = lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget))) return Result; + if ((Result = + lowerVECTOR_SHUFFLE_VEXTRINS(DL, Mask, VT, V1, V2, DAG, Subtarget))) + return Result; if ((Result = lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(DL, Mask, VT, V1, V2, DAG, Zeroable))) return Result; diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vextrins.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vextrins.ll index a504067772e81..0715c7911345c 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vextrins.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vextrins.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s -; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 ;; vextrins.b define void @shufflevector_v16i8(ptr %res, ptr %a, ptr %b) nounwind { @@ -8,9 +8,7 @@ define void @shufflevector_v16i8(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI0_0) -; CHECK-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI0_0) -; CHECK-NEXT: vshuf.b $vr0, $vr1, $vr0, $vr2 +; CHECK-NEXT: vextrins.b $vr0, $vr1, 240 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -26,10 +24,8 @@ define void @shufflevector_v8i16(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK-LABEL: shufflevector_v8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI1_0) -; CHECK-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI1_0) -; CHECK-NEXT: vshuf.h $vr1, $vr0, $vr0 -; CHECK-NEXT: vst $vr1, $a0, 0 +; CHECK-NEXT: vextrins.h $vr0, $vr0, 53 +; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: %va = load <8 x i16>, ptr %a @@ -41,15 +37,21 @@ entry: ;; vextrins.w define void @shufflevector_v4i32(ptr %res, ptr %a, ptr %b) nounwind { -; CHECK-LABEL: shufflevector_v4i32: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI2_0) -; CHECK-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI2_0) -; CHECK-NEXT: vshuf.w $vr2, $vr1, $vr0 -; CHECK-NEXT: vst $vr2, $a0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: shufflevector_v4i32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vld $vr0, $a2, 0 +; LA32-NEXT: ld.w $a1, $a1, 12 +; LA32-NEXT: vinsgr2vr.w $vr0, $a1, 0 +; LA32-NEXT: vst $vr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: shufflevector_v4i32: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vld $vr0, $a1, 0 +; LA64-NEXT: vld $vr1, $a2, 0 +; LA64-NEXT: vextrins.w $vr1, $vr0, 3 +; LA64-NEXT: vst $vr1, $a0, 0 +; LA64-NEXT: ret entry: %va = load <4 x i32>, ptr %a %vb = load <4 x i32>, ptr %b @@ -62,12 +64,10 @@ entry: define void @shufflevector_v4f32(ptr %res, ptr %a, ptr %b) nounwind { ; CHECK-LABEL: shufflevector_v4f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vld $vr1, $a2, 0 -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI3_0) -; CHECK-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI3_0) -; CHECK-NEXT: vshuf.w $vr2, $vr1, $vr0 -; CHECK-NEXT: vst $vr2, $a0, 0 +; CHECK-NEXT: vld $vr0, $a2, 0 +; CHECK-NEXT: fld.s $fa1, $a1, 8 +; CHECK-NEXT: vextrins.w $vr0, $vr1, 48 +; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: %va = load <4 x float>, ptr %a diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll index dce6dc9f2aa37..9fa0b3838967b 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll @@ -7,9 +7,7 @@ define void @load_sext_2i8_to_2i64(ptr %ptr, ptr %dst) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: ld.h $a0, $a0, 0 ; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 0 -; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0 -; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vilvl.w $vr0, $vr0, $vr0 +; CHECK-NEXT: vextrins.b $vr0, $vr0, 129 ; CHECK-NEXT: vslli.d $vr0, $vr0, 56 ; CHECK-NEXT: vsrai.d $vr0, $vr0, 56 ; CHECK-NEXT: vst $vr0, $a1, 0 @@ -73,8 +71,7 @@ define void @load_sext_2i16_to_2i64(ptr %ptr, ptr %dst) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: ld.w $a0, $a0, 0 ; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0 -; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vilvl.w $vr0, $vr0, $vr0 +; CHECK-NEXT: vextrins.h $vr0, $vr0, 65 ; CHECK-NEXT: vslli.d $vr0, $vr0, 48 ; CHECK-NEXT: vsrai.d $vr0, $vr0, 48 ; CHECK-NEXT: vst $vr0, $a1, 0 @@ -199,42 +196,42 @@ define void @load_sext_16i8_to_16i64(ptr %ptr, ptr %dst) { ; CHECK-NEXT: vld $vr0, $a0, 0 ; CHECK-NEXT: vilvl.b $vr1, $vr0, $vr0 ; CHECK-NEXT: vilvl.h $vr2, $vr1, $vr1 -; CHECK-NEXT: vilvl.w $vr3, $vr2, $vr2 -; CHECK-NEXT: vslli.d $vr3, $vr3, 56 -; CHECK-NEXT: vsrai.d $vr3, $vr3, 56 ; CHECK-NEXT: vilvh.w $vr2, $vr2, $vr2 ; CHECK-NEXT: vslli.d $vr2, $vr2, 56 ; CHECK-NEXT: vsrai.d $vr2, $vr2, 56 ; CHECK-NEXT: vilvh.h $vr1, $vr1, $vr1 -; CHECK-NEXT: vilvl.w $vr4, $vr1, $vr1 -; CHECK-NEXT: vslli.d $vr4, $vr4, 56 -; CHECK-NEXT: vsrai.d $vr4, $vr4, 56 +; CHECK-NEXT: vilvl.w $vr3, $vr1, $vr1 +; CHECK-NEXT: vslli.d $vr3, $vr3, 56 +; CHECK-NEXT: vsrai.d $vr3, $vr3, 56 ; CHECK-NEXT: vilvh.w $vr1, $vr1, $vr1 ; CHECK-NEXT: vslli.d $vr1, $vr1, 56 ; CHECK-NEXT: vsrai.d $vr1, $vr1, 56 -; CHECK-NEXT: vilvh.b $vr0, $vr0, $vr0 -; CHECK-NEXT: vilvl.h $vr5, $vr0, $vr0 +; CHECK-NEXT: vilvh.b $vr4, $vr0, $vr0 +; CHECK-NEXT: vilvl.h $vr5, $vr4, $vr4 ; CHECK-NEXT: vilvl.w $vr6, $vr5, $vr5 ; CHECK-NEXT: vslli.d $vr6, $vr6, 56 ; CHECK-NEXT: vsrai.d $vr6, $vr6, 56 ; CHECK-NEXT: vilvh.w $vr5, $vr5, $vr5 ; CHECK-NEXT: vslli.d $vr5, $vr5, 56 ; CHECK-NEXT: vsrai.d $vr5, $vr5, 56 -; CHECK-NEXT: vilvh.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vilvl.w $vr7, $vr0, $vr0 +; CHECK-NEXT: vilvh.h $vr4, $vr4, $vr4 +; CHECK-NEXT: vilvl.w $vr7, $vr4, $vr4 ; CHECK-NEXT: vslli.d $vr7, $vr7, 56 ; CHECK-NEXT: vsrai.d $vr7, $vr7, 56 -; CHECK-NEXT: vilvh.w $vr0, $vr0, $vr0 +; CHECK-NEXT: vilvh.w $vr4, $vr4, $vr4 +; CHECK-NEXT: vslli.d $vr4, $vr4, 56 +; CHECK-NEXT: vsrai.d $vr4, $vr4, 56 +; CHECK-NEXT: vextrins.b $vr0, $vr0, 129 ; CHECK-NEXT: vslli.d $vr0, $vr0, 56 ; CHECK-NEXT: vsrai.d $vr0, $vr0, 56 -; CHECK-NEXT: vst $vr0, $a1, 112 +; CHECK-NEXT: vst $vr0, $a1, 0 +; CHECK-NEXT: vst $vr4, $a1, 112 ; CHECK-NEXT: vst $vr7, $a1, 96 ; CHECK-NEXT: vst $vr5, $a1, 80 ; CHECK-NEXT: vst $vr6, $a1, 64 ; CHECK-NEXT: vst $vr1, $a1, 48 -; CHECK-NEXT: vst $vr4, $a1, 32 +; CHECK-NEXT: vst $vr3, $a1, 32 ; CHECK-NEXT: vst $vr2, $a1, 16 -; CHECK-NEXT: vst $vr3, $a1, 0 ; CHECK-NEXT: ret entry: %A = load <16 x i8>, ptr %ptr @@ -268,23 +265,23 @@ define void @load_sext_8i16_to_8i64(ptr %ptr, ptr %dst) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a0, 0 ; CHECK-NEXT: vilvl.h $vr1, $vr0, $vr0 -; CHECK-NEXT: vilvl.w $vr2, $vr1, $vr1 -; CHECK-NEXT: vslli.d $vr2, $vr2, 48 -; CHECK-NEXT: vsrai.d $vr2, $vr2, 48 ; CHECK-NEXT: vilvh.w $vr1, $vr1, $vr1 ; CHECK-NEXT: vslli.d $vr1, $vr1, 48 ; CHECK-NEXT: vsrai.d $vr1, $vr1, 48 -; CHECK-NEXT: vilvh.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vilvl.w $vr3, $vr0, $vr0 +; CHECK-NEXT: vilvh.h $vr2, $vr0, $vr0 +; CHECK-NEXT: vilvl.w $vr3, $vr2, $vr2 ; CHECK-NEXT: vslli.d $vr3, $vr3, 48 ; CHECK-NEXT: vsrai.d $vr3, $vr3, 48 -; CHECK-NEXT: vilvh.w $vr0, $vr0, $vr0 +; CHECK-NEXT: vilvh.w $vr2, $vr2, $vr2 +; CHECK-NEXT: vslli.d $vr2, $vr2, 48 +; CHECK-NEXT: vsrai.d $vr2, $vr2, 48 +; CHECK-NEXT: vextrins.h $vr0, $vr0, 65 ; CHECK-NEXT: vslli.d $vr0, $vr0, 48 ; CHECK-NEXT: vsrai.d $vr0, $vr0, 48 -; CHECK-NEXT: vst $vr0, $a1, 48 +; CHECK-NEXT: vst $vr0, $a1, 0 +; CHECK-NEXT: vst $vr2, $a1, 48 ; CHECK-NEXT: vst $vr3, $a1, 32 ; CHECK-NEXT: vst $vr1, $a1, 16 -; CHECK-NEXT: vst $vr2, $a1, 0 ; CHECK-NEXT: ret entry: %A = load <8 x i16>, ptr %ptr diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-any-ext.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-any-ext.ll index bb008ee5eb903..06ab85a84fb28 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-any-ext.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-any-ext.ll @@ -7,9 +7,7 @@ define void @shuffle_any_ext_2i8_to_2i64(ptr %ptr, ptr %dst) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: ld.h $a0, $a0, 0 ; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 0 -; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0 -; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vilvl.w $vr0, $vr0, $vr0 +; CHECK-NEXT: vextrins.b $vr0, $vr0, 129 ; CHECK-NEXT: vst $vr0, $a1, 0 ; CHECK-NEXT: ret %x = load <2 x i8>, ptr %ptr @@ -24,8 +22,7 @@ define void @shuffle_any_ext_2i16_to_2i64(ptr %ptr, ptr %dst) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: ld.w $a0, $a0, 0 ; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0 -; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vilvl.w $vr0, $vr0, $vr0 +; CHECK-NEXT: vextrins.h $vr0, $vr0, 65 ; CHECK-NEXT: vst $vr0, $a1, 0 ; CHECK-NEXT: ret %x = load <2 x i16>, ptr %ptr @@ -162,15 +159,15 @@ define void @shuffle_any_ext_8i16_to_8i64(ptr %ptr, ptr %dst) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: vld $vr0, $a0, 0 ; CHECK-NEXT: vilvl.h $vr1, $vr0, $vr0 -; CHECK-NEXT: vilvl.w $vr2, $vr1, $vr1 ; CHECK-NEXT: vilvh.w $vr1, $vr1, $vr1 -; CHECK-NEXT: vilvh.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vilvl.w $vr3, $vr0, $vr0 -; CHECK-NEXT: vilvh.w $vr0, $vr0, $vr0 -; CHECK-NEXT: vst $vr0, $a1, 48 +; CHECK-NEXT: vilvh.h $vr2, $vr0, $vr0 +; CHECK-NEXT: vilvl.w $vr3, $vr2, $vr2 +; CHECK-NEXT: vilvh.w $vr2, $vr2, $vr2 +; CHECK-NEXT: vextrins.h $vr0, $vr0, 65 +; CHECK-NEXT: vst $vr0, $a1, 0 +; CHECK-NEXT: vst $vr2, $a1, 48 ; CHECK-NEXT: vst $vr3, $a1, 32 ; CHECK-NEXT: vst $vr1, $a1, 16 -; CHECK-NEXT: vst $vr2, $a1, 0 ; CHECK-NEXT: ret %x = load <8 x i16>, ptr %ptr %y = shufflevector <8 x i16> %x, <8 x i16> poison, <32 x i32> @@ -223,26 +220,26 @@ define void @shuffle_any_ext_16i8_to_16i64(ptr %ptr, ptr %dst) nounwind { ; CHECK-NEXT: vld $vr0, $a0, 0 ; CHECK-NEXT: vilvl.b $vr1, $vr0, $vr0 ; CHECK-NEXT: vilvl.h $vr2, $vr1, $vr1 -; CHECK-NEXT: vilvl.w $vr3, $vr2, $vr2 ; CHECK-NEXT: vilvh.w $vr2, $vr2, $vr2 ; CHECK-NEXT: vilvh.h $vr1, $vr1, $vr1 -; CHECK-NEXT: vilvl.w $vr4, $vr1, $vr1 +; CHECK-NEXT: vilvl.w $vr3, $vr1, $vr1 ; CHECK-NEXT: vilvh.w $vr1, $vr1, $vr1 -; CHECK-NEXT: vilvh.b $vr0, $vr0, $vr0 -; CHECK-NEXT: vilvl.h $vr5, $vr0, $vr0 +; CHECK-NEXT: vilvh.b $vr4, $vr0, $vr0 +; CHECK-NEXT: vilvl.h $vr5, $vr4, $vr4 ; CHECK-NEXT: vilvl.w $vr6, $vr5, $vr5 ; CHECK-NEXT: vilvh.w $vr5, $vr5, $vr5 -; CHECK-NEXT: vilvh.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vilvl.w $vr7, $vr0, $vr0 -; CHECK-NEXT: vilvh.w $vr0, $vr0, $vr0 -; CHECK-NEXT: vst $vr0, $a1, 112 +; CHECK-NEXT: vilvh.h $vr4, $vr4, $vr4 +; CHECK-NEXT: vilvl.w $vr7, $vr4, $vr4 +; CHECK-NEXT: vilvh.w $vr4, $vr4, $vr4 +; CHECK-NEXT: vextrins.b $vr0, $vr0, 129 +; CHECK-NEXT: vst $vr0, $a1, 0 +; CHECK-NEXT: vst $vr4, $a1, 112 ; CHECK-NEXT: vst $vr7, $a1, 96 ; CHECK-NEXT: vst $vr5, $a1, 80 ; CHECK-NEXT: vst $vr6, $a1, 64 ; CHECK-NEXT: vst $vr1, $a1, 48 -; CHECK-NEXT: vst $vr4, $a1, 32 +; CHECK-NEXT: vst $vr3, $a1, 32 ; CHECK-NEXT: vst $vr2, $a1, 16 -; CHECK-NEXT: vst $vr3, $a1, 0 ; CHECK-NEXT: ret %x = load <16 x i8>, ptr %ptr %y = shufflevector <16 x i8> %x, <16 x i8> poison, <128 x i32> diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll index 314350acd23d6..8d94b76b061c7 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll @@ -28,20 +28,16 @@ define void @load_trunc_2i64_to_2i16(ptr %ptr, ptr %dst) nounwind { ; LA32-LABEL: load_trunc_2i64_to_2i16: ; LA32: # %bb.0: ; LA32-NEXT: vld $vr0, $a0, 0 -; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) -; LA32-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI1_0) -; LA32-NEXT: vshuf.h $vr1, $vr0, $vr0 -; LA32-NEXT: vpickve2gr.w $a0, $vr1, 0 +; LA32-NEXT: vextrins.h $vr0, $vr0, 20 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 ; LA32-NEXT: st.w $a0, $a1, 0 ; LA32-NEXT: ret ; ; LA64-LABEL: load_trunc_2i64_to_2i16: ; LA64: # %bb.0: ; LA64-NEXT: vld $vr0, $a0, 0 -; LA64-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) -; LA64-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI1_0) -; LA64-NEXT: vshuf.h $vr1, $vr0, $vr0 -; LA64-NEXT: vstelm.w $vr1, $a1, 0, 0 +; LA64-NEXT: vextrins.h $vr0, $vr0, 20 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 ; LA64-NEXT: ret %a = load <2 x i64>, ptr %ptr %trunc = trunc <2 x i64> %a to <2 x i16> @@ -53,18 +49,14 @@ define void @load_trunc_2i64_to_2i8(ptr %ptr, ptr %dst) nounwind { ; LA32-LABEL: load_trunc_2i64_to_2i8: ; LA32: # %bb.0: ; LA32-NEXT: vld $vr0, $a0, 0 -; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_0) -; LA32-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI2_0) -; LA32-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 +; LA32-NEXT: vextrins.b $vr0, $vr0, 24 ; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0 ; LA32-NEXT: ret ; ; LA64-LABEL: load_trunc_2i64_to_2i8: ; LA64: # %bb.0: ; LA64-NEXT: vld $vr0, $a0, 0 -; LA64-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_0) -; LA64-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI2_0) -; LA64-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 +; LA64-NEXT: vextrins.b $vr0, $vr0, 24 ; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0 ; LA64-NEXT: ret %a = load <2 x i64>, ptr %ptr @@ -174,21 +166,17 @@ define void @load_trunc_2i32_to_2i8(ptr %ptr, ptr %dst) nounwind { ; LA32: # %bb.0: ; LA32-NEXT: ld.w $a2, $a0, 0 ; LA32-NEXT: ld.w $a0, $a0, 4 -; LA32-NEXT: pcalau12i $a3, %pc_hi20(.LCPI7_0) -; LA32-NEXT: vld $vr0, $a3, %pc_lo12(.LCPI7_0) -; LA32-NEXT: vinsgr2vr.w $vr1, $a2, 0 -; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 1 -; LA32-NEXT: vshuf.b $vr0, $vr0, $vr1, $vr0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vextrins.b $vr0, $vr0, 20 ; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0 ; LA32-NEXT: ret ; ; LA64-LABEL: load_trunc_2i32_to_2i8: ; LA64: # %bb.0: ; LA64-NEXT: ld.d $a0, $a0, 0 -; LA64-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_0) -; LA64-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI7_0) -; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 0 -; LA64-NEXT: vshuf.b $vr0, $vr0, $vr1, $vr0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vextrins.b $vr0, $vr0, 20 ; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0 ; LA64-NEXT: ret %a = load <2 x i32>, ptr %ptr diff --git a/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll b/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll index 8bdeebef13dd2..7a098c1e26a43 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll @@ -178,9 +178,7 @@ define i2 @vmsk_sgt_v2i8(<2 x i8> %a, <2 x i8> %b) { ; CHECK-LABEL: vmsk_sgt_v2i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vslt.b $vr0, $vr1, $vr0 -; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0 -; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vilvl.w $vr0, $vr0, $vr0 +; CHECK-NEXT: vextrins.b $vr0, $vr0, 129 ; CHECK-NEXT: vslli.d $vr0, $vr0, 56 ; CHECK-NEXT: vmskltz.d $vr0, $vr0 ; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 @@ -194,8 +192,7 @@ define i2 @vmsk_sgt_v2i16(<2 x i16> %a, <2 x i16> %b) { ; CHECK-LABEL: vmsk_sgt_v2i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vslt.h $vr0, $vr1, $vr0 -; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vilvl.w $vr0, $vr0, $vr0 +; CHECK-NEXT: vextrins.h $vr0, $vr0, 65 ; CHECK-NEXT: vslli.d $vr0, $vr0, 48 ; CHECK-NEXT: vmskltz.d $vr0, $vr0 ; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 @@ -340,9 +337,7 @@ define i2 @vmsk_sgt_and_sgt_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> ; CHECK-NEXT: vslt.b $vr0, $vr1, $vr0 ; CHECK-NEXT: vslt.b $vr1, $vr3, $vr2 ; CHECK-NEXT: vand.v $vr0, $vr0, $vr1 -; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0 -; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vilvl.w $vr0, $vr0, $vr0 +; CHECK-NEXT: vextrins.b $vr0, $vr0, 129 ; CHECK-NEXT: vslli.d $vr0, $vr0, 56 ; CHECK-NEXT: vmskltz.d $vr0, $vr0 ; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 @@ -360,8 +355,7 @@ define i2 @vmsk_sgt_and_sgt_v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x ; CHECK-NEXT: vslt.h $vr0, $vr1, $vr0 ; CHECK-NEXT: vslt.h $vr1, $vr3, $vr2 ; CHECK-NEXT: vand.v $vr0, $vr0, $vr1 -; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vilvl.w $vr0, $vr0, $vr0 +; CHECK-NEXT: vextrins.h $vr0, $vr0, 65 ; CHECK-NEXT: vslli.d $vr0, $vr0, 48 ; CHECK-NEXT: vmskltz.d $vr0, $vr0 ; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0