diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h index 637c2c71b0241..6afaea3f3fc5c 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h @@ -908,6 +908,18 @@ class LegalizeRuleSet { LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx, MinSize)); } + /// Widen the scalar or vector element type to the next power of two that is + /// at least MinSize. No effect if the scalar size is a power of two. + LegalizeRuleSet &widenScalarOrEltToNextPow2OrMinSize(unsigned TypeIdx, + unsigned MinSize = 0) { + using namespace LegalityPredicates; + return actionIf( + LegalizeAction::WidenScalar, + any(scalarOrEltNarrowerThan(TypeIdx, MinSize), + scalarOrEltSizeNotPow2(typeIdx(TypeIdx))), + LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx, MinSize)); + } + LegalizeRuleSet &narrowScalar(unsigned TypeIdx, LegalizeMutation Mutation) { using namespace LegalityPredicates; return actionIf(LegalizeAction::NarrowScalar, isScalar(typeIdx(TypeIdx)), diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 8079f853aef85..1d016e684c48f 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -2495,6 +2495,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { case TargetOpcode::G_OR: case TargetOpcode::G_XOR: case TargetOpcode::G_SUB: + case TargetOpcode::G_SHUFFLE_VECTOR: // Perform operation at larger width (any extension is fines here, high bits // don't affect the result) and then truncate the result back to the // original type. diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 117c4004d41df..33f04e6ad0c76 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -956,6 +956,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) }, changeTo(1, 0)) .moreElementsToNextPow2(0) + .widenScalarOrEltToNextPow2OrMinSize(0, 8) + .clampNumElements(0, v8s8, v16s8) + .clampNumElements(0, v4s16, v8s16) .clampNumElements(0, v4s32, v4s32) .clampNumElements(0, v2s64, v2s64) .moreElementsIf( diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp index 5a8031641ae09..3cee2de4f5df8 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp @@ -780,6 +780,8 @@ bool matchScalarizeVectorUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI) { auto &Unmerge = cast(MI); Register Src1Reg = Unmerge.getReg(Unmerge.getNumOperands() - 1); const LLT SrcTy = MRI.getType(Src1Reg); + if (SrcTy.getSizeInBits() != 128 && SrcTy.getSizeInBits() != 64) + return false; return SrcTy.isVector() && !SrcTy.isScalable() && Unmerge.getNumOperands() == (unsigned)SrcTy.getNumElements() + 1; } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir index 4879ffd28784c..63a26dcfea476 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir @@ -287,39 +287,47 @@ body: | ; CHECK-NEXT: %q0:_(<4 x s32>) = COPY $q0 ; CHECK-NEXT: %q1:_(<4 x s32>) = COPY $q1 ; CHECK-NEXT: %q2:_(<4 x s32>) = COPY $q2 - ; CHECK-NEXT: %vec_cond0:_(<4 x s1>) = G_ICMP intpred(eq), %q0(<4 x s32>), %q1 - ; CHECK-NEXT: %vec_cond1:_(<4 x s1>) = G_ICMP intpred(eq), %q0(<4 x s32>), %q2 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(eq), %q0(<4 x s32>), %q1 + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(<4 x s32>) = G_ICMP intpred(eq), %q0(<4 x s32>), %q2 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4100 - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32), [[C1]](s32), [[C1]](s32) - ; CHECK-NEXT: %cmp:_(s1) = G_ICMP intpred(eq), %w0(s32), [[C]] - ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT %cmp(s1) - ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ZEXT]], 1 - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[SEXT_INREG]](s32) - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s1>) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C2]](s64) - ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s1>) = G_SHUFFLE_VECTOR [[IVEC]](<4 x s1>), [[DEF]], shufflemask(0, 0, 0, 0) - ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s8) = G_CONSTANT i8 1 - ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[C3]](s8) - ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s1>) = G_BUILD_VECTOR [[TRUNC1]](s1), [[TRUNC1]](s1), [[TRUNC1]](s1), [[TRUNC1]](s1) - ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[SHUF]](<4 x s1>) - ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[BUILD_VECTOR1]](<4 x s1>) - ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<4 x s16>) = G_XOR [[ANYEXT]], [[ANYEXT1]] - ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(<4 x s1>) = G_TRUNC [[XOR]](<4 x s16>) - ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT %vec_cond0(<4 x s1>) - ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[SHUF]](<4 x s1>) - ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s16>) = G_AND [[ANYEXT2]], [[ANYEXT3]] - ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(<4 x s1>) = G_TRUNC [[AND]](<4 x s16>) - ; CHECK-NEXT: [[ANYEXT4:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT %vec_cond1(<4 x s1>) - ; CHECK-NEXT: [[ANYEXT5:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[TRUNC2]](<4 x s1>) - ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<4 x s16>) = G_AND [[ANYEXT4]], [[ANYEXT5]] - ; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(<4 x s1>) = G_TRUNC [[AND1]](<4 x s16>) - ; CHECK-NEXT: [[ANYEXT6:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[TRUNC3]](<4 x s1>) - ; CHECK-NEXT: [[ANYEXT7:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[TRUNC4]](<4 x s1>) - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<4 x s16>) = G_OR [[ANYEXT6]], [[ANYEXT7]] - ; CHECK-NEXT: %select:_(<4 x s1>) = G_TRUNC [[OR]](<4 x s16>) - ; CHECK-NEXT: %zext_select:_(<4 x s32>) = G_ZEXT %select(<4 x s1>) + ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %w0(s32), [[C]] + ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ICMP2]], 1 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY [[DEF1]](s16) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY [[DEF1]](s16) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[DEF1]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[COPY]](s16), [[COPY1]](s16), [[COPY2]](s16), [[DEF1]](s16) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32) + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s16>) = G_INSERT_VECTOR_ELT [[BUILD_VECTOR]], [[TRUNC]](s16), [[C1]](s64) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[IVEC]](<4 x s16>) + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[UV]](s16) + ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s8) = G_TRUNC [[UV1]](s16) + ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[UV2]](s16) + ; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s8) = G_TRUNC [[UV3]](s16) + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[TRUNC1]](s8), [[TRUNC2]](s8), [[TRUNC3]](s8), [[TRUNC4]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8) + ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8), [[DEF2]](s8) + ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<8 x s8>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR1]](<8 x s8>), [[BUILD_VECTOR2]], shufflemask(0, 0, 0, 0, undef, undef, undef, undef) + ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(<4 x s8>), [[UV5:%[0-9]+]]:_(<4 x s8>) = G_UNMERGE_VALUES [[SHUF]](<8 x s8>) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s16) = COPY [[C2]](s16) + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s16) = COPY [[C2]](s16) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s16) = COPY [[C2]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[COPY3]](s16), [[COPY4]](s16), [[COPY5]](s16), [[C2]](s16) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[UV4]](<4 x s8>) + ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<4 x s16>) = G_XOR [[ANYEXT]], [[BUILD_VECTOR3]] + ; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP]](<4 x s32>) + ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[UV4]](<4 x s8>) + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s16>) = G_AND [[TRUNC5]], [[ANYEXT1]] + ; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[ICMP1]](<4 x s32>) + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<4 x s16>) = G_AND [[TRUNC6]], [[XOR]] + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<4 x s16>) = G_OR [[AND]], [[AND1]] + ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(<4 x s32>) = G_ANYEXT [[OR]](<4 x s16>) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C3]](s32), [[C3]](s32), [[C3]](s32), [[C3]](s32) + ; CHECK-NEXT: %zext_select:_(<4 x s32>) = G_AND [[ANYEXT2]], [[BUILD_VECTOR4]] ; CHECK-NEXT: $q0 = COPY %zext_select(<4 x s32>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %w0:_(s32) = COPY $w0 diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll index df59eb8e629f4..b408bc1c38976 100644 --- a/llvm/test/CodeGen/AArch64/shufflevector.ll +++ b/llvm/test/CodeGen/AArch64/shufflevector.ll @@ -3,17 +3,7 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI ; CHECK-GI: warning: Instruction selection used fallback path for shufflevector_v2i1 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shufflevector_v4i8 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shufflevector_v32i8 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shufflevector_v2i16 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shufflevector_v16i16 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shufflevector_v2i1_zeroes -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shufflevector_v4i8_zeroes -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shufflevector_v32i8_zeroes -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shufflevector_v2i16_zeroes -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shufflevector_v16i16_zeroes -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shufflevector_v3i8 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for shufflevector_v3i8_zeroes ; ===== Legal Vector Types ===== @@ -205,68 +195,142 @@ define <2 x i1> @shufflevector_v2i1(<2 x i1> %a, <2 x i1> %b){ } define i32 @shufflevector_v4i8(<4 x i8> %a, <4 x i8> %b){ -; CHECK-LABEL: shufflevector_v4i8: -; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ext v0.8b, v1.8b, v0.8b, #6 -; CHECK-NEXT: zip1 v1.4h, v1.4h, v0.4h -; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #4 -; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shufflevector_v4i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: ext v0.8b, v1.8b, v0.8b, #6 +; CHECK-SD-NEXT: zip1 v1.4h, v1.4h, v0.4h +; CHECK-SD-NEXT: ext v0.8b, v0.8b, v1.8b, #4 +; CHECK-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shufflevector_v4i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov h2, v0.h[1] +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: mov h3, v1.h[1] +; CHECK-GI-NEXT: adrp x8, .LCPI15_0 +; CHECK-GI-NEXT: mov h4, v0.h[2] +; CHECK-GI-NEXT: mov h5, v0.h[3] +; CHECK-GI-NEXT: mov h6, v1.h[3] +; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] +; CHECK-GI-NEXT: mov h2, v1.h[2] +; CHECK-GI-NEXT: mov v1.b[1], v3.b[0] +; CHECK-GI-NEXT: mov v0.b[2], v4.b[0] +; CHECK-GI-NEXT: mov v1.b[2], v2.b[0] +; CHECK-GI-NEXT: mov v0.b[3], v5.b[0] +; CHECK-GI-NEXT: mov v1.b[3], v6.b[0] +; CHECK-GI-NEXT: mov v0.b[4], v0.b[0] +; CHECK-GI-NEXT: mov v1.b[4], v0.b[0] +; CHECK-GI-NEXT: mov v0.b[5], v0.b[0] +; CHECK-GI-NEXT: mov v1.b[5], v0.b[0] +; CHECK-GI-NEXT: mov v0.b[6], v0.b[0] +; CHECK-GI-NEXT: mov v1.b[6], v0.b[0] +; CHECK-GI-NEXT: mov v0.b[7], v0.b[0] +; CHECK-GI-NEXT: mov v1.b[7], v0.b[0] +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI15_0] +; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret %c = shufflevector <4 x i8> %a, <4 x i8> %b, <4 x i32> %d = bitcast <4 x i8> %c to i32 ret i32 %d } define <32 x i8> @shufflevector_v32i8(<32 x i8> %a, <32 x i8> %b){ -; CHECK-LABEL: shufflevector_v32i8: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q2 killed $q2 def $q1_q2 -; CHECK-NEXT: adrp x8, .LCPI16_0 -; CHECK-NEXT: adrp x9, .LCPI16_1 -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI16_0] -; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI16_1] -; CHECK-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v3.16b -; CHECK-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v4.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shufflevector_v32i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 def $q1_q2 +; CHECK-SD-NEXT: adrp x8, .LCPI16_0 +; CHECK-SD-NEXT: adrp x9, .LCPI16_1 +; CHECK-SD-NEXT: mov v1.16b, v0.16b +; CHECK-SD-NEXT: ldr q3, [x8, :lo12:.LCPI16_0] +; CHECK-SD-NEXT: ldr q4, [x9, :lo12:.LCPI16_1] +; CHECK-SD-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v3.16b +; CHECK-SD-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v4.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shufflevector_v32i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov v3.16b, v0.16b +; CHECK-GI-NEXT: adrp x8, .LCPI16_1 +; CHECK-GI-NEXT: adrp x9, .LCPI16_0 +; CHECK-GI-NEXT: mov v4.16b, v2.16b +; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI16_1] +; CHECK-GI-NEXT: ldr q1, [x9, :lo12:.LCPI16_0] +; CHECK-GI-NEXT: tbl v0.16b, { v3.16b, v4.16b }, v0.16b +; CHECK-GI-NEXT: tbl v1.16b, { v3.16b, v4.16b }, v1.16b +; CHECK-GI-NEXT: ret %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %c } define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){ -; CHECK-LABEL: shufflevector_v2i16: -; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #4 -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strh w9, [sp, #12] -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: ldr w0, [sp, #12] -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shufflevector_v2i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: ext v0.8b, v0.8b, v1.8b, #4 +; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: fmov w9, s0 +; CHECK-SD-NEXT: strh w9, [sp, #12] +; CHECK-SD-NEXT: strh w8, [sp, #14] +; CHECK-SD-NEXT: ldr w0, [sp, #12] +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shufflevector_v2i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov s2, v0.s[1] +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: mov s3, v1.s[1] +; CHECK-GI-NEXT: adrp x8, .LCPI17_0 +; CHECK-GI-NEXT: mov v0.h[1], v2.h[0] +; CHECK-GI-NEXT: mov v1.h[1], v3.h[0] +; CHECK-GI-NEXT: mov v0.h[2], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[2], v0.h[0] +; CHECK-GI-NEXT: mov v0.h[3], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[3], v0.h[0] +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI17_0] +; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret %c = shufflevector <2 x i16> %a, <2 x i16> %b, <2 x i32> %d = bitcast <2 x i16> %c to i32 ret i32 %d } define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b){ -; CHECK-LABEL: shufflevector_v16i16: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q2 killed $q2 def $q1_q2 -; CHECK-NEXT: adrp x8, .LCPI18_0 -; CHECK-NEXT: adrp x9, .LCPI18_1 -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI18_0] -; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI18_1] -; CHECK-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v3.16b -; CHECK-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v4.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shufflevector_v16i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $q2 killed $q2 def $q1_q2 +; CHECK-SD-NEXT: adrp x8, .LCPI18_0 +; CHECK-SD-NEXT: adrp x9, .LCPI18_1 +; CHECK-SD-NEXT: mov v1.16b, v0.16b +; CHECK-SD-NEXT: ldr q3, [x8, :lo12:.LCPI18_0] +; CHECK-SD-NEXT: ldr q4, [x9, :lo12:.LCPI18_1] +; CHECK-SD-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v3.16b +; CHECK-SD-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v4.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shufflevector_v16i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov v3.16b, v0.16b +; CHECK-GI-NEXT: adrp x8, .LCPI18_1 +; CHECK-GI-NEXT: adrp x9, .LCPI18_0 +; CHECK-GI-NEXT: mov v4.16b, v2.16b +; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI18_1] +; CHECK-GI-NEXT: ldr q1, [x9, :lo12:.LCPI18_0] +; CHECK-GI-NEXT: tbl v0.16b, { v3.16b, v4.16b }, v0.16b +; CHECK-GI-NEXT: tbl v1.16b, { v3.16b, v4.16b }, v1.16b +; CHECK-GI-NEXT: ret %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %c } @@ -332,16 +396,23 @@ define <2 x i1> @shufflevector_v2i1_zeroes(<2 x i1> %a, <2 x i1> %b){ } define i32 @shufflevector_v4i8_zeroes(<4 x i8> %a, <4 x i8> %b){ -; CHECK-LABEL: shufflevector_v4i8_zeroes: -; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v0.4h, v0.h[0] -; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shufflevector_v4i8_zeroes: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: dup v0.4h, v0.h[0] +; CHECK-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shufflevector_v4i8_zeroes: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: dup v0.8b, w8 +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret %c = shufflevector <4 x i8> %a, <4 x i8> %b, <4 x i32> %d = bitcast <4 x i8> %c to i32 ret i32 %d @@ -358,19 +429,26 @@ define <32 x i8> @shufflevector_v32i8_zeroes(<32 x i8> %a, <32 x i8> %b){ } define i32 @shufflevector_v2i16_zeroes(<2 x i16> %a, <2 x i16> %b){ -; CHECK-LABEL: shufflevector_v2i16_zeroes: -; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v1.2s, v0.s[0] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strh w9, [sp, #12] -; CHECK-NEXT: mov w8, v1.s[1] -; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: ldr w0, [sp, #12] -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shufflevector_v2i16_zeroes: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: dup v1.2s, v0.s[0] +; CHECK-SD-NEXT: fmov w9, s0 +; CHECK-SD-NEXT: strh w9, [sp, #12] +; CHECK-SD-NEXT: mov w8, v1.s[1] +; CHECK-SD-NEXT: strh w8, [sp, #14] +; CHECK-SD-NEXT: ldr w0, [sp, #12] +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shufflevector_v2i16_zeroes: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: dup v0.4h, w8 +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret %c = shufflevector <2 x i16> %a, <2 x i16> %b, <2 x i32> %d = bitcast <2 x i16> %c to i32 ret i32 %d @@ -417,12 +495,45 @@ define <4 x i64> @shufflevector_v4i64_zeroes(<4 x i64> %a, <4 x i64> %b) { ; ===== Vectors with Non-Pow 2 Widths ===== define <3 x i8> @shufflevector_v3i8(<3 x i8> %a, <3 x i8> %b) { -; CHECK-LABEL: shufflevector_v3i8: -; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, w1 -; CHECK-NEXT: mov w1, w2 -; CHECK-NEXT: mov w2, w4 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shufflevector_v3i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov w0, w1 +; CHECK-SD-NEXT: mov w1, w2 +; CHECK-SD-NEXT: mov w2, w4 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shufflevector_v3i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: fmov s1, w1 +; CHECK-GI-NEXT: adrp x8, .LCPI30_0 +; CHECK-GI-NEXT: fmov s2, w3 +; CHECK-GI-NEXT: fmov s3, w4 +; CHECK-GI-NEXT: mov v0.b[1], v1.b[0] +; CHECK-GI-NEXT: fmov s1, w2 +; CHECK-GI-NEXT: mov v2.b[1], v3.b[0] +; CHECK-GI-NEXT: fmov s3, w5 +; CHECK-GI-NEXT: mov v0.b[2], v1.b[0] +; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI30_0] +; CHECK-GI-NEXT: mov v2.b[2], v3.b[0] +; CHECK-GI-NEXT: mov v0.b[3], v0.b[0] +; CHECK-GI-NEXT: mov v2.b[3], v0.b[0] +; CHECK-GI-NEXT: mov v0.b[4], v0.b[0] +; CHECK-GI-NEXT: mov v2.b[4], v0.b[0] +; CHECK-GI-NEXT: mov v0.b[5], v0.b[0] +; CHECK-GI-NEXT: mov v2.b[5], v0.b[0] +; CHECK-GI-NEXT: mov v0.b[6], v0.b[0] +; CHECK-GI-NEXT: mov v2.b[6], v0.b[0] +; CHECK-GI-NEXT: mov v0.b[7], v0.b[0] +; CHECK-GI-NEXT: mov v2.b[7], v0.b[0] +; CHECK-GI-NEXT: mov v0.d[1], v2.d[0] +; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b +; CHECK-GI-NEXT: mov b1, v0.b[1] +; CHECK-GI-NEXT: mov b2, v0.b[2] +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: fmov w1, s1 +; CHECK-GI-NEXT: fmov w2, s2 +; CHECK-GI-NEXT: ret %c = shufflevector <3 x i8> %a, <3 x i8> %b, <3 x i32> ret <3 x i8> %c } @@ -517,11 +628,21 @@ define <3 x i32> @shufflevector_v3i32(<3 x i32> %a, <3 x i32> %b) { ; ===== Vectors with Non-Pow 2 Widths with Zero Masks ===== define <3 x i8> @shufflevector_v3i8_zeroes(<3 x i8> %a, <3 x i8> %b) { -; CHECK-LABEL: shufflevector_v3i8_zeroes: -; CHECK: // %bb.0: -; CHECK-NEXT: mov w1, w0 -; CHECK-NEXT: mov w2, w0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: shufflevector_v3i8_zeroes: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov w1, w0 +; CHECK-SD-NEXT: mov w2, w0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shufflevector_v3i8_zeroes: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: dup v0.8b, w0 +; CHECK-GI-NEXT: mov b1, v0.b[1] +; CHECK-GI-NEXT: mov b2, v0.b[2] +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: fmov w1, s1 +; CHECK-GI-NEXT: fmov w2, s2 +; CHECK-GI-NEXT: ret %c = shufflevector <3 x i8> %a, <3 x i8> %b, <3 x i32> ret <3 x i8> %c }