diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td index fd04dfde95c2a..d9a2035e8a0ee 100644 --- a/clang/include/clang/Basic/arm_mve.td +++ b/clang/include/clang/Basic/arm_mve.td @@ -1166,6 +1166,22 @@ let params = T.Int32 in { defm vsbc: vadcsbc; } +let params = T.Int in { + def vshlcq: Intrinsic< + Vector, (args Vector:$v, Ptr:$ps, imm_1to32:$imm), + (seq (load $ps):$s, + (IRInt<"vshlc", [Vector]> $v, $s, $imm):$pair, + (store (xval $pair, 0), $ps), + (xval $pair, 1))>; + def vshlcq_m: Intrinsic< + Vector, (args Vector:$v, Ptr:$ps, imm_1to32:$imm, Predicate:$pred), + (seq (load $ps):$s, + (IRInt<"vshlc_predicated", [Vector, Predicate]> + $v, $s, $imm, $pred):$pair, + (store (xval $pair, 0), $ps), + (xval $pair, 1))>; +} + multiclass VectorComplexAddPred { def "" : Intrinsic not_halving, angle, $a, $b)>; diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vshlc.c b/clang/test/CodeGen/arm-mve-intrinsics/vshlc.c new file mode 100644 index 0000000000000..1a53a90f26fac --- /dev/null +++ b/clang/test/CodeGen/arm-mve-intrinsics/vshlc.c @@ -0,0 +1,221 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s + +#include + +// CHECK-LABEL: @test_vshlcq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call { i32, <16 x i8> } @llvm.arm.mve.vshlc.v16i8(<16 x i8> [[A:%.*]], i32 [[TMP0]], i32 18) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { i32, <16 x i8> } [[TMP1]], 0 +// CHECK-NEXT: store i32 [[TMP2]], i32* [[B]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, <16 x i8> } [[TMP1]], 1 +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +int8x16_t test_vshlcq_s8(int8x16_t a, uint32_t *b) { +#ifdef POLYMORPHIC + return vshlcq(a, b, 18); +#else /* POLYMORPHIC */ + return vshlcq_s8(a, b, 18); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vshlcq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16> [[A:%.*]], i32 [[TMP0]], i32 16) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { i32, <8 x i16> } [[TMP1]], 0 +// CHECK-NEXT: store i32 [[TMP2]], i32* [[B]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, <8 x i16> } [[TMP1]], 1 +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +int16x8_t test_vshlcq_s16(int16x8_t a, uint32_t *b) { +#ifdef POLYMORPHIC + return vshlcq(a, b, 16); +#else /* POLYMORPHIC */ + return vshlcq_s16(a, b, 16); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vshlcq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call { i32, <4 x i32> } @llvm.arm.mve.vshlc.v4i32(<4 x i32> [[A:%.*]], i32 [[TMP0]], i32 4) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { i32, <4 x i32> } [[TMP1]], 0 +// CHECK-NEXT: store i32 [[TMP2]], i32* [[B]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, <4 x i32> } [[TMP1]], 1 +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// +int32x4_t test_vshlcq_s32(int32x4_t a, uint32_t *b) { +#ifdef POLYMORPHIC + return vshlcq(a, b, 4); +#else /* POLYMORPHIC */ + return vshlcq_s32(a, b, 4); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vshlcq_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call { i32, <16 x i8> } @llvm.arm.mve.vshlc.v16i8(<16 x i8> [[A:%.*]], i32 [[TMP0]], i32 17) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { i32, <16 x i8> } [[TMP1]], 0 +// CHECK-NEXT: store i32 [[TMP2]], i32* [[B]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, <16 x i8> } [[TMP1]], 1 +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +uint8x16_t test_vshlcq_u8(uint8x16_t a, uint32_t *b) { +#ifdef POLYMORPHIC + return vshlcq(a, b, 17); +#else /* POLYMORPHIC */ + return vshlcq_u8(a, b, 17); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vshlcq_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16> [[A:%.*]], i32 [[TMP0]], i32 17) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { i32, <8 x i16> } [[TMP1]], 0 +// CHECK-NEXT: store i32 [[TMP2]], i32* [[B]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, <8 x i16> } [[TMP1]], 1 +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +uint16x8_t test_vshlcq_u16(uint16x8_t a, uint32_t *b) { +#ifdef POLYMORPHIC + return vshlcq(a, b, 17); +#else /* POLYMORPHIC */ + return vshlcq_u16(a, b, 17); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vshlcq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call { i32, <4 x i32> } @llvm.arm.mve.vshlc.v4i32(<4 x i32> [[A:%.*]], i32 [[TMP0]], i32 20) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { i32, <4 x i32> } [[TMP1]], 0 +// CHECK-NEXT: store i32 [[TMP2]], i32* [[B]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, <4 x i32> } [[TMP1]], 1 +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// +uint32x4_t test_vshlcq_u32(uint32x4_t a, uint32_t *b) { +#ifdef POLYMORPHIC + return vshlcq(a, b, 20); +#else /* POLYMORPHIC */ + return vshlcq_u32(a, b, 20); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vshlcq_m_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, <16 x i8> } @llvm.arm.mve.vshlc.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], i32 [[TMP0]], i32 29, <16 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, <16 x i8> } [[TMP3]], 0 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { i32, <16 x i8> } [[TMP3]], 1 +// CHECK-NEXT: ret <16 x i8> [[TMP5]] +// +int8x16_t test_vshlcq_m_s8(int8x16_t a, uint32_t *b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vshlcq_m(a, b, 29, p); +#else /* POLYMORPHIC */ + return vshlcq_m_s8(a, b, 29, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vshlcq_m_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, <8 x i16> } @llvm.arm.mve.vshlc.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 [[TMP0]], i32 17, <8 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, <8 x i16> } [[TMP3]], 0 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { i32, <8 x i16> } [[TMP3]], 1 +// CHECK-NEXT: ret <8 x i16> [[TMP5]] +// +int16x8_t test_vshlcq_m_s16(int16x8_t a, uint32_t *b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vshlcq_m(a, b, 17, p); +#else /* POLYMORPHIC */ + return vshlcq_m_s16(a, b, 17, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vshlcq_m_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, <4 x i32> } @llvm.arm.mve.vshlc.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 [[TMP0]], i32 9, <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, <4 x i32> } [[TMP3]], 0 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { i32, <4 x i32> } [[TMP3]], 1 +// CHECK-NEXT: ret <4 x i32> [[TMP5]] +// +int32x4_t test_vshlcq_m_s32(int32x4_t a, uint32_t *b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vshlcq_m(a, b, 9, p); +#else /* POLYMORPHIC */ + return vshlcq_m_s32(a, b, 9, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vshlcq_m_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, <16 x i8> } @llvm.arm.mve.vshlc.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], i32 [[TMP0]], i32 21, <16 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, <16 x i8> } [[TMP3]], 0 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { i32, <16 x i8> } [[TMP3]], 1 +// CHECK-NEXT: ret <16 x i8> [[TMP5]] +// +uint8x16_t test_vshlcq_m_u8(uint8x16_t a, uint32_t *b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vshlcq_m(a, b, 21, p); +#else /* POLYMORPHIC */ + return vshlcq_m_u8(a, b, 21, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vshlcq_m_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, <8 x i16> } @llvm.arm.mve.vshlc.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 [[TMP0]], i32 24, <8 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, <8 x i16> } [[TMP3]], 0 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { i32, <8 x i16> } [[TMP3]], 1 +// CHECK-NEXT: ret <8 x i16> [[TMP5]] +// +uint16x8_t test_vshlcq_m_u16(uint16x8_t a, uint32_t *b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vshlcq_m(a, b, 24, p); +#else /* POLYMORPHIC */ + return vshlcq_m_u16(a, b, 24, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vshlcq_m_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, <4 x i32> } @llvm.arm.mve.vshlc.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 [[TMP0]], i32 26, <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, <4 x i32> } [[TMP3]], 0 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { i32, <4 x i32> } [[TMP3]], 1 +// CHECK-NEXT: ret <4 x i32> [[TMP5]] +// +uint32x4_t test_vshlcq_m_u32(uint32x4_t a, uint32_t *b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vshlcq_m(a, b, 26, p); +#else /* POLYMORPHIC */ + return vshlcq_m_u32(a, b, 26, p); +#endif /* POLYMORPHIC */ +} diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td index 1325085436009..9018ea1e081a1 100644 --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -1031,6 +1031,14 @@ def int_arm_mve_vsbc_predicated: Intrinsic< [llvm_anyvector_ty, llvm_i32_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>; +def int_arm_mve_vshlc: Intrinsic< + [llvm_i32_ty /* bits shifted out */, llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_i32_ty /* bits shifted in */, + llvm_i32_ty /* shift count */], [IntrNoMem]>; +def int_arm_mve_vshlc_predicated: Intrinsic< + [llvm_i32_ty /* bits shifted out */, llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_i32_ty /* bits shifted in */, + llvm_i32_ty /* shift count */, llvm_anyvector_ty], [IntrNoMem]>; def int_arm_mve_vmulh: Intrinsic< [llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */], diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 97283aa29c9d6..69f4ceb5c9247 100644 --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -237,6 +237,10 @@ class ARMDAGToDAGISel : public SelectionDAGISel { void SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry, uint16_t OpcodeWithNoCarry, bool Add, bool Predicated); + /// SelectMVE_VSHLC - Select MVE intrinsics for a shift that carries between + /// vector lanes. + void SelectMVE_VSHLC(SDNode *N, bool Predicated); + /// Select long MVE vector reductions with two vector operands /// Stride is the number of vector element widths the instruction can operate /// on: @@ -2569,6 +2573,25 @@ void ARMDAGToDAGISel::SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry, CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops)); } +void ARMDAGToDAGISel::SelectMVE_VSHLC(SDNode *N, bool Predicated) { + SDLoc Loc(N); + SmallVector Ops; + + // One vector input, followed by a 32-bit word of bits to shift in + // and then an immediate shift count + Ops.push_back(N->getOperand(1)); + Ops.push_back(N->getOperand(2)); + int32_t ImmValue = cast(N->getOperand(3))->getZExtValue(); + Ops.push_back(getI32Imm(ImmValue, Loc)); // immediate shift count + + if (Predicated) + AddMVEPredicateToOps(Ops, Loc, N->getOperand(4)); + else + AddEmptyMVEPredicateToOps(Ops, Loc); + + CurDAG->SelectNodeTo(N, ARM::MVE_VSHLC, N->getVTList(), makeArrayRef(Ops)); +} + static bool SDValueToConstBool(SDValue SDVal) { assert(isa(SDVal) && "expected a compile-time constant"); ConstantSDNode *SDValConstant = dyn_cast(SDVal); @@ -4593,6 +4616,10 @@ void ARMDAGToDAGISel::Select(SDNode *N) { SelectMVE_VADCSBC(N, ARM::MVE_VSBC, ARM::MVE_VSBCI, true, IntNo == Intrinsic::arm_mve_vsbc_predicated); return; + case Intrinsic::arm_mve_vshlc: + case Intrinsic::arm_mve_vshlc_predicated: + SelectMVE_VSHLC(N, IntNo == Intrinsic::arm_mve_vshlc_predicated); + return; case Intrinsic::arm_mve_vmlldava: case Intrinsic::arm_mve_vmlldava_predicated: { diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vshlc.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vshlc.ll new file mode 100644 index 0000000000000..cc5fd36bf066a --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vshlc.ll @@ -0,0 +1,228 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s + +define arm_aapcs_vfpcc <16 x i8> @test_vshlcq_s8(<16 x i8> %a, i32* nocapture %b) { +; CHECK-LABEL: test_vshlcq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: vshlc q0, r1, #18 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %b, align 4 + %1 = tail call { i32, <16 x i8> } @llvm.arm.mve.vshlc.v16i8(<16 x i8> %a, i32 %0, i32 18) + %2 = extractvalue { i32, <16 x i8> } %1, 0 + store i32 %2, i32* %b, align 4 + %3 = extractvalue { i32, <16 x i8> } %1, 1 + ret <16 x i8> %3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vshlcq_s16(<8 x i16> %a, i32* nocapture %b) { +; CHECK-LABEL: test_vshlcq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: vshlc q0, r1, #16 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %b, align 4 + %1 = tail call { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16> %a, i32 %0, i32 16) + %2 = extractvalue { i32, <8 x i16> } %1, 0 + store i32 %2, i32* %b, align 4 + %3 = extractvalue { i32, <8 x i16> } %1, 1 + ret <8 x i16> %3 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vshlcq_s32(<4 x i32> %a, i32* nocapture %b) { +; CHECK-LABEL: test_vshlcq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: vshlc q0, r1, #4 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %b, align 4 + %1 = tail call { i32, <4 x i32> } @llvm.arm.mve.vshlc.v4i32(<4 x i32> %a, i32 %0, i32 4) + %2 = extractvalue { i32, <4 x i32> } %1, 0 + store i32 %2, i32* %b, align 4 + %3 = extractvalue { i32, <4 x i32> } %1, 1 + ret <4 x i32> %3 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vshlcq_u8(<16 x i8> %a, i32* nocapture %b) { +; CHECK-LABEL: test_vshlcq_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: vshlc q0, r1, #17 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %b, align 4 + %1 = tail call { i32, <16 x i8> } @llvm.arm.mve.vshlc.v16i8(<16 x i8> %a, i32 %0, i32 17) + %2 = extractvalue { i32, <16 x i8> } %1, 0 + store i32 %2, i32* %b, align 4 + %3 = extractvalue { i32, <16 x i8> } %1, 1 + ret <16 x i8> %3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vshlcq_u16(<8 x i16> %a, i32* nocapture %b) { +; CHECK-LABEL: test_vshlcq_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: vshlc q0, r1, #17 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %b, align 4 + %1 = tail call { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16> %a, i32 %0, i32 17) + %2 = extractvalue { i32, <8 x i16> } %1, 0 + store i32 %2, i32* %b, align 4 + %3 = extractvalue { i32, <8 x i16> } %1, 1 + ret <8 x i16> %3 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vshlcq_u32(<4 x i32> %a, i32* nocapture %b) { +; CHECK-LABEL: test_vshlcq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: vshlc q0, r1, #20 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %b, align 4 + %1 = tail call { i32, <4 x i32> } @llvm.arm.mve.vshlc.v4i32(<4 x i32> %a, i32 %0, i32 20) + %2 = extractvalue { i32, <4 x i32> } %1, 0 + store i32 %2, i32* %b, align 4 + %3 = extractvalue { i32, <4 x i32> } %1, 1 + ret <4 x i32> %3 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vshlcq_m_s8(<16 x i8> %a, i32* nocapture %b, i16 zeroext %p) { +; CHECK-LABEL: test_vshlcq_m_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: vpst +; CHECK-NEXT: vshlct q0, r1, #29 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %b, align 4 + %1 = zext i16 %p to i32 + %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + %3 = tail call { i32, <16 x i8> } @llvm.arm.mve.vshlc.predicated.v16i8.v16i1(<16 x i8> %a, i32 %0, i32 29, <16 x i1> %2) + %4 = extractvalue { i32, <16 x i8> } %3, 0 + store i32 %4, i32* %b, align 4 + %5 = extractvalue { i32, <16 x i8> } %3, 1 + ret <16 x i8> %5 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vshlcq_m_s16(<8 x i16> %a, i32* nocapture %b, i16 zeroext %p) { +; CHECK-LABEL: test_vshlcq_m_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: vpst +; CHECK-NEXT: vshlct q0, r1, #17 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %b, align 4 + %1 = zext i16 %p to i32 + %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = tail call { i32, <8 x i16> } @llvm.arm.mve.vshlc.predicated.v8i16.v8i1(<8 x i16> %a, i32 %0, i32 17, <8 x i1> %2) + %4 = extractvalue { i32, <8 x i16> } %3, 0 + store i32 %4, i32* %b, align 4 + %5 = extractvalue { i32, <8 x i16> } %3, 1 + ret <8 x i16> %5 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vshlcq_m_s32(<4 x i32> %a, i32* nocapture %b, i16 zeroext %p) { +; CHECK-LABEL: test_vshlcq_m_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: vpst +; CHECK-NEXT: vshlct q0, r1, #9 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %b, align 4 + %1 = zext i16 %p to i32 + %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = tail call { i32, <4 x i32> } @llvm.arm.mve.vshlc.predicated.v4i32.v4i1(<4 x i32> %a, i32 %0, i32 9, <4 x i1> %2) + %4 = extractvalue { i32, <4 x i32> } %3, 0 + store i32 %4, i32* %b, align 4 + %5 = extractvalue { i32, <4 x i32> } %3, 1 + ret <4 x i32> %5 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vshlcq_m_u8(<16 x i8> %a, i32* nocapture %b, i16 zeroext %p) { +; CHECK-LABEL: test_vshlcq_m_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: vpst +; CHECK-NEXT: vshlct q0, r1, #21 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %b, align 4 + %1 = zext i16 %p to i32 + %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + %3 = tail call { i32, <16 x i8> } @llvm.arm.mve.vshlc.predicated.v16i8.v16i1(<16 x i8> %a, i32 %0, i32 21, <16 x i1> %2) + %4 = extractvalue { i32, <16 x i8> } %3, 0 + store i32 %4, i32* %b, align 4 + %5 = extractvalue { i32, <16 x i8> } %3, 1 + ret <16 x i8> %5 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vshlcq_m_u16(<8 x i16> %a, i32* nocapture %b, i16 zeroext %p) { +; CHECK-LABEL: test_vshlcq_m_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: vpst +; CHECK-NEXT: vshlct q0, r1, #24 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %b, align 4 + %1 = zext i16 %p to i32 + %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = tail call { i32, <8 x i16> } @llvm.arm.mve.vshlc.predicated.v8i16.v8i1(<8 x i16> %a, i32 %0, i32 24, <8 x i1> %2) + %4 = extractvalue { i32, <8 x i16> } %3, 0 + store i32 %4, i32* %b, align 4 + %5 = extractvalue { i32, <8 x i16> } %3, 1 + ret <8 x i16> %5 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vshlcq_m_u32(<4 x i32> %a, i32* nocapture %b, i16 zeroext %p) { +; CHECK-LABEL: test_vshlcq_m_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: vpst +; CHECK-NEXT: vshlct q0, r1, #26 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %b, align 4 + %1 = zext i16 %p to i32 + %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = tail call { i32, <4 x i32> } @llvm.arm.mve.vshlc.predicated.v4i32.v4i1(<4 x i32> %a, i32 %0, i32 26, <4 x i1> %2) + %4 = extractvalue { i32, <4 x i32> } %3, 0 + store i32 %4, i32* %b, align 4 + %5 = extractvalue { i32, <4 x i32> } %3, 1 + ret <4 x i32> %5 +} + +declare { i32, <16 x i8> } @llvm.arm.mve.vshlc.v16i8(<16 x i8>, i32, i32) +declare { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16>, i32, i32) +declare { i32, <4 x i32> } @llvm.arm.mve.vshlc.v4i32(<4 x i32>, i32, i32) +declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) +declare { i32, <16 x i8> } @llvm.arm.mve.vshlc.predicated.v16i8.v16i1(<16 x i8>, i32, i32, <16 x i1>) +declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) +declare { i32, <8 x i16> } @llvm.arm.mve.vshlc.predicated.v8i16.v8i1(<8 x i16>, i32, i32, <8 x i1>) +declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) +declare { i32, <4 x i32> } @llvm.arm.mve.vshlc.predicated.v4i32.v4i1(<4 x i32>, i32, i32, <4 x i1>)