221 changes: 221 additions & 0 deletions clang/test/CodeGen/arm-mve-intrinsics/vshlc.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s

#include <arm_mve.h>

// CHECK-LABEL: @test_vshlcq_s8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = call { i32, <16 x i8> } @llvm.arm.mve.vshlc.v16i8(<16 x i8> [[A:%.*]], i32 [[TMP0]], i32 18)
// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { i32, <16 x i8> } [[TMP1]], 0
// CHECK-NEXT: store i32 [[TMP2]], i32* [[B]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, <16 x i8> } [[TMP1]], 1
// CHECK-NEXT: ret <16 x i8> [[TMP3]]
//
int8x16_t test_vshlcq_s8(int8x16_t a, uint32_t *b) {
#ifdef POLYMORPHIC
return vshlcq(a, b, 18);
#else /* POLYMORPHIC */
return vshlcq_s8(a, b, 18);
#endif /* POLYMORPHIC */
}

// CHECK-LABEL: @test_vshlcq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = call { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16> [[A:%.*]], i32 [[TMP0]], i32 16)
// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { i32, <8 x i16> } [[TMP1]], 0
// CHECK-NEXT: store i32 [[TMP2]], i32* [[B]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, <8 x i16> } [[TMP1]], 1
// CHECK-NEXT: ret <8 x i16> [[TMP3]]
//
int16x8_t test_vshlcq_s16(int16x8_t a, uint32_t *b) {
#ifdef POLYMORPHIC
return vshlcq(a, b, 16);
#else /* POLYMORPHIC */
return vshlcq_s16(a, b, 16);
#endif /* POLYMORPHIC */
}

// CHECK-LABEL: @test_vshlcq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = call { i32, <4 x i32> } @llvm.arm.mve.vshlc.v4i32(<4 x i32> [[A:%.*]], i32 [[TMP0]], i32 4)
// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { i32, <4 x i32> } [[TMP1]], 0
// CHECK-NEXT: store i32 [[TMP2]], i32* [[B]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, <4 x i32> } [[TMP1]], 1
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
int32x4_t test_vshlcq_s32(int32x4_t a, uint32_t *b) {
#ifdef POLYMORPHIC
return vshlcq(a, b, 4);
#else /* POLYMORPHIC */
return vshlcq_s32(a, b, 4);
#endif /* POLYMORPHIC */
}

// CHECK-LABEL: @test_vshlcq_u8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = call { i32, <16 x i8> } @llvm.arm.mve.vshlc.v16i8(<16 x i8> [[A:%.*]], i32 [[TMP0]], i32 17)
// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { i32, <16 x i8> } [[TMP1]], 0
// CHECK-NEXT: store i32 [[TMP2]], i32* [[B]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, <16 x i8> } [[TMP1]], 1
// CHECK-NEXT: ret <16 x i8> [[TMP3]]
//
uint8x16_t test_vshlcq_u8(uint8x16_t a, uint32_t *b) {
#ifdef POLYMORPHIC
return vshlcq(a, b, 17);
#else /* POLYMORPHIC */
return vshlcq_u8(a, b, 17);
#endif /* POLYMORPHIC */
}

// CHECK-LABEL: @test_vshlcq_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = call { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16> [[A:%.*]], i32 [[TMP0]], i32 17)
// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { i32, <8 x i16> } [[TMP1]], 0
// CHECK-NEXT: store i32 [[TMP2]], i32* [[B]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, <8 x i16> } [[TMP1]], 1
// CHECK-NEXT: ret <8 x i16> [[TMP3]]
//
uint16x8_t test_vshlcq_u16(uint16x8_t a, uint32_t *b) {
#ifdef POLYMORPHIC
return vshlcq(a, b, 17);
#else /* POLYMORPHIC */
return vshlcq_u16(a, b, 17);
#endif /* POLYMORPHIC */
}

// CHECK-LABEL: @test_vshlcq_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = call { i32, <4 x i32> } @llvm.arm.mve.vshlc.v4i32(<4 x i32> [[A:%.*]], i32 [[TMP0]], i32 20)
// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { i32, <4 x i32> } [[TMP1]], 0
// CHECK-NEXT: store i32 [[TMP2]], i32* [[B]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, <4 x i32> } [[TMP1]], 1
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
uint32x4_t test_vshlcq_u32(uint32x4_t a, uint32_t *b) {
#ifdef POLYMORPHIC
return vshlcq(a, b, 20);
#else /* POLYMORPHIC */
return vshlcq_u32(a, b, 20);
#endif /* POLYMORPHIC */
}

// CHECK-LABEL: @test_vshlcq_m_s8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = call { i32, <16 x i8> } @llvm.arm.mve.vshlc.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], i32 [[TMP0]], i32 29, <16 x i1> [[TMP2]])
// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, <16 x i8> } [[TMP3]], 0
// CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4
// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { i32, <16 x i8> } [[TMP3]], 1
// CHECK-NEXT: ret <16 x i8> [[TMP5]]
//
int8x16_t test_vshlcq_m_s8(int8x16_t a, uint32_t *b, mve_pred16_t p) {
#ifdef POLYMORPHIC
return vshlcq_m(a, b, 29, p);
#else /* POLYMORPHIC */
return vshlcq_m_s8(a, b, 29, p);
#endif /* POLYMORPHIC */
}

// CHECK-LABEL: @test_vshlcq_m_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = call { i32, <8 x i16> } @llvm.arm.mve.vshlc.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 [[TMP0]], i32 17, <8 x i1> [[TMP2]])
// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, <8 x i16> } [[TMP3]], 0
// CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4
// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { i32, <8 x i16> } [[TMP3]], 1
// CHECK-NEXT: ret <8 x i16> [[TMP5]]
//
int16x8_t test_vshlcq_m_s16(int16x8_t a, uint32_t *b, mve_pred16_t p) {
#ifdef POLYMORPHIC
return vshlcq_m(a, b, 17, p);
#else /* POLYMORPHIC */
return vshlcq_m_s16(a, b, 17, p);
#endif /* POLYMORPHIC */
}

// CHECK-LABEL: @test_vshlcq_m_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = call { i32, <4 x i32> } @llvm.arm.mve.vshlc.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 [[TMP0]], i32 9, <4 x i1> [[TMP2]])
// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, <4 x i32> } [[TMP3]], 0
// CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4
// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { i32, <4 x i32> } [[TMP3]], 1
// CHECK-NEXT: ret <4 x i32> [[TMP5]]
//
int32x4_t test_vshlcq_m_s32(int32x4_t a, uint32_t *b, mve_pred16_t p) {
#ifdef POLYMORPHIC
return vshlcq_m(a, b, 9, p);
#else /* POLYMORPHIC */
return vshlcq_m_s32(a, b, 9, p);
#endif /* POLYMORPHIC */
}

// CHECK-LABEL: @test_vshlcq_m_u8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = call { i32, <16 x i8> } @llvm.arm.mve.vshlc.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], i32 [[TMP0]], i32 21, <16 x i1> [[TMP2]])
// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, <16 x i8> } [[TMP3]], 0
// CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4
// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { i32, <16 x i8> } [[TMP3]], 1
// CHECK-NEXT: ret <16 x i8> [[TMP5]]
//
uint8x16_t test_vshlcq_m_u8(uint8x16_t a, uint32_t *b, mve_pred16_t p) {
#ifdef POLYMORPHIC
return vshlcq_m(a, b, 21, p);
#else /* POLYMORPHIC */
return vshlcq_m_u8(a, b, 21, p);
#endif /* POLYMORPHIC */
}

// CHECK-LABEL: @test_vshlcq_m_u16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = call { i32, <8 x i16> } @llvm.arm.mve.vshlc.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 [[TMP0]], i32 24, <8 x i1> [[TMP2]])
// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, <8 x i16> } [[TMP3]], 0
// CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4
// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { i32, <8 x i16> } [[TMP3]], 1
// CHECK-NEXT: ret <8 x i16> [[TMP5]]
//
uint16x8_t test_vshlcq_m_u16(uint16x8_t a, uint32_t *b, mve_pred16_t p) {
#ifdef POLYMORPHIC
return vshlcq_m(a, b, 24, p);
#else /* POLYMORPHIC */
return vshlcq_m_u16(a, b, 24, p);
#endif /* POLYMORPHIC */
}

// CHECK-LABEL: @test_vshlcq_m_u32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = call { i32, <4 x i32> } @llvm.arm.mve.vshlc.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 [[TMP0]], i32 26, <4 x i1> [[TMP2]])
// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, <4 x i32> } [[TMP3]], 0
// CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4
// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { i32, <4 x i32> } [[TMP3]], 1
// CHECK-NEXT: ret <4 x i32> [[TMP5]]
//
uint32x4_t test_vshlcq_m_u32(uint32x4_t a, uint32_t *b, mve_pred16_t p) {
#ifdef POLYMORPHIC
return vshlcq_m(a, b, 26, p);
#else /* POLYMORPHIC */
return vshlcq_m_u32(a, b, 26, p);
#endif /* POLYMORPHIC */
}
15 changes: 15 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsARM.td
Original file line number Diff line number Diff line change
Expand Up @@ -1020,10 +1020,25 @@ def int_arm_mve_vabd: Intrinsic<
def int_arm_mve_vadc: Intrinsic<
[llvm_anyvector_ty, llvm_i32_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem]>;
def int_arm_mve_vsbc: Intrinsic<
[llvm_anyvector_ty, llvm_i32_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem]>;
def int_arm_mve_vadc_predicated: Intrinsic<
[llvm_anyvector_ty, llvm_i32_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>;
def int_arm_mve_vsbc_predicated: Intrinsic<
[llvm_anyvector_ty, llvm_i32_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>;
def int_arm_mve_vshlc: Intrinsic<
[llvm_i32_ty /* bits shifted out */, llvm_anyvector_ty],
[LLVMMatchType<0>, llvm_i32_ty /* bits shifted in */,
llvm_i32_ty /* shift count */], [IntrNoMem]>;
def int_arm_mve_vshlc_predicated: Intrinsic<
[llvm_i32_ty /* bits shifted out */, llvm_anyvector_ty],
[LLVMMatchType<0>, llvm_i32_ty /* bits shifted in */,
llvm_i32_ty /* shift count */, llvm_anyvector_ty], [IntrNoMem]>;
def int_arm_mve_vmulh: Intrinsic<
[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */],
Expand Down
32 changes: 32 additions & 0 deletions llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,10 @@ class ARMDAGToDAGISel : public SelectionDAGISel {
void SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry,
uint16_t OpcodeWithNoCarry, bool Add, bool Predicated);

/// SelectMVE_VSHLC - Select MVE intrinsics for a shift that carries between
/// vector lanes.
void SelectMVE_VSHLC(SDNode *N, bool Predicated);

/// Select long MVE vector reductions with two vector operands
/// Stride is the number of vector element widths the instruction can operate
/// on:
Expand Down Expand Up @@ -2569,6 +2573,25 @@ void ARMDAGToDAGISel::SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry,
CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops));
}

void ARMDAGToDAGISel::SelectMVE_VSHLC(SDNode *N, bool Predicated) {
SDLoc Loc(N);
SmallVector<SDValue, 8> Ops;

// One vector input, followed by a 32-bit word of bits to shift in
// and then an immediate shift count
Ops.push_back(N->getOperand(1));
Ops.push_back(N->getOperand(2));
int32_t ImmValue = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
Ops.push_back(getI32Imm(ImmValue, Loc)); // immediate shift count

if (Predicated)
AddMVEPredicateToOps(Ops, Loc, N->getOperand(4));
else
AddEmptyMVEPredicateToOps(Ops, Loc);

CurDAG->SelectNodeTo(N, ARM::MVE_VSHLC, N->getVTList(), makeArrayRef(Ops));
}

static bool SDValueToConstBool(SDValue SDVal) {
assert(isa<ConstantSDNode>(SDVal) && "expected a compile-time constant");
ConstantSDNode *SDValConstant = dyn_cast<ConstantSDNode>(SDVal);
Expand Down Expand Up @@ -4588,6 +4611,15 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
SelectMVE_VADCSBC(N, ARM::MVE_VADC, ARM::MVE_VADCI, true,
IntNo == Intrinsic::arm_mve_vadc_predicated);
return;
case Intrinsic::arm_mve_vsbc:
case Intrinsic::arm_mve_vsbc_predicated:
SelectMVE_VADCSBC(N, ARM::MVE_VSBC, ARM::MVE_VSBCI, true,
IntNo == Intrinsic::arm_mve_vsbc_predicated);
return;
case Intrinsic::arm_mve_vshlc:
case Intrinsic::arm_mve_vshlc_predicated:
SelectMVE_VSHLC(N, IntNo == Intrinsic::arm_mve_vshlc_predicated);
return;

case Intrinsic::arm_mve_vmlldava:
case Intrinsic::arm_mve_vmlldava_predicated: {
Expand Down
184 changes: 184 additions & 0 deletions llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,187 @@ entry:
%8 = extractvalue { <4 x i32>, i32 } %4, 0
ret <4 x i32> %8
}

declare { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32>, <4 x i32>, i32)

define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_s32(<4 x i32> %a, <4 x i32> %b, i32* nocapture %carry_out) {
; CHECK-LABEL: test_vsbciq_s32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vsbci.i32 q0, q0, q1
; CHECK-NEXT: vmrs r1, fpscr_nzcvqc
; CHECK-NEXT: ubfx r1, r1, #29, #1
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 0)
%1 = extractvalue { <4 x i32>, i32 } %0, 1
%2 = lshr i32 %1, 29
%3 = and i32 %2, 1
store i32 %3, i32* %carry_out, align 4
%4 = extractvalue { <4 x i32>, i32 } %0, 0
ret <4 x i32> %4
}

define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_u32(<4 x i32> %a, <4 x i32> %b, i32* nocapture %carry_out) {
; CHECK-LABEL: test_vsbciq_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vsbci.i32 q0, q0, q1
; CHECK-NEXT: vmrs r1, fpscr_nzcvqc
; CHECK-NEXT: ubfx r1, r1, #29, #1
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 0)
%1 = extractvalue { <4 x i32>, i32 } %0, 1
%2 = lshr i32 %1, 29
%3 = and i32 %2, 1
store i32 %3, i32* %carry_out, align 4
%4 = extractvalue { <4 x i32>, i32 } %0, 0
ret <4 x i32> %4
}

define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_s32(<4 x i32> %a, <4 x i32> %b, i32* nocapture %carry) {
; CHECK-LABEL: test_vsbcq_s32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r1, [r0]
; CHECK-NEXT: lsls r1, r1, #29
; CHECK-NEXT: vmsr fpscr_nzcvqc, r1
; CHECK-NEXT: vsbc.i32 q0, q0, q1
; CHECK-NEXT: vmrs r1, fpscr_nzcvqc
; CHECK-NEXT: ubfx r1, r1, #29, #1
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %carry, align 4
%1 = shl i32 %0, 29
%2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %1)
%3 = extractvalue { <4 x i32>, i32 } %2, 1
%4 = lshr i32 %3, 29
%5 = and i32 %4, 1
store i32 %5, i32* %carry, align 4
%6 = extractvalue { <4 x i32>, i32 } %2, 0
ret <4 x i32> %6
}

define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_u32(<4 x i32> %a, <4 x i32> %b, i32* nocapture %carry) {
; CHECK-LABEL: test_vsbcq_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r1, [r0]
; CHECK-NEXT: lsls r1, r1, #29
; CHECK-NEXT: vmsr fpscr_nzcvqc, r1
; CHECK-NEXT: vsbc.i32 q0, q0, q1
; CHECK-NEXT: vmrs r1, fpscr_nzcvqc
; CHECK-NEXT: ubfx r1, r1, #29, #1
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %carry, align 4
%1 = shl i32 %0, 29
%2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %1)
%3 = extractvalue { <4 x i32>, i32 } %2, 1
%4 = lshr i32 %3, 29
%5 = and i32 %4, 1
store i32 %5, i32* %carry, align 4
%6 = extractvalue { <4 x i32>, i32 } %2, 0
ret <4 x i32> %6
}

declare { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i32>, i32, <4 x i1>)

define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32* nocapture %carry_out, i16 zeroext %p) {
; CHECK-LABEL: test_vsbciq_m_s32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vsbcit.i32 q0, q1, q2
; CHECK-NEXT: vmrs r1, fpscr_nzcvqc
; CHECK-NEXT: ubfx r1, r1, #29, #1
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
%2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1)
%3 = extractvalue { <4 x i32>, i32 } %2, 1
%4 = lshr i32 %3, 29
%5 = and i32 %4, 1
store i32 %5, i32* %carry_out, align 4
%6 = extractvalue { <4 x i32>, i32 } %2, 0
ret <4 x i32> %6
}

define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_m_u32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32* nocapture %carry_out, i16 zeroext %p) {
; CHECK-LABEL: test_vsbciq_m_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vsbcit.i32 q0, q1, q2
; CHECK-NEXT: vmrs r1, fpscr_nzcvqc
; CHECK-NEXT: ubfx r1, r1, #29, #1
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
%2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1)
%3 = extractvalue { <4 x i32>, i32 } %2, 1
%4 = lshr i32 %3, 29
%5 = and i32 %4, 1
store i32 %5, i32* %carry_out, align 4
%6 = extractvalue { <4 x i32>, i32 } %2, 0
ret <4 x i32> %6
}

define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32* nocapture %carry, i16 zeroext %p) {
; CHECK-LABEL: test_vsbcq_m_s32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: lsls r1, r2, #29
; CHECK-NEXT: vmsr fpscr_nzcvqc, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vsbct.i32 q0, q1, q2
; CHECK-NEXT: vmrs r1, fpscr_nzcvqc
; CHECK-NEXT: ubfx r1, r1, #29, #1
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %carry, align 4
%1 = shl i32 %0, 29
%2 = zext i16 %p to i32
%3 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2)
%4 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 %1, <4 x i1> %3)
%5 = extractvalue { <4 x i32>, i32 } %4, 1
%6 = lshr i32 %5, 29
%7 = and i32 %6, 1
store i32 %7, i32* %carry, align 4
%8 = extractvalue { <4 x i32>, i32 } %4, 0
ret <4 x i32> %8
}

define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_m_u32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32* nocapture %carry, i16 zeroext %p) {
; CHECK-LABEL: test_vsbcq_m_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: lsls r1, r2, #29
; CHECK-NEXT: vmsr fpscr_nzcvqc, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vsbct.i32 q0, q1, q2
; CHECK-NEXT: vmrs r1, fpscr_nzcvqc
; CHECK-NEXT: ubfx r1, r1, #29, #1
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %carry, align 4
%1 = shl i32 %0, 29
%2 = zext i16 %p to i32
%3 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2)
%4 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 %1, <4 x i1> %3)
%5 = extractvalue { <4 x i32>, i32 } %4, 1
%6 = lshr i32 %5, 29
%7 = and i32 %6, 1
store i32 %7, i32* %carry, align 4
%8 = extractvalue { <4 x i32>, i32 } %4, 0
ret <4 x i32> %8
}
228 changes: 228 additions & 0 deletions llvm/test/CodeGen/Thumb2/mve-intrinsics/vshlc.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s

define arm_aapcs_vfpcc <16 x i8> @test_vshlcq_s8(<16 x i8> %a, i32* nocapture %b) {
; CHECK-LABEL: test_vshlcq_s8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r1, [r0]
; CHECK-NEXT: vshlc q0, r1, #18
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %b, align 4
%1 = tail call { i32, <16 x i8> } @llvm.arm.mve.vshlc.v16i8(<16 x i8> %a, i32 %0, i32 18)
%2 = extractvalue { i32, <16 x i8> } %1, 0
store i32 %2, i32* %b, align 4
%3 = extractvalue { i32, <16 x i8> } %1, 1
ret <16 x i8> %3
}

define arm_aapcs_vfpcc <8 x i16> @test_vshlcq_s16(<8 x i16> %a, i32* nocapture %b) {
; CHECK-LABEL: test_vshlcq_s16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r1, [r0]
; CHECK-NEXT: vshlc q0, r1, #16
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %b, align 4
%1 = tail call { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16> %a, i32 %0, i32 16)
%2 = extractvalue { i32, <8 x i16> } %1, 0
store i32 %2, i32* %b, align 4
%3 = extractvalue { i32, <8 x i16> } %1, 1
ret <8 x i16> %3
}

define arm_aapcs_vfpcc <4 x i32> @test_vshlcq_s32(<4 x i32> %a, i32* nocapture %b) {
; CHECK-LABEL: test_vshlcq_s32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r1, [r0]
; CHECK-NEXT: vshlc q0, r1, #4
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %b, align 4
%1 = tail call { i32, <4 x i32> } @llvm.arm.mve.vshlc.v4i32(<4 x i32> %a, i32 %0, i32 4)
%2 = extractvalue { i32, <4 x i32> } %1, 0
store i32 %2, i32* %b, align 4
%3 = extractvalue { i32, <4 x i32> } %1, 1
ret <4 x i32> %3
}

define arm_aapcs_vfpcc <16 x i8> @test_vshlcq_u8(<16 x i8> %a, i32* nocapture %b) {
; CHECK-LABEL: test_vshlcq_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r1, [r0]
; CHECK-NEXT: vshlc q0, r1, #17
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %b, align 4
%1 = tail call { i32, <16 x i8> } @llvm.arm.mve.vshlc.v16i8(<16 x i8> %a, i32 %0, i32 17)
%2 = extractvalue { i32, <16 x i8> } %1, 0
store i32 %2, i32* %b, align 4
%3 = extractvalue { i32, <16 x i8> } %1, 1
ret <16 x i8> %3
}

define arm_aapcs_vfpcc <8 x i16> @test_vshlcq_u16(<8 x i16> %a, i32* nocapture %b) {
; CHECK-LABEL: test_vshlcq_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r1, [r0]
; CHECK-NEXT: vshlc q0, r1, #17
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %b, align 4
%1 = tail call { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16> %a, i32 %0, i32 17)
%2 = extractvalue { i32, <8 x i16> } %1, 0
store i32 %2, i32* %b, align 4
%3 = extractvalue { i32, <8 x i16> } %1, 1
ret <8 x i16> %3
}

define arm_aapcs_vfpcc <4 x i32> @test_vshlcq_u32(<4 x i32> %a, i32* nocapture %b) {
; CHECK-LABEL: test_vshlcq_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r1, [r0]
; CHECK-NEXT: vshlc q0, r1, #20
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %b, align 4
%1 = tail call { i32, <4 x i32> } @llvm.arm.mve.vshlc.v4i32(<4 x i32> %a, i32 %0, i32 20)
%2 = extractvalue { i32, <4 x i32> } %1, 0
store i32 %2, i32* %b, align 4
%3 = extractvalue { i32, <4 x i32> } %1, 1
ret <4 x i32> %3
}

define arm_aapcs_vfpcc <16 x i8> @test_vshlcq_m_s8(<16 x i8> %a, i32* nocapture %b, i16 zeroext %p) {
; CHECK-LABEL: test_vshlcq_m_s8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: ldr r1, [r0]
; CHECK-NEXT: vpst
; CHECK-NEXT: vshlct q0, r1, #29
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %b, align 4
%1 = zext i16 %p to i32
%2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
%3 = tail call { i32, <16 x i8> } @llvm.arm.mve.vshlc.predicated.v16i8.v16i1(<16 x i8> %a, i32 %0, i32 29, <16 x i1> %2)
%4 = extractvalue { i32, <16 x i8> } %3, 0
store i32 %4, i32* %b, align 4
%5 = extractvalue { i32, <16 x i8> } %3, 1
ret <16 x i8> %5
}

define arm_aapcs_vfpcc <8 x i16> @test_vshlcq_m_s16(<8 x i16> %a, i32* nocapture %b, i16 zeroext %p) {
; CHECK-LABEL: test_vshlcq_m_s16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: ldr r1, [r0]
; CHECK-NEXT: vpst
; CHECK-NEXT: vshlct q0, r1, #17
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %b, align 4
%1 = zext i16 %p to i32
%2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
%3 = tail call { i32, <8 x i16> } @llvm.arm.mve.vshlc.predicated.v8i16.v8i1(<8 x i16> %a, i32 %0, i32 17, <8 x i1> %2)
%4 = extractvalue { i32, <8 x i16> } %3, 0
store i32 %4, i32* %b, align 4
%5 = extractvalue { i32, <8 x i16> } %3, 1
ret <8 x i16> %5
}

define arm_aapcs_vfpcc <4 x i32> @test_vshlcq_m_s32(<4 x i32> %a, i32* nocapture %b, i16 zeroext %p) {
; CHECK-LABEL: test_vshlcq_m_s32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: ldr r1, [r0]
; CHECK-NEXT: vpst
; CHECK-NEXT: vshlct q0, r1, #9
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %b, align 4
%1 = zext i16 %p to i32
%2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
%3 = tail call { i32, <4 x i32> } @llvm.arm.mve.vshlc.predicated.v4i32.v4i1(<4 x i32> %a, i32 %0, i32 9, <4 x i1> %2)
%4 = extractvalue { i32, <4 x i32> } %3, 0
store i32 %4, i32* %b, align 4
%5 = extractvalue { i32, <4 x i32> } %3, 1
ret <4 x i32> %5
}

define arm_aapcs_vfpcc <16 x i8> @test_vshlcq_m_u8(<16 x i8> %a, i32* nocapture %b, i16 zeroext %p) {
; CHECK-LABEL: test_vshlcq_m_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: ldr r1, [r0]
; CHECK-NEXT: vpst
; CHECK-NEXT: vshlct q0, r1, #21
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %b, align 4
%1 = zext i16 %p to i32
%2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
%3 = tail call { i32, <16 x i8> } @llvm.arm.mve.vshlc.predicated.v16i8.v16i1(<16 x i8> %a, i32 %0, i32 21, <16 x i1> %2)
%4 = extractvalue { i32, <16 x i8> } %3, 0
store i32 %4, i32* %b, align 4
%5 = extractvalue { i32, <16 x i8> } %3, 1
ret <16 x i8> %5
}

define arm_aapcs_vfpcc <8 x i16> @test_vshlcq_m_u16(<8 x i16> %a, i32* nocapture %b, i16 zeroext %p) {
; CHECK-LABEL: test_vshlcq_m_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: ldr r1, [r0]
; CHECK-NEXT: vpst
; CHECK-NEXT: vshlct q0, r1, #24
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %b, align 4
%1 = zext i16 %p to i32
%2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
%3 = tail call { i32, <8 x i16> } @llvm.arm.mve.vshlc.predicated.v8i16.v8i1(<8 x i16> %a, i32 %0, i32 24, <8 x i1> %2)
%4 = extractvalue { i32, <8 x i16> } %3, 0
store i32 %4, i32* %b, align 4
%5 = extractvalue { i32, <8 x i16> } %3, 1
ret <8 x i16> %5
}

define arm_aapcs_vfpcc <4 x i32> @test_vshlcq_m_u32(<4 x i32> %a, i32* nocapture %b, i16 zeroext %p) {
; CHECK-LABEL: test_vshlcq_m_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: ldr r1, [r0]
; CHECK-NEXT: vpst
; CHECK-NEXT: vshlct q0, r1, #26
; CHECK-NEXT: str r1, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %b, align 4
%1 = zext i16 %p to i32
%2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
%3 = tail call { i32, <4 x i32> } @llvm.arm.mve.vshlc.predicated.v4i32.v4i1(<4 x i32> %a, i32 %0, i32 26, <4 x i1> %2)
%4 = extractvalue { i32, <4 x i32> } %3, 0
store i32 %4, i32* %b, align 4
%5 = extractvalue { i32, <4 x i32> } %3, 1
ret <4 x i32> %5
}

declare { i32, <16 x i8> } @llvm.arm.mve.vshlc.v16i8(<16 x i8>, i32, i32)
declare { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16>, i32, i32)
declare { i32, <4 x i32> } @llvm.arm.mve.vshlc.v4i32(<4 x i32>, i32, i32)
declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
declare { i32, <16 x i8> } @llvm.arm.mve.vshlc.predicated.v16i8.v16i1(<16 x i8>, i32, i32, <16 x i1>)
declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
declare { i32, <8 x i16> } @llvm.arm.mve.vshlc.predicated.v8i16.v8i1(<8 x i16>, i32, i32, <8 x i1>)
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
declare { i32, <4 x i32> } @llvm.arm.mve.vshlc.predicated.v4i32.v4i1(<4 x i32>, i32, i32, <4 x i1>)